예제 #1
0
 def test_persistence_separate_file(self):
     """Tests whether the model is saved and loaded correctly when the arrays are stored separately."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile(), sep_limit=1)
     loaded = PoincareModel.load(testfile())
     self.models_equal(model, loaded)
예제 #2
0
 def test_persistence_separate_file(self):
     """Tests whether the model is saved and loaded correctly when the arrays are stored separately."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile(), sep_limit=1)
     loaded = PoincareModel.load(testfile())
     self.models_equal(model, loaded)
예제 #3
0
 def test_persistence(self):
     """Tests whether the model is saved and loaded correctly."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile())
     loaded = PoincareModel.load(testfile())
     self.models_equal(model, loaded)
예제 #4
0
 def test_persistence(self):
     """Tests whether the model is saved and loaded correctly."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile())
     loaded = PoincareModel.load(testfile())
     self.models_equal(model, loaded)
예제 #5
0
def train(rels, 
          lang=LANG, 
          epochs=VALUE_DEFAULT_EPOCHS,
          epochs_load=0,
          size=VALUE_DEFAULT_SIZE, 
          negative=VALUE_DEFAULT_NEGATIVE,
          memo=VALUE_DEFAULT_MEMO,
          burnin=None,
          reg=None,
          resume=False):
    try:
        if resume:
            filename = make_filename_model(lang, epochs_load, size, negative, memo, burnin, reg)
            model = PoincareModel.load(filename)
            print("resume {}".format(filename))
        else:
            print("first training")
            raise ValueError()
    except:
        if resume:
            print("file not found")
        model = PoincareModel(rels, burn_in=0, regularization_coeff=0, negative=negative, size=size)
    
    model.train(epochs=epochs, print_every=1500)
    model.save(make_filename_model(lang, epochs+epochs_load, size, negative, memo, burnin, reg))
    
    return model
예제 #6
0
def train_run(args):
    # create experiment name from args
    # create log folder, params folder from exp name
    # Start logging in exp log folder
    # save trained model in exp params folder

    exp_name = 'HB'+'time' + str(datetime.now()) + '_EXP' + str(args.train_dir) + \
    '_prbt' + str(args.prob_threshold) + '_reg' + str(args.reg_coef) + \
    '_dim' + str(args.embed_dim) + '_lr' + str(args.learning_rate) + \
    '_neg' + str(args.negs) + '_epoc' + str(args.epochs) + '_burnin' + str(args.burn_in)

    exp_name = exp_name.replace(":", "-")
    exp_name = exp_name.replace("/", "-")
    exp_name = exp_name.replace(" ", "-")
    print(exp_name)

    # Training Logs Folder
    exp_log_folder = args.log_folder + exp_name + '/'
    if not os.path.exists(exp_log_folder):
        os.makedirs(exp_log_folder)

    logging_file = exp_log_folder + 'logging.txt'
    logging.basicConfig(filename=logging_file, level=logging.INFO)

    # Model saving folder
    exp_params_folder = args.params_folder + exp_name + '/'
    if not os.path.exists(exp_params_folder):
        os.makedirs(exp_params_folder)

    training_file = args.train_dir + args.trn_file
    trn_dataset = data_loader.get_data_list(training_file, args.prob_threshold)
    print("Number of training examples: ", len(trn_dataset))

    # Create the model definition
    model = PoincareModel(train_data=trn_dataset,
                          size=args.embed_dim,
                          alpha=args.learning_rate,
                          negative=args.negs,
                          regularization_coeff=args.reg_coef,
                          burn_in=args.burn_in,
                          burn_in_alpha=args.burn_in_alpha,
                          init_range=args.init_range,
                          seed=args.random_seed)

    # Start the model training
    model.train(epochs=args.epochs,
                batch_size=args.batch_size,
                print_every=args.print_every)

    # Save the model
    model_save_name = exp_params_folder + 'gensim_model.params'
    model.save(model_save_name)

    # Save the arguments in the params folder
    args_fname = exp_params_folder + 'args_model.pkl'
    with open(args_fname, "wb") as f:
        pickle.dump(args, f)

    return
예제 #7
0
 def test_train_after_load(self):
     """Tests whether the model can be trained correctly after loading from disk."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile())
     loaded = PoincareModel.load(testfile())
     model.train(epochs=1)
     loaded.train(epochs=1)
     self.models_equal(model, loaded)
예제 #8
0
 def test_train_after_load(self):
     """Tests whether the model can be trained correctly after loading from disk."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile())
     loaded = PoincareModel.load(testfile())
     model.train(epochs=1)
     loaded.train(epochs=1)
     self.models_equal(model, loaded)
def hello_world():
    relations = [('math', 'science'), ('cs', 'science'), ('ml', 'cs'),
                 ('db', 'cs'), ('linalg', 'math')]
    model = PoincareModel(relations, size=8, negative=2)
    model.train(epochs=50)
    # Poincare distance between two entities
    print(model.kv.distance('ml', 'db'))
    # Compute absolute position in hierarchy of input node or vector.
    # Values range between 0 and 1. A lower value indicates the input
    # node or vector is higher in the hierarchy.
    print(model.kv.norm('ml'))
    print(model.kv.norm('ml'))
    # Get the vectors
    print(model.kv.get_vector('ml'))
    model.save('test_embeddings.bin')
    model.kv.save_word2vec_format('test_embeddings.w2v')
    return
def main(poincare=''):
    from gensim.models.poincare import PoincareModel
    pm = PoincareModel([], size=300, dtype=np.float64)
    emb = PoincareKeyedVectors.load_word2vec_format(poincare,
                                                    fvocab=None,
                                                    binary=False,
                                                    encoding='utf8',
                                                    unicode_errors='strict',
                                                    limit=None,
                                                    datatype=np.float64)
    pm.kv = emb

    pm.save('w2v_poincare.pickle', pickle_protocol=4)
    pm2 = PoincareModel.load('w2v_poincare.pickle')

    pm2.train(10000, batch_size=10, print_every=1, check_gradients_every=None)
    pm2.save('w2v_poincare_after_train.pickle', pickle_protocol=4)
def run(mode, language):
    if mode == "train_poincare_custom":
        gold_s,_ = read_all_data(domain = "science", language = language)
        gold_e,_ = read_all_data(domain = "environment", language = language)
        gold_f,_ = read_all_data(domain = "food", language = language)
        vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e])
        relations ="data/" + language + "/poincare_common_and_domains_" + language + ".tsv"
        assert len(open(relations, 'r').readlines()) > 10, "Not enough relations to train embeddings. Aborting ..."
        poincare_rel = PoincareRelations(relations)
        dim = 50
        model = PoincareModel(poincare_rel, size = dim)
        print("Starting Training...")
        model.train(epochs=400)
        model.save("embeddings/poincare_common_and_domains_5_3_" + language + "_" + str(dim))


    if mode == 'train_poincare_wordnet':
        assert language == 'EN', "Wordnet consists only of English nouns"

        gold_s,_ = read_all_data(domain = "science")
        gold_e,_ = read_all_data(domain = "environment")
        gold_f,_ = read_all_data(domain = "food")
        vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e])

        preprocess_wordnet('data/EN/noun_closure.tsv', vocabulary)
        poincare_rel = PoincareRelations('data/EN/noun_closure_filtered.tsv')
        dim = 50
        model = PoincareModel(poincare_rel, size = dim)
        print("Starting Training...")
        model.train(epochs=400)
        model.save("embeddings/wordnet_filtered" + "_" + str(dim))

    if mode == "train_word2vec":
        gold_s,relations_s = read_all_data("science")
        gold_e,relations_e = read_all_data("environment")
        gold_f,relations_f = read_all_data("food")
        vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e])
        documents =  []

        documents = list(read_input("/data/EN/wikipedia_utf8_filtered_20pageviews.csv",vocabulary))
        model = gensim.models.Word2Vec(documents, size= 300, window = 10, min_count = 2, workers = 10)
        model.train(documents, total_examples=len(documents), epochs=30)
        print("Finished building word2vec model")
        model.save("embeddings/own_embeddings_w2v")
예제 #12
0
from gensim.models.poincare import PoincareModel, PoincareRelations
from gensim.test.utils import datapath
from utils import Parameters
import pandas as pd

data_path = datapath("D:/PyCharm/PyCharm_Project/paper/data/type_relation.tsv")
type_embedding_path = "data/type_embedding"
model = PoincareModel(train_data=PoincareRelations(data_path, encoding="gbk"),
                      size=Parameters.type_embedding_dim,
                      negative=3)
model.train(epochs=50, print_every=5)
print(model.kv.word_vec("川菜"))
model.save(type_embedding_path)

# poincareModel = PoincareModel.load("data/type_embedding")
# print(poincareModel.kv.word_vec('东北菜'))
예제 #13
0
path = "/path/to/training_set.txt" # poincare_dict.txt
path_lexical = "path/to/lexical.txt" # example file_lexical_fon.txt following the HyperLex Format
path_validation = "/file/to/validation_set.txt" # validation set poincare_embedding_validation.txt
relations_ = load_doc(path)

# parameters
size = 15 # dimension of the embedding space
c = 15 # constant of negative curvature
epochs = 2000 # number of training epochs
# define the model
model = PoincareModel(relations_, size=size, negative=c)
model.train(epochs)

# save the model
model.save('/path/to/model')
# save model embedding
model.kv.save_word2vec_format("/path/to/embedding")

# load the model and the embedding
model = PoincareModel.load("/path/to/model")
model.kv.load_word2vec_format("/path/to/embedding")

all_relations = set(relations_)
# add different classes to the labels to add them to the graph
labels = list(set([_[0] for _ in relations_])) + ["girl_name", "boy_name", "mixed_name", "body_part", "benin_city"]

title = "Title Figure"
fig = poincare_2d_visualization(new_model_10, all_relations, title, show_node_labels=labels)

plt.image.ishow(fig, width=1000, height=1000)

logging.basicConfig(level=logging.INFO)

DATASET_PATH = settings.AMAZON_REVIEWS_CELL_PHONES_AND_ACCESSORIES_DATASET_JSON

aspect_analysis_gerani = AspectAnalysis(
    input_path=DATASET_PATH.as_posix(),
    output_path=settings.DEFAULT_OUTPUT_PATH / DATASET_PATH.stem,
    experiment_name='gerani',
    max_docs=50000)

discourse_tree_df = pd.read_pickle(
    aspect_analysis_gerani.paths.discourse_trees_df)

relations = []

for row_id, row in tqdm(discourse_tree_df.iterrows(),
                        total=len(discourse_tree_df),
                        desc='Generating aspect-aspect graph based on rules'):
    for edu_left, edu_right, relation, weight in row.rules:
        for aspect_left, aspect_right in product(row.aspects[edu_left],
                                                 row.aspects[edu_right]):
            relations.append((aspect_left, aspect_right))

model = PoincareModel(train_data=relations, size=2, burn_in=0)

model.train(epochs=100, print_every=500)

model.save(aspect_analysis_gerani.paths.aspects_poincare_embeddings)
예제 #15
0
def run(mode,
        embedding,
        embedding_name,
        experiment_name=None,
        log=False,
        trial=False):
    if embedding == "fasttext":
        #model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M-subword.vec', binary=False)
        model = gensim.models.FastText.load_fasttext_format('wiki.en.bin')
        #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec')
    elif embedding == "wiki2M":
        #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec','vec')
        model = gensim.models.KeyedVectors.load_word2vec_format(
            'embeddings/crawl-300d-2M.vec', binary=False)
        #model.save("crawl-300d-2M.bin")
    elif embedding == "wiki1M_subword":
        model = gensim.models.KeyedVectors.load_word2vec_format(
            'embeddings/wiki-news-300d-1M-subword.vec', binary=False)

    elif embedding == "own_w2v":
        model = gensim.models.KeyedVectors.load(
            'embeddings/own_embeddings_w2v')

    elif embedding == "quick":
        model = gensim.models.KeyedVectors.load_word2vec_format(
            'embeddings/crawl-300d-2M.vec', binary=False, limit=50000)
    elif embedding == "poincare":
        model = PoincareModel.load(
            'embeddings/poincare_common_domains02_5_3_50')
        print(len(model.kv.vocab))
        words = [
            "computer_science", "biology", "physics", "science", "virology",
            "life_science", "chemistry", "earth_science", "algebra",
            "economics", "optics"
            "immunology"
        ]
        for word in words:
            print("Current word: ", word)

            if word in model.kv.vocab:
                try:
                    print("Closest Parent: ", model.kv.closest_parent(word))
                    print("Closest Child ", model.kv.closest_child(word))
                    print("Descendants: ", model.kv.descendants(word))
                    print("Ancestors: ", model.kv.ancestors(word))
                    print("Hierarchy diff to Science: ",
                          model.kv.difference_in_hierarchy(word, "science"))
                    print('\n')
                except:
                    continue
            else:
                print("Word not in Vocab")

    if mode == "visualize_embedding_poincare":
        relations = set([])
        filename_in = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "data/isas_1000.tsv")
        with open(filename_in, 'r') as f:
            reader = csv.reader(f, delimiter='\t')
            for i, line in enumerate(reader):
                relations.add((line[0], line[1]))
        plot = poincare_2d_visualization(model, relations, experiment_name)
        py.image.save_as(plot, "vis/" + experiment_name + '.png')
        print("Starting visualization")

        #visualize_taxonomy(vectors, names)


#todo own file for train
    if mode == "visualize_embedding":
        gold, relations = read_all_data()
        vectors = []
        names = []
        for relation in (
            [relation1[1].replace(" ", "_") for relation1 in relations] +
            [relation2[2].replace(" ", "_") for relation2 in relations]):
            if relation not in names:
                if relation not in model.wv:
                    print(relation)
                    continue
                vectors.append(model.wv[relation])
                names.append(relation)
        visualize_taxonomy(vectors, names, experiment_name)

    if mode == 'train_poincare':
        # gold,relations = read_all_data()
        # freq_science = [3,5]
        # for entry_science in freq_science:
        #     relations = './data/' + domain +'_crawl_' + str(entry_science) +'.tsv'
        #     #relations = './data/science_crawl_merge_10_3_02.tsv'
        #     poincare_rel = PoincareRelations(relations)
        #     dim = 50
        #     model = PoincareModel(poincare_rel, size = dim)
        #     print("Starting Training...")
        #     model.train(epochs=400)
        #     model.save("embeddings/embeddings_" + domain + "_crawl_poincare_" + str(entry_science) + "_" + str(dim))
        #     #model.save("embeddings/embeddings_science_crawl_merge_poincare_10_3_50_02")
        #     break
        relations = './data/poincare_common_domains.tsv'
        #relations = './data/science_crawl_merge_10_3_02.tsv'
        poincare_rel = PoincareRelations(relations)
        dim = 50
        model = PoincareModel(poincare_rel, size=dim)
        print("Starting Training...")
        model.train(epochs=400)
        model.save("embeddings/poincare_common_domains_5_3" + "_" + str(dim))

    if mode == "train_word2vec":
        gold_s, relations_s = read_all_data("science")
        gold_e, relations_e = read_all_data("environment")
        gold_f, relations_f = read_all_data("food")
        vocabulary = set([relation[2] for relation in gold_s] +
                         [relation[1] for relation in gold_s])
        vocabulary = vocabulary | set([relation[2] for relation in gold_f] +
                                      [relation[1] for relation in gold_f])
        vocabulary = vocabulary | set([relation[2] for relation in gold_e] +
                                      [relation[1] for relation in gold_e])
        documents = list(
            read_input(
                "/srv/data/5aly/data_text/wikipedia_utf8_filtered_20pageviews.csv",
                vocabulary))
        model = gensim.models.Word2Vec(size=300,
                                       window=5,
                                       min_count=5,
                                       workers=30)
        model.build_vocab(documents)
        #model.train(documents, total_examples = len(documents), epochs=10)
        model.train(documents, total_examples=model.corpus_count, epochs=30)
        model.save("embeddings/own_embeddings_w2v_all")

    elif mode == "analysis":
        gold, relations = read_all_data()
        voc_rel = set([relation[1] for relation in relations] +
                      [relation[2] for relation in relations])
        voc_gold = set([relation[1] for relation in gold] +
                       [relation[2] for relation in gold])
        print("Vokabeln in Gold: " + str(len(voc_gold)) +
              "Vokabeln in Taxonomy: " + str(len(voc_rel)))