def load_embeddings(include_co, exclude_parent, wordnet, domain, language='EN'): model = None model_poincare = None if include_co: if language == 'EN': model = gensim.models.KeyedVectors.load( 'embeddings/own_embeddings_w2v_all') print("Word2vec vocab size", len(model.wv.vocab)) else: print( "There is no wordnet poincaré model for a non-english language\nAbort..." ) sys.exit() if not exclude_parent: if wordnet: if language == 'EN': model_poincare = PoincareModel.load( 'embeddings/wordnet_filtered_50') else: print( "There is no wordnet poincaré model for a non-english language\nAbort..." ) sys.exit() else: assert language in ['EN', 'FR', 'IT', 'NL'], "Language not supported. Aborting..." #model_poincare = PoincareModel.load('embeddings/poincare_common_domains_5_3_' + language + '_' + domain + '_50') model_poincare = PoincareModel.load( 'embeddings/poincare_common_and_domains_5_3_' + language + '_50') print("Poincare vocab size", len(model_poincare.kv.vocab)) #print(model_poincare.kv.vocab) #wordlist = ["volcanic_eruption", "whipped_cream", 'ordinary_differential_equations', "Atlantic_Ocean", "electrical_engineering", "vanilla_extract", "wastewater", "lake", "freshwater", "water"] #wordlist = ["international_relations", "second_language_acquisition", "botany", "sweet_potatoes"] # for word in wordlist: # print(word) # distances = list(model_poincare.kv.distances(word)) # pairs = list(zip(distances, list(model_poincare.kv.vocab))) # pairs = sorted(pairs) # closest = [element[1] for element in pairs[:5]] # print(closest, '\n') return [model, model_poincare]
def load_poincare_embeddings(file_name): """ Load the pre-trained embeddings from a file :param file_name: the embeddings file :return: the vocabulary and the word vectors """ model = PoincareModel.load(file_name) words = [] vectors = [] dim_size = EMBEDDINGS_DIM print("Loading %s poincare embeddings..." % len(model.kv.vocab)) for i, term in enumerate(model.kv.vocab): words.append( term.encode("ascii", errors="ignore").lower().replace("_", " ")) vector = model.kv.get_vector(term) dim_size = max(EMBEDDINGS_DIM, vector.size) vectors.append(vector_to_str(vector)) if (i + 1) % (len(model.kv.vocab) / 10) == 0: # Print current state 10 times print(" %s / %s" % (i, len(model.kv.vocab))) print("Finished loading poincare embeddings.") return prepare_embeddings(words, vectors, dim_size)
def test_persistence_old_model(self): """Tests whether model from older gensim version is loaded correctly.""" loaded = PoincareModel.load(datapath('poincare_test_3.4.0')) self.assertEqual(loaded.kv.syn0.shape, (239, 2)) self.assertEqual(len(loaded.kv.vocab), 239) self.assertEqual(loaded.size, 2) self.assertEqual(len(loaded.all_relations), 200)
def test_persistence(self): """Tests whether the model is saved and loaded correctly.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile()) loaded = PoincareModel.load(testfile()) self.models_equal(model, loaded)
def test_persistence_separate_file(self): """Tests whether the model is saved and loaded correctly when the arrays are stored separately.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile(), sep_limit=1) loaded = PoincareModel.load(testfile()) self.models_equal(model, loaded)
def train(rels, lang=LANG, epochs=VALUE_DEFAULT_EPOCHS, epochs_load=0, size=VALUE_DEFAULT_SIZE, negative=VALUE_DEFAULT_NEGATIVE, memo=VALUE_DEFAULT_MEMO, burnin=None, reg=None, resume=False): try: if resume: filename = make_filename_model(lang, epochs_load, size, negative, memo, burnin, reg) model = PoincareModel.load(filename) print("resume {}".format(filename)) else: print("first training") raise ValueError() except: if resume: print("file not found") model = PoincareModel(rels, burn_in=0, regularization_coeff=0, negative=negative, size=size) model.train(epochs=epochs, print_every=1500) model.save(make_filename_model(lang, epochs+epochs_load, size, negative, memo, burnin, reg)) return model
def test_train_after_load(self): """Tests whether the model can be trained correctly after loading from disk.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile()) loaded = PoincareModel.load(testfile()) model.train(epochs=1) loaded.train(epochs=1) self.models_equal(model, loaded)
def load_vectors(): """ Load word vectors. """ embedding_dir = '/home/5aly/taxi/distributed_semantics/embeddings/' poincare_model = PoincareModel.load( embedding_dir + 'embeddings_poincare_wordnet') # parent-cluster relationship own_model = gensim.models.KeyedVectors.load( embedding_dir + 'own_embeddings_w2v') # family-cluster relationship return poincare_model, own_model
def load_poincare_model(path, word2vec_format=True, binary=False): """ Load a Poincare embedding model. :param path: path of the file of the pre-trained Poincare embedding model :param word2vec_format: whether to load from word2vec format (default: True) :param binary: binary format (default: False) :return: a pre-trained Poincare embedding model :type path: str :type word2vec_format: bool :type binary: bool :rtype: gensim.models.poincare.PoincareKeyedVectors """ if word2vec_format: return PoincareKeyedVectors.load_word2vec_format(path, binary=binary) else: return PoincareModel.load(path).kv
def main(poincare=''): from gensim.models.poincare import PoincareModel pm = PoincareModel([], size=300, dtype=np.float64) emb = PoincareKeyedVectors.load_word2vec_format(poincare, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=np.float64) pm.kv = emb pm.save('w2v_poincare.pickle', pickle_protocol=4) pm2 = PoincareModel.load('w2v_poincare.pickle') pm2.train(10000, batch_size=10, print_every=1, check_gradients_every=None) pm2.save('w2v_poincare_after_train.pickle', pickle_protocol=4)
def run(mode, domain, embedding, exclude_parent=False, include_co=False, compound=False, wordnet=False, experiment_name=None, log=False): if embedding == "fasttext": #model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M-subword.vec', binary=False) model = gensim.models.FastText.load_fasttext_format('wiki.en.bin') #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec') elif embedding == "wiki2M": #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec','vec') model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/crawl-300d-2M.vec', binary=False) #model.save("crawl-300d-2M.bin") elif embedding == "wiki1M_subword": model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/wiki-news-300d-1M-subword.vec', binary=False) elif embedding == "own_w2v": model = gensim.models.KeyedVectors.load( 'embeddings/own_embeddings_w2v') elif embedding == "quick": model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/crawl-300d-2M.vec', binary=False, limit=50000) elif embedding == 'own_and_poincare': print("init") model = gensim.models.KeyedVectors.load( 'embeddings/own_embeddings_w2v_all') #n2 #all #model_poincare = PoincareModel.load('embeddings/embeddings_' + domain +'_crawl_poincare_3_50') #model_poincare = PoincareModel.load('embeddings/embeddings_science_crawl_merge_poincare_10_3_50_02') model_poincare = PoincareModel.load( 'embeddings/poincare_common_domains02_5_3_50') #model_poincare = PoincareModel.load('embeddings/embeddings_poincare_wordnet') gold = [] relations = [] taxonomy = [] outliers = [] exclude_co = not include_co if mode == 'combined_embeddings_removal': #thresholds = [2,4,6,8,10,12,14]poincare and co-hyper testrun thresholds = [6] for value in thresholds: gold, relations = read_all_data(domain) outliers = calculate_outliers(relations, model, threshold=value, model_poincare=model_poincare, compound=compound, no_parents=exclude_parent, no_co=exclude_co, wordnet=wordnet) compare_to_gold( gold=gold, taxonomy=relations, outliers=outliers, model=model, log="logs/" + mode + "_" + embedding + "_" + str(value), write_file="out/" + mode + "_" + embedding + "_" + str(value)) elif mode == 'combined_embeddings_new_nodes': #thresholds = [2] thresholds = [2, 4, 6, 8, 10, 12, 14] #poincare testrun #thresholds = [12,14,18,20] #co-hyper testrun for value in thresholds: gold, relations = read_all_data(domain) new_nodes = connect_new_nodes(taxonomy=relations, gold=gold, model=model, model_poincare=model_poincare, threshold=value, no_parents=exclude_parent, no_co=exclude_co, wordnet=wordnet) compare_to_gold(gold=gold, taxonomy=relations, model=model, model_poincare=model_poincare, new_nodes=new_nodes) elif mode == 'combined_embeddings_removal_and_new': gold, relations = read_all_data(domain) new_nodes = connect_new_nodes(taxonomy=relations, gold=gold, model=model, model_poincare=model_poincare, threshold=2, no_parents=exclude_parent, no_co=exclude_co, wordnet=wordnet) outliers = calculate_outliers(relations, model, threshold=6, model_poincare=model_poincare, compound=compound, no_parents=exclude_parent, no_co=exclude_co, wordnet=wordnet) relations1 = compare_to_gold(gold=gold, taxonomy=relations, model=model, model_poincare=model_poincare, new_nodes=new_nodes) relations2 = compare_to_gold(gold=gold, taxonomy=relations, model=model, model_poincare=model_poincare, new_nodes=new_nodes, outliers=outliers) outliers = calculate_outliers(relations1, model, threshold=6, model_poincare=model_poincare, compound=compound, no_parents=exclude_parent, no_co=exclude_co, wordnet=wordnet) compare_to_gold(gold=gold, taxonomy=relations1, outliers=outliers, new_nodes=new_nodes, model=model, model_poincare=model_poincare)
def run(mode, embedding, embedding_name, experiment_name=None, log=False, trial=False): if embedding == "fasttext": #model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M-subword.vec', binary=False) model = gensim.models.FastText.load_fasttext_format('wiki.en.bin') #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec') elif embedding == "wiki2M": #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec','vec') model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/crawl-300d-2M.vec', binary=False) #model.save("crawl-300d-2M.bin") elif embedding == "wiki1M_subword": model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/wiki-news-300d-1M-subword.vec', binary=False) elif embedding == "own_w2v": model = gensim.models.KeyedVectors.load( 'embeddings/own_embeddings_w2v') elif embedding == "quick": model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/crawl-300d-2M.vec', binary=False, limit=50000) elif embedding == "poincare": model = PoincareModel.load( 'embeddings/poincare_common_domains02_5_3_50') print(len(model.kv.vocab)) words = [ "computer_science", "biology", "physics", "science", "virology", "life_science", "chemistry", "earth_science", "algebra", "economics", "optics" "immunology" ] for word in words: print("Current word: ", word) if word in model.kv.vocab: try: print("Closest Parent: ", model.kv.closest_parent(word)) print("Closest Child ", model.kv.closest_child(word)) print("Descendants: ", model.kv.descendants(word)) print("Ancestors: ", model.kv.ancestors(word)) print("Hierarchy diff to Science: ", model.kv.difference_in_hierarchy(word, "science")) print('\n') except: continue else: print("Word not in Vocab") if mode == "visualize_embedding_poincare": relations = set([]) filename_in = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/isas_1000.tsv") with open(filename_in, 'r') as f: reader = csv.reader(f, delimiter='\t') for i, line in enumerate(reader): relations.add((line[0], line[1])) plot = poincare_2d_visualization(model, relations, experiment_name) py.image.save_as(plot, "vis/" + experiment_name + '.png') print("Starting visualization") #visualize_taxonomy(vectors, names) #todo own file for train if mode == "visualize_embedding": gold, relations = read_all_data() vectors = [] names = [] for relation in ( [relation1[1].replace(" ", "_") for relation1 in relations] + [relation2[2].replace(" ", "_") for relation2 in relations]): if relation not in names: if relation not in model.wv: print(relation) continue vectors.append(model.wv[relation]) names.append(relation) visualize_taxonomy(vectors, names, experiment_name) if mode == 'train_poincare': # gold,relations = read_all_data() # freq_science = [3,5] # for entry_science in freq_science: # relations = './data/' + domain +'_crawl_' + str(entry_science) +'.tsv' # #relations = './data/science_crawl_merge_10_3_02.tsv' # poincare_rel = PoincareRelations(relations) # dim = 50 # model = PoincareModel(poincare_rel, size = dim) # print("Starting Training...") # model.train(epochs=400) # model.save("embeddings/embeddings_" + domain + "_crawl_poincare_" + str(entry_science) + "_" + str(dim)) # #model.save("embeddings/embeddings_science_crawl_merge_poincare_10_3_50_02") # break relations = './data/poincare_common_domains.tsv' #relations = './data/science_crawl_merge_10_3_02.tsv' poincare_rel = PoincareRelations(relations) dim = 50 model = PoincareModel(poincare_rel, size=dim) print("Starting Training...") model.train(epochs=400) model.save("embeddings/poincare_common_domains_5_3" + "_" + str(dim)) if mode == "train_word2vec": gold_s, relations_s = read_all_data("science") gold_e, relations_e = read_all_data("environment") gold_f, relations_f = read_all_data("food") vocabulary = set([relation[2] for relation in gold_s] + [relation[1] for relation in gold_s]) vocabulary = vocabulary | set([relation[2] for relation in gold_f] + [relation[1] for relation in gold_f]) vocabulary = vocabulary | set([relation[2] for relation in gold_e] + [relation[1] for relation in gold_e]) documents = list( read_input( "/srv/data/5aly/data_text/wikipedia_utf8_filtered_20pageviews.csv", vocabulary)) model = gensim.models.Word2Vec(size=300, window=5, min_count=5, workers=30) model.build_vocab(documents) #model.train(documents, total_examples = len(documents), epochs=10) model.train(documents, total_examples=model.corpus_count, epochs=30) model.save("embeddings/own_embeddings_w2v_all") elif mode == "analysis": gold, relations = read_all_data() voc_rel = set([relation[1] for relation in relations] + [relation[2] for relation in relations]) voc_gold = set([relation[1] for relation in gold] + [relation[2] for relation in gold]) print("Vokabeln in Gold: " + str(len(voc_gold)) + "Vokabeln in Taxonomy: " + str(len(voc_rel)))
border= #train.pyで学習したモデルのダウンロード #from train import train #model=Train.train() #保存したいとき #model.save("name.gz") #フォルダからpoincareモデル(epoch=100)のダウンロード from gensim.models.poincare import PoincareModel,PoincareRelations model=PoincareModel.load("../data/poincare_1.gz") #比較する文章のデータベースの作成 from database import Database database,id2doc=Database.database() #対象の文章の読み込み import glob path=glob.glob("../data/target_data/*") path=path[0] f = open(path) text=f.read() f.close() #入力(対象の文章、判別モデル、文章のデータベース、しきい値)から出力(類似度がしきい値以下の似てる文章を昇順で)を出す from comparison import Comparison ind=Comparison.comparison(text,database,model,border) for i in ind: print(id2doc[i])
def test_train_old_model_after_load(self): """Tests whether loaded model from older gensim version can be trained correctly.""" loaded = PoincareModel.load(datapath('poincare_test_3.4.0')) old_vectors = np.copy(loaded.kv.syn0) loaded.train(epochs=2) self.assertFalse(np.allclose(old_vectors, loaded.kv.syn0))
if p.grad is not None: del p.grad torch.cuda.empty_cache() total_loss = 0.0 batch_begin_time = time() model = self.train() for p in model.parameters(): if p.grad is not None: del p.grad torch.cuda.empty_cache() model = self.train() shop_location_dict = process.create_POI_index(Parameters.shopMes_path, Parameters.shop_index_path) shop_index = process.load_POI_index(Parameters.shop_index_path) poincareModel = PoincareModel.load(Parameters.hieraechical_path) type_embedding_matrix = process.create_type_matrix(Parameters.shopMes_path, poincareModel, shop_index) user_history_data_dict = process.split_review_data(Parameters.review_path, shop_index) struct_matrix = np.load(Parameters.concat_graph_matrix_path) net = Net(poi_size=len(shop_index), embedding_matrix=type_embedding_matrix, struct_topology=struct_matrix, user_history_data_dict=user_history_data_dict).cuda() net.fit("model.pkl") count = 0 hit_recommend = 0 for test_dict in process.get_test_data(Parameters.test_review_path, user_history_data_dict, Parameters.MAX_LEN, shop_index, shop_location_dict): count += 1 recommend_poi_score = dict() user = test_dict['user_index'] poi = test_dict['poi_index']
# parameters size = 15 # dimension of the embedding space c = 15 # constant of negative curvature epochs = 2000 # number of training epochs # define the model model = PoincareModel(relations_, size=size, negative=c) model.train(epochs) # save the model model.save('/path/to/model') # save model embedding model.kv.save_word2vec_format("/path/to/embedding") # load the model and the embedding model = PoincareModel.load("/path/to/model") model.kv.load_word2vec_format("/path/to/embedding") all_relations = set(relations_) # add different classes to the labels to add them to the graph labels = list(set([_[0] for _ in relations_])) + ["girl_name", "boy_name", "mixed_name", "body_part", "benin_city"] title = "Title Figure" fig = poincare_2d_visualization(new_model_10, all_relations, title, show_node_labels=labels) plt.image.ishow(fig, width=1000, height=1000) plt.image.save_as(fig, filename='path/to/plot/plot_name.png') # Compute Link PredictionEvaluation link = LinkPredictionEvaluation(path, path_validation, model.kv.load_word2vec_format("/path/to/embedding")) print(link.evaluate())
wordrank_path = 'wordrank' # path to Wordrank directory out_dir = 'model' # name of output directory to save data to data = '../../gensim/test/test_data/lee.cor' # sample corpus model = Wordrank.train(wordrank_path, data, out_dir, iter=21, dump_period=10) varembed_vectors = '../../gensim/test/test_data/varembed_leecorpus_vectors.pkl' model = varembed.VarEmbed.load_varembed_format(vectors=varembed_vectors) morfessors = '../../gensim/test/test_data/varembed_leecorpus_morfessor.bin' model = varembed.VarEmbed.load_varembed_format(vectors=varembed_vectors, morfessor_model=morfessors) import os poincare_directory = os.path.join(os.getcwd(), 'docs', 'notebooks', 'poincare') data_directory = os.path.join(poincare_directory, 'data') wordnet_mammal_file = os.path.join(data_directory, 'wordnet_mammal_hypernyms.tsv') from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations relations = PoincareRelations(file_path=wordnet_mammal_file, delimiter='\t') model = PoincareModel(train_data=relations, size=2, burn_in=0) model.train(epochs=1, print_every=500) models_directory = os.path.join(poincare_directory, 'models') test_model_path = os.path.join(models_directory, 'gensim_model_batch_size_10_burn_in_0_epochs_50_neg_20_dim_50') model = PoincareModel.load(test_model_path)
model = train(rels_n, epochs=200, size=200, memo="n", burnin=0, reg=0) # In[ ]: # train more (200 epochs, 200dims, burnin and reg = 0) model = train(rels_n, epochs=200, size=200, memo="n", epochs_load=200, resume=True, burnin=0, reg=0) # #### some attemptions # In[ ]: model = PoincareModel.load(make_filename_model(LANG, 400, 200, 10, "n", 0, 0)) # In[ ]: model.kv.descendants("ไก่") # *Cleaning the structure or separating it seems to be needed* # ### visualize sample hypernyms (synsets) # In[ ]: