def test_encoding_handling(self): """Tests whether utf8 and non-utf8 data loaded correctly.""" non_utf8_file = datapath('poincare_cp852.tsv') relations = [relation for relation in PoincareRelations(non_utf8_file, encoding='cp852')] self.assertEqual(len(relations), 2) self.assertEqual(relations[0], (u'tímto', u'budeš')) utf8_file = datapath('poincare_utf8.tsv') relations = [relation for relation in PoincareRelations(utf8_file)] self.assertEqual(len(relations), 2) self.assertEqual(relations[0], (u'tímto', u'budeš'))
def run(mode, language): if mode == "train_poincare_custom": gold_s,_ = read_all_data(domain = "science", language = language) gold_e,_ = read_all_data(domain = "environment", language = language) gold_f,_ = read_all_data(domain = "food", language = language) vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e]) relations ="data/" + language + "/poincare_common_and_domains_" + language + ".tsv" assert len(open(relations, 'r').readlines()) > 10, "Not enough relations to train embeddings. Aborting ..." poincare_rel = PoincareRelations(relations) dim = 50 model = PoincareModel(poincare_rel, size = dim) print("Starting Training...") model.train(epochs=400) model.save("embeddings/poincare_common_and_domains_5_3_" + language + "_" + str(dim)) if mode == 'train_poincare_wordnet': assert language == 'EN', "Wordnet consists only of English nouns" gold_s,_ = read_all_data(domain = "science") gold_e,_ = read_all_data(domain = "environment") gold_f,_ = read_all_data(domain = "food") vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e]) preprocess_wordnet('data/EN/noun_closure.tsv', vocabulary) poincare_rel = PoincareRelations('data/EN/noun_closure_filtered.tsv') dim = 50 model = PoincareModel(poincare_rel, size = dim) print("Starting Training...") model.train(epochs=400) model.save("embeddings/wordnet_filtered" + "_" + str(dim)) if mode == "train_word2vec": gold_s,relations_s = read_all_data("science") gold_e,relations_e = read_all_data("environment") gold_f,relations_f = read_all_data("food") vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e]) documents = [] documents = list(read_input("/data/EN/wikipedia_utf8_filtered_20pageviews.csv",vocabulary)) model = gensim.models.Word2Vec(documents, size= 300, window = 10, min_count = 2, workers = 10) model.train(documents, total_examples=len(documents), epochs=30) print("Finished building word2vec model") model.save("embeddings/own_embeddings_w2v")
def train_embeddings( input_path, # path to input edge relations delimiter, # input file delim output_path, # path to output embedding vectors size=2, # embed dimension alpha=0.1, # learning rate burn_in=10, # burn in train rounds burn_in_alpha=0.01, # burn in learning rate workers=1, # number of training threads used negative=10, # negative sample size epochs=100, # training rounds print_every=500, # print train info batch_size=10): # num samples in batch # load file with edge relations between entities relations = PoincareRelations(file_path=input_path, delimiter=delimiter) # train model model = PoincareModel(train_data=relations, size=size, alpha=alpha, burn_in=burn_in, burn_in_alpha=burn_in_alpha, workers=workers, negative=negative) model.train(epochs=epochs, print_every=print_every, batch_size=batch_size) # save output vectors model.kv.save_word2vec_format(output_path) return
from gensim.models.poincare import PoincareModel, PoincareRelations from gensim.test.utils import datapath from utils import Parameters import pandas as pd data_path = datapath("D:/PyCharm/PyCharm_Project/paper/data/type_relation.tsv") type_embedding_path = "data/type_embedding" model = PoincareModel(train_data=PoincareRelations(data_path, encoding="gbk"), size=Parameters.type_embedding_dim, negative=3) model.train(epochs=50, print_every=5) print(model.kv.word_vec("川菜")) model.save(type_embedding_path) # poincareModel = PoincareModel.load("data/type_embedding") # print(poincareModel.kv.word_vec('东北菜'))
def train(): from gensim.models.poincare import PoincareModel,PoincareRelations relations = PoincareRelations(file_path="../data/word_relation.csv", delimiter=',') model = PoincareModel(relations, negative=10,size=5) model.train(epochs=500) return model
Args: word (str): arbitrary word Returns: child_word (str) : The closest parent word in Wordnet format """ parent_word = None try: wnet_word = word_to_wn(word) parent_word = poincare_model.kv.closest_parent(wnet_word) except Exception as e : print(e) return parent_word if __name__ == '__main__': parser = argparse.ArgumentParser(description='Anotator options') parser.add_argument('-p', action="store", type=str, dest = 'dataset_path', help ='designated dataset path', default = None) parser.add_argument('-d', action="store", type=str, dest = 'delimiter', help ='the split for the hyper touples', default = '\t') path = parser.path dataset = PoincareRelations(file_path=path, delimiter='\t') model = poincare_train(dataset)
# writer = csv.writer(outfile, delimiter='\t') # writer.writerows(hyp) with open('polyFileEdgesRand.tsv', 'w+') as outfile: writer = csv.writer(outfile, delimiter='\t') writer.writerows(poly) with open('polyTrainFileRand.tsv', 'w+') as outfile: writer = csv.writer(outfile, delimiter='\t') writer.writerows(polyTrain) with open('polyTestFileRand.tsv', 'w+') as outfile: writer = csv.writer(outfile, delimiter='\t') writer.writerows(polyTest) # with open('hypTestFile.tsv', 'w+') as outfile: # writer = csv.writer(outfile, delimiter='\t') # writer.writerows(hypTest) # # with open('hypTrainFile.tsv', 'w+') as outfile: # writer = csv.writer(outfile, delimiter='\t') # writer.writerows(hypTrain) #file_path = datapath('randFileEdges.tsv') print("POLY TO HYP") model = PoincareModel(PoincareRelations("randFileEdges.tsv"), negative=2) model.train(epochs=100) #print(model.kv.most_similar('pitch.n.02', topn=10)) test = LinkPredictionEvaluation("polyTrainFileRand.tsv", "polyTestFileRand.tsv", model.kv) print(test.evaluate()) recon = ReconstructionEvaluation("polyFileEdgesRand.tsv", model.kv) print(recon.evaluate())
import json from gensim.models.poincare import PoincareModel, PoincareRelations from gensim.test.utils import datapath # leaves and total nodes num = 1840 total = 8801 file_path = datapath('path of the original dataset for poincare') model = PoincareModel(PoincareRelations(file_path,delimiter=','), negative=2, size=32) model.train(epochs=10000,print_every=10) child2ParentDict = {} for each in range(total): r = model.kv.closest_parent(str(each)) if r is None: print(each) else: child2ParentDict[each] = r parent2ChildDict = {} for child in child2ParentDict: parent = child2ParentDict[child] if(parent in parent2ChildDict): parent2ChildDict[parent].append(child) else:
def run(mode, embedding, embedding_name, experiment_name=None, log=False, trial=False): if embedding == "fasttext": #model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M-subword.vec', binary=False) model = gensim.models.FastText.load_fasttext_format('wiki.en.bin') #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec') elif embedding == "wiki2M": #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec','vec') model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/crawl-300d-2M.vec', binary=False) #model.save("crawl-300d-2M.bin") elif embedding == "wiki1M_subword": model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/wiki-news-300d-1M-subword.vec', binary=False) elif embedding == "own_w2v": model = gensim.models.KeyedVectors.load( 'embeddings/own_embeddings_w2v') elif embedding == "quick": model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/crawl-300d-2M.vec', binary=False, limit=50000) elif embedding == "poincare": model = PoincareModel.load( 'embeddings/poincare_common_domains02_5_3_50') print(len(model.kv.vocab)) words = [ "computer_science", "biology", "physics", "science", "virology", "life_science", "chemistry", "earth_science", "algebra", "economics", "optics" "immunology" ] for word in words: print("Current word: ", word) if word in model.kv.vocab: try: print("Closest Parent: ", model.kv.closest_parent(word)) print("Closest Child ", model.kv.closest_child(word)) print("Descendants: ", model.kv.descendants(word)) print("Ancestors: ", model.kv.ancestors(word)) print("Hierarchy diff to Science: ", model.kv.difference_in_hierarchy(word, "science")) print('\n') except: continue else: print("Word not in Vocab") if mode == "visualize_embedding_poincare": relations = set([]) filename_in = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/isas_1000.tsv") with open(filename_in, 'r') as f: reader = csv.reader(f, delimiter='\t') for i, line in enumerate(reader): relations.add((line[0], line[1])) plot = poincare_2d_visualization(model, relations, experiment_name) py.image.save_as(plot, "vis/" + experiment_name + '.png') print("Starting visualization") #visualize_taxonomy(vectors, names) #todo own file for train if mode == "visualize_embedding": gold, relations = read_all_data() vectors = [] names = [] for relation in ( [relation1[1].replace(" ", "_") for relation1 in relations] + [relation2[2].replace(" ", "_") for relation2 in relations]): if relation not in names: if relation not in model.wv: print(relation) continue vectors.append(model.wv[relation]) names.append(relation) visualize_taxonomy(vectors, names, experiment_name) if mode == 'train_poincare': # gold,relations = read_all_data() # freq_science = [3,5] # for entry_science in freq_science: # relations = './data/' + domain +'_crawl_' + str(entry_science) +'.tsv' # #relations = './data/science_crawl_merge_10_3_02.tsv' # poincare_rel = PoincareRelations(relations) # dim = 50 # model = PoincareModel(poincare_rel, size = dim) # print("Starting Training...") # model.train(epochs=400) # model.save("embeddings/embeddings_" + domain + "_crawl_poincare_" + str(entry_science) + "_" + str(dim)) # #model.save("embeddings/embeddings_science_crawl_merge_poincare_10_3_50_02") # break relations = './data/poincare_common_domains.tsv' #relations = './data/science_crawl_merge_10_3_02.tsv' poincare_rel = PoincareRelations(relations) dim = 50 model = PoincareModel(poincare_rel, size=dim) print("Starting Training...") model.train(epochs=400) model.save("embeddings/poincare_common_domains_5_3" + "_" + str(dim)) if mode == "train_word2vec": gold_s, relations_s = read_all_data("science") gold_e, relations_e = read_all_data("environment") gold_f, relations_f = read_all_data("food") vocabulary = set([relation[2] for relation in gold_s] + [relation[1] for relation in gold_s]) vocabulary = vocabulary | set([relation[2] for relation in gold_f] + [relation[1] for relation in gold_f]) vocabulary = vocabulary | set([relation[2] for relation in gold_e] + [relation[1] for relation in gold_e]) documents = list( read_input( "/srv/data/5aly/data_text/wikipedia_utf8_filtered_20pageviews.csv", vocabulary)) model = gensim.models.Word2Vec(size=300, window=5, min_count=5, workers=30) model.build_vocab(documents) #model.train(documents, total_examples = len(documents), epochs=10) model.train(documents, total_examples=model.corpus_count, epochs=30) model.save("embeddings/own_embeddings_w2v_all") elif mode == "analysis": gold, relations = read_all_data() voc_rel = set([relation[1] for relation in relations] + [relation[2] for relation in relations]) voc_gold = set([relation[1] for relation in gold] + [relation[2] for relation in gold]) print("Vokabeln in Gold: " + str(len(voc_gold)) + "Vokabeln in Taxonomy: " + str(len(voc_rel)))
def setUp(self): self.data = PoincareRelations(datapath('poincare_hypernyms.tsv')) self.data_large = PoincareRelations( datapath('poincare_hypernyms_large.tsv'))
wordrank_path = 'wordrank' # path to Wordrank directory out_dir = 'model' # name of output directory to save data to data = '../../gensim/test/test_data/lee.cor' # sample corpus model = Wordrank.train(wordrank_path, data, out_dir, iter=21, dump_period=10) varembed_vectors = '../../gensim/test/test_data/varembed_leecorpus_vectors.pkl' model = varembed.VarEmbed.load_varembed_format(vectors=varembed_vectors) morfessors = '../../gensim/test/test_data/varembed_leecorpus_morfessor.bin' model = varembed.VarEmbed.load_varembed_format(vectors=varembed_vectors, morfessor_model=morfessors) import os poincare_directory = os.path.join(os.getcwd(), 'docs', 'notebooks', 'poincare') data_directory = os.path.join(poincare_directory, 'data') wordnet_mammal_file = os.path.join(data_directory, 'wordnet_mammal_hypernyms.tsv') from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations relations = PoincareRelations(file_path=wordnet_mammal_file, delimiter='\t') model = PoincareModel(train_data=relations, size=2, burn_in=0) model.train(epochs=1, print_every=500) models_directory = os.path.join(poincare_directory, 'models') test_model_path = os.path.join(models_directory, 'gensim_model_batch_size_10_burn_in_0_epochs_50_neg_20_dim_50') model = PoincareModel.load(test_model_path)
if __name__ == "__main__": args = parse_arguments() config_path = args.config model_path = args.out hp = iom.load_json(config_path) data_path = hp.get("input") hp["epochs"] = args.epochs hp["batch_size"] = args.batch_size relations = PoincareRelations(file_path=args.data, delimiter='\t') # load model and continue training if iom.check_exists(model_path): logger.info("Poincarè model found! Loading") model = iom.load_pickle(model_path) # create new model else: # get hyperparameters of model and model training size = hp.get("size", 100) nce = hp.get("nce", 20) burn_in = hp.get("burn_in", 10)
from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations from gensim.viz.poincare import poincare_distance_heatmap from tensorflow.keras.layers import Embedding wordnet_mamal_file_path = '/Users/pankaj/dev/git/smu/nlp337/data/mamals.tsv' relations = PoincareRelations(wordnet_mamal_file_path, delimiter='\t') model = PoincareModel(train_data=relations, size=2, burn_in=0) model.train(epochs=2, print_every=500) pcv = PoincareKeyedVectors(vector_size=20) poincare_distance_heatmap((0, 0), x_range=(-1.0, 1.0), y_range=(-1.0, 1.0), num_points=100)
model.kv.descendants("ไก่") # *Cleaning the structure or separating it seems to be needed* # ### visualize sample hypernyms (synsets) # In[ ]: from gensim.test.utils import datapath from gensim.models.poincare import PoincareRelations file_path = datapath("poincare_hypernyms_large.tsv") rels = PoincareRelations(file_path) for epochs in [5, 10, 20, 50, 100, 1000]: model = PoincareModel(rels, size=2) model.train(epochs=epochs) import plotly import gensim.viz.poincare plotly.offline.init_notebook_mode(connected=False) prefecutre_map = gensim.viz.poincare.poincare_2d_visualization(model=model, tree=rels, figure_title="{} epochs".format(epochs), show_node_labels=model.kv.vocab.keys()) plotly.offline.iplot(prefecutre_map)