def test_persistence_separate_file(self): """Tests whether the model is saved and loaded correctly when the arrays are stored separately.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile(), sep_limit=1) loaded = PoincareModel.load(testfile()) self.models_equal(model, loaded)
def test_persistence(self): """Tests whether the model is saved and loaded correctly.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile()) loaded = PoincareModel.load(testfile()) self.models_equal(model, loaded)
def train(rels, lang=LANG, epochs=VALUE_DEFAULT_EPOCHS, epochs_load=0, size=VALUE_DEFAULT_SIZE, negative=VALUE_DEFAULT_NEGATIVE, memo=VALUE_DEFAULT_MEMO, burnin=None, reg=None, resume=False): try: if resume: filename = make_filename_model(lang, epochs_load, size, negative, memo, burnin, reg) model = PoincareModel.load(filename) print("resume {}".format(filename)) else: print("first training") raise ValueError() except: if resume: print("file not found") model = PoincareModel(rels, burn_in=0, regularization_coeff=0, negative=negative, size=size) model.train(epochs=epochs, print_every=1500) model.save(make_filename_model(lang, epochs+epochs_load, size, negative, memo, burnin, reg)) return model
def train_run(args): # create experiment name from args # create log folder, params folder from exp name # Start logging in exp log folder # save trained model in exp params folder exp_name = 'HB'+'time' + str(datetime.now()) + '_EXP' + str(args.train_dir) + \ '_prbt' + str(args.prob_threshold) + '_reg' + str(args.reg_coef) + \ '_dim' + str(args.embed_dim) + '_lr' + str(args.learning_rate) + \ '_neg' + str(args.negs) + '_epoc' + str(args.epochs) + '_burnin' + str(args.burn_in) exp_name = exp_name.replace(":", "-") exp_name = exp_name.replace("/", "-") exp_name = exp_name.replace(" ", "-") print(exp_name) # Training Logs Folder exp_log_folder = args.log_folder + exp_name + '/' if not os.path.exists(exp_log_folder): os.makedirs(exp_log_folder) logging_file = exp_log_folder + 'logging.txt' logging.basicConfig(filename=logging_file, level=logging.INFO) # Model saving folder exp_params_folder = args.params_folder + exp_name + '/' if not os.path.exists(exp_params_folder): os.makedirs(exp_params_folder) training_file = args.train_dir + args.trn_file trn_dataset = data_loader.get_data_list(training_file, args.prob_threshold) print("Number of training examples: ", len(trn_dataset)) # Create the model definition model = PoincareModel(train_data=trn_dataset, size=args.embed_dim, alpha=args.learning_rate, negative=args.negs, regularization_coeff=args.reg_coef, burn_in=args.burn_in, burn_in_alpha=args.burn_in_alpha, init_range=args.init_range, seed=args.random_seed) # Start the model training model.train(epochs=args.epochs, batch_size=args.batch_size, print_every=args.print_every) # Save the model model_save_name = exp_params_folder + 'gensim_model.params' model.save(model_save_name) # Save the arguments in the params folder args_fname = exp_params_folder + 'args_model.pkl' with open(args_fname, "wb") as f: pickle.dump(args, f) return
def test_train_after_load(self): """Tests whether the model can be trained correctly after loading from disk.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile()) loaded = PoincareModel.load(testfile()) model.train(epochs=1) loaded.train(epochs=1) self.models_equal(model, loaded)
def hello_world(): relations = [('math', 'science'), ('cs', 'science'), ('ml', 'cs'), ('db', 'cs'), ('linalg', 'math')] model = PoincareModel(relations, size=8, negative=2) model.train(epochs=50) # Poincare distance between two entities print(model.kv.distance('ml', 'db')) # Compute absolute position in hierarchy of input node or vector. # Values range between 0 and 1. A lower value indicates the input # node or vector is higher in the hierarchy. print(model.kv.norm('ml')) print(model.kv.norm('ml')) # Get the vectors print(model.kv.get_vector('ml')) model.save('test_embeddings.bin') model.kv.save_word2vec_format('test_embeddings.w2v') return
def main(poincare=''): from gensim.models.poincare import PoincareModel pm = PoincareModel([], size=300, dtype=np.float64) emb = PoincareKeyedVectors.load_word2vec_format(poincare, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=np.float64) pm.kv = emb pm.save('w2v_poincare.pickle', pickle_protocol=4) pm2 = PoincareModel.load('w2v_poincare.pickle') pm2.train(10000, batch_size=10, print_every=1, check_gradients_every=None) pm2.save('w2v_poincare_after_train.pickle', pickle_protocol=4)
def run(mode, language): if mode == "train_poincare_custom": gold_s,_ = read_all_data(domain = "science", language = language) gold_e,_ = read_all_data(domain = "environment", language = language) gold_f,_ = read_all_data(domain = "food", language = language) vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e]) relations ="data/" + language + "/poincare_common_and_domains_" + language + ".tsv" assert len(open(relations, 'r').readlines()) > 10, "Not enough relations to train embeddings. Aborting ..." poincare_rel = PoincareRelations(relations) dim = 50 model = PoincareModel(poincare_rel, size = dim) print("Starting Training...") model.train(epochs=400) model.save("embeddings/poincare_common_and_domains_5_3_" + language + "_" + str(dim)) if mode == 'train_poincare_wordnet': assert language == 'EN', "Wordnet consists only of English nouns" gold_s,_ = read_all_data(domain = "science") gold_e,_ = read_all_data(domain = "environment") gold_f,_ = read_all_data(domain = "food") vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e]) preprocess_wordnet('data/EN/noun_closure.tsv', vocabulary) poincare_rel = PoincareRelations('data/EN/noun_closure_filtered.tsv') dim = 50 model = PoincareModel(poincare_rel, size = dim) print("Starting Training...") model.train(epochs=400) model.save("embeddings/wordnet_filtered" + "_" + str(dim)) if mode == "train_word2vec": gold_s,relations_s = read_all_data("science") gold_e,relations_e = read_all_data("environment") gold_f,relations_f = read_all_data("food") vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e]) documents = [] documents = list(read_input("/data/EN/wikipedia_utf8_filtered_20pageviews.csv",vocabulary)) model = gensim.models.Word2Vec(documents, size= 300, window = 10, min_count = 2, workers = 10) model.train(documents, total_examples=len(documents), epochs=30) print("Finished building word2vec model") model.save("embeddings/own_embeddings_w2v")
from gensim.models.poincare import PoincareModel, PoincareRelations from gensim.test.utils import datapath from utils import Parameters import pandas as pd data_path = datapath("D:/PyCharm/PyCharm_Project/paper/data/type_relation.tsv") type_embedding_path = "data/type_embedding" model = PoincareModel(train_data=PoincareRelations(data_path, encoding="gbk"), size=Parameters.type_embedding_dim, negative=3) model.train(epochs=50, print_every=5) print(model.kv.word_vec("川菜")) model.save(type_embedding_path) # poincareModel = PoincareModel.load("data/type_embedding") # print(poincareModel.kv.word_vec('东北菜'))
path = "/path/to/training_set.txt" # poincare_dict.txt path_lexical = "path/to/lexical.txt" # example file_lexical_fon.txt following the HyperLex Format path_validation = "/file/to/validation_set.txt" # validation set poincare_embedding_validation.txt relations_ = load_doc(path) # parameters size = 15 # dimension of the embedding space c = 15 # constant of negative curvature epochs = 2000 # number of training epochs # define the model model = PoincareModel(relations_, size=size, negative=c) model.train(epochs) # save the model model.save('/path/to/model') # save model embedding model.kv.save_word2vec_format("/path/to/embedding") # load the model and the embedding model = PoincareModel.load("/path/to/model") model.kv.load_word2vec_format("/path/to/embedding") all_relations = set(relations_) # add different classes to the labels to add them to the graph labels = list(set([_[0] for _ in relations_])) + ["girl_name", "boy_name", "mixed_name", "body_part", "benin_city"] title = "Title Figure" fig = poincare_2d_visualization(new_model_10, all_relations, title, show_node_labels=labels) plt.image.ishow(fig, width=1000, height=1000)
logging.basicConfig(level=logging.INFO) DATASET_PATH = settings.AMAZON_REVIEWS_CELL_PHONES_AND_ACCESSORIES_DATASET_JSON aspect_analysis_gerani = AspectAnalysis( input_path=DATASET_PATH.as_posix(), output_path=settings.DEFAULT_OUTPUT_PATH / DATASET_PATH.stem, experiment_name='gerani', max_docs=50000) discourse_tree_df = pd.read_pickle( aspect_analysis_gerani.paths.discourse_trees_df) relations = [] for row_id, row in tqdm(discourse_tree_df.iterrows(), total=len(discourse_tree_df), desc='Generating aspect-aspect graph based on rules'): for edu_left, edu_right, relation, weight in row.rules: for aspect_left, aspect_right in product(row.aspects[edu_left], row.aspects[edu_right]): relations.append((aspect_left, aspect_right)) model = PoincareModel(train_data=relations, size=2, burn_in=0) model.train(epochs=100, print_every=500) model.save(aspect_analysis_gerani.paths.aspects_poincare_embeddings)
def run(mode, embedding, embedding_name, experiment_name=None, log=False, trial=False): if embedding == "fasttext": #model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M-subword.vec', binary=False) model = gensim.models.FastText.load_fasttext_format('wiki.en.bin') #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec') elif embedding == "wiki2M": #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec','vec') model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/crawl-300d-2M.vec', binary=False) #model.save("crawl-300d-2M.bin") elif embedding == "wiki1M_subword": model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/wiki-news-300d-1M-subword.vec', binary=False) elif embedding == "own_w2v": model = gensim.models.KeyedVectors.load( 'embeddings/own_embeddings_w2v') elif embedding == "quick": model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/crawl-300d-2M.vec', binary=False, limit=50000) elif embedding == "poincare": model = PoincareModel.load( 'embeddings/poincare_common_domains02_5_3_50') print(len(model.kv.vocab)) words = [ "computer_science", "biology", "physics", "science", "virology", "life_science", "chemistry", "earth_science", "algebra", "economics", "optics" "immunology" ] for word in words: print("Current word: ", word) if word in model.kv.vocab: try: print("Closest Parent: ", model.kv.closest_parent(word)) print("Closest Child ", model.kv.closest_child(word)) print("Descendants: ", model.kv.descendants(word)) print("Ancestors: ", model.kv.ancestors(word)) print("Hierarchy diff to Science: ", model.kv.difference_in_hierarchy(word, "science")) print('\n') except: continue else: print("Word not in Vocab") if mode == "visualize_embedding_poincare": relations = set([]) filename_in = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/isas_1000.tsv") with open(filename_in, 'r') as f: reader = csv.reader(f, delimiter='\t') for i, line in enumerate(reader): relations.add((line[0], line[1])) plot = poincare_2d_visualization(model, relations, experiment_name) py.image.save_as(plot, "vis/" + experiment_name + '.png') print("Starting visualization") #visualize_taxonomy(vectors, names) #todo own file for train if mode == "visualize_embedding": gold, relations = read_all_data() vectors = [] names = [] for relation in ( [relation1[1].replace(" ", "_") for relation1 in relations] + [relation2[2].replace(" ", "_") for relation2 in relations]): if relation not in names: if relation not in model.wv: print(relation) continue vectors.append(model.wv[relation]) names.append(relation) visualize_taxonomy(vectors, names, experiment_name) if mode == 'train_poincare': # gold,relations = read_all_data() # freq_science = [3,5] # for entry_science in freq_science: # relations = './data/' + domain +'_crawl_' + str(entry_science) +'.tsv' # #relations = './data/science_crawl_merge_10_3_02.tsv' # poincare_rel = PoincareRelations(relations) # dim = 50 # model = PoincareModel(poincare_rel, size = dim) # print("Starting Training...") # model.train(epochs=400) # model.save("embeddings/embeddings_" + domain + "_crawl_poincare_" + str(entry_science) + "_" + str(dim)) # #model.save("embeddings/embeddings_science_crawl_merge_poincare_10_3_50_02") # break relations = './data/poincare_common_domains.tsv' #relations = './data/science_crawl_merge_10_3_02.tsv' poincare_rel = PoincareRelations(relations) dim = 50 model = PoincareModel(poincare_rel, size=dim) print("Starting Training...") model.train(epochs=400) model.save("embeddings/poincare_common_domains_5_3" + "_" + str(dim)) if mode == "train_word2vec": gold_s, relations_s = read_all_data("science") gold_e, relations_e = read_all_data("environment") gold_f, relations_f = read_all_data("food") vocabulary = set([relation[2] for relation in gold_s] + [relation[1] for relation in gold_s]) vocabulary = vocabulary | set([relation[2] for relation in gold_f] + [relation[1] for relation in gold_f]) vocabulary = vocabulary | set([relation[2] for relation in gold_e] + [relation[1] for relation in gold_e]) documents = list( read_input( "/srv/data/5aly/data_text/wikipedia_utf8_filtered_20pageviews.csv", vocabulary)) model = gensim.models.Word2Vec(size=300, window=5, min_count=5, workers=30) model.build_vocab(documents) #model.train(documents, total_examples = len(documents), epochs=10) model.train(documents, total_examples=model.corpus_count, epochs=30) model.save("embeddings/own_embeddings_w2v_all") elif mode == "analysis": gold, relations = read_all_data() voc_rel = set([relation[1] for relation in relations] + [relation[2] for relation in relations]) voc_gold = set([relation[1] for relation in gold] + [relation[2] for relation in gold]) print("Vokabeln in Gold: " + str(len(voc_gold)) + "Vokabeln in Taxonomy: " + str(len(voc_rel)))