def train_embeddings( input_path, # path to input edge relations delimiter, # input file delim output_path, # path to output embedding vectors size=2, # embed dimension alpha=0.1, # learning rate burn_in=10, # burn in train rounds burn_in_alpha=0.01, # burn in learning rate workers=1, # number of training threads used negative=10, # negative sample size epochs=100, # training rounds print_every=500, # print train info batch_size=10): # num samples in batch # load file with edge relations between entities relations = PoincareRelations(file_path=input_path, delimiter=delimiter) # train model model = PoincareModel(train_data=relations, size=size, alpha=alpha, burn_in=burn_in, burn_in_alpha=burn_in_alpha, workers=workers, negative=negative) model.train(epochs=epochs, print_every=print_every, batch_size=batch_size) # save output vectors model.kv.save_word2vec_format(output_path) return
def poincare(relations_file, key_epochs): # read relation file with open(relations_file) as f: output_all = json.load(f) relations = [(x[0], x[3]) for x in output_all if x[3] != ""] # ######### poincare embedding ########## # train print(datetime.datetime.now(), "---poincare embedding Start", location()) # os.environ['PYTHONHASHSEED'] = '0' # If you want to get the same result for the same input, set the PoincareModel () argument to workers = 1 and lock seed. model = PoincareModel(train_data=relations, size=2, negative=8, workers=1, seed=1) model.train(epochs=key_epochs) print(datetime.datetime.now(), "---poincare embedding End", location()) # Create dictionary {keys: value} = {term: 2D coordinate values} vec = {} for word in model.kv.vocab.keys(): vec[word] = model.kv.get_vector(word) print(datetime.datetime.now(), "---model.kv.vocab.keys End", location()) # plot ''' poincare_map = gensim.viz.poincare.poincare_2d_visualization(model=model, tree=relations, figure_title="tutorial", show_node_labels=model.kv.vocab.keys()) offline.plot(poincare_map) ''' return vec, model
def test_handle_duplicates(self): """Tests that correct number of negatives are used.""" vector_updates = np.array([[0.5, 0.5], [0.1, 0.2], [0.3, -0.2]]) node_indices = [0, 1, 0] PoincareModel._handle_duplicates(vector_updates, node_indices) vector_updates_expected = np.array([[0.0, 0.0], [0.1, 0.2], [0.8, 0.3]]) self.assertTrue((vector_updates == vector_updates_expected).all())
def test_burn_in_only_done_once(self): """Tests that burn-in does not happen when train is called a second time.""" model = PoincareModel(self.data, negative=3, burn_in=1) model.train(epochs=0) original_vectors = np.copy(model.kv.syn0) model.train(epochs=0) self.assertTrue(np.allclose(model.kv.syn0, original_vectors))
def test_gradients_check(self): """Tests that the model is trained successfully with gradients check enabled.""" model = PoincareModel(self.data, negative=3) try: model.train(epochs=1, batch_size=1, check_gradients_every=1) except Exception as e: self.fail('Exception %s raised unexpectedly while training with gradient checking' % repr(e))
def test_wrong_gradients_raises_assertion(self): """Tests that discrepancy in gradients raises an error.""" model = PoincareModel(self.data, negative=3) model._loss_grad = Mock(return_value=np.zeros((2 + model.negative, model.size))) with self.assertRaises(AssertionError): model.train(epochs=1, batch_size=1, check_gradients_every=1)
def test_invalid_data_raises_error(self): """Tests that error is raised on invalid input data.""" with self.assertRaises(ValueError): PoincareModel([("a", "b", "c")]) with self.assertRaises(ValueError): PoincareModel(["a", "b", "c"]) with self.assertRaises(ValueError): PoincareModel("ab")
def get_poincare_model(relations, emb_size, num_threads=1): print('Learning Poincare embeddings with %d relations' % len(relations)) model = PoincareModel(relations, size=emb_size, negative=2) t_start = datetime.now() model.train(epochs=50) t_end = datetime.now() print('Training time: %s' % (t_end - t_start)) return model
def test_online_learning(self): """Tests whether additional input data is loaded correctly and completely.""" model = PoincareModel(self.data, burn_in=0, negative=3) self.assertEqual(len(model.kv), 7) self.assertEqual(model.kv.get_vecattr('kangaroo.n.01', 'count'), 3) self.assertEqual(model.kv.get_vecattr('cat.n.01', 'count'), 1) model.build_vocab([('kangaroo.n.01', 'cat.n.01')], update=True) # update vocab self.assertEqual(model.kv.get_vecattr('kangaroo.n.01', 'count'), 4) self.assertEqual(model.kv.get_vecattr('cat.n.01', 'count'), 2)
def test_no_duplicates_and_positives_in_negative_sample(self): """Tests that no duplicates or positively related nodes are present in negative samples.""" model = PoincareModel(self.data_large, negative=3) positive_nodes = model.node_relations[0] # Positive nodes for node 0 num_samples = 100 # Repeat experiment multiple times for i in range(num_samples): negatives = model._sample_negatives(0) self.assertFalse(positive_nodes & set(negatives)) self.assertEqual(len(negatives), len(set(negatives)))
def embedding(namespace, emb_fname): graph = go_graph.copy() for n, attr in go_graph._node.items(): if attr['namespace'] != namespace: graph.remove_node(n) model = PoincareModel(train_data=graph.edges(), size=dim) model.train(epochs=num_epochs, print_every=500) model.kv.save(emb_fname) return model.kv
def test_vector_dtype(self): """Tests whether vectors have the correct dtype before and after training.""" model = PoincareModel(self.data_large, dtype=np.float32, burn_in=0, negative=3) self.assertEqual(model.kv.syn0.dtype, np.float32) model.train(epochs=1) self.assertEqual(model.kv.syn0.dtype, np.float32)
def poincare_disk_model(relations, dimension = 2, workers = 1, negative_sample = 2, batch_number = 10): #for i in range(100): # print(relations[i]) print("poincare ball model initialization") model = PoincareModel(relations, negative = negative_sample, size = dimension, workers = workers) print("start poincare ball model training") #batch = int(len(relations)/batch_number) #print("batch size: ",batch) model.train(epochs = 50, print_every=1000, batch_size = 100000) return model
def test_train_after_load(self): """Tests whether the model can be trained correctly after loading from disk.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile()) loaded = PoincareModel.load(testfile()) model.train(epochs=1) loaded.train(epochs=1) self.models_equal(model, loaded)
def test_reproducible(self): """Tests that vectors are same for two independent models trained with the same seed.""" model_1 = PoincareModel(self.data_large, seed=1, negative=3, burn_in=1) model_1.train(epochs=2) model_2 = PoincareModel(self.data_large, seed=1, negative=3, burn_in=1) model_2.train(epochs=2) self.assertTrue(np.allclose(model_1.kv.syn0, model_2.kv.syn0))
def load_embeddings(include_co, exclude_parent, wordnet, domain, language='EN'): model = None model_poincare = None if include_co: if language == 'EN': model = gensim.models.KeyedVectors.load( 'embeddings/own_embeddings_w2v_all') print("Word2vec vocab size", len(model.wv.vocab)) else: print( "There is no wordnet poincaré model for a non-english language\nAbort..." ) sys.exit() if not exclude_parent: if wordnet: if language == 'EN': model_poincare = PoincareModel.load( 'embeddings/wordnet_filtered_50') else: print( "There is no wordnet poincaré model for a non-english language\nAbort..." ) sys.exit() else: assert language in ['EN', 'FR', 'IT', 'NL'], "Language not supported. Aborting..." #model_poincare = PoincareModel.load('embeddings/poincare_common_domains_5_3_' + language + '_' + domain + '_50') model_poincare = PoincareModel.load( 'embeddings/poincare_common_and_domains_5_3_' + language + '_50') print("Poincare vocab size", len(model_poincare.kv.vocab)) #print(model_poincare.kv.vocab) #wordlist = ["volcanic_eruption", "whipped_cream", 'ordinary_differential_equations', "Atlantic_Ocean", "electrical_engineering", "vanilla_extract", "wastewater", "lake", "freshwater", "water"] #wordlist = ["international_relations", "second_language_acquisition", "botany", "sweet_potatoes"] # for word in wordlist: # print(word) # distances = list(model_poincare.kv.distances(word)) # pairs = list(zip(distances, list(model_poincare.kv.vocab))) # pairs = sorted(pairs) # closest = [element[1] for element in pairs[:5]] # print(closest, '\n') return [model, model_poincare]
def test_persistence_old_model(self): """Tests whether model from older gensim version is loaded correctly.""" loaded = PoincareModel.load(datapath('poincare_test_3.4.0')) self.assertEqual(loaded.kv.syn0.shape, (239, 2)) self.assertEqual(len(loaded.kv.vocab), 239) self.assertEqual(loaded.size, 2) self.assertEqual(len(loaded.all_relations), 200)
def test_data_counts_with_bytes(self): """Tests whether input bytes data is loaded correctly and completely.""" model = PoincareModel([(b'\x80\x01c', b'\x50\x71a'), (b'node.1', b'node.2')]) self.assertEqual(len(model.all_relations), 2) self.assertEqual(len(model.node_relations[model.kv.get_index(b'\x80\x01c')]), 1) self.assertEqual(len(model.kv), 4) self.assertTrue(b'\x50\x71a' not in model.node_relations)
def load_poincare_embeddings(file_name): """ Load the pre-trained embeddings from a file :param file_name: the embeddings file :return: the vocabulary and the word vectors """ model = PoincareModel.load(file_name) words = [] vectors = [] dim_size = EMBEDDINGS_DIM print("Loading %s poincare embeddings..." % len(model.kv.vocab)) for i, term in enumerate(model.kv.vocab): words.append( term.encode("ascii", errors="ignore").lower().replace("_", " ")) vector = model.kv.get_vector(term) dim_size = max(EMBEDDINGS_DIM, vector.size) vectors.append(vector_to_str(vector)) if (i + 1) % (len(model.kv.vocab) / 10) == 0: # Print current state 10 times print(" %s / %s" % (i, len(model.kv.vocab))) print("Finished loading poincare embeddings.") return prepare_embeddings(words, vectors, dim_size)
def test_data_counts(self): """Tests whether data has been loaded correctly and completely.""" model = PoincareModel(self.data) self.assertEqual(len(model.all_relations), 5) self.assertEqual(len(model.node_relations[model.kv.get_index('kangaroo.n.01')]), 3) self.assertEqual(len(model.kv), 7) self.assertTrue('mammal.n.01' not in model.node_relations)
def test_persistence(self): """Tests whether the model is saved and loaded correctly.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile()) loaded = PoincareModel.load(testfile()) self.models_equal(model, loaded)
def train(rels, lang=LANG, epochs=VALUE_DEFAULT_EPOCHS, epochs_load=0, size=VALUE_DEFAULT_SIZE, negative=VALUE_DEFAULT_NEGATIVE, memo=VALUE_DEFAULT_MEMO, burnin=None, reg=None, resume=False): try: if resume: filename = make_filename_model(lang, epochs_load, size, negative, memo, burnin, reg) model = PoincareModel.load(filename) print("resume {}".format(filename)) else: print("first training") raise ValueError() except: if resume: print("file not found") model = PoincareModel(rels, burn_in=0, regularization_coeff=0, negative=negative, size=size) model.train(epochs=epochs, print_every=1500) model.save(make_filename_model(lang, epochs+epochs_load, size, negative, memo, burnin, reg)) return model
def test_persistence_separate_file(self): """Tests whether the model is saved and loaded correctly when the arrays are stored separately.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile(), sep_limit=1) loaded = PoincareModel.load(testfile()) self.models_equal(model, loaded)
def test_training_multiple(self): """Tests that calling train multiple times results in different vectors.""" model = PoincareModel(self.data_large, burn_in=0, negative=3) model.train(epochs=2) old_vectors = np.copy(model.kv.syn0) model.train(epochs=1) self.assertFalse(np.allclose(old_vectors, model.kv.syn0)) old_vectors = np.copy(model.kv.syn0) model.train(epochs=0) self.assertTrue(np.allclose(old_vectors, model.kv.syn0))
def test_poincare(self): doc_count = 100 cits, _ = self.get_citation_graph(doc_count) poincare_model = PoincareModel( cits, size=300, alpha=0.1, negative=10, workers=1, epsilon=1e-05, regularization_coeff=1.0, burn_in=10, burn_in_alpha=0.01, init_range=(-0.001, 0.001), ) poincare_model.train(epochs=2, ) print(poincare_model.kv.vector_size)
def load_vectors(): """ Load word vectors. """ embedding_dir = '/home/5aly/taxi/distributed_semantics/embeddings/' poincare_model = PoincareModel.load( embedding_dir + 'embeddings_poincare_wordnet') # parent-cluster relationship own_model = gensim.models.KeyedVectors.load( embedding_dir + 'own_embeddings_w2v') # family-cluster relationship return poincare_model, own_model
def train_poincare_model(edgelist, dim=2, reg=0, nepochs=300): """ Load and train the Poincare embedding model """ embedding = PoincareModel(edgelist, size=dim, regularization_coeff=reg) (embedding .train(epochs=nepochs)) return embedding
def train_run(args): # create experiment name from args # create log folder, params folder from exp name # Start logging in exp log folder # save trained model in exp params folder exp_name = 'HB'+'time' + str(datetime.now()) + '_EXP' + str(args.train_dir) + \ '_prbt' + str(args.prob_threshold) + '_reg' + str(args.reg_coef) + \ '_dim' + str(args.embed_dim) + '_lr' + str(args.learning_rate) + \ '_neg' + str(args.negs) + '_epoc' + str(args.epochs) + '_burnin' + str(args.burn_in) exp_name = exp_name.replace(":", "-") exp_name = exp_name.replace("/", "-") exp_name = exp_name.replace(" ", "-") print(exp_name) # Training Logs Folder exp_log_folder = args.log_folder + exp_name + '/' if not os.path.exists(exp_log_folder): os.makedirs(exp_log_folder) logging_file = exp_log_folder + 'logging.txt' logging.basicConfig(filename=logging_file, level=logging.INFO) # Model saving folder exp_params_folder = args.params_folder + exp_name + '/' if not os.path.exists(exp_params_folder): os.makedirs(exp_params_folder) training_file = args.train_dir + args.trn_file trn_dataset = data_loader.get_data_list(training_file, args.prob_threshold) print("Number of training examples: ", len(trn_dataset)) # Create the model definition model = PoincareModel(train_data=trn_dataset, size=args.embed_dim, alpha=args.learning_rate, negative=args.negs, regularization_coeff=args.reg_coef, burn_in=args.burn_in, burn_in_alpha=args.burn_in_alpha, init_range=args.init_range, seed=args.random_seed) # Start the model training model.train(epochs=args.epochs, batch_size=args.batch_size, print_every=args.print_every) # Save the model model_save_name = exp_params_folder + 'gensim_model.params' model.save(model_save_name) # Save the arguments in the params folder args_fname = exp_params_folder + 'args_model.pkl' with open(args_fname, "wb") as f: pickle.dump(args, f) return
def poincare_train(hypertouple_dataset, size=2, burn_in=0, epochs = 5, print_freq = 100): """ Train a poincare embedding Args: hypertouple_dataset (list): The hypertouple dataset to feed for training size (int): size of model burn_in (int): Burnin identifier epochs (int): Number of epochs to train print_freq (int): Update frequency number Returns: poincare_model (model object) : The trained Poincare Model """ poincare_model = None try: #poincare_model = PoincareModel(train_data = hypertouple_dataset) poincare_model = PoincareModel(train_data=hypertouple_dataset, size = size, burn_in = burn_in) poincare_model.train(epochs=epochs, print_every = print_freq) except Exception as e: print(e) return poincare_model
def load_poincare_model(path, word2vec_format=True, binary=False): """ Load a Poincare embedding model. :param path: path of the file of the pre-trained Poincare embedding model :param word2vec_format: whether to load from word2vec format (default: True) :param binary: binary format (default: False) :return: a pre-trained Poincare embedding model :type path: str :type word2vec_format: bool :type binary: bool :rtype: gensim.models.poincare.PoincareKeyedVectors """ if word2vec_format: return PoincareKeyedVectors.load_word2vec_format(path, binary=binary) else: return PoincareModel.load(path).kv
def test_negatives(self): """Tests that correct number of negatives are sampled.""" model = PoincareModel(self.data, negative=5) self.assertEqual(len(model._get_candidate_negatives()), 5)
def test_training(self): """Tests that vectors are different before and after training.""" model = PoincareModel(self.data_large, burn_in=0, negative=3) old_vectors = np.copy(model.kv.syn0) model.train(epochs=2) self.assertFalse(np.allclose(old_vectors, model.kv.syn0))
def test_burn_in(self): """Tests that vectors are different after burn-in.""" model = PoincareModel(self.data, burn_in=1, negative=3) original_vectors = np.copy(model.kv.syn0) model.train(epochs=0) self.assertFalse(np.allclose(model.kv.syn0, original_vectors))
def test_error_if_negative_more_than_population(self): """Tests error is rased if number of negatives to sample is more than remaining nodes.""" model = PoincareModel(self.data, negative=5) with self.assertRaises(ValueError): model.train(epochs=1)
def test_train_old_model_after_load(self): """Tests whether loaded model from older gensim version can be trained correctly.""" loaded = PoincareModel.load(datapath('poincare_test_3.4.0')) old_vectors = np.copy(loaded.kv.syn0) loaded.train(epochs=2) self.assertFalse(np.allclose(old_vectors, loaded.kv.syn0))