예제 #1
0
    def test_encoding_handling(self):
        """Tests whether utf8 and non-utf8 data loaded correctly."""
        non_utf8_file = datapath('poincare_cp852.tsv')
        relations = [relation for relation in PoincareRelations(non_utf8_file, encoding='cp852')]
        self.assertEqual(len(relations), 2)
        self.assertEqual(relations[0], (u'tímto', u'budeš'))

        utf8_file = datapath('poincare_utf8.tsv')
        relations = [relation for relation in PoincareRelations(utf8_file)]
        self.assertEqual(len(relations), 2)
        self.assertEqual(relations[0], (u'tímto', u'budeš'))
def run(mode, language):
    if mode == "train_poincare_custom":
        gold_s,_ = read_all_data(domain = "science", language = language)
        gold_e,_ = read_all_data(domain = "environment", language = language)
        gold_f,_ = read_all_data(domain = "food", language = language)
        vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e])
        relations ="data/" + language + "/poincare_common_and_domains_" + language + ".tsv"
        assert len(open(relations, 'r').readlines()) > 10, "Not enough relations to train embeddings. Aborting ..."
        poincare_rel = PoincareRelations(relations)
        dim = 50
        model = PoincareModel(poincare_rel, size = dim)
        print("Starting Training...")
        model.train(epochs=400)
        model.save("embeddings/poincare_common_and_domains_5_3_" + language + "_" + str(dim))


    if mode == 'train_poincare_wordnet':
        assert language == 'EN', "Wordnet consists only of English nouns"

        gold_s,_ = read_all_data(domain = "science")
        gold_e,_ = read_all_data(domain = "environment")
        gold_f,_ = read_all_data(domain = "food")
        vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e])

        preprocess_wordnet('data/EN/noun_closure.tsv', vocabulary)
        poincare_rel = PoincareRelations('data/EN/noun_closure_filtered.tsv')
        dim = 50
        model = PoincareModel(poincare_rel, size = dim)
        print("Starting Training...")
        model.train(epochs=400)
        model.save("embeddings/wordnet_filtered" + "_" + str(dim))

    if mode == "train_word2vec":
        gold_s,relations_s = read_all_data("science")
        gold_e,relations_e = read_all_data("environment")
        gold_f,relations_f = read_all_data("food")
        vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e])
        documents =  []

        documents = list(read_input("/data/EN/wikipedia_utf8_filtered_20pageviews.csv",vocabulary))
        model = gensim.models.Word2Vec(documents, size= 300, window = 10, min_count = 2, workers = 10)
        model.train(documents, total_examples=len(documents), epochs=30)
        print("Finished building word2vec model")
        model.save("embeddings/own_embeddings_w2v")
예제 #3
0
def train_embeddings(
        input_path,  # path to input edge relations
        delimiter,  # input file delim
        output_path,  # path to output embedding vectors 
        size=2,  # embed dimension
        alpha=0.1,  # learning rate
        burn_in=10,  # burn in train rounds
        burn_in_alpha=0.01,  # burn in learning rate
        workers=1,  # number of training threads used
        negative=10,  # negative sample size
        epochs=100,  # training rounds
        print_every=500,  # print train info
        batch_size=10):  # num samples in batch

    # load file with edge relations between entities
    relations = PoincareRelations(file_path=input_path, delimiter=delimiter)

    # train model
    model = PoincareModel(train_data=relations,
                          size=size,
                          alpha=alpha,
                          burn_in=burn_in,
                          burn_in_alpha=burn_in_alpha,
                          workers=workers,
                          negative=negative)
    model.train(epochs=epochs, print_every=print_every, batch_size=batch_size)

    # save output vectors
    model.kv.save_word2vec_format(output_path)

    return
예제 #4
0
from gensim.models.poincare import PoincareModel, PoincareRelations
from gensim.test.utils import datapath
from utils import Parameters
import pandas as pd

data_path = datapath("D:/PyCharm/PyCharm_Project/paper/data/type_relation.tsv")
type_embedding_path = "data/type_embedding"
model = PoincareModel(train_data=PoincareRelations(data_path, encoding="gbk"),
                      size=Parameters.type_embedding_dim,
                      negative=3)
model.train(epochs=50, print_every=5)
print(model.kv.word_vec("川菜"))
model.save(type_embedding_path)

# poincareModel = PoincareModel.load("data/type_embedding")
# print(poincareModel.kv.word_vec('东北菜'))
예제 #5
0
	def train():
		from gensim.models.poincare import PoincareModel,PoincareRelations
		relations = PoincareRelations(file_path="../data/word_relation.csv", delimiter=',')
		model = PoincareModel(relations, negative=10,size=5)
		model.train(epochs=500)
		return model
예제 #6
0
    Args:
        word (str): arbitrary word

    Returns:
        child_word (str) : The closest parent word in Wordnet format
    """

    parent_word = None
    try:
        wnet_word = word_to_wn(word)
        parent_word = poincare_model.kv.closest_parent(wnet_word)
    except Exception as e :
        print(e)

    return parent_word



if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Anotator options')
    parser.add_argument('-p', action="store", type=str, dest = 'dataset_path', help ='designated dataset path', default = None)
    parser.add_argument('-d', action="store", type=str, dest = 'delimiter', help ='the split for the hyper touples', default = '\t')

    path = parser.path

    dataset = PoincareRelations(file_path=path, delimiter='\t')

    model = poincare_train(dataset)
예제 #7
0
#      writer = csv.writer(outfile, delimiter='\t')
#      writer.writerows(hyp)
with open('polyFileEdgesRand.tsv', 'w+') as outfile:
     writer = csv.writer(outfile, delimiter='\t')
     writer.writerows(poly)
with open('polyTrainFileRand.tsv', 'w+') as outfile:
     writer = csv.writer(outfile, delimiter='\t')
     writer.writerows(polyTrain)
with open('polyTestFileRand.tsv', 'w+') as outfile:
     writer = csv.writer(outfile, delimiter='\t')
     writer.writerows(polyTest)

# with open('hypTestFile.tsv', 'w+') as outfile:
#      writer = csv.writer(outfile, delimiter='\t')
#      writer.writerows(hypTest)
#
# with open('hypTrainFile.tsv', 'w+') as outfile:
#      writer = csv.writer(outfile, delimiter='\t')
#      writer.writerows(hypTrain)

#file_path = datapath('randFileEdges.tsv')
print("POLY TO HYP")
model = PoincareModel(PoincareRelations("randFileEdges.tsv"), negative=2)
model.train(epochs=100)
#print(model.kv.most_similar('pitch.n.02', topn=10))
test = LinkPredictionEvaluation("polyTrainFileRand.tsv", "polyTestFileRand.tsv", model.kv)
print(test.evaluate())

recon = ReconstructionEvaluation("polyFileEdgesRand.tsv", model.kv)
print(recon.evaluate())
예제 #8
0
import json

from gensim.models.poincare import PoincareModel, PoincareRelations
from gensim.test.utils import datapath

# leaves and total nodes
num = 1840
total = 8801



file_path = datapath('path of the original dataset for poincare')
model = PoincareModel(PoincareRelations(file_path,delimiter=','), negative=2, size=32)
model.train(epochs=10000,print_every=10)

child2ParentDict = {}

for each in range(total):
    r = model.kv.closest_parent(str(each))
    if r is None:
        print(each)
    else:
        child2ParentDict[each] = r


parent2ChildDict = {}
for child in child2ParentDict:
    parent = child2ParentDict[child]
    if(parent in parent2ChildDict):
        parent2ChildDict[parent].append(child)
    else:
예제 #9
0
def run(mode,
        embedding,
        embedding_name,
        experiment_name=None,
        log=False,
        trial=False):
    if embedding == "fasttext":
        #model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M-subword.vec', binary=False)
        model = gensim.models.FastText.load_fasttext_format('wiki.en.bin')
        #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec')
    elif embedding == "wiki2M":
        #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec','vec')
        model = gensim.models.KeyedVectors.load_word2vec_format(
            'embeddings/crawl-300d-2M.vec', binary=False)
        #model.save("crawl-300d-2M.bin")
    elif embedding == "wiki1M_subword":
        model = gensim.models.KeyedVectors.load_word2vec_format(
            'embeddings/wiki-news-300d-1M-subword.vec', binary=False)

    elif embedding == "own_w2v":
        model = gensim.models.KeyedVectors.load(
            'embeddings/own_embeddings_w2v')

    elif embedding == "quick":
        model = gensim.models.KeyedVectors.load_word2vec_format(
            'embeddings/crawl-300d-2M.vec', binary=False, limit=50000)
    elif embedding == "poincare":
        model = PoincareModel.load(
            'embeddings/poincare_common_domains02_5_3_50')
        print(len(model.kv.vocab))
        words = [
            "computer_science", "biology", "physics", "science", "virology",
            "life_science", "chemistry", "earth_science", "algebra",
            "economics", "optics"
            "immunology"
        ]
        for word in words:
            print("Current word: ", word)

            if word in model.kv.vocab:
                try:
                    print("Closest Parent: ", model.kv.closest_parent(word))
                    print("Closest Child ", model.kv.closest_child(word))
                    print("Descendants: ", model.kv.descendants(word))
                    print("Ancestors: ", model.kv.ancestors(word))
                    print("Hierarchy diff to Science: ",
                          model.kv.difference_in_hierarchy(word, "science"))
                    print('\n')
                except:
                    continue
            else:
                print("Word not in Vocab")

    if mode == "visualize_embedding_poincare":
        relations = set([])
        filename_in = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "data/isas_1000.tsv")
        with open(filename_in, 'r') as f:
            reader = csv.reader(f, delimiter='\t')
            for i, line in enumerate(reader):
                relations.add((line[0], line[1]))
        plot = poincare_2d_visualization(model, relations, experiment_name)
        py.image.save_as(plot, "vis/" + experiment_name + '.png')
        print("Starting visualization")

        #visualize_taxonomy(vectors, names)


#todo own file for train
    if mode == "visualize_embedding":
        gold, relations = read_all_data()
        vectors = []
        names = []
        for relation in (
            [relation1[1].replace(" ", "_") for relation1 in relations] +
            [relation2[2].replace(" ", "_") for relation2 in relations]):
            if relation not in names:
                if relation not in model.wv:
                    print(relation)
                    continue
                vectors.append(model.wv[relation])
                names.append(relation)
        visualize_taxonomy(vectors, names, experiment_name)

    if mode == 'train_poincare':
        # gold,relations = read_all_data()
        # freq_science = [3,5]
        # for entry_science in freq_science:
        #     relations = './data/' + domain +'_crawl_' + str(entry_science) +'.tsv'
        #     #relations = './data/science_crawl_merge_10_3_02.tsv'
        #     poincare_rel = PoincareRelations(relations)
        #     dim = 50
        #     model = PoincareModel(poincare_rel, size = dim)
        #     print("Starting Training...")
        #     model.train(epochs=400)
        #     model.save("embeddings/embeddings_" + domain + "_crawl_poincare_" + str(entry_science) + "_" + str(dim))
        #     #model.save("embeddings/embeddings_science_crawl_merge_poincare_10_3_50_02")
        #     break
        relations = './data/poincare_common_domains.tsv'
        #relations = './data/science_crawl_merge_10_3_02.tsv'
        poincare_rel = PoincareRelations(relations)
        dim = 50
        model = PoincareModel(poincare_rel, size=dim)
        print("Starting Training...")
        model.train(epochs=400)
        model.save("embeddings/poincare_common_domains_5_3" + "_" + str(dim))

    if mode == "train_word2vec":
        gold_s, relations_s = read_all_data("science")
        gold_e, relations_e = read_all_data("environment")
        gold_f, relations_f = read_all_data("food")
        vocabulary = set([relation[2] for relation in gold_s] +
                         [relation[1] for relation in gold_s])
        vocabulary = vocabulary | set([relation[2] for relation in gold_f] +
                                      [relation[1] for relation in gold_f])
        vocabulary = vocabulary | set([relation[2] for relation in gold_e] +
                                      [relation[1] for relation in gold_e])
        documents = list(
            read_input(
                "/srv/data/5aly/data_text/wikipedia_utf8_filtered_20pageviews.csv",
                vocabulary))
        model = gensim.models.Word2Vec(size=300,
                                       window=5,
                                       min_count=5,
                                       workers=30)
        model.build_vocab(documents)
        #model.train(documents, total_examples = len(documents), epochs=10)
        model.train(documents, total_examples=model.corpus_count, epochs=30)
        model.save("embeddings/own_embeddings_w2v_all")

    elif mode == "analysis":
        gold, relations = read_all_data()
        voc_rel = set([relation[1] for relation in relations] +
                      [relation[2] for relation in relations])
        voc_gold = set([relation[1] for relation in gold] +
                       [relation[2] for relation in gold])
        print("Vokabeln in Gold: " + str(len(voc_gold)) +
              "Vokabeln in Taxonomy: " + str(len(voc_rel)))
예제 #10
0
 def setUp(self):
     self.data = PoincareRelations(datapath('poincare_hypernyms.tsv'))
     self.data_large = PoincareRelations(
         datapath('poincare_hypernyms_large.tsv'))
wordrank_path = 'wordrank' # path to Wordrank directory
out_dir = 'model' # name of output directory to save data to
data = '../../gensim/test/test_data/lee.cor' # sample corpus

model = Wordrank.train(wordrank_path, data, out_dir, iter=21, dump_period=10)


varembed_vectors = '../../gensim/test/test_data/varembed_leecorpus_vectors.pkl'
model = varembed.VarEmbed.load_varembed_format(vectors=varembed_vectors)


morfessors = '../../gensim/test/test_data/varembed_leecorpus_morfessor.bin'
model = varembed.VarEmbed.load_varembed_format(vectors=varembed_vectors, morfessor_model=morfessors)

import os

poincare_directory = os.path.join(os.getcwd(), 'docs', 'notebooks', 'poincare')
data_directory = os.path.join(poincare_directory, 'data')
wordnet_mammal_file = os.path.join(data_directory, 'wordnet_mammal_hypernyms.tsv')

from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations
relations = PoincareRelations(file_path=wordnet_mammal_file, delimiter='\t')
model = PoincareModel(train_data=relations, size=2, burn_in=0)
model.train(epochs=1, print_every=500)

models_directory = os.path.join(poincare_directory, 'models')
test_model_path = os.path.join(models_directory, 'gensim_model_batch_size_10_burn_in_0_epochs_50_neg_20_dim_50')
model = PoincareModel.load(test_model_path)


if __name__ == "__main__":

    args = parse_arguments()

    config_path = args.config
    model_path = args.out

    hp = iom.load_json(config_path)

    data_path = hp.get("input")

    hp["epochs"] = args.epochs
    hp["batch_size"] = args.batch_size

    relations = PoincareRelations(file_path=args.data, delimiter='\t')

    # load model and continue training
    if iom.check_exists(model_path):

        logger.info("Poincarè model found! Loading")

        model = iom.load_pickle(model_path)

    # create new model
    else:

        # get hyperparameters of model and model training
        size = hp.get("size", 100)
        nce = hp.get("nce", 20)
        burn_in = hp.get("burn_in", 10)
예제 #13
0
from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations
from gensim.viz.poincare import poincare_distance_heatmap

from tensorflow.keras.layers import Embedding

wordnet_mamal_file_path = '/Users/pankaj/dev/git/smu/nlp337/data/mamals.tsv'
relations = PoincareRelations(wordnet_mamal_file_path, delimiter='\t')
model = PoincareModel(train_data=relations, size=2, burn_in=0)
model.train(epochs=2, print_every=500)

pcv = PoincareKeyedVectors(vector_size=20)

poincare_distance_heatmap((0, 0),
                          x_range=(-1.0, 1.0),
                          y_range=(-1.0, 1.0),
                          num_points=100)
예제 #14
0
model.kv.descendants("ไก่")


# *Cleaning the structure or separating it seems to be needed*

# ### visualize sample hypernyms (synsets)

# In[ ]:


from gensim.test.utils import datapath
from gensim.models.poincare import PoincareRelations

file_path = datapath("poincare_hypernyms_large.tsv")
rels = PoincareRelations(file_path)

for epochs in [5, 10, 20, 50, 100, 1000]:
    model = PoincareModel(rels, size=2)
    model.train(epochs=epochs)

    import plotly
    import gensim.viz.poincare

    plotly.offline.init_notebook_mode(connected=False)
    prefecutre_map = gensim.viz.poincare.poincare_2d_visualization(model=model,
                                                                   tree=rels,
                                                                   figure_title="{} epochs".format(epochs),
                                                                   show_node_labels=model.kv.vocab.keys())
    plotly.offline.iplot(prefecutre_map)