Пример #1
0
def embedding_seperate(path, words, model_embedding, hParams):
	model_embedding_kv = KeyedVectors(hParams.embedding_size)

	for word in words:
		try:
			model_embedding_kv.add([word], [model_embedding.wv[word]])
		except:
			pass

	model_embedding_kv.save(path)
	return model_embedding_kv
Пример #2
0
def embedding_ayir(dizin, kelimeler, model_embedding, hParams):
    model_embedding_kv = KeyedVectors(hParams.embedding_matris_boyut)

    for kelime in kelimeler:
        try:
            model_embedding_kv.add([kelime], [model_embedding.wv[kelime]])
        except:
            pass

    model_embedding_kv.save(dizin)
    return model_embedding_kv
Пример #3
0
def save_fasttext(vocab):
    model = FastText.load_word2vec_format('../../corpora/wiki.en.vec')
     # 新建KeyedVectors
    kmodel = KeyedVectors(300)
    loss = 0
    for word in vocab:
        try:
            vec = model[word]
        except:
            loss += 1
            continue
        kmodel.add(word, vec, replace=True)
    print('loss word: ', loss)
    kmodel.save('../../corpora/fasttext.wv')
Пример #4
0
def save_gnews(vocab):
    model = KeyedVectors.load_word2vec_format('../../corpora/GoogleNews-vectors-negative300.bin', binary=True)
    # 新建KeyedVectors
    kmodel = KeyedVectors(300)
    loss = 0
    for word in vocab:
        try:
            vec = model[word]
        except:
            loss += 1
            continue
        kmodel.add(word, vec, replace=True)
    print('loss word: ', loss)
    kmodel.save('../../corpora/gnews.wv')
Пример #5
0
def save_glove(vocab):
    #model = KeyedVectors.load_word2vec_format('../../corpora/glove.840B.300d.txt', binary=False)
    kmodel = KeyedVectors(300)
    vocab = set(vocab.to_list())
    f = open('../../corpora/glove.840B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = str(values[0])
        if word not in vocab:
            continue
        try:
            vec = np.asarray(values[1:], dtype='float32')
        except:
            continue
        kmodel.add(word, vec, replace=True)
    f.close()
    kmodel.save('../../corpora/glove.wv')
Пример #6
0
    def handle(self, output_file, debug_output_file, **options):
        logger.info("Building definition vectors")

        definitions = Definition.objects.filter(
            auto_translation_source_id__isnull=True).prefetch_related(
                "wordform__lemma")

        count = definitions.count()

        news_vectors = google_news_vectors()

        definition_vector_keys = []
        definition_vector_vectors = []

        unknown_words = set()

        with create_debug_output(debug_output_file) as debug_output:
            for d in tqdm(definitions.iterator(), total=count):
                keys = extract_keyed_words(d.semantic_definition, news_vectors,
                                           unknown_words)
                debug_output(
                    json.dumps(
                        {
                            "definition": d.text,
                            "wordform_text": d.wordform.text,
                            "extracted_keys": keys,
                        },
                        ensure_ascii=False,
                    ))
                if keys:
                    vec_sum = vector_for_keys(news_vectors, keys)

                    definition_vector_keys.append(definition_to_cvd_key(d))
                    definition_vector_vectors.append(vec_sum)

            definition_vectors = KeyedVectors(
                vector_size=news_vectors.vector_size)
            definition_vectors.add_vectors(definition_vector_keys,
                                           definition_vector_vectors)
            output_file.parent.mkdir(exist_ok=True)
            definition_vectors.save(fspath(output_file))
def optimize_embeddings(
    vocabulary,
    embedding_file,
    output_embedding_path,
    n_dim,
    logging,
):
    original_embeddings = KeyedVectors.load(embedding_file, mmap='r')
    embeddings_redux = KeyedVectors(n_dim)

    words = []
    weights = []

    logging.info(
        "Generating optimized W2V embedding based on vocabulary words...")

    count = 0
    for word in vocabulary:
        try:
            vector = original_embeddings[word]
            words.append(word)
            weights.append(vector)
            count += 1
        except:
            logging.info(f'Embeddings: word "{word}" not found on embeddings!')
            pass

    del original_embeddings
    embeddings_redux.add(words, weights)
    del words
    del weights

    os.makedirs(os.path.dirname(output_embedding_path), exist_ok=True)
    embeddings_redux.save(output_embedding_path)
    del embeddings_redux

    logging.info(
        f'\n\nGenerated optimized Gensim W2V embedding file at "{output_embedding_path}"'
    )
    del output_embedding_path
    logging.info(f'{count}/{len(vocabulary)} words found on embeddings')
Пример #8
0
def main():
    """Entry point."""
    parser = argparse.ArgumentParser("AWD-LSTM Embeddings to Word Vectors")
    parser.add_argument("--model", required=True)
    parser.add_argument("--dictionary", required=True)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    dictionary = torch.load(args.dictionary)
    model = torch.load(args.model, map_location='cpu')
    embeddings = model[0].encoder.weight.data.cpu().numpy()

    kv = KeyedVectors(embeddings.shape[1])
    kv.syn0 = embeddings
    kv.vocab = {
        w: Vocab(index=i)
        for i, w in enumerate(dictionary.dictionary.idx2word)
    }
    kv.index2word = dictionary.dictionary.idx2word

    kv.save(args.output)
Пример #9
0
def convert(dir_path):
    with open(os.path.join(dir_path, 'data/cord19q', 'cord19-300d.txt'), mode="r") as txtfile:
        lines = txtfile.readlines()
        num_entries = len(lines)
        vecs = np.zeros((num_entries-1, 300), float)
        words = []
        print(type(words))
        idx = 0
        for line in lines[1:]:
            tok = line.split(" ")
            words.append(tok[0])
            vals = tok[1:]
            vecs_ = np.array([float(item) for item in vals])
            vecs[idx, :] = vecs_
            idx = idx + 1

        model = KeyedVectors(vecs.shape[1])
        model.add(words, vecs)
        model.save(os.path.join(dir_path, 'data/cord19q', 'cord19-300d.wv'))

        # load the model back and verify results
        model_ = model.load(os.path.join(dir_path, 'data/cord19q', 'cord19-300d.wv'))
Пример #10
0
def rename_wikipedia2vec_entities(src, tgt_w, tgt_f):
    from gensim.models import KeyedVectors
    import numpy as np

    old_model = KeyedVectors.load(src, mmap='r')
    words = [
        word for word in old_model.vocab.keys()
        if not word.startswith("ENTITY/")
    ]

    titles = [
        word[7:] for word in old_model.vocab.keys()
        if word.startswith("ENTITY/")
    ]
    titles = [title for title in titles if not "#" in title]
    title2deviant_title = {title.split("|")[0]: title for title in titles}
    titles = [title.split("|")[0] for title in titles]

    t2w = title2wikidata(titles)
    t2f = init_mapping(titles)
    w2f = wikidata2freebase(sum([list(x) for x in t2w.values()], []))

    for title in t2w:
        for w in t2w[title]:
            for f in w2f[w]:
                t2f[title].add(f)

    w2t = init_mapping(sum([list(x) for x in t2w.values()], []))
    f2t = init_mapping(sum([list(x) for x in t2f.values()], []))

    for title in titles:
        for w in t2w[title]:
            w2t[w].add(title)
        for f in t2f[title]:
            f2t[f].add(title)

    print("Some stats", flush=True)
    print("t2f", len(t2f), Counter([len(x) for x in t2f.values()]))
    print("w2f", len(w2f), Counter([len(x) for x in w2f.values()]))
    print("t2w", len(t2w), Counter([len(x) for x in t2w.values()]))
    print("f2t", len(f2t), Counter([len(x) for x in f2t.values()]))
    print("w2t", len(w2t), Counter([len(x) for x in w2t.values()]))

    w_vecs = {word: old_model[word] for word in words}
    freebase_model = KeyedVectors(old_model.vector_size)
    freebase_model.add(words, [w_vecs[word] for word in words])
    freebase_words = list(f2t.keys())
    f_vecs = {
        f: np.mean([
            old_model["ENTITY/" + title2deviant_title[title]]
            for title in f2t[f]
        ], 0)
        for f in freebase_words
    }
    freebase_model.add(freebase_words,
                       [f_vecs[word] for word in freebase_words])
    freebase_model.save(tgt_f)
    del freebase_model

    wikidata_model = KeyedVectors(old_model.vector_size)
    wikidata_model.add(words, [w_vecs[word] for word in words])
    wikidata_words = list(w2t.keys())
    w_vecs = {
        w: np.mean([
            old_model["ENTITY/" + title2deviant_title[title]]
            for title in w2t[w]
        ], 0)
        for w in wikidata_words
    }
    wikidata_model.add(wikidata_words,
                       [w_vecs[word] for word in wikidata_words])
    wikidata_model.save(tgt_w)
    del wikidata_model
Пример #11
0
def test_build_vocab_build_vocab_from_embeddings():
    """
    This test shows that all fields in the embeddings will be included.

    In embeddings and data:
        blue
        green
        yellow
    In embeddings only:
        purple
        gold
    In data only:
        white

    Expected vocab:
        blue
        green
        yellow
        purple
        gold
        white
    """

    model = KeyedVectors(10)
    model.add('purple', np.random.rand(10))
    model.add('gold', np.random.rand(10))
    model.add('<unk>', np.random.rand(10))
    model.add('blue', np.random.rand(10))
    model.add('green', np.random.rand(10))
    model.add('<pad>', np.random.rand(10))
    model.add('yellow', np.random.rand(10))

    with tempfile.NamedTemporaryFile() as tmpfile:
        model.save(tmpfile.name)

        field = TextField.from_embeddings(
            embeddings=tmpfile.name,
            embeddings_format='gensim',
            build_vocab_from_embeddings=True,
        )

        dummy = ["blue green", "yellow", 'white']

        field.setup(dummy)

    # assert vocab setup in expected order
    assert field.vocab == odict([
        ('<pad>', 0),
        ('<unk>', 1),
        ('blue', 2),
        ('green', 3),
        ('yellow', 4),
        ('white', 1),
        ('purple', 5),
        ('gold', 6),
    ])

    # assert embedding matrix organized in expected order
    assert torch.equal(
        field.embedding_matrix,
        torch.stack([
            torch.tensor(model['<pad>']),
            torch.tensor(model['<unk>']),
            torch.tensor(model['blue']),
            torch.tensor(model['green']),
            torch.tensor(model['yellow']),
            torch.tensor(model['purple']),
            torch.tensor(model['gold'])
        ]),
    )
Пример #12
0
        vector = embeddings[key]
        words.append(key)
        weights.append(vector)
        found += 1

if len(words) > 0 and len(weights) > 0:
    liwc_embeddings.add(words, weights)
else:
    del embeddings
    sys.exit("No words found on LIWC dictionary!")

del embeddings
del words
del weights

output_path = os.path.join(OUTPUT_FOLDER, f'{args.lang}_liwc.w2v')
logging.info(f'Saving LIWC embeddings to {output_path}...')
os.makedirs(os.path.dirname(output_path), exist_ok=True)
liwc_embeddings.save(output_path)
del liwc_embeddings
del output_path
logging.info(f'LIWC embeddings saved')

logging.info(f'{found}/{count} words on embeddings were found on LIWC')
del found
del count

end = time.time()

logging.info(f'Elapsed execution time: {end-start}s')