Exemplo n.º 1
0
def apply_w2v_regression(model, regression):
    """Given a word2vec model and a linear regression, apply that regression to all the vectors
    in the model.
    ::param model:: A gensim `KeyedVectors` or `Word2Vec` instance
    ::param regression:: A `sklearn.linear_model.LinearRegression` instance
    ::returns:: A gensim `KeyedVectors` instance
    """
    aligned_model = KeyedVectors()  # Word2Vec()
    aligned_model.vocab = model.vocab.copy()
    aligned_model.vector_size = model.vector_size
    aligned_model.index2word = model.index2word
    # aligned_model.reset_weights()
    aligned_model.syn0 = regression.predict(model.syn0).astype(np.float32)
    return aligned_model
Exemplo n.º 2
0
def apply_wv2_regression(model, regression):
    """Given a word2vec model and a linear regression, apply that regression to all the vectors
    in the model.

    ::param model:: A gensim `KeyedVectors` or `Word2Vec` instance
    ::param regression:: A `sklearn.linear_model.LinearRegression` instance
    ::returns:: A gensim `KeyedVectors` instance
    """
    debug("Applying transformation")
    model_t = KeyedVectors() # Word2Vec()
    model_t.wv.vocab = model.vocab.copy()
    model_t.wv.vector_size = model.vector_size
    model_t.wv.index2word = model.index2word
    # model_t.reset_weights()
    debug("Transforming {:,} vectors".format(len(model.syn0)))
    # N.B. Somehow I get float64 here and that's not what I want, so I'm explicitly casting to float32
    model_t.syn0 = regression.predict(model.syn0).astype(np.float32)
    return model_t
Exemplo n.º 3
0
def main():
    """Entry point."""
    parser = argparse.ArgumentParser("AWD-LSTM Embeddings to Word Vectors")
    parser.add_argument("--model", required=True)
    parser.add_argument("--dictionary", required=True)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    dictionary = torch.load(args.dictionary)
    model = torch.load(args.model, map_location='cpu')
    embeddings = model[0].encoder.weight.data.cpu().numpy()

    kv = KeyedVectors(embeddings.shape[1])
    kv.syn0 = embeddings
    kv.vocab = {
        w: Vocab(index=i)
        for i, w in enumerate(dictionary.dictionary.idx2word)
    }
    kv.index2word = dictionary.dictionary.idx2word

    kv.save(args.output)
Exemplo n.º 4
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("-p", "--pretrain", action="store_true")
    parser.add_argument("-t", "--center_trainable", action="store_true")
    parser.add_argument("-w", "--weight", type=float, default="0.5")
    parser.add_argument("-r", "--restrict", type=float, default="1")
    parser.add_argument("-c", "--cross_validate", type=int, default=0)
    parser.add_argument("-g", "--GPU", type=str, default="1")
    parser.add_argument("-e", "--num_epoch", type=int, default=20)
    parser.add_argument("-l",
                        "--relational_embedding_size",
                        type=int,
                        default=10)
    args = parser.parse_args()
    print(args)

    os.environ["CUDA_VISIBLE_DEVICES"] = args.GPU
    statsPath = "data/2018_corpus_stats.pkl"

    num_cross = str(args.cross_validate)

    train_data = DataLoader()
    train_data.read_stats(statsPath)
    train_data.load_pairs_counting("data/train_count.pkl")
    train_data.load_argument_sample_table("data/argument_sample_table.p")

    test_data = DataLoader()
    test_data.read_stats(statsPath)
    test_data.load_pairs_counting("data/test_count.pkl")
    test_data.load_argument_sample_table("data/argument_sample_table.p")

    modelPath = "data/stage_one.model"
    word2vecModel = Word2Vec.load(modelPath)

    context_wv = KeyedVectors(vector_size=300)
    context_wv.vocab = word2vecModel.wv.vocab
    context_wv.index2word = word2vecModel.wv.index2word
    context_wv.syn0 = word2vecModel.syn1neg

    pretrain_center_emb = list()
    pretrain_context_emb = list()

    counter = 0

    for i in range(len(train_data.id2word)):
        tmp_w = train_data.id2word[i]
        if tmp_w in context_wv.vocab:
            pretrain_center_emb.append(word2vecModel[tmp_w])
            pretrain_context_emb.append(context_wv[tmp_w])
        else:
            pretrain_center_emb.append(np.zeros(300))
            pretrain_context_emb.append(np.zeros(300))
            counter += 1

    print("empty count", counter)

    pretrain_center_emb = np.asarray(pretrain_center_emb)

    wordsim_dir = "./Word-similarity-dataset/Simlex/"

    with open(wordsim_dir + "verb.json", "r") as f:
        verb_list = json.load(f)

    with open(wordsim_dir + "noun.json", "r") as f:
        noun_list = json.load(f)

    with open(wordsim_dir + "adjective.json", "r") as f:
        adjective_list = json.load(f)

    with open(wordsim_dir + "all.json", "r") as f:
        all_list = json.load(f)

    simlex_corpora = [verb_list, noun_list, adjective_list, all_list]
    simlex_names = ["verb_list", "noun_list", "adjective_list", "all_list"]

    m = Model(train_data, args)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        if args.pretrain:

            # Initial assign
            sess.run(m.emb_init,
                     feed_dict={m.emb_placeholder: pretrain_center_emb})
            sess.run(
                m.emb_init_context,
                feed_dict={m.emb_placeholder_context: pretrain_context_emb})

        ws_test(sess, m, test_data, simlex_corpora, num_cross,
                args.relational_embedding_size, args.restrict)
        sp10k_test_overall(sess, m, train_data, args.relational_embedding_size,
                           args.restrict)
        test_keller_overall(sess, m, train_data)

        num_epoch = args.num_epoch
        num_batch = 256

        batch_size = 1024
        for epoch in range(num_epoch):

            print(" epoch:", str(epoch + 1), "/", num_epoch)

            process_bar = tqdm(range(num_batch))
            for i in process_bar:
                batch = train_data.get_sd_train_batch(batch_size)

                feed_dict = {
                    m.predicate_amod_ids: batch["amod"][:, 0],
                    m.argument_amod_ids: batch["amod"][:, 1],
                    m.argument_prime_amod_ids: batch["amod"][:, 2],
                    m.predicate_nsubj_ids: batch["nsubj"][:, 0],
                    m.argument_nsubj_ids: batch["nsubj"][:, 1],
                    m.argument_prime_nsubj_ids: batch["nsubj"][:, 2],
                    m.predicate_dobj_ids: batch["dobj"][:, 0],
                    m.argument_dobj_ids: batch["dobj"][:, 1],
                    m.argument_prime_dobj_ids: batch["dobj"][:, 2],
                }

                loss, _ = sess.run([m.loss, m.optimize], feed_dict=feed_dict)

                process_bar.set_description("Loss: %0.4f" % loss)

            # test
            ws_test(sess, m, test_data, simlex_corpora, num_cross,
                    args.relational_embedding_size, args.restrict)
            sp10k_test_overall(sess, m, train_data,
                               args.relational_embedding_size, args.restrict)
            test_keller_overall(sess, m, train_data)
Exemplo n.º 5
0
print("text analysis")

# text += print_info_length(corpus_labels, lines_corpus_splitted, "corpus docs" + conf, "words", True)
text += print_info_length(queries_labels, lines_queries_splitted,
                          "queries" + conf, "words", True)

text += '\n' + str(corpus_model)

print("done.")

w1 = "night"

outv = KeyedVectors(300)
outv.vocab = corpus_model.wv.vocab  # same
outv.index2word = corpus_model.wv.index2word  # same
outv.syn0 = corpus_model.syn1neg  # different

text += '\nIN EMBEDDINGS COMPARISON:\n' + str(
    corpus_model.wv.most_similar(positive=[corpus_model[w1]], topn=6))
print("IN-IN done.")
text += '\nOUT EMBEDDINGS COMPARISON:\n' + str(
    outv.most_similar(positive=[outv[w1]], topn=6))
print("OUT-OUT done.")
text += '\nIN-OUT EMBEDDINGS COMPARISON:\n' + str(
    corpus_model.wv.most_similar(positive=[outv[w1]], topn=6))
print("IN-OUT done.")
text += '\nOUT-IN EMBEDDINGS COMPARISON:\n' + str(
    outv.most_similar(positive=[corpus_model[w1]], topn=6))
print("OUT-IN done.")

with open("data_analysis/data_analysis" + conf + ".txt", 'w') as file:
Exemplo n.º 6
0
    encoded_queries[query_id] = encode(query.title, word_dict)
    print(query.title, encoded_queries[query_id])
    encoded_queries_oov[query_id] = encode_oov(query.title, word_dict)

print(encoded_queries_oov)

idf_filename = "preprocessing/pre_data/idfs/idfs" + conf
idfs = load_from_pickle_file(idf_filename)

idfs = encode_idf(idfs, word_dict)

if not glv:
    outv = KeyedVectors(300)
    outv.vocab = model.wv.vocab  # same
    outv.index2word = model.wv.index2word  # same
    outv.syn0 = model.syn1neg  # different
    we_out = encode_we(outv, word_dict, glv)

we = encode_we(model, word_dict, glv)

max_query_len = max([len(q.title.split()) for q in queries_obj.values()])

padded_query_idfs = {}
padded_query_embs = {}

print("Encoding padded queries idf and embeddings")

for query_id, query in tqdm(encoded_queries.items()
                            ):  # padding queries idfs and queries embeddings
    padded_query_idfs[query_id] = []
    padded_query_embs[query_id] = []