def apply_w2v_regression(model, regression): """Given a word2vec model and a linear regression, apply that regression to all the vectors in the model. ::param model:: A gensim `KeyedVectors` or `Word2Vec` instance ::param regression:: A `sklearn.linear_model.LinearRegression` instance ::returns:: A gensim `KeyedVectors` instance """ aligned_model = KeyedVectors() # Word2Vec() aligned_model.vocab = model.vocab.copy() aligned_model.vector_size = model.vector_size aligned_model.index2word = model.index2word # aligned_model.reset_weights() aligned_model.syn0 = regression.predict(model.syn0).astype(np.float32) return aligned_model
def apply_wv2_regression(model, regression): """Given a word2vec model and a linear regression, apply that regression to all the vectors in the model. ::param model:: A gensim `KeyedVectors` or `Word2Vec` instance ::param regression:: A `sklearn.linear_model.LinearRegression` instance ::returns:: A gensim `KeyedVectors` instance """ debug("Applying transformation") model_t = KeyedVectors() # Word2Vec() model_t.wv.vocab = model.vocab.copy() model_t.wv.vector_size = model.vector_size model_t.wv.index2word = model.index2word # model_t.reset_weights() debug("Transforming {:,} vectors".format(len(model.syn0))) # N.B. Somehow I get float64 here and that's not what I want, so I'm explicitly casting to float32 model_t.syn0 = regression.predict(model.syn0).astype(np.float32) return model_t
def main(): """Entry point.""" parser = argparse.ArgumentParser("AWD-LSTM Embeddings to Word Vectors") parser.add_argument("--model", required=True) parser.add_argument("--dictionary", required=True) parser.add_argument("--output", required=True) args = parser.parse_args() dictionary = torch.load(args.dictionary) model = torch.load(args.model, map_location='cpu') embeddings = model[0].encoder.weight.data.cpu().numpy() kv = KeyedVectors(embeddings.shape[1]) kv.syn0 = embeddings kv.vocab = { w: Vocab(index=i) for i, w in enumerate(dictionary.dictionary.idx2word) } kv.index2word = dictionary.dictionary.idx2word kv.save(args.output)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-p", "--pretrain", action="store_true") parser.add_argument("-t", "--center_trainable", action="store_true") parser.add_argument("-w", "--weight", type=float, default="0.5") parser.add_argument("-r", "--restrict", type=float, default="1") parser.add_argument("-c", "--cross_validate", type=int, default=0) parser.add_argument("-g", "--GPU", type=str, default="1") parser.add_argument("-e", "--num_epoch", type=int, default=20) parser.add_argument("-l", "--relational_embedding_size", type=int, default=10) args = parser.parse_args() print(args) os.environ["CUDA_VISIBLE_DEVICES"] = args.GPU statsPath = "data/2018_corpus_stats.pkl" num_cross = str(args.cross_validate) train_data = DataLoader() train_data.read_stats(statsPath) train_data.load_pairs_counting("data/train_count.pkl") train_data.load_argument_sample_table("data/argument_sample_table.p") test_data = DataLoader() test_data.read_stats(statsPath) test_data.load_pairs_counting("data/test_count.pkl") test_data.load_argument_sample_table("data/argument_sample_table.p") modelPath = "data/stage_one.model" word2vecModel = Word2Vec.load(modelPath) context_wv = KeyedVectors(vector_size=300) context_wv.vocab = word2vecModel.wv.vocab context_wv.index2word = word2vecModel.wv.index2word context_wv.syn0 = word2vecModel.syn1neg pretrain_center_emb = list() pretrain_context_emb = list() counter = 0 for i in range(len(train_data.id2word)): tmp_w = train_data.id2word[i] if tmp_w in context_wv.vocab: pretrain_center_emb.append(word2vecModel[tmp_w]) pretrain_context_emb.append(context_wv[tmp_w]) else: pretrain_center_emb.append(np.zeros(300)) pretrain_context_emb.append(np.zeros(300)) counter += 1 print("empty count", counter) pretrain_center_emb = np.asarray(pretrain_center_emb) wordsim_dir = "./Word-similarity-dataset/Simlex/" with open(wordsim_dir + "verb.json", "r") as f: verb_list = json.load(f) with open(wordsim_dir + "noun.json", "r") as f: noun_list = json.load(f) with open(wordsim_dir + "adjective.json", "r") as f: adjective_list = json.load(f) with open(wordsim_dir + "all.json", "r") as f: all_list = json.load(f) simlex_corpora = [verb_list, noun_list, adjective_list, all_list] simlex_names = ["verb_list", "noun_list", "adjective_list", "all_list"] m = Model(train_data, args) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if args.pretrain: # Initial assign sess.run(m.emb_init, feed_dict={m.emb_placeholder: pretrain_center_emb}) sess.run( m.emb_init_context, feed_dict={m.emb_placeholder_context: pretrain_context_emb}) ws_test(sess, m, test_data, simlex_corpora, num_cross, args.relational_embedding_size, args.restrict) sp10k_test_overall(sess, m, train_data, args.relational_embedding_size, args.restrict) test_keller_overall(sess, m, train_data) num_epoch = args.num_epoch num_batch = 256 batch_size = 1024 for epoch in range(num_epoch): print(" epoch:", str(epoch + 1), "/", num_epoch) process_bar = tqdm(range(num_batch)) for i in process_bar: batch = train_data.get_sd_train_batch(batch_size) feed_dict = { m.predicate_amod_ids: batch["amod"][:, 0], m.argument_amod_ids: batch["amod"][:, 1], m.argument_prime_amod_ids: batch["amod"][:, 2], m.predicate_nsubj_ids: batch["nsubj"][:, 0], m.argument_nsubj_ids: batch["nsubj"][:, 1], m.argument_prime_nsubj_ids: batch["nsubj"][:, 2], m.predicate_dobj_ids: batch["dobj"][:, 0], m.argument_dobj_ids: batch["dobj"][:, 1], m.argument_prime_dobj_ids: batch["dobj"][:, 2], } loss, _ = sess.run([m.loss, m.optimize], feed_dict=feed_dict) process_bar.set_description("Loss: %0.4f" % loss) # test ws_test(sess, m, test_data, simlex_corpora, num_cross, args.relational_embedding_size, args.restrict) sp10k_test_overall(sess, m, train_data, args.relational_embedding_size, args.restrict) test_keller_overall(sess, m, train_data)
print("text analysis") # text += print_info_length(corpus_labels, lines_corpus_splitted, "corpus docs" + conf, "words", True) text += print_info_length(queries_labels, lines_queries_splitted, "queries" + conf, "words", True) text += '\n' + str(corpus_model) print("done.") w1 = "night" outv = KeyedVectors(300) outv.vocab = corpus_model.wv.vocab # same outv.index2word = corpus_model.wv.index2word # same outv.syn0 = corpus_model.syn1neg # different text += '\nIN EMBEDDINGS COMPARISON:\n' + str( corpus_model.wv.most_similar(positive=[corpus_model[w1]], topn=6)) print("IN-IN done.") text += '\nOUT EMBEDDINGS COMPARISON:\n' + str( outv.most_similar(positive=[outv[w1]], topn=6)) print("OUT-OUT done.") text += '\nIN-OUT EMBEDDINGS COMPARISON:\n' + str( corpus_model.wv.most_similar(positive=[outv[w1]], topn=6)) print("IN-OUT done.") text += '\nOUT-IN EMBEDDINGS COMPARISON:\n' + str( outv.most_similar(positive=[corpus_model[w1]], topn=6)) print("OUT-IN done.") with open("data_analysis/data_analysis" + conf + ".txt", 'w') as file:
encoded_queries[query_id] = encode(query.title, word_dict) print(query.title, encoded_queries[query_id]) encoded_queries_oov[query_id] = encode_oov(query.title, word_dict) print(encoded_queries_oov) idf_filename = "preprocessing/pre_data/idfs/idfs" + conf idfs = load_from_pickle_file(idf_filename) idfs = encode_idf(idfs, word_dict) if not glv: outv = KeyedVectors(300) outv.vocab = model.wv.vocab # same outv.index2word = model.wv.index2word # same outv.syn0 = model.syn1neg # different we_out = encode_we(outv, word_dict, glv) we = encode_we(model, word_dict, glv) max_query_len = max([len(q.title.split()) for q in queries_obj.values()]) padded_query_idfs = {} padded_query_embs = {} print("Encoding padded queries idf and embeddings") for query_id, query in tqdm(encoded_queries.items() ): # padding queries idfs and queries embeddings padded_query_idfs[query_id] = [] padded_query_embs[query_id] = []