示例#1
0
# loading/merging word embeddings
t_embeddings = text_embeddings.Embeddings()
t_embeddings.load_embeddings(embs_path,
                             200000,
                             language='en',
                             print_loading=True,
                             skip_first_line=True)
t_embeddings.inverse_vocabularies()
vocabulary_size = len(t_embeddings.lang_vocabularies["en"])
embeddings = t_embeddings.lang_embeddings["en"].astype(np.float64)
embedding_size = t_embeddings.emb_sizes["en"]

# loading simlex and evaluating initial embeddings
simlex_path_en = simlex_path
simlex_entries_en = io_helper.load_csv_lines(simlex_path_en,
                                             delimiter='\t',
                                             indices=[0, 1, 3])
simlex_corr_en = appleveleval.evaluate_reps_simlex(t_embeddings,
                                                   simlex_entries_en,
                                                   lang="en",
                                                   lower=False)
print("Evaluation dataset correlation before specialization: " +
      str(simlex_corr_en))

# preparing simlex pairs for the computation of the new embeddings with the model
simlex_data = []
for sim_ent in simlex_entries_en:
    if sim_ent[0] in t_embeddings.lang_vocabularies["en"] and sim_ent[
            1] in t_embeddings.lang_vocabularies["en"]:
        simlex_data.append((t_embeddings.lang_vocabularies["en"][sim_ent[0]],
                            t_embeddings.lang_vocabularies["en"][sim_ent[1]],
示例#2
0
act = tf.nn.tanh
noise = 0

model = wordpair_classifier.WordPairClassifier(embeddings, embedding_size, mlp_hidden_layer_sizes, same_mlp = same_mlp, bilinear_softmax = bilinear_softmax, num_mappings = num_mlps, activation = act, num_classes = len(dist_labels), noise_std = noise)
#model.define_optimization(loss_functions.softmax_cross_entropy, l2_reg_fac, lr, loss_function_params = None)

print("Initializing tensorflow session...")
session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())

model.set_variable_values(session, vars)


###### preparing test set for predictions #######
print("Loading dataset...")
predict_set = io_helper.load_csv_lines(args.data, delimiter = '\t')
predict_wordpairs = [(x[0], x[1]) for x in predict_set]
#predict_labels = [x[2] for x in predict_set]
print("Preparing prediction examples...")
predict_pairs = data_shaper.prep_word_tuples(predict_wordpairs, t_embeddings, "default", labels = None)
#predict_labels, dl = data_shaper.prep_labels_one_hot_encoding(predict_labels, dist_labels)
predict_data = list(zip(predict_pairs, [None]*len(predict_pairs)))
	
###### predicting and evaluating ################
print("Computing predictions...")
preds = model.preds_raw.eval(session = session, feed_dict = build_feed_dict_func(model, predict_data, predict = True)[0])
pred_labels = [dist_labels[np.argmax(p)] for p in preds]

if args.preds is not None:
	print("Writing predictions to file...")
	to_write = list(zip([t_embeddings.get_word_from_index(x[0], lang = "default") for x in predict_pairs], [t_embeddings.get_word_from_index(x[1], lang = "default") for x in predict_pairs], pred_labels))
示例#3
0
print("Loading pre-trained embeddings...")
t_embeddings = text_embeddings.Embeddings()
t_embeddings.load_embeddings(embs_path,
                             200000,
                             language='en',
                             print_loading=True,
                             skip_first_line=False)
t_embeddings.inverse_vocabularies()
vocabulary_size = len(t_embeddings.lang_vocabularies["en"])
embeddings = t_embeddings.lang_embeddings["en"].astype(np.float64)
embedding_size = t_embeddings.emb_sizes["en"]

# data loading and preprocessing
synonym_pairs = data_shaper.prep_word_tuples(
    [[x[0].split('_')[1], x[1].split('_')[1]]
     for x in io_helper.load_csv_lines(synonyms_path, delimiter=' ')],
    t_embeddings, "en")
antonym_pairs = data_shaper.prep_word_tuples(
    [[x[0].split('_')[1], x[1].split('_')[1]]
     for x in io_helper.load_csv_lines(antonyms_path, delimiter=' ')],
    t_embeddings, "en")
all_pairs = []
all_pairs.extend(synonym_pairs)
all_pairs.extend(antonym_pairs)
print("Num syn pairs: " + str(len(synonym_pairs)))
print("Num ant pairs: " + str(len(antonym_pairs)))

syn_constraints_dict = {
    (str(i1) + ":" + str(i2) if i1 < i2 else str(i2) + ":" + str(i1)): 0
    for (i1, i2) in synonym_pairs
}
示例#4
0
if args.modelpath is not None and not os.path.isdir(os.path.dirname(args.modelpath)) and not os.path.dirname(args.modelpath) == "":
	print("Error: Directory of the desired model output path not found.")
	exit(code = 1)

# loading/merging word embeddings
print("Loading pre-trained embeddings...")
t_embeddings = text_embeddings.Embeddings()
t_embeddings.load_embeddings(embs_path, 200000, language = 'en', print_loading = True, skip_first_line = False)
t_embeddings.inverse_vocabularies()
vocabulary_size = len(t_embeddings.lang_vocabularies["en"])
embeddings = t_embeddings.lang_embeddings["en"].astype(np.float64)
embedding_size = t_embeddings.emb_sizes["en"]

# data loading and preprocessing
synonym_pairs = data_shaper.prep_word_tuples([[x[0].split('_')[1], x[1].split('_')[1]] for x in io_helper.load_csv_lines(synonyms_path, delimiter = ' ')], t_embeddings, "en")
antonym_pairs = data_shaper.prep_word_tuples([[x[0].split('_')[1], x[1].split('_')[1]] for x in io_helper.load_csv_lines(antonyms_path, delimiter = ' ')], t_embeddings, "en")
all_pairs = []
all_pairs.extend(synonym_pairs)
all_pairs.extend(antonym_pairs)
print("Num syn pairs: " + str(len(synonym_pairs)))
print("Num ant pairs: " + str(len(antonym_pairs)))

syn_constraints_dict = {(str(i1) + ":" + str(i2) if i1 < i2 else str(i2) + ":" + str(i1)) : 0 for (i1, i2) in synonym_pairs}
ant_constraints_dict = {(str(i1) + ":" + str(i2) if i1 < i2 else str(i2) + ":" + str(i1)) : 0 for (i1, i2) in antonym_pairs}

# Faiss wrapper for quick comparison of vectors, initially we index the starting distributional vectors
print("Building FAISS index for fast retrieval of most similar vectors...")
faisser = faiss_sts.Faiss(embedding_size)
faisser.index(None, t_embeddings.lang_embeddings["en"], matrix_normalized = False, measure = distance_measure)