# loading/merging word embeddings t_embeddings = text_embeddings.Embeddings() t_embeddings.load_embeddings(embs_path, 200000, language='en', print_loading=True, skip_first_line=True) t_embeddings.inverse_vocabularies() vocabulary_size = len(t_embeddings.lang_vocabularies["en"]) embeddings = t_embeddings.lang_embeddings["en"].astype(np.float64) embedding_size = t_embeddings.emb_sizes["en"] # loading simlex and evaluating initial embeddings simlex_path_en = simlex_path simlex_entries_en = io_helper.load_csv_lines(simlex_path_en, delimiter='\t', indices=[0, 1, 3]) simlex_corr_en = appleveleval.evaluate_reps_simlex(t_embeddings, simlex_entries_en, lang="en", lower=False) print("Evaluation dataset correlation before specialization: " + str(simlex_corr_en)) # preparing simlex pairs for the computation of the new embeddings with the model simlex_data = [] for sim_ent in simlex_entries_en: if sim_ent[0] in t_embeddings.lang_vocabularies["en"] and sim_ent[ 1] in t_embeddings.lang_vocabularies["en"]: simlex_data.append((t_embeddings.lang_vocabularies["en"][sim_ent[0]], t_embeddings.lang_vocabularies["en"][sim_ent[1]],
act = tf.nn.tanh noise = 0 model = wordpair_classifier.WordPairClassifier(embeddings, embedding_size, mlp_hidden_layer_sizes, same_mlp = same_mlp, bilinear_softmax = bilinear_softmax, num_mappings = num_mlps, activation = act, num_classes = len(dist_labels), noise_std = noise) #model.define_optimization(loss_functions.softmax_cross_entropy, l2_reg_fac, lr, loss_function_params = None) print("Initializing tensorflow session...") session = tf.InteractiveSession() session.run(tf.global_variables_initializer()) model.set_variable_values(session, vars) ###### preparing test set for predictions ####### print("Loading dataset...") predict_set = io_helper.load_csv_lines(args.data, delimiter = '\t') predict_wordpairs = [(x[0], x[1]) for x in predict_set] #predict_labels = [x[2] for x in predict_set] print("Preparing prediction examples...") predict_pairs = data_shaper.prep_word_tuples(predict_wordpairs, t_embeddings, "default", labels = None) #predict_labels, dl = data_shaper.prep_labels_one_hot_encoding(predict_labels, dist_labels) predict_data = list(zip(predict_pairs, [None]*len(predict_pairs))) ###### predicting and evaluating ################ print("Computing predictions...") preds = model.preds_raw.eval(session = session, feed_dict = build_feed_dict_func(model, predict_data, predict = True)[0]) pred_labels = [dist_labels[np.argmax(p)] for p in preds] if args.preds is not None: print("Writing predictions to file...") to_write = list(zip([t_embeddings.get_word_from_index(x[0], lang = "default") for x in predict_pairs], [t_embeddings.get_word_from_index(x[1], lang = "default") for x in predict_pairs], pred_labels))
print("Loading pre-trained embeddings...") t_embeddings = text_embeddings.Embeddings() t_embeddings.load_embeddings(embs_path, 200000, language='en', print_loading=True, skip_first_line=False) t_embeddings.inverse_vocabularies() vocabulary_size = len(t_embeddings.lang_vocabularies["en"]) embeddings = t_embeddings.lang_embeddings["en"].astype(np.float64) embedding_size = t_embeddings.emb_sizes["en"] # data loading and preprocessing synonym_pairs = data_shaper.prep_word_tuples( [[x[0].split('_')[1], x[1].split('_')[1]] for x in io_helper.load_csv_lines(synonyms_path, delimiter=' ')], t_embeddings, "en") antonym_pairs = data_shaper.prep_word_tuples( [[x[0].split('_')[1], x[1].split('_')[1]] for x in io_helper.load_csv_lines(antonyms_path, delimiter=' ')], t_embeddings, "en") all_pairs = [] all_pairs.extend(synonym_pairs) all_pairs.extend(antonym_pairs) print("Num syn pairs: " + str(len(synonym_pairs))) print("Num ant pairs: " + str(len(antonym_pairs))) syn_constraints_dict = { (str(i1) + ":" + str(i2) if i1 < i2 else str(i2) + ":" + str(i1)): 0 for (i1, i2) in synonym_pairs }
if args.modelpath is not None and not os.path.isdir(os.path.dirname(args.modelpath)) and not os.path.dirname(args.modelpath) == "": print("Error: Directory of the desired model output path not found.") exit(code = 1) # loading/merging word embeddings print("Loading pre-trained embeddings...") t_embeddings = text_embeddings.Embeddings() t_embeddings.load_embeddings(embs_path, 200000, language = 'en', print_loading = True, skip_first_line = False) t_embeddings.inverse_vocabularies() vocabulary_size = len(t_embeddings.lang_vocabularies["en"]) embeddings = t_embeddings.lang_embeddings["en"].astype(np.float64) embedding_size = t_embeddings.emb_sizes["en"] # data loading and preprocessing synonym_pairs = data_shaper.prep_word_tuples([[x[0].split('_')[1], x[1].split('_')[1]] for x in io_helper.load_csv_lines(synonyms_path, delimiter = ' ')], t_embeddings, "en") antonym_pairs = data_shaper.prep_word_tuples([[x[0].split('_')[1], x[1].split('_')[1]] for x in io_helper.load_csv_lines(antonyms_path, delimiter = ' ')], t_embeddings, "en") all_pairs = [] all_pairs.extend(synonym_pairs) all_pairs.extend(antonym_pairs) print("Num syn pairs: " + str(len(synonym_pairs))) print("Num ant pairs: " + str(len(antonym_pairs))) syn_constraints_dict = {(str(i1) + ":" + str(i2) if i1 < i2 else str(i2) + ":" + str(i1)) : 0 for (i1, i2) in synonym_pairs} ant_constraints_dict = {(str(i1) + ":" + str(i2) if i1 < i2 else str(i2) + ":" + str(i1)) : 0 for (i1, i2) in antonym_pairs} # Faiss wrapper for quick comparison of vectors, initially we index the starting distributional vectors print("Building FAISS index for fast retrieval of most similar vectors...") faisser = faiss_sts.Faiss(embedding_size) faisser.index(None, t_embeddings.lang_embeddings["en"], matrix_normalized = False, measure = distance_measure)