# key in data name import sys, os name = sys.argv[1] # load data from py.utils.load_data import read_dataset X_train, _, X_test, _ = read_dataset(name) from py.utils.sent2vec import sent2vec from py.utils.safe_pickle import pickle_dump from tqdm import tqdm # pmean calculator import numpy def p_mean_vector(powers, vectors): if len(vectors) <= 1: return numpy.zeros(300 * len(powers)) embeddings = [] for p in powers: embeddings.append( numpy.power( numpy.mean(numpy.power(numpy.array(vectors, dtype=complex), p), axis=0), 1 / p).real) return numpy.hstack(embeddings) powers_list = [[1.0], [1.0, 2.0], [1.0, 2.0, 3.0], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]]
fname = "../../../produced/" + distname + "_" + dataset + ".numpyz.npz" from scipy.spatial.distance import squareform distmat = squareform(load(fname)["dist"]) distmat = nan_to_num(distmat) distmat[distmat < 0] = 0.0 distmat = distmat / max(distmat) print(distmat.shape) # need to split into train-test # to do this, need to load original and get the indices # load data from py.utils.load_data import read_dataset X_train, Y_train, X_test, Y_test = read_dataset(dataset) train_idx = len(X_train) X_train_mat = distmat[:train_idx, :train_idx] X_test_train_mat = distmat[train_idx:, :train_idx] from sklearn.neighbors import KNeighborsClassifier as KNN from sklearn.metrics import classification_report, accuracy_score knn = KNN(n_neighbors=1, metric="precomputed") knn.fit(X_train_mat, Y_train) predict = knn.predict(X_test_train_mat) report = classification_report(Y_test, predict, digits=5) acc = accuracy_score(Y_test, predict)
dataset = "amazon" distname = "wmddist" best_params = "100-1.0-1.0" from py.utils.safe_pickle import pickle_load embeddings = pickle_load("../../../exact_embeddings/" + distname + "_" + dataset + "/" + best_params + ".p") print(embeddings.shape) from py.utils.load_data import read_dataset X_train, _, X_test, _ = read_dataset(dataset) all_sent = X_train + X_test print(len(all_sent)) from scipy.spatial.distance import cosine def sim_sent(embeddings, query_idx): dist = 1 best_match_idx = None query_emb = embeddings[query_idx] for i in range(len(embeddings)): if i != idx: d = cosine(query_emb, embeddings[i]) if d < dist: dist = d best_match_idx = i return 1 - dist, best_match_idx
# key in data name import sys name = sys.argv[1] # load data and write in a suitable format for wmd code from py.utils.load_data import read_dataset X_train, Y_train, X_test, Y_test = read_dataset(name) with open("../../../data/" + name + "_for_wmd.txt", "w") as f: for i in range(len(X_train)): f.write("{}\t{}\n".format(Y_train[i], X_train[i])) for i in range(len(X_test)): f.write("{}\t{}\n".format(Y_test[i], X_test[i])) from py.distances.wmd.get_word_vectors import read_line_by_line import gensim # load word2vec model (trained on Google News) model = gensim.models.KeyedVectors.load_word2vec_format( '../../../resources/GoogleNews-vectors-negative300.bin.gz', binary=True) vec_size = 300 # specify train/test datasets train_dataset = "../../../data/" + name + "_for_wmd.txt" # e.g.: 'twitter.txt' save_file = "../../../produced/" + name + "_vec.pk" # e.g.: 'twitter.pk' # read document data (X, BOW_X, y, C, words) = read_line_by_line(train_dataset, [], model, vec_size) # save pickle of extracted variables import pickle with open(save_file, 'wb') as f: pickle.dump([X, BOW_X, y, C, words], f)