def distance(a, b): a = np.array(a) b = np.array(b) f = Frobenius_Distance(a, b) t = TriUL_sim(a, b) print("Distance: ", round(f, 3), round(t, 3)) return [a, b]
def worker(method, val1, val2, send_end): result = 0.0 if method == 1: result = cosine_similarity(val1, val2)[0][0] if method == 2: result = Frobenius_Distance(val1, val2) if method == 3: result = TriUL_sim(val1, val2) send_end.send(result)
def dense_ths_encoder(d, y, n_features, m_l): #load dense embedding max_len = m_l model_name = 'dense_t1' # dense_t1, dense_t2, full_lstm, full_gru, gru_lstm auto_encoder, encoder = autoencoder(model_name, n_features, max_len) # , init=kernel_init) new_emb = d i = 0 final_map_s1 = [] final_map_s2 = [] final_map_s3 = [] start = time.time() for r1 in new_emb: temp_map_s1 = [] # cosine similarity temp_map_s2 = [] # frobenius similarity temp_map_s3 = [] # triUL similarity av_r1 = np.average(r1, axis=0) av1 = av_r1.reshape(1, -1) j = 0 for r2 in new_emb: if i != j: av_r2 = np.average(r2, axis=0) av2 = av_r2.reshape(1, -1) result_list = [ cosine_similarity(av1, av2)[0][0], Frobenius_Distance(av1, av2), TriUL_sim(av1, av2) ] temp_map_s1.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[0] ]) temp_map_s2.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[1] ]) temp_map_s3.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[2] ]) j += 1 temp_map_s1 = np.array(temp_map_s1) temp_map_s2 = np.array(temp_map_s2) temp_map_s3 = np.array(temp_map_s3) temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]] temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]] temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]] temp_map_s1 = temp_map_s1[:5] temp_map_s2 = temp_map_s2[:5] temp_map_s3 = temp_map_s3[:5] final_map_s1.append(temp_map_s1) final_map_s2.append(temp_map_s2) final_map_s3.append(temp_map_s3) print("i: ", i) i += 1 if i % 100 == 0: print("Row: ", i, "\n") end = time.time() print("average embedding similarity computed", (end - start) / 60, " minutes") return final_map_s1, final_map_s2, final_map_s3
def universal_sentence_encoder(d, y): embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2") start = time.time() final_map_s1 = [] final_map_s2 = [] final_map_s3 = [] with tf.Session() as sess: sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) embeddings = sess.run(embed(d)) print("universal encoder ready") i = 0 for r1 in embeddings: r1 = r1.reshape(1, -1) temp_map_s1 = [] # cosine similarity temp_map_s2 = [] # frobenius similarity temp_map_s3 = [] # triUL similarity j = 0 for r2 in embeddings: r2 = r2.reshape(1, -1) if i != j: result_list = [ cosine_similarity(r1, r2)[0][0], Frobenius_Distance(r1, r2), TriUL_sim(r1, r2) ] temp_map_s1.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[0] ]) temp_map_s2.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[1] ]) temp_map_s3.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[2] ]) j += 1 temp_map_s1 = np.array(temp_map_s1) temp_map_s2 = np.array(temp_map_s2) temp_map_s3 = np.array(temp_map_s3) temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]] temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]] temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]] temp_map_s1 = temp_map_s1[:5] temp_map_s2 = temp_map_s2[:5] temp_map_s3 = temp_map_s3[:5] final_map_s1.append(temp_map_s1) final_map_s2.append(temp_map_s2) final_map_s3.append(temp_map_s3) print("i: ", i) i += 1 if i % 100 == 0: print("Row: ", i, "\n") end = time.time() print("Universal Encoder similarity computed", (end - start) / 60, " minutes") return final_map_s1, final_map_s2, final_map_s3
def encoder_infersent(d, y): start = time.time() model_version = 1 MODEL_PATH = "../test/data/infersent%s.pickle" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) use_cuda = True model = model.cuda() if use_cuda else model W2V_PATH = '../test/data/glove.840B.300d.txt' if model_version == 1 else '../test/data/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) embeddings = model.encode(d, bsize=128, tokenize=False, verbose=True) print("InferSent embedding computed") i = 0 final_map_s1 = [] final_map_s2 = [] final_map_s3 = [] for r1 in embeddings: r1 = r1.reshape(1, -1) temp_map_s1 = [] # cosine similarity temp_map_s2 = [] # frobenius similarity temp_map_s3 = [] # triUL similarity j = 0 for r2 in embeddings: r2 = r2.reshape(1, -1) if i != j: result_list = [ cosine_similarity(r1, r2)[0][0], Frobenius_Distance(r1, r2), TriUL_sim(r1, r2) ] temp_map_s1.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[0] ]) temp_map_s2.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[1] ]) temp_map_s3.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[2] ]) j += 1 temp_map_s1 = np.array(temp_map_s1) temp_map_s2 = np.array(temp_map_s2) temp_map_s3 = np.array(temp_map_s3) temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]] temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]] temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]] temp_map_s1 = temp_map_s1[:5] temp_map_s2 = temp_map_s2[:5] temp_map_s3 = temp_map_s3[:5] final_map_s1.append(temp_map_s1) final_map_s2.append(temp_map_s2) final_map_s3.append(temp_map_s3) print("i: ", i) i += 1 if i % 100 == 0: print("Row: ", i, "\n") end = time.time() print("InferSent similarity computed", (end - start) / 60, " minutes") return final_map_s1, final_map_s2, final_map_s3
def latent_semantic_indexing(d, y): start = time.time() vec = CountVectorizer() x = vec.fit_transform(d) df = pd.DataFrame(x.toarray(), columns=vec.get_feature_names()) matrix = df.values #[v, s, u] = svds(matrix.astype("float64"), k=2) #u = u.T [v, s, u] = np.linalg.svd(matrix, full_matrices=True) uk = u[0:2].T vk = v[:, 0:2] sk = np.diag(s[0:2]) # vtk = vk.T # To plot 2D sk_inverse = np.power(sk.diagonal(), -1) sk_inverse = np.diag(sk_inverse) # s_k(-1) final_map_s1 = [] final_map_s2 = [] final_map_s3 = [] i = 0 for row in matrix: row = row.reshape((1, row.shape[0])) row_coordinate = np.dot(np.dot(row, uk), sk_inverse) # q = q.T x u_k x s_k(-1) temp_map_s1 = [] # cosine similarity temp_map_s2 = [] # frobenius similarity temp_map_s3 = [] # triUL similarity j = 0 for e in vk: if i != j: ev_val = e.reshape((1, 2)) result_list = [ cosine_similarity(row_coordinate, ev_val)[0][0], Frobenius_Distance(row_coordinate, ev_val), TriUL_sim(row_coordinate, ev_val) ] temp_map_s1.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[0] ]) temp_map_s2.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[1] ]) temp_map_s3.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[2] ]) j += 1 temp_map_s1 = np.array(temp_map_s1) temp_map_s2 = np.array(temp_map_s2) temp_map_s3 = np.array(temp_map_s3) temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]] temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]] temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]] temp_map_s1 = temp_map_s1[:5] temp_map_s2 = temp_map_s2[:5] temp_map_s3 = temp_map_s3[:5] final_map_s1.append(temp_map_s1) final_map_s2.append(temp_map_s2) final_map_s3.append(temp_map_s3) print("i: ", i) i += 1 if i % 100 == 0: print("Row: ", i, "\n") end = time.time() print("Latent Semantic Indexing similarity computed", (end - start) / 60, " minutes") return final_map_s1, final_map_s2, final_map_s3
def smooth_inverse_frequency(d, y, glove, m_l): d = fix_text_format(d) # without labels: data = fix_text_format(data) print("Data cleaned") t = 1 # type = 1: Spacy Lemmatizer, 2: Pattern Lemmatizer d = lemmatizer(t, d) print("data lemmatizer finished") d = remove_stopwords(d) print("stop words removed") tokenized_list = tokenizer_sentence_list(d) word_counter = Counter(itertools.chain(*tokenized_list)) se = get_glove(glove) print("glove loaded") start = time.time() a = 1e-3 pre_emb = [] for sentence in tokenized_list: token_length = len(sentence) for w in sentence: a_value = a / (a + word_counter[w] / len(word_counter) ) # smooth inverse frequency, SIF vs = np.multiply(a_value, se.map_sentence(w, m_l)) vs = vs.sum(axis=0) # vs += sif * word_vector vs = vs / float(token_length) # weighted average pre_emb.append(vs) [_, _, u] = np.array(svds(pre_emb, k=1)) new_emb = [] for v_s in pre_emb: v_s = v_s - v_s.dot(u * u.transpose()) new_emb.append(v_s) print("SIF computed") i = 0 final_map_s1 = [] final_map_s2 = [] final_map_s3 = [] for r1 in new_emb: temp_map_s1 = [] # cosine similarity temp_map_s2 = [] # frobenius similarity temp_map_s3 = [] # triUL similarity r1 = r1.reshape(1, -1) j = 0 for r2 in new_emb: if i != j: r2 = r2.reshape(1, -1) result_list = [ cosine_similarity(r1, r2)[0][0], Frobenius_Distance(r1, r2), TriUL_sim(r1, r2) ] temp_map_s1.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[0] ]) temp_map_s2.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[1] ]) temp_map_s3.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[2] ]) j += 1 temp_map_s1 = np.array(temp_map_s1) temp_map_s2 = np.array(temp_map_s2) temp_map_s3 = np.array(temp_map_s3) temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]] temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]] temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]] temp_map_s1 = temp_map_s1[:5] temp_map_s2 = temp_map_s2[:5] temp_map_s3 = temp_map_s3[:5] final_map_s1.append(temp_map_s1) final_map_s2.append(temp_map_s2) final_map_s3.append(temp_map_s3) print("i: ", i) i += 1 if i % 100 == 0: print("Row: ", i, "\n") end = time.time() print("Smooth Inverse Frequency similarity computed", (end - start) / 60, " minutes") return final_map_s1, final_map_s2, final_map_s3
def average_embedding(d, y): i = 0 final_map_s1 = [] final_map_s2 = [] final_map_s3 = [] start = time.time() for r1 in d: temp_map_s1 = [] # cosine similarity temp_map_s2 = [] # frobenius similarity temp_map_s3 = [] # triUL similarity av_r1 = np.average(r1, axis=0) av1 = av_r1.reshape(1, -1) j = 0 for r2 in d: if i != j: av_r2 = np.average(r2, axis=0) av2 = av_r2.reshape(1, -1) result_list = [ cosine_similarity(av1, av2)[0][0], Frobenius_Distance(av1, av2), TriUL_sim(av1, av2) ] temp_map_s1.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[0] ]) temp_map_s2.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[1] ]) temp_map_s3.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[2] ]) j += 1 temp_map_s1 = np.array(temp_map_s1) temp_map_s2 = np.array(temp_map_s2) temp_map_s3 = np.array(temp_map_s3) temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]] temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]] temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]] temp_map_s1 = temp_map_s1[:5] temp_map_s2 = temp_map_s2[:5] temp_map_s3 = temp_map_s3[:5] final_map_s1.append(temp_map_s1) final_map_s2.append(temp_map_s2) final_map_s3.append(temp_map_s3) print("i: ", i) i += 1 if i % 100 == 0: print("Row: ", i, "\n") end = time.time() print("average embedding similarity computed", (end - start) / 60, " minutes") return final_map_s1, final_map_s2, final_map_s3
def tfidf_similarity(d, y): start = time.time() tfidf_vectorizer = TfidfVectorizer() i = 0 final_map_s1 = [] final_map_s2 = [] final_map_s3 = [] for r1 in d: temp_map_s1 = [] # cosine similarity temp_map_s2 = [] # frobenius similarity temp_map_s3 = [] # triUL similarity j = 0 for r2 in d: if i != j: documents = (r1, r2) tfidf_matrix = tfidf_vectorizer.fit_transform( documents).toarray() row1 = np.array(tfidf_matrix[0]).reshape(1, -1) row2 = np.array(tfidf_matrix[1]).reshape(1, -1) """ jobs = [] pipe_list = [] for i in range(3): recv, send = multiprocessing.Pipe(False) if i == 0: p = multiprocessing.Process(target=worker, args=(0, row1, row2, send)) if i == 1: p = multiprocessing.Process(target=worker, args=(1, row1, row2, send)) if i == 2: p = multiprocessing.Process(target=worker, args=(2, row1, row2, send)) jobs.append(p) pipe_list.append(recv) p.start() result_list = [x.recv() for x in pipe_list] """ result_list = [ cosine_similarity(row1, row2)[0][0], Frobenius_Distance(row1, row2), TriUL_sim(row1, row2) ] temp_map_s1.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[0] ]) temp_map_s2.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[1] ]) temp_map_s3.append([ round(i, 1), round(y[i], 1), round(j, 1), round(y[j], 1), result_list[2] ]) j += 1 temp_map_s1 = np.array(temp_map_s1) temp_map_s2 = np.array(temp_map_s2) temp_map_s3 = np.array(temp_map_s3) temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]] temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]] temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]] temp_map_s1 = temp_map_s1[:5] temp_map_s2 = temp_map_s2[:5] temp_map_s3 = temp_map_s3[:5] final_map_s1.append(temp_map_s1) final_map_s2.append(temp_map_s2) final_map_s3.append(temp_map_s3) print("i: ", i) i += 1 if i % 100 == 0: print("Row: ", i, "\n") end = time.time() print("tf-idf similarity computed", (end - start) / 60, " minutes") return final_map_s1, final_map_s2, final_map_s3