Пример #1
0
def distance(a, b):
    a = np.array(a)
    b = np.array(b)
    f = Frobenius_Distance(a, b)
    t = TriUL_sim(a, b)
    print("Distance: ", round(f, 3), round(t, 3))
    return [a, b]
Пример #2
0
def worker(method, val1, val2, send_end):
    result = 0.0
    if method == 1:
        result = cosine_similarity(val1, val2)[0][0]
    if method == 2:
        result = Frobenius_Distance(val1, val2)
    if method == 3:
        result = TriUL_sim(val1, val2)
    send_end.send(result)
Пример #3
0
def dense_ths_encoder(d, y, n_features, m_l):
    #load dense embedding
    max_len = m_l
    model_name = 'dense_t1'  # dense_t1, dense_t2, full_lstm, full_gru, gru_lstm
    auto_encoder, encoder = autoencoder(model_name, n_features,
                                        max_len)  # , init=kernel_init)
    new_emb = d
    i = 0
    final_map_s1 = []
    final_map_s2 = []
    final_map_s3 = []
    start = time.time()
    for r1 in new_emb:
        temp_map_s1 = []  # cosine similarity
        temp_map_s2 = []  # frobenius similarity
        temp_map_s3 = []  # triUL similarity
        av_r1 = np.average(r1, axis=0)
        av1 = av_r1.reshape(1, -1)
        j = 0
        for r2 in new_emb:
            if i != j:
                av_r2 = np.average(r2, axis=0)
                av2 = av_r2.reshape(1, -1)
                result_list = [
                    cosine_similarity(av1, av2)[0][0],
                    Frobenius_Distance(av1, av2),
                    TriUL_sim(av1, av2)
                ]
                temp_map_s1.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[0]
                ])
                temp_map_s2.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[1]
                ])
                temp_map_s3.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[2]
                ])
            j += 1
        temp_map_s1 = np.array(temp_map_s1)
        temp_map_s2 = np.array(temp_map_s2)
        temp_map_s3 = np.array(temp_map_s3)
        temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]]
        temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]]
        temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]]
        temp_map_s1 = temp_map_s1[:5]
        temp_map_s2 = temp_map_s2[:5]
        temp_map_s3 = temp_map_s3[:5]
        final_map_s1.append(temp_map_s1)
        final_map_s2.append(temp_map_s2)
        final_map_s3.append(temp_map_s3)
        print("i: ", i)
        i += 1
        if i % 100 == 0:
            print("Row: ", i, "\n")
    end = time.time()
    print("average embedding similarity computed", (end - start) / 60,
          " minutes")
    return final_map_s1, final_map_s2, final_map_s3
Пример #4
0
def universal_sentence_encoder(d, y):
    embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
    start = time.time()
    final_map_s1 = []
    final_map_s2 = []
    final_map_s3 = []
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        embeddings = sess.run(embed(d))
        print("universal encoder ready")
        i = 0
        for r1 in embeddings:
            r1 = r1.reshape(1, -1)
            temp_map_s1 = []  # cosine similarity
            temp_map_s2 = []  # frobenius similarity
            temp_map_s3 = []  # triUL similarity
            j = 0
            for r2 in embeddings:
                r2 = r2.reshape(1, -1)
                if i != j:
                    result_list = [
                        cosine_similarity(r1, r2)[0][0],
                        Frobenius_Distance(r1, r2),
                        TriUL_sim(r1, r2)
                    ]
                    temp_map_s1.append([
                        round(i, 1),
                        round(y[i], 1),
                        round(j, 1),
                        round(y[j], 1), result_list[0]
                    ])
                    temp_map_s2.append([
                        round(i, 1),
                        round(y[i], 1),
                        round(j, 1),
                        round(y[j], 1), result_list[1]
                    ])
                    temp_map_s3.append([
                        round(i, 1),
                        round(y[i], 1),
                        round(j, 1),
                        round(y[j], 1), result_list[2]
                    ])
                j += 1
            temp_map_s1 = np.array(temp_map_s1)
            temp_map_s2 = np.array(temp_map_s2)
            temp_map_s3 = np.array(temp_map_s3)
            temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]]
            temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]]
            temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]]
            temp_map_s1 = temp_map_s1[:5]
            temp_map_s2 = temp_map_s2[:5]
            temp_map_s3 = temp_map_s3[:5]
            final_map_s1.append(temp_map_s1)
            final_map_s2.append(temp_map_s2)
            final_map_s3.append(temp_map_s3)
            print("i: ", i)
            i += 1
            if i % 100 == 0:
                print("Row: ", i, "\n")
        end = time.time()
    print("Universal Encoder similarity computed", (end - start) / 60,
          " minutes")
    return final_map_s1, final_map_s2, final_map_s3
Пример #5
0
def encoder_infersent(d, y):
    start = time.time()
    model_version = 1
    MODEL_PATH = "../test/data/infersent%s.pickle" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    use_cuda = True
    model = model.cuda() if use_cuda else model
    W2V_PATH = '../test/data/glove.840B.300d.txt' if model_version == 1 else '../test/data/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)
    model.build_vocab_k_words(K=100000)
    embeddings = model.encode(d, bsize=128, tokenize=False, verbose=True)
    print("InferSent embedding computed")
    i = 0
    final_map_s1 = []
    final_map_s2 = []
    final_map_s3 = []
    for r1 in embeddings:
        r1 = r1.reshape(1, -1)
        temp_map_s1 = []  # cosine similarity
        temp_map_s2 = []  # frobenius similarity
        temp_map_s3 = []  # triUL similarity
        j = 0
        for r2 in embeddings:
            r2 = r2.reshape(1, -1)
            if i != j:
                result_list = [
                    cosine_similarity(r1, r2)[0][0],
                    Frobenius_Distance(r1, r2),
                    TriUL_sim(r1, r2)
                ]
                temp_map_s1.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[0]
                ])
                temp_map_s2.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[1]
                ])
                temp_map_s3.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[2]
                ])
            j += 1
        temp_map_s1 = np.array(temp_map_s1)
        temp_map_s2 = np.array(temp_map_s2)
        temp_map_s3 = np.array(temp_map_s3)
        temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]]
        temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]]
        temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]]
        temp_map_s1 = temp_map_s1[:5]
        temp_map_s2 = temp_map_s2[:5]
        temp_map_s3 = temp_map_s3[:5]
        final_map_s1.append(temp_map_s1)
        final_map_s2.append(temp_map_s2)
        final_map_s3.append(temp_map_s3)
        print("i: ", i)
        i += 1
        if i % 100 == 0:
            print("Row: ", i, "\n")
    end = time.time()
    print("InferSent similarity computed", (end - start) / 60, " minutes")
    return final_map_s1, final_map_s2, final_map_s3
Пример #6
0
def latent_semantic_indexing(d, y):
    start = time.time()
    vec = CountVectorizer()
    x = vec.fit_transform(d)
    df = pd.DataFrame(x.toarray(), columns=vec.get_feature_names())
    matrix = df.values
    #[v, s, u] = svds(matrix.astype("float64"), k=2)
    #u = u.T
    [v, s, u] = np.linalg.svd(matrix, full_matrices=True)
    uk = u[0:2].T
    vk = v[:, 0:2]
    sk = np.diag(s[0:2])
    # vtk = vk.T # To plot 2D
    sk_inverse = np.power(sk.diagonal(), -1)
    sk_inverse = np.diag(sk_inverse)  # s_k(-1)
    final_map_s1 = []
    final_map_s2 = []
    final_map_s3 = []
    i = 0
    for row in matrix:
        row = row.reshape((1, row.shape[0]))
        row_coordinate = np.dot(np.dot(row, uk),
                                sk_inverse)  # q = q.T x u_k x s_k(-1)
        temp_map_s1 = []  # cosine similarity
        temp_map_s2 = []  # frobenius similarity
        temp_map_s3 = []  # triUL similarity
        j = 0
        for e in vk:
            if i != j:
                ev_val = e.reshape((1, 2))
                result_list = [
                    cosine_similarity(row_coordinate, ev_val)[0][0],
                    Frobenius_Distance(row_coordinate, ev_val),
                    TriUL_sim(row_coordinate, ev_val)
                ]
                temp_map_s1.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[0]
                ])
                temp_map_s2.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[1]
                ])
                temp_map_s3.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[2]
                ])
            j += 1
        temp_map_s1 = np.array(temp_map_s1)
        temp_map_s2 = np.array(temp_map_s2)
        temp_map_s3 = np.array(temp_map_s3)
        temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]]
        temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]]
        temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]]
        temp_map_s1 = temp_map_s1[:5]
        temp_map_s2 = temp_map_s2[:5]
        temp_map_s3 = temp_map_s3[:5]
        final_map_s1.append(temp_map_s1)
        final_map_s2.append(temp_map_s2)
        final_map_s3.append(temp_map_s3)
        print("i: ", i)
        i += 1
        if i % 100 == 0:
            print("Row: ", i, "\n")
    end = time.time()
    print("Latent Semantic Indexing similarity computed", (end - start) / 60,
          " minutes")
    return final_map_s1, final_map_s2, final_map_s3
Пример #7
0
def smooth_inverse_frequency(d, y, glove, m_l):
    d = fix_text_format(d)  # without labels: data = fix_text_format(data)
    print("Data cleaned")
    t = 1  # type = 1: Spacy Lemmatizer, 2: Pattern Lemmatizer
    d = lemmatizer(t, d)
    print("data lemmatizer finished")
    d = remove_stopwords(d)
    print("stop words removed")
    tokenized_list = tokenizer_sentence_list(d)
    word_counter = Counter(itertools.chain(*tokenized_list))
    se = get_glove(glove)
    print("glove loaded")
    start = time.time()
    a = 1e-3
    pre_emb = []
    for sentence in tokenized_list:
        token_length = len(sentence)
        for w in sentence:
            a_value = a / (a + word_counter[w] / len(word_counter)
                           )  # smooth inverse frequency, SIF
        vs = np.multiply(a_value, se.map_sentence(w, m_l))
        vs = vs.sum(axis=0)  # vs += sif * word_vector
        vs = vs / float(token_length)  # weighted average
        pre_emb.append(vs)
    [_, _, u] = np.array(svds(pre_emb, k=1))
    new_emb = []
    for v_s in pre_emb:
        v_s = v_s - v_s.dot(u * u.transpose())
        new_emb.append(v_s)
    print("SIF computed")
    i = 0
    final_map_s1 = []
    final_map_s2 = []
    final_map_s3 = []
    for r1 in new_emb:
        temp_map_s1 = []  # cosine similarity
        temp_map_s2 = []  # frobenius similarity
        temp_map_s3 = []  # triUL similarity
        r1 = r1.reshape(1, -1)
        j = 0
        for r2 in new_emb:
            if i != j:
                r2 = r2.reshape(1, -1)
                result_list = [
                    cosine_similarity(r1, r2)[0][0],
                    Frobenius_Distance(r1, r2),
                    TriUL_sim(r1, r2)
                ]
                temp_map_s1.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[0]
                ])
                temp_map_s2.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[1]
                ])
                temp_map_s3.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[2]
                ])
            j += 1
        temp_map_s1 = np.array(temp_map_s1)
        temp_map_s2 = np.array(temp_map_s2)
        temp_map_s3 = np.array(temp_map_s3)
        temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]]
        temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]]
        temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]]
        temp_map_s1 = temp_map_s1[:5]
        temp_map_s2 = temp_map_s2[:5]
        temp_map_s3 = temp_map_s3[:5]
        final_map_s1.append(temp_map_s1)
        final_map_s2.append(temp_map_s2)
        final_map_s3.append(temp_map_s3)
        print("i: ", i)
        i += 1
        if i % 100 == 0:
            print("Row: ", i, "\n")
    end = time.time()
    print("Smooth Inverse Frequency similarity computed", (end - start) / 60,
          " minutes")
    return final_map_s1, final_map_s2, final_map_s3
Пример #8
0
def average_embedding(d, y):
    i = 0
    final_map_s1 = []
    final_map_s2 = []
    final_map_s3 = []
    start = time.time()
    for r1 in d:
        temp_map_s1 = []  # cosine similarity
        temp_map_s2 = []  # frobenius similarity
        temp_map_s3 = []  # triUL similarity
        av_r1 = np.average(r1, axis=0)
        av1 = av_r1.reshape(1, -1)
        j = 0
        for r2 in d:
            if i != j:
                av_r2 = np.average(r2, axis=0)
                av2 = av_r2.reshape(1, -1)
                result_list = [
                    cosine_similarity(av1, av2)[0][0],
                    Frobenius_Distance(av1, av2),
                    TriUL_sim(av1, av2)
                ]
                temp_map_s1.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[0]
                ])
                temp_map_s2.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[1]
                ])
                temp_map_s3.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[2]
                ])
            j += 1
        temp_map_s1 = np.array(temp_map_s1)
        temp_map_s2 = np.array(temp_map_s2)
        temp_map_s3 = np.array(temp_map_s3)
        temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]]
        temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]]
        temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]]
        temp_map_s1 = temp_map_s1[:5]
        temp_map_s2 = temp_map_s2[:5]
        temp_map_s3 = temp_map_s3[:5]
        final_map_s1.append(temp_map_s1)
        final_map_s2.append(temp_map_s2)
        final_map_s3.append(temp_map_s3)
        print("i: ", i)
        i += 1
        if i % 100 == 0:
            print("Row: ", i, "\n")
    end = time.time()
    print("average embedding similarity computed", (end - start) / 60,
          " minutes")
    return final_map_s1, final_map_s2, final_map_s3
Пример #9
0
def tfidf_similarity(d, y):
    start = time.time()
    tfidf_vectorizer = TfidfVectorizer()
    i = 0
    final_map_s1 = []
    final_map_s2 = []
    final_map_s3 = []
    for r1 in d:
        temp_map_s1 = []  # cosine similarity
        temp_map_s2 = []  # frobenius similarity
        temp_map_s3 = []  # triUL similarity
        j = 0
        for r2 in d:
            if i != j:
                documents = (r1, r2)
                tfidf_matrix = tfidf_vectorizer.fit_transform(
                    documents).toarray()
                row1 = np.array(tfidf_matrix[0]).reshape(1, -1)
                row2 = np.array(tfidf_matrix[1]).reshape(1, -1)
                """
                jobs = []
                pipe_list = []
                for i in range(3):
                    recv, send = multiprocessing.Pipe(False)
                    if i == 0:
                        p = multiprocessing.Process(target=worker, args=(0, row1, row2, send))
                    if i == 1:
                        p = multiprocessing.Process(target=worker, args=(1, row1, row2, send))
                    if i == 2:
                        p = multiprocessing.Process(target=worker, args=(2, row1, row2, send))
                    jobs.append(p)
                    pipe_list.append(recv)
                    p.start()
                result_list = [x.recv() for x in pipe_list]
                """
                result_list = [
                    cosine_similarity(row1, row2)[0][0],
                    Frobenius_Distance(row1, row2),
                    TriUL_sim(row1, row2)
                ]
                temp_map_s1.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[0]
                ])
                temp_map_s2.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[1]
                ])
                temp_map_s3.append([
                    round(i, 1),
                    round(y[i], 1),
                    round(j, 1),
                    round(y[j], 1), result_list[2]
                ])
            j += 1
        temp_map_s1 = np.array(temp_map_s1)
        temp_map_s2 = np.array(temp_map_s2)
        temp_map_s3 = np.array(temp_map_s3)
        temp_map_s1 = temp_map_s1[temp_map_s1[:, 4].argsort()[::-1]]
        temp_map_s2 = temp_map_s2[temp_map_s2[:, 4].argsort()[::-1]]
        temp_map_s3 = temp_map_s3[temp_map_s3[:, 4].argsort()[::-1]]
        temp_map_s1 = temp_map_s1[:5]
        temp_map_s2 = temp_map_s2[:5]
        temp_map_s3 = temp_map_s3[:5]
        final_map_s1.append(temp_map_s1)
        final_map_s2.append(temp_map_s2)
        final_map_s3.append(temp_map_s3)
        print("i: ", i)
        i += 1
        if i % 100 == 0:
            print("Row: ", i, "\n")
    end = time.time()
    print("tf-idf similarity computed", (end - start) / 60, " minutes")
    return final_map_s1, final_map_s2, final_map_s3