Пример #1
0
def test_text_similarity_jaccard_on_matrix():
    matrix_1 = np.array([["hello", "columbia university"],
                         ["bye", "nyu"]])
    matrix_2 = np.array([["helli", "columbia"],
                         ["bye", "new york"]])

    tmp = similarities().text_similarity_on_matrix(matrix_1, matrix_2, method="jaccard")
    assert (tmp.shape==(4,2))
Пример #2
0
def test_vector_similarity_on_matrix():
    matrix_1 = np.array([[[1,1,1], [1,2,3],[1,3,1]],
                         [[3,6,7],[2,3,1],[1,1,1]]])
    matrix_2 = np.array([[[2,2,2],[2,3,4],[1,1,1]]])

    tmp = similarities().vector_similarity_on_matrix(matrix_1,matrix_2)
    desired = np.array([[1, 0.993,0.87],[0.953, 0.844, 1]])
    assert(np.array_equal(tmp,desired))
Пример #3
0
def test_text_similarity_jaro_on_matrix():
    matrix_1 = np.array([["hello", "columbia university"],
                         ["bye", "nyu"]])
    matrix_2 = np.array([["helli", "columbia"],
                         ["bye", "new york"]])

    tmp = similarities().text_similarity_on_matrix(matrix_1, matrix_2, method="jaro_winkler")
    desired = np.array([[0.92, 0.88421053],
                        [0.51111111, 0.3998538],
                        [0.51111111, 0.48611111],
                        [1., 0.63888889]])
    assert (np.array_equal(tmp, desired))
Пример #4
0
def test_text_similarity_on_matrix():
    matrix_1 = np.array([["hello","columbia university"],
                         ["bye","nyu"]])
    matrix_2 = np.array([["helli","columbia"],
                         ["bye","new york"]])

    tmp = similarities().text_similarity_on_matrix(matrix_1,matrix_2)
    desired = np.array([[ 1., 11.],
                        [ 5., 17.],
                        [ 5.,  7.],
                        [ 0.,  6.]])
    assert (np.array_equal(tmp, desired))
# save for later use to generate labels
df1_id_col = df1[df1_id]
df2_id_col = df2[df2_id]

# drop id columns because we don't need to compute id similarity
df1 = df1.drop(columns=[df1_id])
df2 = df2.drop(columns=[df2_id])

processed_data = Preprocessing().overall_preprocess(df1.drop(columns=['description']), df2.drop(columns=['description']),
                                                    special_columns=['title','manufacturer'],
                                                    word_embedding_model='none') # may take a while bc loading pretrained word embedding model

num_matrix_1, num_matrix_2 = processed_data["numerical"][0],processed_data["numerical"][1]
spc_matrix_1, spc_matrix_2 = processed_data["special_fields"][0],processed_data["special_fields"][1]
num_final_data = similarities().numerical_similarity_on_matrix(num_matrix_1,num_matrix_2)
spc_final_data = similarities().text_similarity_on_matrix(spc_matrix_1,spc_matrix_2,method='jaccard')

df1['key'] = 0
df2['key'] = 0
merged = pd.merge(df1, df2, on='key')[['description_x', 'description_y']]

'''
train-test split
'''
non_empty = []

for m in num_final_data, spc_final_data:#, embed_mean_data, embed_max_data, embed_min_data:
    if m.size !=0:
        non_empty.append(m)
Пример #6
0
# Drop id columns because we don't need to compute id similarity
df1 = df1.drop(columns=[df1_id])
df2 = df2.drop(columns=[df2_id])

processed_data = Preprocessing().overall_preprocess(
    df1.drop(columns=['description']),
    df2.drop(columns=['description']),
    special_columns=['title', 'manufacturer'],
    word_embedding_model='none'
)  # may take a while bc loading pretrained word embedding model

num_matrix_1, num_matrix_2 = processed_data["numerical"][0], processed_data[
    "numerical"][1]
spc_matrix_1, spc_matrix_2 = processed_data["special_fields"][
    0], processed_data["special_fields"][1]
num_final_data = similarities().numerical_similarity_on_matrix(
    num_matrix_1, num_matrix_2)
spc_final_data_0 = similarities().text_similarity_on_matrix(spc_matrix_1,
                                                            spc_matrix_2,
                                                            method='jaccard')
spc_final_data_1 = similarities().text_similarity_on_matrix(
    spc_matrix_1, spc_matrix_2, method='lavenshtein')
spc_final_data_2 = similarities().text_similarity_on_matrix(
    spc_matrix_1, spc_matrix_2, method='jaro_winkler')

df1['key'] = 0
df2['key'] = 0
merged = pd.merge(df1, df2, on='key')[['description_x', 'description_y']]
'''
train-test split
'''
non_empty = []
Пример #7
0
    embedding_weight='tfidf')
# may take a while bc loading pretrained word embedding model
'''
get numerical data
'''
# need fix addressZip and not to see it as numeric
num_matrix_1, num_matrix_2 = processed_data["numerical"][0], processed_data[
    "numerical"][1]
embed_matrix_1, embed_matrix_2 = processed_data["word_embedding_fields"][
    0], processed_data["word_embedding_fields"][1]
spc_matrix_1, spc_matrix_2 = processed_data["special_fields"][
    0], processed_data["special_fields"][1]
'''
calculate similarities
'''
num_final_data = similarities().numerical_similarity_on_matrix(
    num_matrix_1, num_matrix_2)
embed_tfidf_data = similarities().vector_similarity_on_matrix(
    embed_matrix_1, embed_matrix_2)
#embed_mean_data = similarities().vector_similarity_on_matrix(embed_matrix_1,embed_matrix_2)
#embed_min_data = similarities().vector_similarity_on_matrix(embed_matrix_1,embed_matrix_2)
#embed_max_data = similarities().vector_similarity_on_matrix(embed_matrix_1,embed_matrix_2)
spc_final_data = similarities().text_similarity_on_matrix(
    spc_matrix_1, spc_matrix_2)
'''
concatenate all data
'''
# only concatenate non-empty similarity matrices
non_empty = []

for m in num_final_data, spc_final_data, embed_tfidf_data:  #, embed_min_data, embed_max_data:#, embed_mean_data,embed_tfidf_data#:
    if m.size != 0:
Пример #8
0
def test_numerical_similarity_on_matrix():
    matrix_1 = np.array([[1,2,3,4],[5,6,7,8]])
    matrix_2 = np.array([[1,2,3,4]])

    tmp = similarities().numerical_similarity_on_matrix(matrix_1,matrix_2,method = "min_max")
    assert(np.array_equal(tmp,np.array([[1,1,1,1],[0.2 ,0.333, 0.429, 0.5 ]])))
Пример #9
0
def test_vector_similarity_on_matrix_empty(): #check case where one of the input matrix is empty
    matrix_2 = np.array([[[2,2,2],[2,3,4],[1,1,1]]])

    tmp = similarities().vector_similarity_on_matrix(np.array([]),matrix_2)
    desired = np.array([[1, 0.993,0.87],[0.953, 0.844, 1]])
    assert(tmp.size==0)