Пример #1
0
def compute_all(A, B, idA, idB, keyA, keyB, merged, iters):
    ke = []
    jkws = []
    jk2g = []
    jws = []
    j2g = []
    for _ in range(iters):
        start = time.time()
        ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-1)
        ke.append(time.time() - start)

        start = time.time()
        ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-1)
        jkws.append(time.time() - start)

        start = time.time()
        ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-1)
        jk2g.append(time.time() - start)

        start = time.time()
        ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-1)
        jws.append(time.time() - start)

        start = time.time()
        ssj.jaccard_join(A,
                         B,
                         idA,
                         idB,
                         merged,
                         merged,
                         TWO_GRAM,
                         0.1,
                         n_jobs=-1)
        j2g.append(time.time() - start)
    return ke, jkws, jk2g, jws, j2g
Пример #2
0
def compute_joins(A, B, idA, idB, keyA, keyB, merged, compute = 5):
    if compute == 5:
        results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5),
                   'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5),
                   'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5),
                   'mer_jac_ws': ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-5),
                   'mer_jac_2g': ssj.jaccard_join(A, B, idA, idB, merged, merged, TWO_GRAM, 0.1, n_jobs=-5),
        }
    if compute == 3:
         results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5),
                   'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5),
                   'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5),
                   'mer_jac_ws': ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-5),
                   'mer_jac_2g': ssj.jaccard_join(A, B, idA, idB, merged, merged, TWO_GRAM, 0.1, n_jobs=-5),
        }  
            
    if compute == 2:
         results = {
                   'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5),
                   'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5),
        }    
            
    if compute == 1:
         results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5),
        }    
            
            
    if compute == 0:
         results = {
                   'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5),
        } 
    return results
Пример #3
0
 def apply_tables(self, ltable, rtable, l_key_attr, r_key_attr,              
                  l_match_attr, r_match_attr, n_jobs=1):                     
     if self.sim_measure_type == 'JACCARD':                                  
         return ssj.jaccard_join(ltable, rtable, l_key_attr, r_key_attr,     
                                 l_match_attr, r_match_attr, self.tokenizer, 
                                 self.threshold, comp_op=self.comp_op,       
                                 n_jobs=n_jobs)                              
     elif self.sim_measure_type == 'COSINE':                                 
         return ssj.cosine_join(ltable, rtable, l_key_attr, r_key_attr,      
                                l_match_attr, r_match_attr, self.tokenizer,  
                                self.threshold, comp_op=self.comp_op,        
                                n_jobs=n_jobs)                               
     elif self.sim_measure_type == 'DICE':                                   
         return ssj.dice_join(ltable, rtable, l_key_attr, r_key_attr,        
                              l_match_attr, r_match_attr, self.tokenizer,    
                              self.threshold, comp_op=self.comp_op,          
                              n_jobs=n_jobs)                                 
     elif self.sim_measure_type == 'EDIT_DISTANCE':                          
         return ssj.edit_distance_join(ltable, rtable,                       
                                       l_key_attr, r_key_attr,               
                                       l_match_attr, r_match_attr,           
                                       self.threshold, comp_op=self.comp_op, 
                                       n_jobs=n_jobs)                        
     elif self.sim_measure_type == 'OVERLAP':                                
         return ssj.overlap_join(ltable, rtable, l_key_attr, r_key_attr,     
                                 l_match_attr, r_match_attr, self.tokenizer, 
                                 self.threshold, comp_op=self.comp_op,       
                                 n_jobs=n_jobs)                              
     elif self.sim_measure_type == 'OVERLAP_COEFFICIENT':                    
         return ssj.overlap_coefficient_join(ltable, rtable,                 
                                 l_key_attr, r_key_attr,                     
                                 l_match_attr, r_match_attr, self.tokenizer, 
                                 self.threshold, comp_op=self.comp_op,       
                                 n_jobs=n_jobs)  
Пример #4
0
# on 'name' and 'attraction' is at most 5.
# l_out_attrs and r_out_attrs denote the attributes from the
# left table (A) and right table (B) that need to be included in the output.
#name	locType	longitude	latitude	closest_to	blogLinks	src	dest	address	srcLoc	destLoc
#attraction_in	no_of_attractions	no_of_reviews	url	reviewComment	activity_type	rank	max_rank
#attraction	contact_no	knownFor	address	pinCode

output_pairs = ssj.edit_distance_join(
    blogTable,
    tripAdvisorTable,
    'name',
    'url',
    'name',
    'attraction',
    3,
    l_out_prefix='blogs_',
    r_out_prefix='tripAdvisor_',
    l_out_attrs=['name', 'locType', 'address'],
    r_out_attrs=[
        'attraction', 'knownFor', 'address', 'pinCode', 'reviewComment',
        'no_of_reviews', 'url', 'contact_no', 'rank', 'max_rank',
        'activity_type', 'attraction_in', 'no_of_attractions'
    ],
    n_jobs=-1)

print(len(output_pairs))
# examine the output pairs
csvFile = open('editDistance.csv', 'w')
output_pairs.to_csv(csvFile)
print(output_pairs.loc[:, '_sim_score'])
Пример #5
0
data_attempt1 = pd.merge(imdb_data, kaggle_data, how='inner', left_on=['norm_title', 'norm_year'],
                         right_on=['norm_movie_title', 'norm_title_year'])
data_attempt1.shape


# But given that IMDB and Kaggle datasets are collected from different sources, chances are that the name of a movie would be slightly different in these datasets (e.g. "Wall.E" vs "WallE"). To be able to find such matches, one can look at the similarity of movie titles and consider title with high similarity to be the same entity. BigGorilla's recommendation for doing similarity join across two datasets is the python package **py_stringsimjoin**. The following code snippet uses the **py_stringsimjoin** to match all the titles that have an edit distance of one or less (i.e., there is at most one character that needs to be changed/added/removed to make both titles identical). Once the similarity join is complete, it only selects the title pairs that are produced in the same year.

# In[18]:

import py_stringsimjoin as ssj
import py_stringmatching as sm

imdb_data['id'] = range(imdb_data.shape[0])
kaggle_data['id'] = range(kaggle_data.shape[0])
similar_titles = ssj.edit_distance_join(imdb_data, kaggle_data, 'id', 'id', 'norm_title',
                                        'norm_movie_title', l_out_attrs=['norm_title', 'norm_year'],
                                         r_out_attrs=['norm_movie_title', 'norm_title_year'], threshold=1)
# selecting the entries that have the same production year
data_attempt2 = similar_titles[similar_titles.r_norm_title_year == similar_titles.l_norm_year]
data_attempt2.shape


# We can see that using the similarity join 4,689 titles were matched. Let's look at some of the titles that are matched by the similarity join but are not identical.

# In[19]:

data_attempt2[data_attempt2.l_norm_title != data_attempt2.r_norm_movie_title].head()


# While instances such as "walle" and "wall.e" are correctly matched, we can see that this techniques also makes some errors (e.g., "grave" and "brave"). This raises the following questions: "what method should be used for data matching?" and "how can we determine the quality of the matching?". BigGorilla's recommendation for dealing with this problem is using the pythong package **py_entitymatching** which is developed as part of the [Magellan project](https://sites.google.com/site/anhaidgroup/projects/magellan).
# 
Пример #6
0
print('py_stringsimjoin version: ' + ssj.__version__)
print('py_stringmatching version: ' + sm.__version__)
print('pandas version: ' + pd.__version__)

table_A_path = os.sep.join(
    [ssj.get_install_path(), 'datasets', 'data', 'person_table_A.csv'])
table_B_path = os.sep.join(
    [ssj.get_install_path(), 'datasets', 'data', 'person_table_B.csv'])

# Load csv files as dataframes.
A = pd.read_csv(table_A_path)
B = pd.read_csv(table_B_path)
print('Number of records in A: ' + str(len(A)))
print('Number of records in B: ' + str(len(B)))
ssj.profile_table_for_join(A)

B['new_key_attr'] = range(0, len(B))
ws = sm.WhitespaceTokenizer(return_set=True)
ws.tokenize('William Bridge')
output_pairs = ssj.edit_distance_join(A,
                                      B,
                                      'A.id',
                                      'B.id',
                                      'A.name',
                                      'B.name',
                                      5,
                                      l_out_attrs=['A.name'],
                                      r_out_attrs=['B.name'])
print(len(output_pairs))
print(output_pairs)
Пример #7
0
 def apply_tables(self,
                  ltable,
                  rtable,
                  l_key_attr,
                  r_key_attr,
                  l_match_attr,
                  r_match_attr,
                  n_jobs=1):
     if self.sim_measure_type == 'JACCARD':
         return ssj.jaccard_join(ltable,
                                 rtable,
                                 l_key_attr,
                                 r_key_attr,
                                 l_match_attr,
                                 r_match_attr,
                                 self.tokenizer,
                                 self.threshold,
                                 comp_op=self.comp_op,
                                 n_jobs=n_jobs)
     elif self.sim_measure_type == 'COSINE':
         return ssj.cosine_join(ltable,
                                rtable,
                                l_key_attr,
                                r_key_attr,
                                l_match_attr,
                                r_match_attr,
                                self.tokenizer,
                                self.threshold,
                                comp_op=self.comp_op,
                                n_jobs=n_jobs)
     elif self.sim_measure_type == 'DICE':
         return ssj.dice_join(ltable,
                              rtable,
                              l_key_attr,
                              r_key_attr,
                              l_match_attr,
                              r_match_attr,
                              self.tokenizer,
                              self.threshold,
                              comp_op=self.comp_op,
                              n_jobs=n_jobs)
     elif self.sim_measure_type == 'EDIT_DISTANCE':
         return ssj.edit_distance_join(ltable,
                                       rtable,
                                       l_key_attr,
                                       r_key_attr,
                                       l_match_attr,
                                       r_match_attr,
                                       self.threshold,
                                       comp_op=self.comp_op,
                                       n_jobs=n_jobs)
     elif self.sim_measure_type == 'OVERLAP':
         return ssj.overlap_join(ltable,
                                 rtable,
                                 l_key_attr,
                                 r_key_attr,
                                 l_match_attr,
                                 r_match_attr,
                                 self.tokenizer,
                                 self.threshold,
                                 comp_op=self.comp_op,
                                 n_jobs=n_jobs)
     elif self.sim_measure_type == 'OVERLAP_COEFFICIENT':
         return ssj.overlap_coefficient_join(ltable,
                                             rtable,
                                             l_key_attr,
                                             r_key_attr,
                                             l_match_attr,
                                             r_match_attr,
                                             self.tokenizer,
                                             self.threshold,
                                             comp_op=self.comp_op,
                                             n_jobs=n_jobs)
Пример #8
0
def time_all(iters=25):
    timings = defaultdict(list)
    join_types = [
        'key_edit', 'key_jac_ws', 'key_jac_2g', 'mer_jac_ws', 'mer_jac_2g'
    ]
    data_base = '/lfs/1/sahaana/enrichment/data'

    # Fuzzy Join
    A = data_base + "/main_fuzzy/test_tableA_processed.pkl"
    B = data_base + "/main_fuzzy/test_tableB_processed.pkl"
    supervision = data_base + "/main_fuzzy/supervision_test.pkl"
    idA = 'FUZZY_ID'
    idB = 'tconst'
    keyA = 'primaryTitle'
    keyB = 'primaryTitle'
    merged = 'merged_all'

    A = pd.read_pickle(A).reset_index()
    B = pd.read_pickle(B).reset_index()
    supervision = pd.read_pickle(supervision)
    supervision = supervision.set_index(idA)

    results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters)
    print('FJ')
    for j in range(len(join_types)):
        timings['data'].append('main_fuzzy')
        timings['type'].append(join_types[j])
        timings['results'].append(results[j])
        timings['avg'].append(np.mean(results[j]))
        print(timings['data'][-1], timings['type'][-1], timings['results'][-1],
              timings['avg'][-1])
    print()

    # IMDB_wiki
    A = data_base + "/imdb_wiki/dev_tableA_processed.pkl"
    B = data_base + "/imdb_wiki/dev_tableB_processed.pkl"
    supervision = data_base + "/imdb_wiki/supervision_test.pkl"
    idA = 'IMDB_ID'
    idB = 'QID'
    keyA = 'primaryTitle'
    keyB = 'title'
    merged = 'merged_all'

    A = pd.read_pickle(A).reset_index()
    B = pd.read_pickle(B).reset_index()
    supervision = pd.read_pickle(supervision)
    supervision = supervision.set_index(idA)

    results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters)
    print('imdb_wiki')
    for j in range(len(join_types)):
        timings['data'].append('imdb_wiki')
        timings['type'].append(join_types[j])
        timings['results'].append(results[j])
        timings['avg'].append(np.mean(results[j]))
        print(timings['data'][-1], timings['type'][-1], timings['results'][-1],
              timings['avg'][-1])
    print()

    # SQUAD
    A = data_base + "/SQuAD/dev_tableA_processed.pkl"
    B = data_base + "/SQuAD/dev_tableB_sent_processed.pkl"
    supervision = data_base + "/SQuAD/dev_sent_labels.pkl"
    idA = 'QID'
    idB = 'SID'
    keyA = 'question'
    keyB = 'sentence'
    merged = 'merged_all'

    A = pd.read_pickle(A).reset_index()
    B = pd.read_pickle(B).reset_index()
    supervision = pd.read_pickle(supervision)
    supervision = supervision.set_index(idA)

    results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters)
    print('SQUAD')
    for j in range(len(join_types)):
        timings['data'].append('squad')
        timings['type'].append(join_types[j])
        timings['results'].append(results[j])
        timings['avg'].append(np.mean(results[j]))
        print(timings['data'][-1], timings['type'][-1], timings['results'][-1],
              timings['avg'][-1])
    print()

    # Deepmatchers
    dm_data = {
        0: "joined_abt_buy_exp_data",
        1: "joined_amazon_google_exp_data",
        2: "joined_beer_exp_data",
        4: "joined_dblp_acm_exp_data",
        5: "joined_dblp_scholar_exp_data",
        6: "joined_dirty_dblp_acm_exp_data",
        7: "joined_dirty_dblp_scholar_exp_data",
        8: "joined_dirty_itunes_amazon_exp_data",
        9: "joined_dirty_walmart_amazon_exp_data",
        10: "joined_fodors_zagat_exp_data",
        11: "joined_itunes_amazon_exp_data",
        12: "joined_walmart_amazon_exp_data",
        30: "joined_company_exp_data"
    }

    dm_keys = {
        0: "name",
        1: "title",
        2: "Beer_Name",
        4: "title",
        5: "title",
        6: "title",
        7: "title",
        8: "Song_Name",
        9: "title",
        10: "name",
        11: "Song_Name",
        12: "title",
        30: "content"
    }
    for i in dm_data:
        print()
        print(dm_data[i])
        A = data_base + f"/dm_blocked/{dm_data[i]}/tableA_processed.pkl"
        B = data_base + f"/dm_blocked/{dm_data[i]}/tableB_processed.pkl"
        supervision = data_base + f"/dm_blocked/{dm_data[i]}/supervision_test.pkl"
        idA = 'id'
        idB = 'id'
        keyA = dm_keys[i]
        keyB = dm_keys[i]
        merged = 'merged_all'

        A = pd.read_pickle(A).reset_index()
        B = pd.read_pickle(B).reset_index()
        supervision = pd.read_pickle(supervision)
        supervision = supervision.set_index('ltable_id')
        results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters)
        print(dm_data[i])
        for j in range(len(join_types)):
            timings['data'].append(dm_data[i])
            timings['type'].append(join_types[j])
            timings['results'].append(results[j])
            timings['avg'].append(np.mean(results[j]))
            print(timings['data'][-1], timings['type'][-1],
                  timings['results'][-1], timings['avg'][-1])
        print()

    # MS Marco
    data_base = '/lfs/1/sahaana/enrichment/data/MSMARCO/'
    A = pd.read_pickle(data_base + 'dev_tableA_processed.pkl')
    B = pd.read_pickle(data_base + 'tableB_processed.pkl')
    supervision = pd.read_pickle(data_base +
                                 'supervision_test.pkl').set_index('QID')
    tk = pd.read_csv(data_base + 'top1000.dev',
                     sep='\t',
                     names=['QID', 'PID', 'Query', 'Passage'])
    grouped_tk = tk.groupby('QID').agg(list)
    grouped_tk['Query'] = grouped_tk['Query'].apply(lambda x: x[0])
    grouped_tk = grouped_tk.reset_index()
    ke = []
    jkws = []
    jk2g = []
    jws = []
    j2g = []

    for _ in range(iters):
        tt = 0
        for idx, row in grouped_tk.iterrows():
            df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]})
            df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage})
            start = time.time()
            scores = ssj.edit_distance_join(df_l,
                                            df_r,
                                            'QID',
                                            'PID',
                                            'Query',
                                            'Passage',
                                            100,
                                            l_out_attrs=['Query'],
                                            r_out_attrs=['Passage'],
                                            n_jobs=-3)
            tt += time.time() - start
        ke.append(tt)

        tt = 0
        for idx, row in grouped_tk.iterrows():
            df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]})
            df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage})
            start = time.time()
            scores = ssj.jaccard_join(df_l,
                                      df_r,
                                      'QID',
                                      'PID',
                                      'Query',
                                      'Passage',
                                      WS,
                                      0.05,
                                      l_out_attrs=['Query'],
                                      r_out_attrs=['Passage'],
                                      n_jobs=-3)
            tt += time.time() - start
        jkws.append(tt)
        jws.append(jkws[-1])

        tt = 0
        for idx, row in grouped_tk.iterrows():
            df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]})
            df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage})
            start = time.time()
            scores = ssj.jaccard_join(df_l,
                                      df_r,
                                      'QID',
                                      'PID',
                                      'Query',
                                      'Passage',
                                      TWO_GRAM,
                                      0.05,
                                      l_out_attrs=['Query'],
                                      r_out_attrs=['Passage'],
                                      n_jobs=-3)
            tt += time.time() - start
        jk2g.append(tt)
        j2g.append(jk2g[-1])

    results = (ke, jkws, jk2g, jws, j2g)
    print('marco')
    for j in range(len(join_types)):
        timings['data'].append('marco')
        timings['type'].append(join_types[j])
        timings['results'].append(results[j])
        timings['avg'].append(np.mean(results[j]))
        print(timings['data'][-1], timings['type'][-1], timings['results'][-1],
              timings['avg'][-1])
    print()

    timings = pd.DataFrame(timings)
    timings.to_pickle(
        f'/lfs/1/sahaana/enrichment/ember/postprocessing/simjoin-iters_{iters}.pkl'
    )