def compute_all(A, B, idA, idB, keyA, keyB, merged, iters): ke = [] jkws = [] jk2g = [] jws = [] j2g = [] for _ in range(iters): start = time.time() ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-1) ke.append(time.time() - start) start = time.time() ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-1) jkws.append(time.time() - start) start = time.time() ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-1) jk2g.append(time.time() - start) start = time.time() ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-1) jws.append(time.time() - start) start = time.time() ssj.jaccard_join(A, B, idA, idB, merged, merged, TWO_GRAM, 0.1, n_jobs=-1) j2g.append(time.time() - start) return ke, jkws, jk2g, jws, j2g
def compute_joins(A, B, idA, idB, keyA, keyB, merged, compute = 5): if compute == 5: results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5), 'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5), 'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5), 'mer_jac_ws': ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-5), 'mer_jac_2g': ssj.jaccard_join(A, B, idA, idB, merged, merged, TWO_GRAM, 0.1, n_jobs=-5), } if compute == 3: results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5), 'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5), 'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5), 'mer_jac_ws': ssj.jaccard_join(A, B, idA, idB, merged, merged, WS, 0.1, n_jobs=-5), 'mer_jac_2g': ssj.jaccard_join(A, B, idA, idB, merged, merged, TWO_GRAM, 0.1, n_jobs=-5), } if compute == 2: results = { 'key_jac_ws': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, WS, 0.1, n_jobs=-5), 'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5), } if compute == 1: results = {'key_edit': ssj.edit_distance_join(A, B, idA, idB, keyA, keyB, 30, n_jobs=-5), } if compute == 0: results = { 'key_jac_2g': ssj.jaccard_join(A, B, idA, idB, keyA, keyB, TWO_GRAM, 0.1, n_jobs=-5), } return results
def apply_tables(self, ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, n_jobs=1): if self.sim_measure_type == 'JACCARD': return ssj.jaccard_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'COSINE': return ssj.cosine_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'DICE': return ssj.dice_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'EDIT_DISTANCE': return ssj.edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'OVERLAP': return ssj.overlap_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs) elif self.sim_measure_type == 'OVERLAP_COEFFICIENT': return ssj.overlap_coefficient_join(ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, self.tokenizer, self.threshold, comp_op=self.comp_op, n_jobs=n_jobs)
# on 'name' and 'attraction' is at most 5. # l_out_attrs and r_out_attrs denote the attributes from the # left table (A) and right table (B) that need to be included in the output. #name locType longitude latitude closest_to blogLinks src dest address srcLoc destLoc #attraction_in no_of_attractions no_of_reviews url reviewComment activity_type rank max_rank #attraction contact_no knownFor address pinCode output_pairs = ssj.edit_distance_join( blogTable, tripAdvisorTable, 'name', 'url', 'name', 'attraction', 3, l_out_prefix='blogs_', r_out_prefix='tripAdvisor_', l_out_attrs=['name', 'locType', 'address'], r_out_attrs=[ 'attraction', 'knownFor', 'address', 'pinCode', 'reviewComment', 'no_of_reviews', 'url', 'contact_no', 'rank', 'max_rank', 'activity_type', 'attraction_in', 'no_of_attractions' ], n_jobs=-1) print(len(output_pairs)) # examine the output pairs csvFile = open('editDistance.csv', 'w') output_pairs.to_csv(csvFile) print(output_pairs.loc[:, '_sim_score'])
data_attempt1 = pd.merge(imdb_data, kaggle_data, how='inner', left_on=['norm_title', 'norm_year'], right_on=['norm_movie_title', 'norm_title_year']) data_attempt1.shape # But given that IMDB and Kaggle datasets are collected from different sources, chances are that the name of a movie would be slightly different in these datasets (e.g. "Wall.E" vs "WallE"). To be able to find such matches, one can look at the similarity of movie titles and consider title with high similarity to be the same entity. BigGorilla's recommendation for doing similarity join across two datasets is the python package **py_stringsimjoin**. The following code snippet uses the **py_stringsimjoin** to match all the titles that have an edit distance of one or less (i.e., there is at most one character that needs to be changed/added/removed to make both titles identical). Once the similarity join is complete, it only selects the title pairs that are produced in the same year. # In[18]: import py_stringsimjoin as ssj import py_stringmatching as sm imdb_data['id'] = range(imdb_data.shape[0]) kaggle_data['id'] = range(kaggle_data.shape[0]) similar_titles = ssj.edit_distance_join(imdb_data, kaggle_data, 'id', 'id', 'norm_title', 'norm_movie_title', l_out_attrs=['norm_title', 'norm_year'], r_out_attrs=['norm_movie_title', 'norm_title_year'], threshold=1) # selecting the entries that have the same production year data_attempt2 = similar_titles[similar_titles.r_norm_title_year == similar_titles.l_norm_year] data_attempt2.shape # We can see that using the similarity join 4,689 titles were matched. Let's look at some of the titles that are matched by the similarity join but are not identical. # In[19]: data_attempt2[data_attempt2.l_norm_title != data_attempt2.r_norm_movie_title].head() # While instances such as "walle" and "wall.e" are correctly matched, we can see that this techniques also makes some errors (e.g., "grave" and "brave"). This raises the following questions: "what method should be used for data matching?" and "how can we determine the quality of the matching?". BigGorilla's recommendation for dealing with this problem is using the pythong package **py_entitymatching** which is developed as part of the [Magellan project](https://sites.google.com/site/anhaidgroup/projects/magellan). #
print('py_stringsimjoin version: ' + ssj.__version__) print('py_stringmatching version: ' + sm.__version__) print('pandas version: ' + pd.__version__) table_A_path = os.sep.join( [ssj.get_install_path(), 'datasets', 'data', 'person_table_A.csv']) table_B_path = os.sep.join( [ssj.get_install_path(), 'datasets', 'data', 'person_table_B.csv']) # Load csv files as dataframes. A = pd.read_csv(table_A_path) B = pd.read_csv(table_B_path) print('Number of records in A: ' + str(len(A))) print('Number of records in B: ' + str(len(B))) ssj.profile_table_for_join(A) B['new_key_attr'] = range(0, len(B)) ws = sm.WhitespaceTokenizer(return_set=True) ws.tokenize('William Bridge') output_pairs = ssj.edit_distance_join(A, B, 'A.id', 'B.id', 'A.name', 'B.name', 5, l_out_attrs=['A.name'], r_out_attrs=['B.name']) print(len(output_pairs)) print(output_pairs)
def time_all(iters=25): timings = defaultdict(list) join_types = [ 'key_edit', 'key_jac_ws', 'key_jac_2g', 'mer_jac_ws', 'mer_jac_2g' ] data_base = '/lfs/1/sahaana/enrichment/data' # Fuzzy Join A = data_base + "/main_fuzzy/test_tableA_processed.pkl" B = data_base + "/main_fuzzy/test_tableB_processed.pkl" supervision = data_base + "/main_fuzzy/supervision_test.pkl" idA = 'FUZZY_ID' idB = 'tconst' keyA = 'primaryTitle' keyB = 'primaryTitle' merged = 'merged_all' A = pd.read_pickle(A).reset_index() B = pd.read_pickle(B).reset_index() supervision = pd.read_pickle(supervision) supervision = supervision.set_index(idA) results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters) print('FJ') for j in range(len(join_types)): timings['data'].append('main_fuzzy') timings['type'].append(join_types[j]) timings['results'].append(results[j]) timings['avg'].append(np.mean(results[j])) print(timings['data'][-1], timings['type'][-1], timings['results'][-1], timings['avg'][-1]) print() # IMDB_wiki A = data_base + "/imdb_wiki/dev_tableA_processed.pkl" B = data_base + "/imdb_wiki/dev_tableB_processed.pkl" supervision = data_base + "/imdb_wiki/supervision_test.pkl" idA = 'IMDB_ID' idB = 'QID' keyA = 'primaryTitle' keyB = 'title' merged = 'merged_all' A = pd.read_pickle(A).reset_index() B = pd.read_pickle(B).reset_index() supervision = pd.read_pickle(supervision) supervision = supervision.set_index(idA) results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters) print('imdb_wiki') for j in range(len(join_types)): timings['data'].append('imdb_wiki') timings['type'].append(join_types[j]) timings['results'].append(results[j]) timings['avg'].append(np.mean(results[j])) print(timings['data'][-1], timings['type'][-1], timings['results'][-1], timings['avg'][-1]) print() # SQUAD A = data_base + "/SQuAD/dev_tableA_processed.pkl" B = data_base + "/SQuAD/dev_tableB_sent_processed.pkl" supervision = data_base + "/SQuAD/dev_sent_labels.pkl" idA = 'QID' idB = 'SID' keyA = 'question' keyB = 'sentence' merged = 'merged_all' A = pd.read_pickle(A).reset_index() B = pd.read_pickle(B).reset_index() supervision = pd.read_pickle(supervision) supervision = supervision.set_index(idA) results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters) print('SQUAD') for j in range(len(join_types)): timings['data'].append('squad') timings['type'].append(join_types[j]) timings['results'].append(results[j]) timings['avg'].append(np.mean(results[j])) print(timings['data'][-1], timings['type'][-1], timings['results'][-1], timings['avg'][-1]) print() # Deepmatchers dm_data = { 0: "joined_abt_buy_exp_data", 1: "joined_amazon_google_exp_data", 2: "joined_beer_exp_data", 4: "joined_dblp_acm_exp_data", 5: "joined_dblp_scholar_exp_data", 6: "joined_dirty_dblp_acm_exp_data", 7: "joined_dirty_dblp_scholar_exp_data", 8: "joined_dirty_itunes_amazon_exp_data", 9: "joined_dirty_walmart_amazon_exp_data", 10: "joined_fodors_zagat_exp_data", 11: "joined_itunes_amazon_exp_data", 12: "joined_walmart_amazon_exp_data", 30: "joined_company_exp_data" } dm_keys = { 0: "name", 1: "title", 2: "Beer_Name", 4: "title", 5: "title", 6: "title", 7: "title", 8: "Song_Name", 9: "title", 10: "name", 11: "Song_Name", 12: "title", 30: "content" } for i in dm_data: print() print(dm_data[i]) A = data_base + f"/dm_blocked/{dm_data[i]}/tableA_processed.pkl" B = data_base + f"/dm_blocked/{dm_data[i]}/tableB_processed.pkl" supervision = data_base + f"/dm_blocked/{dm_data[i]}/supervision_test.pkl" idA = 'id' idB = 'id' keyA = dm_keys[i] keyB = dm_keys[i] merged = 'merged_all' A = pd.read_pickle(A).reset_index() B = pd.read_pickle(B).reset_index() supervision = pd.read_pickle(supervision) supervision = supervision.set_index('ltable_id') results = compute_all(A, B, idA, idB, keyA, keyB, merged, iters) print(dm_data[i]) for j in range(len(join_types)): timings['data'].append(dm_data[i]) timings['type'].append(join_types[j]) timings['results'].append(results[j]) timings['avg'].append(np.mean(results[j])) print(timings['data'][-1], timings['type'][-1], timings['results'][-1], timings['avg'][-1]) print() # MS Marco data_base = '/lfs/1/sahaana/enrichment/data/MSMARCO/' A = pd.read_pickle(data_base + 'dev_tableA_processed.pkl') B = pd.read_pickle(data_base + 'tableB_processed.pkl') supervision = pd.read_pickle(data_base + 'supervision_test.pkl').set_index('QID') tk = pd.read_csv(data_base + 'top1000.dev', sep='\t', names=['QID', 'PID', 'Query', 'Passage']) grouped_tk = tk.groupby('QID').agg(list) grouped_tk['Query'] = grouped_tk['Query'].apply(lambda x: x[0]) grouped_tk = grouped_tk.reset_index() ke = [] jkws = [] jk2g = [] jws = [] j2g = [] for _ in range(iters): tt = 0 for idx, row in grouped_tk.iterrows(): df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]}) df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage}) start = time.time() scores = ssj.edit_distance_join(df_l, df_r, 'QID', 'PID', 'Query', 'Passage', 100, l_out_attrs=['Query'], r_out_attrs=['Passage'], n_jobs=-3) tt += time.time() - start ke.append(tt) tt = 0 for idx, row in grouped_tk.iterrows(): df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]}) df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage}) start = time.time() scores = ssj.jaccard_join(df_l, df_r, 'QID', 'PID', 'Query', 'Passage', WS, 0.05, l_out_attrs=['Query'], r_out_attrs=['Passage'], n_jobs=-3) tt += time.time() - start jkws.append(tt) jws.append(jkws[-1]) tt = 0 for idx, row in grouped_tk.iterrows(): df_l = pd.DataFrame({'QID': [row.QID], 'Query': [row.Query]}) df_r = pd.DataFrame({'PID': row.PID, 'Passage': row.Passage}) start = time.time() scores = ssj.jaccard_join(df_l, df_r, 'QID', 'PID', 'Query', 'Passage', TWO_GRAM, 0.05, l_out_attrs=['Query'], r_out_attrs=['Passage'], n_jobs=-3) tt += time.time() - start jk2g.append(tt) j2g.append(jk2g[-1]) results = (ke, jkws, jk2g, jws, j2g) print('marco') for j in range(len(join_types)): timings['data'].append('marco') timings['type'].append(join_types[j]) timings['results'].append(results[j]) timings['avg'].append(np.mean(results[j])) print(timings['data'][-1], timings['type'][-1], timings['results'][-1], timings['avg'][-1]) print() timings = pd.DataFrame(timings) timings.to_pickle( f'/lfs/1/sahaana/enrichment/ember/postprocessing/simjoin-iters_{iters}.pkl' )