def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.feature_table = em.get_features_for_blocking(self.A, self.B, validate_inferred_attr_types=False) self.rb = em.RuleBasedBlocker()
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.feature_table = em.get_features_for_blocking(self.A, self.B) self.rb = em.RuleBasedBlocker()
def setup(self): path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') self.feature_table = mg.get_features_for_blocking(self.A, self.B) except AssertionError: print("Dataset \'anime\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'ebooks', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'ebooks', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'record_id') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'record_id') ob = mg.OverlapBlocker() self.C = ob.block_tables(A, B, 'title', 'title', overlap_size=2, rem_stop_words = True, l_output_attrs=['title', 'author', 'publisher', 'date'], r_output_attrs=['title', 'author', 'publisher', 'date']) feature_table = mg.get_features_for_blocking(A,B) self.rb = mg.RuleBasedBlocker() self.rb.add_rule(['date_date_lev_sim(ltuple, rtuple) < 0.6'], feature_table) except AssertionError: print("Dataset \'beer\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'ID') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'ID') ob = mg.OverlapBlocker() self.C = ob.block_tables(A, B, 'ADDRESS', 'ADDRESS', overlap_size=4, l_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'], r_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS']) feature_table = mg.get_features_for_blocking(A,B) self.rb = mg.RuleBasedBlocker() self.rb.add_rule(['ADDRESS_ADDRESS_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.44'], feature_table) except AssertionError: print("Dataset \'beer\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
# In[21]: len(A.ID) - A.count() # In[22]: len(B.ID) - B.count() # In[41]: A # In[79]: block_f = em.get_features_for_blocking(A, B) # In[80]: block_f # In[81]: rb1 = em.RuleBasedBlocker() rule1 = 'name_name_lev_sim(ltuple,rtuple) < 0.8' rb1.add_rule(rule1, block_f) rb2 = em.RuleBasedBlocker() rule2 = 'address_address_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.9' rb2.add_rule(rule2, block_f)
# Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' # Get the paths of the input tables path_A = datasets_dir + os.sep + 'person_table_A.csv' path_B = datasets_dir + os.sep + 'person_table_B.csv' A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # print(A.head()) feature_table = em.get_features_for_blocking(A, B) print(feature_table.head()) sim = em.get_sim_funs_for_matching() tok = em.get_tokenizers_for_matching() feature_string = """jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()), wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))""" feature = em.get_feature_fn(feature_string, sim, tok) # Add feature to F em.add_feature(feature_table, 'jac_ws_name_address', feature)
l_output_attrs=[ 'Title', 'Genre', 'Score', 'Release Date', 'Rating', 'Directed By', 'Written By', 'Studio' ], r_output_attrs=[ 'Title', 'Genre', 'Score', 'Release Date', 'Rating', 'Directed By', 'Written By', 'Studio' ]) # Combine the outputs from attr. equivalence blocker and overlap blocker # union because if there is an error in the release date, at least the movies should have their names in common D = em.combine_blocker_outputs_via_union([C1, C2, C3]) # Rule based blocker after D block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False) rb = em.RuleBasedBlocker() # print(block_f) rb.add_rule(['Title_Title_lev_sim(ltuple, rtuple) < 0.4'], block_f) C = rb.block_candset(D, show_progress=False) print('Candidate Match set C Size: ', len(C)) print('Finish Blocking stage') ################################## Matcher Portion ################################## # Open up out labeled data from the last Project. path_G = '../data/G.csv' G = em.read_csv_metadata(path_G, key='_id', ltable=A, rtable=B,
show_progress=True) print len(candidate_pairs) candidate_pairs = ob.block_candset(candidate_pairs, 'artist', 'artist', word_level=True, overlap_size=1, show_progress=True) print len(candidate_pairs) #em.to_csv_metadata(reduced_pairs,'C:/Users/Daniel/Documents/UW/838/Project/Stage3/data/pairs_after_ob_title_and_artist.csv') block_f = em.get_features_for_blocking(songs, tracks) block_c = em.get_attr_corres(songs, tracks) block_t = em.get_tokenizers_for_blocking() block_s = em.get_sim_funs_for_blocking() atypes1 = em.get_attr_types(songs) atypes2 = em.get_attr_types(tracks) block_f = em.get_features(songs, tracks, atypes1, atypes2, block_c, block_t, block_s) rb = em.RuleBasedBlocker() rb.add_rule(["name_name_jac_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.3"], block_f) candidate_pairs = rb.block_candset(candidate_pairs, show_progress=True)
'year', l_output_attrs=['title', 'year'], r_output_attrs=['title', 'year']) C2 = ob.block_candset(C1, 'title', 'title', word_level=True, rem_stop_words=True, overlap_size=1) # C2 = ob.block_tables(sample_movies, sample_tracks, 'title', 'title', word_level=True, rem_stop_words=True, # overlap_size=1, l_output_attrs=['title', 'year'], # r_output_attrs=['title', 'year'], # show_progress=False) block_f = em.get_features_for_blocking(sample_movies, sample_tracks) rb.add_rule([ 'title_title_cos_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.6', 'title_title_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.6' ], block_f) C3 = rb.block_candset(C1, n_jobs=-1, show_progress=False) # C3 = ab.block_candset(C1, l_block_attr='year', r_block_attr='year') D = em.combine_blocker_outputs_via_union([C2, C3]) # C3 = C1 # Use block_tables to apply blocking over two input tables. # corres = [('title', 'title'), ('year', 'year')]
def workflow(path_A, path_B, path_labeled): # Load csv files as dataframes and set the key attribute in the dataframe A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # Run attribute equivalence blocker on brand ab = em.AttrEquivalenceBlocker() C1 = ab.block_tables(A, B, 'Brand', 'Brand', l_output_attrs=[ 'Name', 'Price', 'Brand', 'Screen Size', 'RAM', 'Hard Drive Capacity', 'Processor Type', 'Processor Speed', 'Operating System', 'Clean Name' ], r_output_attrs=[ 'Name', 'Price', 'Brand', 'Screen Size', 'RAM', 'Hard Drive Capacity', 'Processor Type', 'Processor Speed', 'Operating System', 'Clean Name' ]) # Get features for rule based blocking block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False) # Run rule based blocker with rule for jaccard score on Clean Name column rb = em.RuleBasedBlocker() rb.add_rule( ['Clean_Name_Clean_Name_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.2'], block_f) C2 = rb.block_candset(C1) # Run black box blocker to compare screen size, ram, and hard drive capacity bb_screen = em.BlackBoxBlocker() bb_screen.set_black_box_function((screen_ram_hd_equal)) C = bb_screen.block_candset(C2) # Load the labeled data L = em.read_csv_metadata(path_labeled, key='_id', ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID') # Generate features feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) feature_subset = feature_table.iloc[np.r_[4:10, 40:len(feature_table)], :] em.add_blackbox_feature(feature_subset, 'refurbished', refurbished) # Extract feature vectors feature_vectors_dev = em.extract_feature_vecs(L, feature_table=feature_subset, attrs_after='gold') # Impute feature vectors with the mean of the column values. feature_vectors_dev = em.impute_table( feature_vectors_dev, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'], strategy='mean') # Train using feature vectors from the labeled data matcher = em.RFMatcher(name='RF') matcher.fit(table=feature_vectors_dev, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'], target_attr='gold') # Extract feature vectors for the rest of the data feature_vectors = em.extract_feature_vecs(C, feature_table=feature_subset) # Impute feature vectors with the mean of the column values. feature_vectors = em.impute_table( feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], strategy='mean') # Make predictions for the whole data set predictions = matcher.predict( table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], append=True, target_attr='predicted', inplace=False) predictions = predictions.loc[:, [ '_id', 'ltable_ID', 'rtable_ID', 'predicted' ]] return predictions[predictions['predicted'] == 1]