def time_block_tables(self): rb = mg.RuleBasedBlocker() rb.add_rule(['movie_name_movie_name_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.6'], self.feature_table) rb.block_tables(self.A, self.B, ['movie_name','year','directors','actors','critic_rating','genre','pg_rating','duration'], ['movie_name','year','directors','actors','movie_rating','genre','duration'])
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.feature_table = em.get_features_for_blocking(self.A, self.B, validate_inferred_attr_types=False) self.rb = em.RuleBasedBlocker()
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.feature_table = em.get_features_for_blocking(self.A, self.B) self.rb = em.RuleBasedBlocker()
def test_rb_block_candset_empty_output_njobs_2(self): rb = em.RuleBasedBlocker() rb.add_rule(rule_1, self.feature_table) C = rb.block_tables(self.A, self.B) validate_metadata(C) validate_data(C, expected_ids_1) self.rb.add_rule(rule_5, self.feature_table) D = self.rb.block_candset(C, n_jobs=2) validate_metadata_two_candsets(C, D) validate_data(D)
def test_rb_block_candset_empty_input(self): rb = em.RuleBasedBlocker() rb.add_rule(rule_5, self.feature_table) C = rb.block_tables(self.A, self.B) validate_metadata(C) validate_data(C) self.rb.add_rule(rule_1, self.feature_table) D = self.rb.block_candset(C) validate_metadata_two_candsets(C, D) validate_data(D)
def test_rb_block_candset_njobs_2(self): rb = em.RuleBasedBlocker() rb.add_rule(rule_1, self.feature_table) C = rb.block_tables(self.A, self.B, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) validate_metadata(C, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) validate_data(C, expected_ids_1) self.rb.add_rule(rule_2, self.feature_table) D = self.rb.block_candset(C, n_jobs=2) validate_metadata_two_candsets(C, D) validate_data(D, expected_ids_1_and_2)
def setup(self): path_for_A = os.sep.join([datasets_path, 'books', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'books', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') self.feature_table = mg.get_features_for_blocking(self.A, self.B) self.rb = mg.RuleBasedBlocker() except AssertionError: print("Dataset \'beer\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'ebooks', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'ebooks', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'record_id') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'record_id') ob = mg.OverlapBlocker() self.C = ob.block_tables(A, B, 'title', 'title', overlap_size=2, rem_stop_words = True, l_output_attrs=['title', 'author', 'publisher', 'date'], r_output_attrs=['title', 'author', 'publisher', 'date']) feature_table = mg.get_features_for_blocking(A,B) self.rb = mg.RuleBasedBlocker() self.rb.add_rule(['date_date_lev_sim(ltuple, rtuple) < 0.6'], feature_table) except AssertionError: print("Dataset \'beer\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'ID') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'ID') ob = mg.OverlapBlocker() self.C = ob.block_tables(A, B, 'ADDRESS', 'ADDRESS', overlap_size=4, l_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'], r_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS']) feature_table = mg.get_features_for_blocking(A,B) self.rb = mg.RuleBasedBlocker() self.rb.add_rule(['ADDRESS_ADDRESS_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.44'], feature_table) except AssertionError: print("Dataset \'beer\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
# In[41]: A # In[79]: block_f = em.get_features_for_blocking(A, B) # In[80]: block_f # In[81]: rb1 = em.RuleBasedBlocker() rule1 = 'name_name_lev_sim(ltuple,rtuple) < 0.8' rb1.add_rule(rule1, block_f) rb2 = em.RuleBasedBlocker() rule2 = 'address_address_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.9' rb2.add_rule(rule2, block_f) # In[63]: C1 = rb1.block_tables(A, B, l_output_attrs=[ 'ID', 'name', 'address', 'ratingValue', 'price_range', 'number_of_reviews' ],
def test_rb_block_candset_no_rules(self): rb = em.RuleBasedBlocker() rb.add_rule(rule_1, self.feature_table) C = rb.block_tables(self.A, self.B, show_progress=False) self.rb.block_candset(C)
'Directed By', 'Written By', 'Studio' ], r_output_attrs=[ 'Title', 'Genre', 'Score', 'Release Date', 'Rating', 'Directed By', 'Written By', 'Studio' ]) # Combine the outputs from attr. equivalence blocker and overlap blocker # union because if there is an error in the release date, at least the movies should have their names in common D = em.combine_blocker_outputs_via_union([C1, C2, C3]) # Rule based blocker after D block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False) rb = em.RuleBasedBlocker() # print(block_f) rb.add_rule(['Title_Title_lev_sim(ltuple, rtuple) < 0.4'], block_f) C = rb.block_candset(D, show_progress=False) print('Candidate Match set C Size: ', len(C)) print('Finish Blocking stage') ################################## Matcher Portion ################################## # Open up out labeled data from the last Project. path_G = '../data/G.csv' G = em.read_csv_metadata(path_G, key='_id', ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID')
def workflow(path_A, path_B, path_labeled): # Load csv files as dataframes and set the key attribute in the dataframe A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # Run attribute equivalence blocker on brand ab = em.AttrEquivalenceBlocker() C1 = ab.block_tables(A, B, 'Brand', 'Brand', l_output_attrs=[ 'Name', 'Price', 'Brand', 'Screen Size', 'RAM', 'Hard Drive Capacity', 'Processor Type', 'Processor Speed', 'Operating System', 'Clean Name' ], r_output_attrs=[ 'Name', 'Price', 'Brand', 'Screen Size', 'RAM', 'Hard Drive Capacity', 'Processor Type', 'Processor Speed', 'Operating System', 'Clean Name' ]) # Get features for rule based blocking block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False) # Run rule based blocker with rule for jaccard score on Clean Name column rb = em.RuleBasedBlocker() rb.add_rule( ['Clean_Name_Clean_Name_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.2'], block_f) C2 = rb.block_candset(C1) # Run black box blocker to compare screen size, ram, and hard drive capacity bb_screen = em.BlackBoxBlocker() bb_screen.set_black_box_function((screen_ram_hd_equal)) C = bb_screen.block_candset(C2) # Load the labeled data L = em.read_csv_metadata(path_labeled, key='_id', ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID') # Generate features feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) feature_subset = feature_table.iloc[np.r_[4:10, 40:len(feature_table)], :] em.add_blackbox_feature(feature_subset, 'refurbished', refurbished) # Extract feature vectors feature_vectors_dev = em.extract_feature_vecs(L, feature_table=feature_subset, attrs_after='gold') # Impute feature vectors with the mean of the column values. feature_vectors_dev = em.impute_table( feature_vectors_dev, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'], strategy='mean') # Train using feature vectors from the labeled data matcher = em.RFMatcher(name='RF') matcher.fit(table=feature_vectors_dev, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'], target_attr='gold') # Extract feature vectors for the rest of the data feature_vectors = em.extract_feature_vecs(C, feature_table=feature_subset) # Impute feature vectors with the mean of the column values. feature_vectors = em.impute_table( feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], strategy='mean') # Make predictions for the whole data set predictions = matcher.predict( table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], append=True, target_attr='predicted', inplace=False) predictions = predictions.loc[:, [ '_id', 'ltable_ID', 'rtable_ID', 'predicted' ]] return predictions[predictions['predicted'] == 1]
def time_block_tables(self): rb = mg.RuleBasedBlocker() rb.add_rule(['Title_Title_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.4'], self.feature_table) rb.block_tables(self.A, self.B, ['Title'], ['Title'])
def time_block_tables_features_jac(self): rb = mg.RuleBasedBlocker() rb.add_rule(['Features_Features_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.6'], self.feature_table) rb.block_tables(self.A, self.B, ['Features'], ['Features'])
def time_block_tables_name_cos(self): rb = mg.RuleBasedBlocker() rb.add_rule(['Name_Name_cos_dlm_dc0_dlm_dc0(ltuple,rtuple) < 0.3'], self.feature_table) rb.block_tables(self.A, self.B, ['Name'], ['Name'])