def test_build_id_to_index_map_2(self): table = [['a1', 'hello'], ['a1', 'world']] key = 'ID' dataframe = pd.DataFrame(table) dataframe.columns = ['ID', 'title'] em.set_key(dataframe, key) db._build_id_to_index_map(dataframe, key)
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.feature_table = em.get_features_for_blocking(self.A, self.B, validate_inferred_attr_types=False) self.rb = em.RuleBasedBlocker()
def test_bb_block_tables_empty_rtable(self): empty_B = pd.DataFrame(columns=self.B.columns) em.set_key(empty_B, 'ID') self.bb.set_black_box_function(_block_fn) C = self.bb.block_tables(self.A, empty_B) validate_metadata(C) validate_data(C)
def test_ob_block_tables_empty_rtable(self): empty_B = pd.DataFrame(columns=self.B.columns) em.set_key(empty_B, 'ID') C = self.ob.block_tables(self.A, empty_B, l_overlap_attr_1, r_overlap_attr_1) validate_metadata(C) validate_data(C)
def setup(self): p = mg.get_install_path() path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv']) l_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] r_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'id') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'id') C = ab.block_tables(A, B, 'city_posted', 'city_posted', l_output_attrs, r_output_attrs) self.D = ab.block_candset(C, 'model_year', 'model_year') bb.set_black_box_function(_bikes_function) except AssertionError: print("Dataset \'bikes\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'music', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'music', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'Sno') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'Sno') l_output_attrs = [ 'Album_Name', 'Artist_Name', 'CopyRight', 'Released', 'Song_Name', 'Time' ] r_output_attrs = [ 'Album_Name', 'Artist_Name', 'Copyright', 'Released', 'Song_Name', 'Time' ] C = ob.block_tables(A, B, 'Album_Name', 'Album_Name', rem_stop_words=True, l_output_attrs=l_output_attrs, r_output_attrs=r_output_attrs) self.D = ob.block_candset(C, 'Artist_Name', 'Artist_Name', rem_stop_words=True) except AssertionError: print("Dataset \'music\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def run_quora_blocking(sampler = "iterative", lsh_args = None, sequential_args = None): if (sampler != "iterative") & (sampler != "naive"): raise ValueError("Sampler should be iterative or naive (completely random).") # Load Training Set according to sampler em.del_catalog() lhs_table = em.read_csv_metadata("../data/processed_quora/quora_" + sampler + "_X_train_lhs.csv").rename(columns = {"Unnamed: 0":"id_lhs"}).sample(n = 1500, random_state = 52) rhs_table = em.read_csv_metadata("../data/processed_quora/quora_" + sampler + "_X_train_rhs.csv").rename(columns = {"Unnamed: 0":"id_rhs"}).sample(n = 1500, random_state = 52) y_train = pd.read_csv("../data/processed_quora/quora_" + sampler + "_y_train.csv") em.del_catalog() em.set_key(lhs_table, "id_lhs") em.set_key(rhs_table, "id_rhs") n_train = lhs_table.shape[0] # Blocking blocking_cols = ["question1","question2"] feature_cols = [["question1"], ['question2']] id_names = ["qid1","qid2"] lsh_blocking_col_ids = 1 print("Blocking Train Set of Quora using LSH only.") candidates = lsh_blocking(lhs_table, rhs_table, lsh_blocking_col_ids, 2, ["qid1","qid2"], char_ngram = lsh_args["char_ngram"], seeds = lsh_args["seeds"], bands = lsh_args["bands"]) print(f"Generated Candidate size has {candidates.shape[0]} rows") return NotImplementedError
def setup(self): path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'id') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'id') l_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] r_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] self.C = ab.block_tables(A, B, 'city_posted', 'city_posted', l_output_attrs, r_output_attrs) except AssertionError: print("Dataset \'bikes\' not found. Please visit the project" " website to download the dataset.") raise SystemExit self.l_block_attr = 'model_year' self.r_block_attr = 'model_year'
def test_ab_block_tuples_wi_missing_values_disallow_missing(self): path_a = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv' ]) path_b = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv' ]) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') assert_equal( self.ab.block_tuples(A.loc[0], B.loc[0], l_block_attr_1, r_block_attr_1), True) assert_equal( self.ab.block_tuples(A.loc[1], B.loc[2], l_block_attr_1, r_block_attr_1), False) assert_equal( self.ab.block_tuples(A.loc[2], B.loc[1], l_block_attr_1, r_block_attr_1), True) assert_equal( self.ab.block_tuples(A.loc[0], B.loc[1], l_block_attr_1, r_block_attr_1), True) assert_equal( self.ab.block_tuples(A.loc[2], B.loc[2], l_block_attr_1, r_block_attr_1), True)
def test_build_id_to_index_map_2(self): table = [['a1', 'hello'], ['a1', 'world']] key = 'ID' dataframe = pd.DataFrame(table) dataframe.columns = ['ID', 'title'] em.set_key(dataframe, key) db._build_id_to_index_map(dataframe, key)
def test_ob_block_tables_empty_rtable(self): empty_B = pd.DataFrame(columns=self.B.columns) em.set_key(empty_B, 'ID') C = self.ob.block_tables(self.A, empty_B, l_overlap_attr_1, r_overlap_attr_1) validate_metadata(C) validate_data(C)
def test_ob_block_candset_wi_missing_vals_disallow_missing(self): path_a = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv' ]) path_b = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv' ]) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1, allow_missing=True) validate_metadata(C) validate_data(C, expected_ids_4) D = self.ob.block_candset(C, l_overlap_attr_2, r_overlap_attr_2, rem_stop_words=True, overlap_size=4) validate_metadata_two_candsets(C, D) validate_data(D, expected_ids_2)
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.feature_table = em.get_features_for_blocking(self.A, self.B) self.rb = em.RuleBasedBlocker()
def test_ob_block_tables_empty_ltable(self): empty_A = pd.DataFrame(columns=self.A.columns) print(empty_A.dtypes) em.set_key(empty_A, 'ID') C = self.ob.block_tables(empty_A, self.B, l_overlap_attr_1, r_overlap_attr_1) validate_metadata(C) validate_data(C)
def _get_filtered_table(ltable, rtable, lkey, rkey, corres_list): ltable_cols = [col_pair[0] for col_pair in corres_list] rtable_cols = [col_pair[1] for col_pair in corres_list] lfiltered_table = ltable[ltable_cols] rfiltered_table = rtable[rtable_cols] em.set_key(lfiltered_table, lkey) em.set_key(rfiltered_table, rkey) return lfiltered_table, rfiltered_table
def test_ob_block_tables_empty_ltable(self): empty_A = pd.DataFrame(columns=self.A.columns) print(empty_A.dtypes) em.set_key(empty_A, 'ID') C = self.ob.block_tables(empty_A, self.B, l_overlap_attr_1, r_overlap_attr_1) validate_metadata(C) validate_data(C)
def _get_filtered_table(ltable, rtable, lkey, rkey, corres_list): ltable_cols = [col_pair[0] for col_pair in corres_list] rtable_cols = [col_pair[1] for col_pair in corres_list] lfiltered_table = ltable[ltable_cols] rfiltered_table = rtable[rtable_cols] em.set_key(lfiltered_table, lkey) em.set_key(rfiltered_table, rkey) return lfiltered_table, rfiltered_table
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B) self.feature_table = em.get_features_for_matching(self.A, self.B, validate_inferred_attr_types=False) self.brm = em.BooleanRuleMatcher()
def test_build_col_name_index_dict_2(self): A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']]) A.columns = ['ID', 'name', 'price', 'desc'] em.set_key(A, 'ID') col_index = db._build_col_name_index_dict(A) self.assertEqual(col_index['ID'], 0) self.assertEqual(col_index['name'], 1) self.assertEqual(col_index['price'], 2) self.assertEqual(col_index['desc'], 3)
def test_build_col_name_index_dict_2(self): A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']]) A.columns = ['ID', 'name', 'price', 'desc'] em.set_key(A, 'ID') col_index = db._build_col_name_index_dict(A) self.assertEqual(col_index['ID'], 0) self.assertEqual(col_index['name'], 1) self.assertEqual(col_index['price'], 2) self.assertEqual(col_index['desc'], 3)
def read_pair_csv_with_metadata(tuples, filename, key, dtype={}): # Way to inject dtype parameter into native? For now, do it "by hand" pairs = pd.read_csv(filename, dtype=dtype) em.set_key(pairs, key) em.set_ltable(pairs, tuples) em.set_fk_ltable(pairs, f'ltable_{em.get_key(tuples)}') em.set_rtable(pairs, tuples) em.set_fk_rtable(pairs, f'rtable_{em.get_key(tuples)}') return pairs
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B) self.feature_table = em.get_features_for_matching(self.A, self.B, validate_inferred_attr_types=False) self.C['neg_trig_labels'] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] self.C['pos_trig_labels'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.mt = em.MatchTrigger()
def test_ob_block_tables_wi_missing_values_disallow_missing(self): path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv']) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1) validate_metadata(C) validate_data(C, expected_ids_1)
def setup(self): path_for_A = os.sep.join([datasets_path, 'citations', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'citations', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'anime\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') self.feature_table = mg.get_features_for_blocking(self.A, self.B) except AssertionError: print("Dataset \'electronics\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') self.feature_table = mg.get_features_for_blocking(self.A, self.B) except AssertionError: print("Dataset \'anime\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'beer', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'beer', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'Label') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'Label') bb.set_black_box_function(_beer_function) except AssertionError: print("Dataset \'beer\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv']) self.l_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS'] self.r_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'restaurants\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def test_get_filtered_table(self): A = pd.DataFrame([['a1', 'A', 0.11, 53704]]) A.columns = ['ID', 'name', 'price', 'zip code'] em.set_key(A, 'ID') B = pd.DataFrame([['b1', 'A', 0.11, 54321]]) B.columns = ['ID', 'name', 'price', 'zip code'] em.set_key(B, 'ID') A_key = 'ID' B_key = 'ID' ltable_col_dict = db._build_col_name_index_dict(A) rtable_col_dict = db._build_col_name_index_dict(B) attr_corres = [('ID', 'ID'), ('name', 'name'), ('price', 'price'), ('zip code', 'zip code')] db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict, rtable_col_dict, attr_corres) filtered_A, filtered_B = db._get_filtered_table(A, B, attr_corres) expected_filtered_A = pd.DataFrame([['a1', 'A']]) expected_filtered_A.columns = ['ID', 'name'] em.set_key(expected_filtered_A, 'ID') expected_filtered_B = pd.DataFrame([['b1', 'A']]) expected_filtered_B.columns = ['ID', 'name'] em.set_key(expected_filtered_B, 'ID') self.assertEqual(expected_filtered_A.equals(filtered_A), True) self.assertEqual(expected_filtered_B.equals(filtered_B), True)
def test_get_filtered_table(self): A = pd.DataFrame([['a1', 'A', 0.11, 53704]]) A.columns = ['ID', 'name', 'price', 'zip code'] em.set_key(A, 'ID') B = pd.DataFrame([['b1', 'A', 0.11, 54321]]) B.columns = ['ID', 'name', 'price', 'zip code'] em.set_key(B, 'ID') A_key = 'ID' B_key = 'ID' ltable_col_dict = db._build_col_name_index_dict(A) rtable_col_dict = db._build_col_name_index_dict(B) attr_corres = [('ID', 'ID'), ('name', 'name'), ('price', 'price'), ('zip code', 'zip code')] db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict, rtable_col_dict, attr_corres) filtered_A, filtered_B = db._get_filtered_table(A, B, attr_corres) expected_filtered_A = pd.DataFrame([['a1', 'A']]) expected_filtered_A.columns = ['ID', 'name'] em.set_key(expected_filtered_A, 'ID') expected_filtered_B = pd.DataFrame([['b1', 'A']]) expected_filtered_B.columns = ['ID', 'name'] em.set_key(expected_filtered_B, 'ID') self.assertEqual(expected_filtered_A.equals(filtered_A), True) self.assertEqual(expected_filtered_B.equals(filtered_B), True)
def test_index_candidate_set_6(self): A_list = [[1, 'asdf', 'fdas'], [2, 'fdsa', 'asdf']] B_list = [['B002', 'qqqq', 'wwww'], ['B003', 'rrrr', 'fdsa']] A = pd.DataFrame(A_list) A.columns = ['ID', 'f1', 'f2'] em.set_key(A, 'ID') B = pd.DataFrame(B_list) B.columns = ['ID', 'f1', 'f2'] em.set_key(B, 'ID') C = pd.DataFrame() lrecord_id_to_index_map = db._build_id_to_index_map(A, 'ID') rrecord_id_to_index_map = db._build_id_to_index_map(B, 'ID') new_C = db._index_candidate_set(C, lrecord_id_to_index_map, rrecord_id_to_index_map, False) self.assertEqual(new_C, {})
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B) self.feature_table = em.get_features_for_matching( self.A, self.B, validate_inferred_attr_types=False) self.C['neg_trig_labels'] = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ] self.C['pos_trig_labels'] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] self.mt = em.MatchTrigger()
def test_ab_block_candset_wi_missing_values_disallow_missing(self): path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv']) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1) validate_metadata(C) validate_data(C, expected_ids_4) D = self.ab.block_candset(C, l_block_attr_2, r_block_attr_2) validate_metadata_two_candsets(C, D) validate_data(D, [('a5','b5')])
def test_ob_block_tuples_wi_missing_vals_disallow_missing(self): path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv']) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') assert_equal(self.ob.block_tuples(A.ix[1], B.ix[3], l_overlap_attr_1, r_overlap_attr_1), True) assert_equal(self.ob.block_tuples(A.ix[3], B.ix[2], l_overlap_attr_1, r_overlap_attr_1), True) assert_equal(self.ob.block_tuples(A.ix[3], B.ix[3], l_overlap_attr_1, r_overlap_attr_1), True)
def test_index_candidate_set_6(self): A_list = [[1, 'asdf', 'fdas'], [2, 'fdsa', 'asdf']] B_list = [['B002', 'qqqq', 'wwww'], ['B003', 'rrrr', 'fdsa']] A = pd.DataFrame(A_list) A.columns = ['ID', 'f1', 'f2'] em.set_key(A, 'ID') B = pd.DataFrame(B_list) B.columns = ['ID', 'f1', 'f2'] em.set_key(B, 'ID') C = pd.DataFrame() lrecord_id_to_index_map = db._build_id_to_index_map(A, 'ID') rrecord_id_to_index_map = db._build_id_to_index_map(B, 'ID') new_C = db._index_candidate_set(C, lrecord_id_to_index_map, rrecord_id_to_index_map, False) self.assertEqual(new_C, {})
def test_ab_block_tables_wi_missing_values_allow_missing(self): path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv']) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, True) validate_metadata(C, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) validate_data(C, expected_ids_3)
def test_get_tokenized_table_3(self): table = [[1, 'abc abc asdf', '123-3456-7890', pd.np.nan, '', '135 east abc st'], [2, 'aaa bbb', '000-111-2222', '', '', '246 west abc st'], [3, 'cc dd', '123-123-1231', 'cc', 'unknown', ' 246 west def st']] dataframe = pd.DataFrame(table) dataframe.columns = ['ID', 'name', 'phone', 'department', 'school', 'address'] key = 'ID' em.set_key(dataframe, key) feature_list = [1, 3, 4, 5] actual_record_list = db._get_tokenized_table(dataframe, key, feature_list) expected_record_list = [['abc', 'abc_1', 'asdf', '135', 'east', 'abc_2', 'st'], ['aaa', 'bbb', '246', 'west', 'abc', 'st'], ['cc', 'dd', 'cc_1', 'unknown', '246', 'west', 'def', 'st']] self.assertEqual(actual_record_list, expected_record_list)
def setup(self): path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv']) self.l_block_attr = 'Year' self.r_block_attr = 'Year' self.l_output_attrs = ['Title', 'Year', 'Episodes'] self.r_output_attrs = ['Title', 'Year', 'Episodes'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'anime\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv']) self.l_block_attr = 'PHONENUMBER' self.r_block_attr = 'PHONENUMBER' self.l_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS'] self.r_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'restaurants\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv']) self.l_block_attr = 'Brand' self.r_block_attr = 'Brand' self.l_output_attrs = ['Brand', 'Amazon_Price'] self.r_output_attrs = ['Brand', 'Price'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'electronics\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv']) self.l_block_attr = 'Year' self.r_block_attr = 'Year' self.l_output_attrs = ['Title', 'Year', 'Episodes'] self.r_output_attrs = ['Title', 'Year', 'Episodes'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'anime\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def test_filter_corres_list_1(self): A = pd.DataFrame([[0, 20, 0.11, 4576]]) A.columns = ['ID', 'age', 'price', 'zip code'] em.set_key(A, 'ID') B = pd.DataFrame([[0, 240, 0.311, 4474]]) B.columns = ['ID', 'age', 'price', 'zip code'] em.set_key(A, 'ID') A_key = 'ID' B_key = 'ID' ltable_col_dict = db._build_col_name_index_dict(A) rtable_col_dict = db._build_col_name_index_dict(B) attr_corres = [('ID', 'ID'), ('age', 'age'), ('price', 'price'), ('zip code', 'zip code')] db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict, rtable_col_dict, attr_corres)
def main(): # clean up original csv # comment out this code if A.csv and B.csv already exists #clean_up() # read csv tables A = pd.read_csv(Directory+'A.csv') B = pd.read_csv(Directory+'B.csv') # set keys to tables em.set_key(A, 'id') em.set_key(B, 'id') # block tables using black-box blocker C = black_box_blocker(A, B) C.to_csv('C.csv', index = False) # debug blocker dbq = blocker_debugging(C, A, B) dbq.to_csv('debugged_result.csv')
def test_ob_block_tables_wi_missing_values_disallow_missing(self): path_a = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv' ]) path_b = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv' ]) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1) validate_metadata(C) validate_data(C, expected_ids_1)
def test_get_field_correspondence_list_5(self): A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']]) A.columns = ['ID', 'name', 'price', 'desc'] em.set_key(A, 'ID') A_key = em.get_key(A) B = pd.DataFrame([['B', 'B001', 'ASDF', 0.111]]) B.columns = ['item_name', 'item_id', 'item_desc', 'item_price'] em.set_key(B, 'item_id') B_key = em.get_key(B) attr_corres = [('name', 'item_name'), ('price', 'item_price')] actual_attr_corres = db._get_field_correspondence_list( A, B, A_key, B_key, attr_corres) expected_attr_corres = [('name', 'item_name'), ('price', 'item_price'), ('ID', 'item_id')] self.assertEqual(expected_attr_corres, actual_attr_corres)
def test_filter_corres_list_1(self): A = pd.DataFrame([[0, 20, 0.11, 4576]]) A.columns = ['ID', 'age', 'price', 'zip code'] em.set_key(A, 'ID') B = pd.DataFrame([[0, 240, 0.311, 4474]]) B.columns = ['ID', 'age', 'price', 'zip code'] em.set_key(A, 'ID') A_key = 'ID' B_key = 'ID' ltable_col_dict = db._build_col_name_index_dict(A) rtable_col_dict = db._build_col_name_index_dict(B) attr_corres = [('ID', 'ID'), ('age', 'age'), ('price', 'price'), ('zip code', 'zip code')] db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict, rtable_col_dict, attr_corres)
def setup(self): path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv']) self.l_block_attr = 'Brand' self.r_block_attr = 'Brand' self.l_output_attrs = ['Brand', 'Amazon_Price'] self.r_output_attrs = ['Brand', 'Price'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'electronics\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def test_ob_block_candset_wi_missing_vals_disallow_missing(self): path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv']) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1, allow_missing=True) validate_metadata(C) validate_data(C, expected_ids_4) D = self.ob.block_candset(C, l_overlap_attr_2, r_overlap_attr_2, rem_stop_words=True, overlap_size=4) validate_metadata_two_candsets(C, D) validate_data(D, expected_ids_2)
def test_debugblocker_12(self): llist = [[0]] rlist = [[0]] ltable = pd.DataFrame(llist) rtable = pd.DataFrame(rlist) ltable.columns = ['ID'] rtable.columns = ['ID'] lkey = 'ID' rkey = 'ID' em.set_key(ltable, lkey) em.set_key(rtable, rkey) cand_set = pd.DataFrame([[0, 0, 0]]) cand_set.columns = ['_id', 'ltable_ID', 'rtable_ID'] cm.set_candset_properties(cand_set, '_id', 'ltable_ID', 'rtable_ID', ltable, rtable) db.debug_blocker(cand_set, ltable, rtable)
def test_get_tokenized_table_3(self): table = [[1, 'abc abc asdf', '123-3456-7890', pd.np.nan, '', '135 east abc st'], [2, 'aaa bbb', '000-111-2222', '', '', '246 west abc st'], [3, 'cc dd', '123-123-1231', 'cc', 'unknown', ' 246 west def st']] dataframe = pd.DataFrame(table) dataframe.columns = ['ID', 'name', 'phone', 'department', 'school', 'address'] key = 'ID' em.set_key(dataframe, key) feature_list = [1, 3, 4, 5] actual_record_list = db._get_tokenized_table(dataframe, key, feature_list) expected_record_list = [[('abc', 0), ('abc_1', 0), ('asdf', 0), ('135', 3), ('east', 3), ('abc_2', 3), ('st', 3)], [('aaa', 0), ('bbb', 0), ('246', 3), ('west', 3), ('abc', 3), ('st', 3)], [('cc', 0), ('dd', 0), ('cc_1', 1), ('unknown', 2), ('246', 3), ('west', 3), ('def', 3), ('st', 3)]] self.assertEqual(actual_record_list, expected_record_list)
def setup(self): path_for_A = os.sep.join([datasets_path, 'books', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'books', 'B.csv']) self.l_block_attr = 'Author' self.r_block_attr = 'Author' self.l_output_attrs = ['Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date'] self.r_output_attrs = ['Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'books\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def test_get_field_correspondence_list_5(self): A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']]) A.columns = ['ID', 'name', 'price', 'desc'] em.set_key(A, 'ID') A_key = em.get_key(A) B = pd.DataFrame([['B', 'B001', 'ASDF', 0.111]]) B.columns = ['item_name', 'item_id', 'item_desc', 'item_price'] em.set_key(B, 'item_id') B_key = em.get_key(B) attr_corres = [('name', 'item_name'), ('price', 'item_price')] actual_attr_corres = db._get_field_correspondence_list( A, B, A_key, B_key, attr_corres) expected_attr_corres = [('name', 'item_name'), ('price', 'item_price'), ('ID', 'item_id')] self.assertEqual(expected_attr_corres, actual_attr_corres)
def test_index_candidate_set_5(self): A_list = [[1, 'asdf', 'fdas'], [2, 'fdsa', 'asdf']] B_list = [['B002', 'qqqq', 'wwww'], ['B003', 'rrrr', 'fdsa']] A = pd.DataFrame(A_list) A.columns = ['ID', 'f1', 'f2'] em.set_key(A, 'ID') B = pd.DataFrame(B_list) B.columns = ['ID', 'f1', 'f2'] em.set_key(B, 'ID') C_list = [[0, 1, 'B001'], [1, 2, 'B002']] C = pd.DataFrame(C_list) C.columns = ['_id', 'ltable_ID', 'rtable_ID'] cm.set_candset_properties(C, '_id', 'ltable_ID', 'rtable_ID', A, B) lrecord_id_to_index_map = db._build_id_to_index_map(A, 'ID') rrecord_id_to_index_map = db._build_id_to_index_map(B, 'ID') db._index_candidate_set(C, lrecord_id_to_index_map, rrecord_id_to_index_map, False)
def setup(self): path_for_A = os.sep.join([datasets_path, 'books', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'books', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'ID') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'ID') self.C = snb.block_tables(A, B, 'Author', 'Author', ['Title', 'Author', 'ISBN13', 'Publisher'], ['Title', 'Author', 'ISBN13', 'Publisher']) except AssertionError: print("Dataset \'books\' not found. Please visit the project" " website to download the dataset.") raise SystemExit self.l_block_attr = 'ISBN13' self.r_block_attr = 'ISBN13'
def setup(self): path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'ID') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'ID') ob = mg.OverlapBlocker() self.C = ob.block_tables(A, B, 'ADDRESS', 'ADDRESS', overlap_size=4, l_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'], r_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS']) feature_table = mg.get_features_for_blocking(A,B) self.rb = mg.RuleBasedBlocker() self.rb.add_rule(['ADDRESS_ADDRESS_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.44'], feature_table) except AssertionError: print("Dataset \'beer\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'ebooks', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'ebooks', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'record_id') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'record_id') ob = mg.OverlapBlocker() self.C = ob.block_tables(A, B, 'title', 'title', overlap_size=2, rem_stop_words = True, l_output_attrs=['title', 'author', 'publisher', 'date'], r_output_attrs=['title', 'author', 'publisher', 'date']) feature_table = mg.get_features_for_blocking(A,B) self.rb = mg.RuleBasedBlocker() self.rb.add_rule(['date_date_lev_sim(ltuple, rtuple) < 0.6'], feature_table) except AssertionError: print("Dataset \'beer\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def test_index_candidate_set_4(self): A_list = [[1, 'asdf', 'fdas'], [2, 'fdsa', 'asdf']] B_list = [['B002', 'qqqq', 'wwww'], ['B003', 'rrrr', 'fdsa']] A = pd.DataFrame(A_list) A.columns = ['ID', 'f1', 'f2'] em.set_key(A, 'ID') B = pd.DataFrame(B_list) B.columns = ['ID', 'f1', 'f2'] em.set_key(B, 'ID') C_list = [[0, 1, 'B003'], [1, 2, 'B002']] C = pd.DataFrame(C_list) C.columns = ['_id', 'ltable_ID', 'rtable_ID'] cm.set_candset_properties(C, '_id', 'ltable_ID', 'rtable_ID', A, B) lrecord_id_to_index_map = db._build_id_to_index_map(A, 'ID') rrecord_id_to_index_map = db._build_id_to_index_map(B, 'ID') expected_cand_set = {0: set([1]), 1: set([0])} actual_cand_set = db._index_candidate_set(C, lrecord_id_to_index_map, rrecord_id_to_index_map, False) self.assertEqual(expected_cand_set, actual_cand_set)
def setup(self): p = mg.get_install_path() path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'id') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'id') except AssertionError: print("Dataset \'bikes\' not found. Please visit the project" " website to download the dataset.") raise SystemExit self.l_block_attr = 'city_posted' self.r_block_attr = 'city_posted' self.l_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year'] self.r_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year']