def test_build_id_to_index_map_2(self):
     table = [['a1', 'hello'], ['a1', 'world']]
     key = 'ID'
     dataframe = pd.DataFrame(table)
     dataframe.columns = ['ID', 'title']
     em.set_key(dataframe, key)
     db._build_id_to_index_map(dataframe, key)
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.feature_table = em.get_features_for_blocking(self.A, self.B, validate_inferred_attr_types=False)
     self.rb = em.RuleBasedBlocker()
 def test_bb_block_tables_empty_rtable(self):
     empty_B = pd.DataFrame(columns=self.B.columns)
     em.set_key(empty_B, 'ID')
     self.bb.set_black_box_function(_block_fn)
     C = self.bb.block_tables(self.A, empty_B)
     validate_metadata(C)
     validate_data(C)
 def test_ob_block_tables_empty_rtable(self):
     empty_B = pd.DataFrame(columns=self.B.columns)
     em.set_key(empty_B, 'ID')
     C = self.ob.block_tables(self.A, empty_B,
                              l_overlap_attr_1, r_overlap_attr_1)
     validate_metadata(C)
     validate_data(C)
 def setup(self):
     p = mg.get_install_path()
     path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv'])
     l_output_attrs = [
         'bike_name', 'city_posted', 'km_driven', 'price', 'color',
         'model_year'
     ]
     r_output_attrs = [
         'bike_name', 'city_posted', 'km_driven', 'price', 'color',
         'model_year'
     ]
     try:
         A = mg.read_csv_metadata(path_for_A)
         mg.set_key(A, 'id')
         B = mg.read_csv_metadata(path_for_B)
         mg.set_key(B, 'id')
         C = ab.block_tables(A, B, 'city_posted', 'city_posted',
                             l_output_attrs, r_output_attrs)
         self.D = ab.block_candset(C, 'model_year', 'model_year')
         bb.set_black_box_function(_bikes_function)
     except AssertionError:
         print("Dataset \'bikes\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
示例#6
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'music', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'music', 'B.csv'])
     try:
         A = mg.read_csv_metadata(path_for_A)
         mg.set_key(A, 'Sno')
         B = mg.read_csv_metadata(path_for_B)
         mg.set_key(B, 'Sno')
         l_output_attrs = [
             'Album_Name', 'Artist_Name', 'CopyRight', 'Released',
             'Song_Name', 'Time'
         ]
         r_output_attrs = [
             'Album_Name', 'Artist_Name', 'Copyright', 'Released',
             'Song_Name', 'Time'
         ]
         C = ob.block_tables(A,
                             B,
                             'Album_Name',
                             'Album_Name',
                             rem_stop_words=True,
                             l_output_attrs=l_output_attrs,
                             r_output_attrs=r_output_attrs)
         self.D = ob.block_candset(C,
                                   'Artist_Name',
                                   'Artist_Name',
                                   rem_stop_words=True)
     except AssertionError:
         print("Dataset \'music\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
示例#7
0
def run_quora_blocking(sampler = "iterative", lsh_args = None, sequential_args = None):
    if (sampler != "iterative") & (sampler != "naive"):
        raise ValueError("Sampler should be iterative or naive (completely random).")

    # Load Training Set according to sampler
    em.del_catalog()
    lhs_table = em.read_csv_metadata("../data/processed_quora/quora_" + sampler + "_X_train_lhs.csv").rename(columns = {"Unnamed: 0":"id_lhs"}).sample(n = 1500, random_state = 52)
    rhs_table = em.read_csv_metadata("../data/processed_quora/quora_" + sampler + "_X_train_rhs.csv").rename(columns = {"Unnamed: 0":"id_rhs"}).sample(n = 1500, random_state = 52)
    y_train = pd.read_csv("../data/processed_quora/quora_" + sampler + "_y_train.csv")
    em.del_catalog()
    em.set_key(lhs_table, "id_lhs")
    em.set_key(rhs_table, "id_rhs")

    n_train  = lhs_table.shape[0]

    # Blocking
    blocking_cols = ["question1","question2"]
    feature_cols  = [["question1"],
    ['question2']]
    id_names = ["qid1","qid2"]
    lsh_blocking_col_ids = 1

    print("Blocking Train Set of Quora using LSH only.") 
    candidates = lsh_blocking(lhs_table, rhs_table, lsh_blocking_col_ids, 2, ["qid1","qid2"], char_ngram = lsh_args["char_ngram"], seeds = lsh_args["seeds"], bands = lsh_args["bands"])
    print(f"Generated Candidate size has {candidates.shape[0]} rows")
    
    
    return NotImplementedError
示例#8
0
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv'])
        try:
            A = mg.read_csv_metadata(path_for_A)
            mg.set_key(A, 'id')
            B = mg.read_csv_metadata(path_for_B)
            mg.set_key(B, 'id')
            l_output_attrs = [
                'bike_name', 'city_posted', 'km_driven', 'price', 'color',
                'model_year'
            ]
            r_output_attrs = [
                'bike_name', 'city_posted', 'km_driven', 'price', 'color',
                'model_year'
            ]
            self.C = ab.block_tables(A, B, 'city_posted', 'city_posted',
                                     l_output_attrs, r_output_attrs)
        except AssertionError:
            print("Dataset \'bikes\' not found. Please visit the project"
                  " website to download the dataset.")
            raise SystemExit

        self.l_block_attr = 'model_year'
        self.r_block_attr = 'model_year'
 def test_ab_block_tuples_wi_missing_values_disallow_missing(self):
     path_a = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_A_wi_missing_vals.csv'
     ])
     path_b = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_B_wi_missing_vals.csv'
     ])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     assert_equal(
         self.ab.block_tuples(A.loc[0], B.loc[0], l_block_attr_1,
                              r_block_attr_1), True)
     assert_equal(
         self.ab.block_tuples(A.loc[1], B.loc[2], l_block_attr_1,
                              r_block_attr_1), False)
     assert_equal(
         self.ab.block_tuples(A.loc[2], B.loc[1], l_block_attr_1,
                              r_block_attr_1), True)
     assert_equal(
         self.ab.block_tuples(A.loc[0], B.loc[1], l_block_attr_1,
                              r_block_attr_1), True)
     assert_equal(
         self.ab.block_tuples(A.loc[2], B.loc[2], l_block_attr_1,
                              r_block_attr_1), True)
 def test_build_id_to_index_map_2(self):
     table = [['a1', 'hello'], ['a1', 'world']]
     key = 'ID'
     dataframe = pd.DataFrame(table)
     dataframe.columns = ['ID', 'title']
     em.set_key(dataframe, key)
     db._build_id_to_index_map(dataframe, key)
示例#11
0
 def test_ob_block_tables_empty_rtable(self):
     empty_B = pd.DataFrame(columns=self.B.columns)
     em.set_key(empty_B, 'ID')
     C = self.ob.block_tables(self.A, empty_B, l_overlap_attr_1,
                              r_overlap_attr_1)
     validate_metadata(C)
     validate_data(C)
示例#12
0
 def test_ob_block_candset_wi_missing_vals_disallow_missing(self):
     path_a = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_A_wi_missing_vals.csv'
     ])
     path_b = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_B_wi_missing_vals.csv'
     ])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ob.block_tables(A,
                              B,
                              l_overlap_attr_1,
                              r_overlap_attr_1,
                              allow_missing=True)
     validate_metadata(C)
     validate_data(C, expected_ids_4)
     D = self.ob.block_candset(C,
                               l_overlap_attr_2,
                               r_overlap_attr_2,
                               rem_stop_words=True,
                               overlap_size=4)
     validate_metadata_two_candsets(C, D)
     validate_data(D, expected_ids_2)
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.feature_table = em.get_features_for_blocking(self.A, self.B)
     self.rb = em.RuleBasedBlocker()
示例#14
0
 def test_ob_block_tables_empty_ltable(self):
     empty_A = pd.DataFrame(columns=self.A.columns)
     print(empty_A.dtypes)
     em.set_key(empty_A, 'ID')
     C = self.ob.block_tables(empty_A, self.B, l_overlap_attr_1,
                              r_overlap_attr_1)
     validate_metadata(C)
     validate_data(C)
示例#15
0
def _get_filtered_table(ltable, rtable, lkey, rkey, corres_list):
    ltable_cols = [col_pair[0] for col_pair in corres_list]
    rtable_cols = [col_pair[1] for col_pair in corres_list]
    lfiltered_table = ltable[ltable_cols]
    rfiltered_table = rtable[rtable_cols]
    em.set_key(lfiltered_table, lkey)
    em.set_key(rfiltered_table, rkey)
    return lfiltered_table, rfiltered_table
 def test_ob_block_tables_empty_ltable(self):
     empty_A = pd.DataFrame(columns=self.A.columns)
     print(empty_A.dtypes)
     em.set_key(empty_A, 'ID')
     C = self.ob.block_tables(empty_A, self.B,
                              l_overlap_attr_1, r_overlap_attr_1)
     validate_metadata(C)
     validate_data(C)
def _get_filtered_table(ltable, rtable, lkey, rkey, corres_list):
    ltable_cols = [col_pair[0] for col_pair in corres_list]
    rtable_cols = [col_pair[1] for col_pair in corres_list]
    lfiltered_table = ltable[ltable_cols]
    rfiltered_table = rtable[rtable_cols]
    em.set_key(lfiltered_table, lkey)
    em.set_key(rfiltered_table, rkey)
    return lfiltered_table, rfiltered_table
示例#18
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B)
     self.feature_table = em.get_features_for_matching(self.A, self.B, validate_inferred_attr_types=False)
     self.brm = em.BooleanRuleMatcher()
 def test_build_col_name_index_dict_2(self):
     A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']])
     A.columns = ['ID', 'name', 'price', 'desc']
     em.set_key(A, 'ID')
     col_index = db._build_col_name_index_dict(A)
     self.assertEqual(col_index['ID'], 0)
     self.assertEqual(col_index['name'], 1)
     self.assertEqual(col_index['price'], 2)
     self.assertEqual(col_index['desc'], 3)
 def test_build_col_name_index_dict_2(self):
     A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']])
     A.columns = ['ID', 'name', 'price', 'desc']
     em.set_key(A, 'ID')
     col_index = db._build_col_name_index_dict(A)
     self.assertEqual(col_index['ID'], 0)
     self.assertEqual(col_index['name'], 1)
     self.assertEqual(col_index['price'], 2)
     self.assertEqual(col_index['desc'], 3)
示例#21
0
def read_pair_csv_with_metadata(tuples, filename, key, dtype={}):
    # Way to inject dtype parameter into native? For now, do it "by hand"
    pairs = pd.read_csv(filename, dtype=dtype)
    em.set_key(pairs, key)
    em.set_ltable(pairs, tuples)
    em.set_fk_ltable(pairs, f'ltable_{em.get_key(tuples)}')
    em.set_rtable(pairs, tuples)
    em.set_fk_rtable(pairs, f'rtable_{em.get_key(tuples)}')
    return pairs
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B)
     self.feature_table = em.get_features_for_matching(self.A, self.B, validate_inferred_attr_types=False)
     self.C['neg_trig_labels'] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
     self.C['pos_trig_labels'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
     self.mt = em.MatchTrigger()
 def test_ob_block_tables_wi_missing_values_disallow_missing(self):
     path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_A_wi_missing_vals.csv'])
     path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_B_wi_missing_vals.csv'])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1)
     validate_metadata(C)
     validate_data(C, expected_ids_1)
示例#24
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'citations', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'citations', 'B.csv'])
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'anime\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
示例#25
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv'])
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
         self.feature_table = mg.get_features_for_blocking(self.A, self.B)
     except AssertionError:
         print("Dataset \'electronics\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv'])
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
         self.feature_table = mg.get_features_for_blocking(self.A, self.B)
     except AssertionError:
         print("Dataset \'anime\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'beer', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'beer', 'B.csv'])
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'Label')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'Label')
         bb.set_black_box_function(_beer_function)
     except AssertionError:
         print("Dataset \'beer\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
示例#28
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv'])
     self.l_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS']
     self.r_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'restaurants\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
    def test_get_filtered_table(self):
        A = pd.DataFrame([['a1', 'A', 0.11, 53704]])
        A.columns = ['ID', 'name', 'price', 'zip code']
        em.set_key(A, 'ID')
        B = pd.DataFrame([['b1', 'A', 0.11, 54321]])
        B.columns = ['ID', 'name', 'price', 'zip code']
        em.set_key(B, 'ID')
        A_key = 'ID'
        B_key = 'ID'
        ltable_col_dict = db._build_col_name_index_dict(A)
        rtable_col_dict = db._build_col_name_index_dict(B)
        attr_corres = [('ID', 'ID'), ('name', 'name'),
                         ('price', 'price'),
                         ('zip code', 'zip code')]
        db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict,
                rtable_col_dict, attr_corres)

        filtered_A, filtered_B = db._get_filtered_table(A, B, attr_corres)

        expected_filtered_A = pd.DataFrame([['a1', 'A']])
        expected_filtered_A.columns = ['ID', 'name']
        em.set_key(expected_filtered_A, 'ID')
        expected_filtered_B = pd.DataFrame([['b1', 'A']])
        expected_filtered_B.columns = ['ID', 'name']
        em.set_key(expected_filtered_B, 'ID')

        self.assertEqual(expected_filtered_A.equals(filtered_A), True)
        self.assertEqual(expected_filtered_B.equals(filtered_B), True)
    def test_get_filtered_table(self):
        A = pd.DataFrame([['a1', 'A', 0.11, 53704]])
        A.columns = ['ID', 'name', 'price', 'zip code']
        em.set_key(A, 'ID')
        B = pd.DataFrame([['b1', 'A', 0.11, 54321]])
        B.columns = ['ID', 'name', 'price', 'zip code']
        em.set_key(B, 'ID')
        A_key = 'ID'
        B_key = 'ID'
        ltable_col_dict = db._build_col_name_index_dict(A)
        rtable_col_dict = db._build_col_name_index_dict(B)
        attr_corres = [('ID', 'ID'), ('name', 'name'),
                         ('price', 'price'),
                         ('zip code', 'zip code')]
        db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict,
                rtable_col_dict, attr_corres)

        filtered_A, filtered_B = db._get_filtered_table(A, B, attr_corres)

        expected_filtered_A = pd.DataFrame([['a1', 'A']])
        expected_filtered_A.columns = ['ID', 'name']
        em.set_key(expected_filtered_A, 'ID')
        expected_filtered_B = pd.DataFrame([['b1', 'A']])
        expected_filtered_B.columns = ['ID', 'name']
        em.set_key(expected_filtered_B, 'ID')

        self.assertEqual(expected_filtered_A.equals(filtered_A), True)
        self.assertEqual(expected_filtered_B.equals(filtered_B), True)
 def test_index_candidate_set_6(self):
     A_list = [[1, 'asdf', 'fdas'], [2, 'fdsa', 'asdf']]
     B_list = [['B002', 'qqqq', 'wwww'], ['B003', 'rrrr', 'fdsa']]
     A = pd.DataFrame(A_list)
     A.columns = ['ID', 'f1', 'f2']
     em.set_key(A, 'ID')
     B = pd.DataFrame(B_list)
     B.columns = ['ID', 'f1', 'f2']
     em.set_key(B, 'ID')
     C = pd.DataFrame()
     lrecord_id_to_index_map = db._build_id_to_index_map(A, 'ID')
     rrecord_id_to_index_map = db._build_id_to_index_map(B, 'ID')
     new_C = db._index_candidate_set(C,
             lrecord_id_to_index_map, rrecord_id_to_index_map, False)
     self.assertEqual(new_C, {})
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B)
     self.feature_table = em.get_features_for_matching(
         self.A, self.B, validate_inferred_attr_types=False)
     self.C['neg_trig_labels'] = [
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
     ]
     self.C['pos_trig_labels'] = [
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     ]
     self.mt = em.MatchTrigger()
 def test_ab_block_candset_wi_missing_values_disallow_missing(self):
     path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_A_wi_missing_vals.csv'])
     path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_B_wi_missing_vals.csv'])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1)
     validate_metadata(C)
     validate_data(C, expected_ids_4)
     D = self.ab.block_candset(C, l_block_attr_2, r_block_attr_2)
     validate_metadata_two_candsets(C, D)
     validate_data(D, [('a5','b5')])
 def test_ob_block_tuples_wi_missing_vals_disallow_missing(self):
     path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_A_wi_missing_vals.csv'])
     path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_B_wi_missing_vals.csv'])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     assert_equal(self.ob.block_tuples(A.ix[1], B.ix[3], l_overlap_attr_1,
                                       r_overlap_attr_1), True)
     assert_equal(self.ob.block_tuples(A.ix[3], B.ix[2], l_overlap_attr_1,
                                       r_overlap_attr_1), True)
     assert_equal(self.ob.block_tuples(A.ix[3], B.ix[3], l_overlap_attr_1,
                                       r_overlap_attr_1), True)
 def test_index_candidate_set_6(self):
     A_list = [[1, 'asdf', 'fdas'], [2, 'fdsa', 'asdf']]
     B_list = [['B002', 'qqqq', 'wwww'], ['B003', 'rrrr', 'fdsa']]
     A = pd.DataFrame(A_list)
     A.columns = ['ID', 'f1', 'f2']
     em.set_key(A, 'ID')
     B = pd.DataFrame(B_list)
     B.columns = ['ID', 'f1', 'f2']
     em.set_key(B, 'ID')
     C = pd.DataFrame()
     lrecord_id_to_index_map = db._build_id_to_index_map(A, 'ID')
     rrecord_id_to_index_map = db._build_id_to_index_map(B, 'ID')
     new_C = db._index_candidate_set(C,
             lrecord_id_to_index_map, rrecord_id_to_index_map, False)
     self.assertEqual(new_C, {})
 def test_ab_block_tables_wi_missing_values_allow_missing(self):
     path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_A_wi_missing_vals.csv'])
     path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_B_wi_missing_vals.csv'])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1,
                              l_output_attrs, r_output_attrs,
                              l_output_prefix, r_output_prefix, True)
     validate_metadata(C, l_output_attrs, r_output_attrs,
                       l_output_prefix, r_output_prefix)
     validate_data(C, expected_ids_3)
 def test_get_tokenized_table_3(self):
     table = [[1, 'abc abc asdf', '123-3456-7890', pd.np.nan, '',
               '135 east  abc  st'],
              [2, 'aaa bbb', '000-111-2222', '', '', '246  west abc st'],
              [3, 'cc dd', '123-123-1231', 'cc', 'unknown', ' 246 west def st']]
     dataframe = pd.DataFrame(table)
     dataframe.columns = ['ID', 'name', 'phone', 'department', 'school', 'address']
     key = 'ID'
     em.set_key(dataframe, key)
     feature_list = [1, 3, 4, 5]
     actual_record_list = db._get_tokenized_table(dataframe, key, feature_list)
     expected_record_list = [['abc', 'abc_1', 'asdf', '135', 'east', 'abc_2', 'st'],
                             ['aaa', 'bbb', '246', 'west', 'abc', 'st'],
                             ['cc', 'dd', 'cc_1', 'unknown', '246', 'west', 'def', 'st']]
     self.assertEqual(actual_record_list, expected_record_list)
示例#38
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv'])
     self.l_block_attr = 'Year'
     self.r_block_attr = 'Year'
     self.l_output_attrs = ['Title', 'Year', 'Episodes']
     self.r_output_attrs = ['Title', 'Year', 'Episodes']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'anime\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv'])
     self.l_block_attr = 'PHONENUMBER'
     self.r_block_attr = 'PHONENUMBER'
     self.l_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS']
     self.r_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'restaurants\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv'])
     self.l_block_attr = 'Brand'
     self.r_block_attr = 'Brand'
     self.l_output_attrs = ['Brand', 'Amazon_Price']
     self.r_output_attrs = ['Brand', 'Price']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'electronics\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv'])
     self.l_block_attr = 'Year'
     self.r_block_attr = 'Year'
     self.l_output_attrs = ['Title', 'Year', 'Episodes']
     self.r_output_attrs = ['Title', 'Year', 'Episodes']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'anime\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
 def test_filter_corres_list_1(self):
     A = pd.DataFrame([[0, 20, 0.11, 4576]])
     A.columns = ['ID', 'age', 'price', 'zip code']
     em.set_key(A, 'ID')
     B = pd.DataFrame([[0, 240, 0.311, 4474]])
     B.columns = ['ID', 'age', 'price', 'zip code']
     em.set_key(A, 'ID')
     A_key = 'ID'
     B_key = 'ID'
     ltable_col_dict = db._build_col_name_index_dict(A)
     rtable_col_dict = db._build_col_name_index_dict(B)
     attr_corres = [('ID', 'ID'), ('age', 'age'),
                      ('price', 'price'),
                      ('zip code', 'zip code')]
     db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict,
             rtable_col_dict, attr_corres)
示例#43
0
def main():
    # clean up original csv
    # comment out this code if A.csv and B.csv already exists
    #clean_up()
    # read csv tables
    A = pd.read_csv(Directory+'A.csv')
    B = pd.read_csv(Directory+'B.csv')
    # set keys to tables
    em.set_key(A, 'id')
    em.set_key(B, 'id')
    # block tables using black-box blocker
    C = black_box_blocker(A, B)
    C.to_csv('C.csv', index = False)
    # debug blocker
    dbq = blocker_debugging(C, A, B)
    dbq.to_csv('debugged_result.csv')
示例#44
0
 def test_ob_block_tables_wi_missing_values_disallow_missing(self):
     path_a = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_A_wi_missing_vals.csv'
     ])
     path_b = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_B_wi_missing_vals.csv'
     ])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1)
     validate_metadata(C)
     validate_data(C, expected_ids_1)
    def test_get_field_correspondence_list_5(self):
        A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']])
        A.columns = ['ID', 'name', 'price', 'desc']
        em.set_key(A, 'ID')
        A_key = em.get_key(A)
        B = pd.DataFrame([['B', 'B001', 'ASDF', 0.111]])
        B.columns = ['item_name', 'item_id', 'item_desc', 'item_price']
        em.set_key(B, 'item_id')
        B_key = em.get_key(B)
        attr_corres = [('name', 'item_name'), ('price', 'item_price')]
        actual_attr_corres = db._get_field_correspondence_list(
            A, B, A_key, B_key, attr_corres)

        expected_attr_corres = [('name', 'item_name'), ('price', 'item_price'),
                                ('ID', 'item_id')]
        self.assertEqual(expected_attr_corres, actual_attr_corres)
 def test_filter_corres_list_1(self):
     A = pd.DataFrame([[0, 20, 0.11, 4576]])
     A.columns = ['ID', 'age', 'price', 'zip code']
     em.set_key(A, 'ID')
     B = pd.DataFrame([[0, 240, 0.311, 4474]])
     B.columns = ['ID', 'age', 'price', 'zip code']
     em.set_key(A, 'ID')
     A_key = 'ID'
     B_key = 'ID'
     ltable_col_dict = db._build_col_name_index_dict(A)
     rtable_col_dict = db._build_col_name_index_dict(B)
     attr_corres = [('ID', 'ID'), ('age', 'age'),
                      ('price', 'price'),
                      ('zip code', 'zip code')]
     db._filter_corres_list(A, B, A_key, B_key, ltable_col_dict,
             rtable_col_dict, attr_corres)
示例#47
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv'])
     self.l_block_attr = 'Brand'
     self.r_block_attr = 'Brand'
     self.l_output_attrs = ['Brand', 'Amazon_Price']
     self.r_output_attrs = ['Brand', 'Price']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'electronics\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
 def test_ob_block_candset_wi_missing_vals_disallow_missing(self):
     path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_A_wi_missing_vals.csv'])
     path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_B_wi_missing_vals.csv'])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ob.block_tables(A, B, l_overlap_attr_1,
                              r_overlap_attr_1, allow_missing=True)
     validate_metadata(C)
     validate_data(C, expected_ids_4)
     D = self.ob.block_candset(C, l_overlap_attr_2, r_overlap_attr_2,
                               rem_stop_words=True, overlap_size=4)
     validate_metadata_two_candsets(C, D)
     validate_data(D, expected_ids_2)
    def test_debugblocker_12(self):
        llist = [[0]]
        rlist = [[0]]
        ltable = pd.DataFrame(llist)
        rtable = pd.DataFrame(rlist)
        ltable.columns = ['ID']
        rtable.columns = ['ID']
        lkey = 'ID'
        rkey = 'ID'
        em.set_key(ltable, lkey)
        em.set_key(rtable, rkey)
        cand_set = pd.DataFrame([[0, 0, 0]])
        cand_set.columns = ['_id', 'ltable_ID', 'rtable_ID']
        cm.set_candset_properties(cand_set, '_id', 'ltable_ID',
                                  'rtable_ID', ltable, rtable)

        db.debug_blocker(cand_set, ltable, rtable)
 def test_get_tokenized_table_3(self):
     table = [[1, 'abc abc asdf', '123-3456-7890', pd.np.nan, '',
               '135 east  abc  st'],
              [2, 'aaa bbb', '000-111-2222', '', '', '246  west abc st'],
              [3, 'cc dd', '123-123-1231', 'cc', 'unknown', ' 246 west def st']]
     dataframe = pd.DataFrame(table)
     dataframe.columns = ['ID', 'name', 'phone', 'department', 'school', 'address']
     key = 'ID'
     em.set_key(dataframe, key)
     feature_list = [1, 3, 4, 5]
     actual_record_list = db._get_tokenized_table(dataframe, key, feature_list)
     expected_record_list = [[('abc', 0), ('abc_1', 0), ('asdf', 0), ('135', 3), ('east', 3),
                             ('abc_2', 3), ('st', 3)], [('aaa', 0), ('bbb', 0), ('246', 3),
                             ('west', 3), ('abc', 3), ('st', 3)], [('cc', 0), ('dd', 0),
                             ('cc_1', 1), ('unknown', 2), ('246', 3), ('west', 3),
                             ('def', 3), ('st', 3)]]
     self.assertEqual(actual_record_list, expected_record_list)
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'books', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'books', 'B.csv'])
     self.l_block_attr = 'Author'
     self.r_block_attr = 'Author'
     self.l_output_attrs = ['Title', 'Author', 'ISBN13', 'Publisher',
                            'Publication_Date']
     self.r_output_attrs = ['Title', 'Author', 'ISBN13', 'Publisher',
                            'Publication_Date']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'books\' not found. Please visit the project"
               " website to download the dataset.")        
         raise SystemExit
    def test_get_field_correspondence_list_5(self):
        A = pd.DataFrame([[0, 'A', 0.11, 'ASDF']])
        A.columns = ['ID', 'name', 'price', 'desc']
        em.set_key(A, 'ID')
        A_key = em.get_key(A)
        B = pd.DataFrame([['B', 'B001', 'ASDF', 0.111]])
        B.columns = ['item_name', 'item_id', 'item_desc', 'item_price']
        em.set_key(B, 'item_id')
        B_key = em.get_key(B)
        attr_corres = [('name', 'item_name'),
                       ('price', 'item_price')]
        actual_attr_corres = db._get_field_correspondence_list(
            A, B, A_key, B_key, attr_corres)

        expected_attr_corres = [('name', 'item_name'),
                                ('price', 'item_price'),
                                ('ID', 'item_id')]
        self.assertEqual(expected_attr_corres, actual_attr_corres)
 def test_index_candidate_set_5(self):
     A_list = [[1, 'asdf', 'fdas'], [2, 'fdsa', 'asdf']]
     B_list = [['B002', 'qqqq', 'wwww'], ['B003', 'rrrr', 'fdsa']]
     A = pd.DataFrame(A_list)
     A.columns = ['ID', 'f1', 'f2']
     em.set_key(A, 'ID')
     B = pd.DataFrame(B_list)
     B.columns = ['ID', 'f1', 'f2']
     em.set_key(B, 'ID')
     C_list = [[0, 1, 'B001'], [1, 2, 'B002']]
     C = pd.DataFrame(C_list)
     C.columns = ['_id', 'ltable_ID', 'rtable_ID']
     cm.set_candset_properties(C, '_id', 'ltable_ID',
                               'rtable_ID', A, B)
     lrecord_id_to_index_map = db._build_id_to_index_map(A, 'ID')
     rrecord_id_to_index_map = db._build_id_to_index_map(B, 'ID')
     db._index_candidate_set(C,
             lrecord_id_to_index_map, rrecord_id_to_index_map, False)
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'books', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'books', 'B.csv'])
        try:
            A = mg.read_csv_metadata(path_for_A)
            mg.set_key(A, 'ID')
            B = mg.read_csv_metadata(path_for_B)
            mg.set_key(B, 'ID')
            self.C = snb.block_tables(A, B, 'Author', 'Author',
                                     ['Title', 'Author', 'ISBN13', 'Publisher'],
                                     ['Title', 'Author', 'ISBN13', 'Publisher'])
        except AssertionError:
            print("Dataset \'books\' not found. Please visit the project"
                  " website to download the dataset.")        
            raise SystemExit

        self.l_block_attr = 'ISBN13'
        self.r_block_attr = 'ISBN13'
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv'])
        try:
            A = mg.read_csv_metadata(path_for_A)
            mg.set_key(A, 'ID')
            B = mg.read_csv_metadata(path_for_B)
            mg.set_key(B, 'ID')
            ob = mg.OverlapBlocker()
            self.C = ob.block_tables(A, B, 'ADDRESS', 'ADDRESS', overlap_size=4,
			         l_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'],
                                 r_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'])
            feature_table = mg.get_features_for_blocking(A,B)
            self.rb = mg.RuleBasedBlocker()
            self.rb.add_rule(['ADDRESS_ADDRESS_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.44'],
                         feature_table)
        except AssertionError:
            print("Dataset \'beer\' not found. Please visit the project "
                  "website to download the dataset.")
            raise SystemExit
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'ebooks', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'ebooks', 'B.csv'])
     try:
         A = mg.read_csv_metadata(path_for_A)
         mg.set_key(A, 'record_id')
         B = mg.read_csv_metadata(path_for_B)
         mg.set_key(B, 'record_id')
         ob = mg.OverlapBlocker()
         self.C = ob.block_tables(A, B, 'title', 'title', overlap_size=2,
                              rem_stop_words = True,
                              l_output_attrs=['title', 'author', 'publisher', 'date'],
                              r_output_attrs=['title', 'author', 'publisher', 'date'])
         feature_table = mg.get_features_for_blocking(A,B)
         self.rb = mg.RuleBasedBlocker()
         self.rb.add_rule(['date_date_lev_sim(ltuple, rtuple) < 0.6'], feature_table)
     except AssertionError:
         print("Dataset \'beer\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
 def test_index_candidate_set_4(self):
     A_list = [[1, 'asdf', 'fdas'], [2, 'fdsa', 'asdf']]
     B_list = [['B002', 'qqqq', 'wwww'], ['B003', 'rrrr', 'fdsa']]
     A = pd.DataFrame(A_list)
     A.columns = ['ID', 'f1', 'f2']
     em.set_key(A, 'ID')
     B = pd.DataFrame(B_list)
     B.columns = ['ID', 'f1', 'f2']
     em.set_key(B, 'ID')
     C_list = [[0, 1, 'B003'], [1, 2, 'B002']]
     C = pd.DataFrame(C_list)
     C.columns = ['_id', 'ltable_ID', 'rtable_ID']
     cm.set_candset_properties(C, '_id', 'ltable_ID',
                               'rtable_ID', A, B)
     lrecord_id_to_index_map = db._build_id_to_index_map(A, 'ID')
     rrecord_id_to_index_map = db._build_id_to_index_map(B, 'ID')
     expected_cand_set = {0: set([1]), 1: set([0])}
     actual_cand_set = db._index_candidate_set(C,
             lrecord_id_to_index_map, rrecord_id_to_index_map, False)
     self.assertEqual(expected_cand_set, actual_cand_set)
    def setup(self):
        p = mg.get_install_path()
        path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv'])
        try:
            self.A = mg.read_csv_metadata(path_for_A)
            mg.set_key(self.A, 'id')
            self.B = mg.read_csv_metadata(path_for_B)
            mg.set_key(self.B, 'id')
        except AssertionError:
            print("Dataset \'bikes\' not found. Please visit the project"
                  " website to download the dataset.")        
            raise SystemExit

        self.l_block_attr = 'city_posted'
        self.r_block_attr = 'city_posted'
        self.l_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price',
                               'color', 'model_year']
        self.r_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price',
                               'color', 'model_year']