def test_sample_table_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     D = sample_table(C, 10, False)
     self.assertEqual(cm.get_all_properties(C), cm.get_all_properties(D))
     self.assertEqual(len(D), 10)
Exemplo n.º 2
0
 def test_label_table_valid_3(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     D = label_table(C, 'label')
     p1, p2 = cm.get_all_properties(C), cm.get_all_properties(D)
     self.assertEqual(p1, p2)
Exemplo n.º 3
0
 def test_label_table_valid_3(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     D = label_table(C, 'label')
     p1, p2 = cm.get_all_properties(C), cm.get_all_properties(D)
     self.assertEqual(p1, p2)
    def test_blocker_combiner_valid_8(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_1.csv']), ltable=A, rtable=B)
        C1.rename(columns={'l_ID':'ltable_ID'}, inplace=True)
        C1.rename(columns={'r_ID':'rtable_ID'}, inplace=True)
        cm.set_fk_ltable(C1, 'ltable_ID')
        cm.set_fk_rtable(C1, 'rtable_ID')
        C2 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_2.csv']), ltable=A, rtable=B)
        C2.rename(columns={'l_ID':'ltable_ID'}, inplace=True)
        C2.rename(columns={'r_ID':'rtable_ID'}, inplace=True)
        cm.set_fk_ltable(C2, 'ltable_ID')
        cm.set_fk_rtable(C2, 'rtable_ID')

        C = combine_blocker_outputs_via_union([C1, C2], 'l_', 'r_')
        C_exp = read_csv_metadata(os.sep.join([bc_datasets_path, 'C_ex_4.csv']), ltable=A, rtable=B)
        C_exp.rename(columns={'l_ID':'ltable_ID'}, inplace=True)
        C_exp.rename(columns={'r_ID':'rtable_ID'}, inplace=True)
        cm.set_fk_ltable(C_exp, 'ltable_ID')
        cm.set_fk_rtable(C_exp, 'rtable_ID')

        # C_exp.sort_values(['l_ID', 'r_ID'], inplace=True)
        # C_exp.reset_index(inplace=True, drop=True)
        # C_exp['_id'] = six.moves.range(0, len(C_exp))
        # C_exp.drop('r_address', axis=1, inplace=True)
        if os.name != 'nt':
            self.assertEqual(C.equals(C_exp), True)
        p1 = cm.get_all_properties(C)
        p2 = cm.get_all_properties(C_exp)
        self.assertEqual(p1, p2)
Exemplo n.º 5
0
    def test_blocker_combiner_valid_6(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_1.csv']),
                               ltable=A,
                               rtable=B)
        C2 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_2.csv']),
                               ltable=A,
                               rtable=B)
        C = combine_blocker_outputs_via_union([C1, C2], 'l_', 'r_')

        C_exp = read_csv_metadata(os.sep.join([bc_datasets_path,
                                               'C_ex_4.csv']),
                                  ltable=A,
                                  rtable=B)
        # try:
        #     C_exp.sort_values(['l_ID', 'r_ID'], inplace=True)
        # except AttributeError:
        #     C_exp.sort(['l_ID', 'r_ID'], inplace=True)
        C_exp.reset_index(inplace=True, drop=True)
        C_exp['_id'] = six.moves.range(0, len(C_exp))

        if os.name != 'nt':
            self.assertEqual(C.equals(C_exp), True)
        p1 = cm.get_all_properties(C)
        p2 = cm.get_all_properties(C_exp)
        self.assertEqual(p1, p2)
 def test_sample_table_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     D = sample_table(C, 10, False)
     self.assertEqual(cm.get_all_properties(C), cm.get_all_properties(D))
     self.assertEqual(len(D), 10)
 def test_set_properties_valid_1(self):
     A = read_csv_metadata(path_a)
     p = cm.get_all_properties(A)
     B = pd.read_csv(path_b)
     cm.init_properties(B)
     cm.set_properties(B,p)
     self.assertEqual(cm.get_all_properties(B)==p, True)
 def test_copy_properties_update_false_2(self):
     A = read_csv_metadata(path_a)
     A1 = pd.read_csv(path_a)
     cm.copy_properties(A, A1, replace=False)
     p = cm.get_all_properties(A)
     p1 = cm.get_all_properties(A1)
     self.assertEqual(p, p1)
     self.assertEqual(cm.get_key(A1), cm.get_key(A))
 def test_copy_properties_valid_1(self):
     A = read_csv_metadata(path_a)
     A1 = pd.read_csv(path_a)
     cm.copy_properties(A, A1)
     self.assertEqual(cm.is_dfinfo_present(A1), True)
     p = cm.get_all_properties(A)
     p1 = cm.get_all_properties(A1)
     self.assertEqual(p, p1)
     self.assertEqual(cm.get_key(A1), cm.get_key(A))
 def test_blocker_combiner_valid_5(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C3_ex_2.csv']), ltable=A, rtable=B)
     C = combine_blocker_outputs_via_union([C1, C1])
     self.assertEqual(len(C), 0)
     p1 = cm.get_all_properties(C)
     p2 = cm.get_all_properties(C1)
     self.assertEqual(p1, p2)
Exemplo n.º 11
0
 def test_blocker_combiner_valid_5(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C3_ex_2.csv']),
                            ltable=A,
                            rtable=B)
     C = combine_blocker_outputs_via_union([C1, C1])
     self.assertEqual(len(C), 0)
     p1 = cm.get_all_properties(C)
     p2 = cm.get_all_properties(C1)
     self.assertEqual(p1, p2)
Exemplo n.º 12
0
 def test_label_table_valid_2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_name = 'label'
     num_zeros, num_ones = 8, 7
     label_values = [0] * num_zeros
     label_values.extend([1] * num_ones)
     D = self._test_label_table(C, col_name, label_values)
     self.assertEqual(np.sum(D[col_name]), num_ones)
     p1, p2 = cm.get_all_properties(C), cm.get_all_properties(D)
     self.assertEqual(p1, p2)
Exemplo n.º 13
0
 def test_label_table_valid_2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_name = 'label'
     num_zeros, num_ones = 8, 7
     label_values = [0]*num_zeros
     label_values.extend([1]*num_ones)
     D = self._test_label_table(C, col_name, label_values)
     self.assertEqual(pd.np.sum(D[col_name]), num_ones)
     p1, p2 = cm.get_all_properties(C), cm.get_all_properties(D)
     self.assertEqual(p1, p2)
    def test_blocker_combiner_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C1_ex_1.csv']), ltable=A, rtable=B)
        C2 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C2_ex_1.csv']), ltable=A, rtable=B)
        C = combine_blocker_outputs_via_union([C1, C2])

        C_exp = read_csv_metadata(os.sep.join([bc_datasets_path, 'C_ex_1.csv']), ltable=A, rtable=B)
        if os.name != 'nt':
            self.assertEqual(C.equals(C_exp), True)
        p1 = cm.get_all_properties(C)
        p2 = cm.get_all_properties(C_exp)
        self.assertEqual(p1, p2)
 def test_extract_feature_vecs_with_parralel_job_count_less_than_zero(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False)
     F = extract_feature_vecs(C, attrs_before=['ltable_name', 'rtable_name'], feature_table=feature_table, n_jobs=-1)
     self.assertEqual(isinstance(F, pd.DataFrame), True)
     self.assertEqual(F.columns[0], '_id')
     self.assertEqual(F.columns[1], cm.get_fk_ltable(C))
     self.assertEqual(F.columns[2], cm.get_fk_rtable(C))
     self.assertEqual(F.columns[4], 'rtable_name')
     self.assertEqual(F.columns[len(F.columns) - 1] == 'label', False)
     self.assertEqual(cm.get_all_properties(C) == cm.get_all_properties(F), True)
    def test_valid_path_df_chk_catalog_2(self):
        cm.del_catalog()
        del_files_in_dir(sndbx_path)
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        C = read_csv_metadata(path_c, ltable=A, rtable=B)

        p = os.sep.join([sndbx_path, 'C_saved.csv'])
        creat_dir_ifnot_exists(sndbx_path)
        to_csv_metadata(C, p)

        C1 = read_csv_metadata(p, ltable=A, rtable=B)

        self.assertEqual(cm.get_all_properties(C1), cm.get_all_properties(C), 'The properties in the '
                                                                                  'catalog are not same')
Exemplo n.º 17
0
    def test_copy_properties_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b)
        C = read_csv_metadata(path_c, ltable=A, rtable=B)

        C1 = pd.read_csv(path_c)
        cm.copy_properties(C, C1)
        self.assertEqual(cm.is_dfinfo_present(C1), True)
        p = cm.get_all_properties(C1)
        p1 = cm.get_all_properties(C1)
        self.assertEqual(p, p1)
        self.assertEqual(cm.get_key(C1), cm.get_key(C))
        self.assertEqual(cm.get_ltable(C1).equals(A), True)
        self.assertEqual(cm.get_rtable(C1).equals(B), True)
        self.assertEqual(cm.get_fk_ltable(C1), cm.get_fk_ltable(C))
        self.assertEqual(cm.get_fk_rtable(C1), cm.get_fk_rtable(C))
Exemplo n.º 18
0
def _is_table(df):
    table_props = ['key']
    properties = cm.get_all_properties(df)
    keys = list(properties)
    if len(gh.list_diff(keys, table_props)) == 0:
        return True
    else:
        return False
Exemplo n.º 19
0
def _is_table(df):
    table_props = ['key']
    properties = cm.get_all_properties(df)
    keys = list(properties)
    if len(gh.list_diff(keys, table_props)) == 0:
        return True
    else:
        return False
    def test_valid_path_df_chk_catalog_2(self):
        cm.del_catalog()
        del_files_in_dir(sndbx_path)
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        C = read_csv_metadata(path_c, ltable=A, rtable=B)

        p = os.sep.join([sndbx_path, 'C_saved.csv'])
        creat_dir_ifnot_exists(sndbx_path)
        to_csv_metadata(C, p)

        C1 = read_csv_metadata(p, ltable=A, rtable=B)

        self.assertEqual(cm.get_all_properties(C1), cm.get_all_properties(C),
                         'The properties in the '
                         'catalog are not same')
 def test_extract_feature_vecs_valid_8(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False)
     F = extract_feature_vecs(C,
                              feature_table=pd.DataFrame(columns=feature_table.columns),
                              attrs_after=['label', '_id'])
     self.assertEqual(isinstance(F, pd.DataFrame), True)
     self.assertEqual(F.columns[0], '_id')
     self.assertEqual(F.columns[1], cm.get_fk_ltable(C))
     self.assertEqual(F.columns[2], cm.get_fk_rtable(C))
     # self.assertEqual(F.columns[3], 'ltable_name')
     self.assertEqual(F.columns[len(F.columns) - 1] == 'label', True)
     self.assertEqual(cm.get_all_properties(C) == cm.get_all_properties(F), True)
    def test_blocker_combiner_valid_4(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C1_ex_1.csv']), ltable=A, rtable=B)
        C = combine_blocker_outputs_via_union([C1, C1])
        # try:
        #     C1.sort_values(['ltable_ID', 'rtable_ID'], inplace=True)
        # except AttributeError:
        #     C1.sort(['ltable_ID', 'rtable_ID'], inplace=True)
        # to_csv_metadata(C1, os.sep.join([bc_datasets_path, 'C1_ex_1.csv']))

        C1.reset_index(inplace=True, drop=True)
        C1['_id'] = six.moves.range(0, len(C1))

        if os.name != 'nt':
            self.assertEqual(C.equals(C1), True)
        p1 = cm.get_all_properties(C)
        p2 = cm.get_all_properties(C1)
        self.assertEqual(p1, p2)
 def test_train_test_split_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     result = mu.split_train_test(C)
     train = result['train']
     test = result['test']
     self.assertEqual(len(train)+len(test), len(C))
     p1 = cm.get_all_properties(C)
     p2 = cm.get_all_properties(train)
     p3 = cm.get_all_properties(test)
     # d = {}
     # d['ltable'] = A
     # d['rtable'] = A
     # d['key'] = '_id'
     # d['fk_ltable'] = 'ltable_ID'
     # d['fk_rtable'] = 'rtable_ID'
     self.assertEqual(p1 == p2, True)
     self.assertEqual(p1 == p3, True)
Exemplo n.º 24
0
 def test_train_test_split_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     result = mu.split_train_test(C)
     train = result['train']
     test = result['test']
     self.assertEqual(len(train) + len(test), len(C))
     p1 = cm.get_all_properties(C)
     p2 = cm.get_all_properties(train)
     p3 = cm.get_all_properties(test)
     # d = {}
     # d['ltable'] = A
     # d['rtable'] = A
     # d['key'] = '_id'
     # d['fk_ltable'] = 'ltable_ID'
     # d['fk_rtable'] = 'rtable_ID'
     self.assertEqual(p1 == p2, True)
     self.assertEqual(p1 == p3, True)
 def test_extract_feature_vecs_valid_2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(A, B)
     F = extract_feature_vecs(C,
                              attrs_before=['ltable_name', 'rtable_name'],
                              feature_table=feature_table)
     self.assertEqual(isinstance(F, pd.DataFrame), True)
     self.assertEqual(F.columns[0], '_id')
     self.assertEqual(F.columns[1], cm.get_fk_ltable(C))
     self.assertEqual(F.columns[2], cm.get_fk_rtable(C))
     self.assertEqual(F.columns[3], 'ltable_name')
     self.assertEqual(F.columns[4], 'rtable_name')
     self.assertEqual(F.columns[len(F.columns) - 1] == 'label', False)
     self.assertEqual(
         cm.get_all_properties(C) == cm.get_all_properties(F), True)
Exemplo n.º 26
0
    def test_blocker_combiner_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C1_ex_1.csv']),
                               ltable=A,
                               rtable=B)
        C2 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C2_ex_1.csv']),
                               ltable=A,
                               rtable=B)
        C = combine_blocker_outputs_via_union([C1, C2])

        C_exp = read_csv_metadata(os.sep.join([bc_datasets_path,
                                               'C_ex_1.csv']),
                                  ltable=A,
                                  rtable=B)
        if os.name != 'nt':
            self.assertEqual(C.equals(C_exp), True)
        p1 = cm.get_all_properties(C)
        p2 = cm.get_all_properties(C_exp)
        self.assertEqual(p1, p2)
Exemplo n.º 27
0
 def test_extract_feature_vecs_with_parralel_job_count_more_than_one(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(
         A, B, validate_inferred_attr_types=False)
     F = extract_feature_vecs(C,
                              attrs_before=['ltable_name', 'rtable_name'],
                              feature_table=feature_table,
                              n_jobs=2)
     self.assertEqual(isinstance(F, pd.DataFrame), True)
     self.assertEqual(F.columns[0], '_id')
     self.assertEqual(F.columns[1], cm.get_fk_ltable(C))
     self.assertEqual(F.columns[2], cm.get_fk_rtable(C))
     self.assertEqual(F.columns[4], 'rtable_name')
     self.assertEqual(F.columns[len(F.columns) - 1] == 'label', False)
     self.assertEqual(
         cm.get_all_properties(C) == cm.get_all_properties(F), True)
Exemplo n.º 28
0
def _is_table_or_candset(df):
    table_props = ['key']
    candset_props = ['key', 'fk_ltable', 'fk_rtable', 'ltable', 'rtable']
    properties = cm.get_all_properties(df)
    keys = list(properties)
    if len(gh.list_diff(keys, table_props)) == 0:
        return True
    elif len(gh.list_diff(keys, candset_props)) == 0:
        return True
    else:
        return False
 def test_valid_path_candset_wi_valid_metadata(self):
     cm.del_catalog()
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID') # not initializing with ID will raise key_error
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     pd_C = pd.read_csv(path_c)
     self.assertEqual(C.equals(pd_C), True)
     self.assertEqual(len(cm.get_all_properties(C).keys()), 5)
     self.assertEqual(cm.get_key(C), '_id')
     self.assertEqual(cm.get_fk_ltable(C), 'ltable_ID')
     self.assertEqual(cm.get_fk_rtable(C), 'rtable_ID')
Exemplo n.º 30
0
def _is_table_or_candset(df):
    table_props = ['key']
    candset_props = ['key', 'fk_ltable', 'fk_rtable', 'ltable', 'rtable']
    properties = cm.get_all_properties(df)
    keys = list(properties)
    if len(gh.list_diff(keys, table_props)) == 0:
        return True
    elif len(gh.list_diff(keys, candset_props)) == 0:
        return True
    else:
        return False
Exemplo n.º 31
0
 def test_extract_feature_vecs_valid_8(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0] * len(C))
     feature_table = get_features_for_matching(
         A, B, validate_inferred_attr_types=False)
     F = extract_feature_vecs(
         C,
         feature_table=pd.DataFrame(columns=feature_table.columns),
         attrs_after=['label', '_id'])
     self.assertEqual(isinstance(F, pd.DataFrame), True)
     self.assertEqual(F.columns[0], '_id')
     self.assertEqual(F.columns[1], cm.get_fk_ltable(C))
     self.assertEqual(F.columns[2], cm.get_fk_rtable(C))
     # self.assertEqual(F.columns[3], 'ltable_name')
     self.assertEqual(F.columns[len(F.columns) - 1] == 'label', True)
     self.assertEqual(
         cm.get_all_properties(C) == cm.get_all_properties(F), True)
Exemplo n.º 32
0
    def test_blocker_combiner_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C1 = read_csv_metadata(path_c1, ltable=A, rtable=B)
        C2 = read_csv_metadata(path_c2, ltable=A, rtable=B)
        C3 = read_csv_metadata(path_c3, ltable=A, rtable=B)
        C = combine_blocker_outputs_via_union([C1, C2, C3])
        C_exp = read_csv_metadata(path_c, ltable=A, rtable=B)
        # try:
        #     C_exp.sort_values(['ltable_ID', 'rtable_ID'], inplace=True)
        # except AttributeError:
        #     C_exp.sort(['ltable_ID', 'rtable_ID'], inplace=True)
        # to_csv_metadata(C_exp, path_c)

        C_exp.reset_index(inplace=True, drop=True)
        C_exp['_id'] = six.moves.range(0, len(C_exp))
        if os.name != 'nt':
            self.assertEqual(C.equals(C_exp), True)
        p1 = cm.get_all_properties(C)
        p2 = cm.get_all_properties(C_exp)
        self.assertEqual(p1, p2)
Exemplo n.º 33
0
 def test_get_all_properties_valid_2(self):
     # cm.del_catalog()
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     m = cm.get_all_properties(C)
     self.assertEqual(len(m), 5)
     self.assertEqual(m['key'], '_id')
     self.assertEqual(m['fk_ltable'], 'ltable_ID')
     self.assertEqual(m['fk_rtable'], 'rtable_ID')
     self.assertEqual(m['ltable'].equals(A), True)
     self.assertEqual(m['rtable'].equals(B), True)
 def test_valid_path_candset_wi_valid_metadata(self):
     cm.del_catalog()
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(
         path_b, key='ID')  # not initializing with ID will raise key_error
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     pd_C = pd.read_csv(path_c)
     self.assertEqual(C.equals(pd_C), True)
     self.assertEqual(len(cm.get_all_properties(C).keys()), 5)
     self.assertEqual(cm.get_key(C), '_id')
     self.assertEqual(cm.get_fk_ltable(C), 'ltable_ID')
     self.assertEqual(cm.get_fk_rtable(C), 'rtable_ID')
    def test_blocker_combiner_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C1 = read_csv_metadata(path_c1, ltable=A, rtable=B)
        C2 = read_csv_metadata(path_c2, ltable=A, rtable=B)
        C3 = read_csv_metadata(path_c3, ltable=A, rtable=B)
        C = combine_blocker_outputs_via_union([C1, C2, C3])
        C_exp = read_csv_metadata(path_c, ltable=A, rtable=B)
        # try:
        #     C_exp.sort_values(['ltable_ID', 'rtable_ID'], inplace=True)
        # except AttributeError:
        #     C_exp.sort(['ltable_ID', 'rtable_ID'], inplace=True)
        # to_csv_metadata(C_exp, path_c)


        C_exp.reset_index(inplace=True, drop=True)
        C_exp['_id'] = six.moves.range(0, len(C_exp))
        if os.name != 'nt':
            self.assertEqual(C.equals(C_exp), True)
        p1 = cm.get_all_properties(C)
        p2 = cm.get_all_properties(C_exp)
        self.assertEqual(p1, p2)
Exemplo n.º 36
0
    def test_blocker_combiner_valid_8(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_1.csv']),
                               ltable=A,
                               rtable=B)
        C1.rename(columns={'l_ID': 'ltable_ID'}, inplace=True)
        C1.rename(columns={'r_ID': 'rtable_ID'}, inplace=True)
        cm.set_fk_ltable(C1, 'ltable_ID')
        cm.set_fk_rtable(C1, 'rtable_ID')
        C2 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_2.csv']),
                               ltable=A,
                               rtable=B)
        C2.rename(columns={'l_ID': 'ltable_ID'}, inplace=True)
        C2.rename(columns={'r_ID': 'rtable_ID'}, inplace=True)
        cm.set_fk_ltable(C2, 'ltable_ID')
        cm.set_fk_rtable(C2, 'rtable_ID')

        C = combine_blocker_outputs_via_union([C1, C2], 'l_', 'r_')
        C_exp = read_csv_metadata(os.sep.join([bc_datasets_path,
                                               'C_ex_4.csv']),
                                  ltable=A,
                                  rtable=B)
        C_exp.rename(columns={'l_ID': 'ltable_ID'}, inplace=True)
        C_exp.rename(columns={'r_ID': 'rtable_ID'}, inplace=True)
        cm.set_fk_ltable(C_exp, 'ltable_ID')
        cm.set_fk_rtable(C_exp, 'rtable_ID')

        # C_exp.sort_values(['l_ID', 'r_ID'], inplace=True)
        # C_exp.reset_index(inplace=True, drop=True)
        # C_exp['_id'] = six.moves.range(0, len(C_exp))
        # C_exp.drop('r_address', axis=1, inplace=True)
        if os.name != 'nt':
            self.assertEqual(C.equals(C_exp), True)
        p1 = cm.get_all_properties(C)
        p2 = cm.get_all_properties(C_exp)
        self.assertEqual(p1, p2)
Exemplo n.º 37
0
def _write_metadata(data_frame, file_path):
    """
    Write metadata contents to disk.
    """
    # Initialize a metadata dictionary to store the metadata.
    metadata_dict = collections.OrderedDict()

    # Get all the properties for the input data frame
    if cm.is_dfinfo_present(data_frame) is True:
        properties = cm.get_all_properties(data_frame)
    else:
        # If the data_frame is not in the catalog, then return immedidately.
        return False

    # If the properties are present in the catalog, then write properties to
    # disk
    if len(properties) > 0:
        for property_name, property_value in six.iteritems(properties):
            # If the property value is not of type string, then just write it
            #  as 'POINTER'. This will be useful while writing the candidate
            # sets to disk. The candidate set will have properties such as
            # ltable and rtable which are DataFrames. We do not have a simple
            # way to write them to disk and link them back the candidate set
            # while reading back from disk. So to get around this problem we
            # will use 'POINTER' as the special value to indicate objects
            # other than strings.
            if isinstance(property_value, six.string_types) is False:
                metadata_dict[property_name] = 'POINTER'
            else:
                metadata_dict[property_name] = property_value

        # Write the properties to a file in disk. The file will one property
        # per line. We follow a special syntax to write the properties. The
        # syntax is:
        # #property_name=property_value
        with open(file_path, 'w') as file_handler:
            for property_name, property_value in six.iteritems(metadata_dict):
                file_handler.write('#%s=%s\n' %
                                   (property_name, property_value))

    return True
Exemplo n.º 38
0
def save_table(data_frame, file_path, metadata_ext='.pklmetadata'):
    """
    Saves a DataFrame to disk along with its metadata in a pickle format.

    This function saves a  DataFrame to disk along with its metadata from
    the catalog.

    Specifically, this function saves the DataFrame in the given
    file path, and saves the metadata in the same directory (as the
    file path) but with a different extension. This extension can be
    optionally given by the user (defaults to '.pklmetadata').

    Args:
        data_frame (DataFrame): The DataFrame that should be saved.

        file_path (string): The file path where the DataFrame must be stored.

        metadata_ext (string): The metadata extension that should be used while
            storing the metadata information. The default value is
            '.pklmetadata'.

    Returns:
        A Boolean value of True is returned if the DataFrame is successfully
        saved.

    Raises:
        AssertionError: If `data_frame` is not of type pandas
         DataFrame.
        AssertionError: If `file_path` is not of type string.
        AssertionError: If `metadata_ext` is not of type string.
        AssertionError: If a file cannot written in the given `file_path`.

    Examples:

        >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]})
        >>> em.save_table(A, './A.pkl') # will store two files ./A.pkl and ./A.pklmetadata

        >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]})
        >>> em.save_table(A, './A.pkl', metadata_ext='.pklmeta') # will store two files ./A.pkl and ./A.pklmeta


    See Also:
        :meth:`~py_entitymatching.load_table`

    Note:
        This function is a bit different from to_csv_metadata, where the
        DataFrame is stored in a CSV file format. The CSV file format can be
        viewed using a text editor. But a DataFrame stored using 'save_table' is
        stored in a special format, which cannot be viewed with a text editor.
        The reason we have save_table is, for larger DataFrames it is
        efficient to pickle the DataFrame to disk than writing the DataFrame
        in CSV format.
    """
    # Validate the input parameters

    validate_object_type(data_frame, pd.DataFrame)

    validate_object_type(file_path, six.string_types)

    validate_object_type(metadata_ext, six.string_types)

    # Get the file_name (with out extension) and the extension from the given
    #  file path. For example if the file_path was /Users/foo/file.csv then
    # the file_name will be /Users/foo/file and the extension will be '.csv'
    file_name, _ = os.path.splitext(file_path)

    # The metadata file name is the same file name but with the extension
    # given by the user
    metadata_filename = file_name + metadata_ext

    # Check if the file exists in the file_path and whether we have
    # sufficient access privileges to write in that path
    can_write, file_exists = ps._check_file_path(file_path)

    if can_write:
        # If the file already exists then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning(
                'File already exists at %s; Overwriting it', file_path)
            # we open the file_path in binary mode, as we are writing in
            # binary format'
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
        else:
            #
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
    else:
        # Looks like we cannot write the file in the given path. Raise an
        # error in this case.
        logger.error('Cannot write in the file path %s; Exiting', file_path)
        raise AssertionError('Cannot write in the file path %s', file_path)

    # Once we are done with writing the DataFrame, we will write the metadata
    #  now

    # Initialize a metadata dictionary to hold the metadata of DataFrame from
    #  the catalog
    metadata_dict = collections.OrderedDict()

    # get all the properties for the input data frame
    # # Check if the DataFrame information is present in the catalog
    properties = {}
    if cm.is_dfinfo_present(data_frame) is True:
        properties = cm.get_all_properties(data_frame)

    # If the properties are present in the catalog, then write properties to
    # disk
    if len(properties) > 0:
        for property_name, property_value in six.iteritems(properties):
            if isinstance(property_value, six.string_types) is True:
                metadata_dict[property_name] = property_value

    # try to save metadata
    can_write, file_exists = ps._check_file_path(metadata_filename)
    if can_write:
        # If the file already exists, then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning(
                'Metadata file already exists at %s. Overwriting it',
                metadata_filename)
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
        else:
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
    else:
        logger.warning(
            'Cannot write metadata at the file path %s. Skip writing metadata '
            'file', metadata_filename)

    return True
Exemplo n.º 39
0
def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Samples a candidate set of tuple pairs (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): The input DataFrame to be sampled.
            Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): The number of samples to be picked from the input
            DataFrame.
        replace (boolean): A flag to indicate whether sampling should be
            done with replacement or not (defaults to False).
        verbose (boolean): A flag to indicate whether more detailed information
            about the execution steps should be printed out (defaults to False).

    Returns:
        A new DataFrame with 'sample_size' number of rows.

        Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If `table` is not of type pandas DataFrame.
        AssertionError: If the size of `table` is 0.
        AssertionError: If the `sample_size` is greater than the input
            DataFrame size.

    Examples:
        >>> import py_entitymatching as em
        >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from.


    Note:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    validate_object_type(table, pd.DataFrame)

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    # Get the sample set for the output table
    sample_indices = pd.np.random.choice(len(table), sample_size,
                                         replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table
Exemplo n.º 40
0
 def test_get_all_properties_valid_1(self):
     # cm.del_catalog()
     A = read_csv_metadata(path_a)
     m = cm.get_all_properties(A)
     self.assertEqual(len(m), 1)
     self.assertEqual(m['key'], 'ID')
Exemplo n.º 41
0
def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Samples a candidate set of tuple pairs (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): The input DataFrame to be sampled.
            Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): The number of samples to be picked from the input
            DataFrame.
        replace (boolean): A flag to indicate whether sampling should be
            done with replacement or not (defaults to False).
        verbose (boolean): A flag to indicate whether more detailed information
            about the execution steps should be printed out (defaults to False).

    Returns:
        A new DataFrame with 'sample_size' number of rows.

        Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If `table` is not of type pandas DataFrame.
        AssertionError: If the size of `table` is 0.
        AssertionError: If the `sample_size` is greater than the input
            DataFrame size.

    Examples:
        >>> import py_entitymatching as em
        >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from.


    Note:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    validate_object_type(table, pd.DataFrame)

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                      rtable, l_key, r_key, logger, verbose)

    # Get the sample set for the output table
    sample_indices = np.random.choice(len(table), sample_size, replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table
Exemplo n.º 42
0
def save_table(data_frame, file_path, metadata_ext='.pklmetadata'):
    """
    Saves a DataFrame to disk along with its metadata in a pickle format.

    This function saves a  DataFrame to disk along with its metadata from
    the catalog.

    Specifically, this function saves the DataFrame in the given
    file path, and saves the metadata in the same directory (as the
    file path) but with a different extension. This extension can be
    optionally given by the user (defaults to '.pklmetadata').

    Args:
        data_frame (DataFrame): The DataFrame that should be saved.

        file_path (string): The file path where the DataFrame must be stored.

        metadata_ext (string): The metadata extension that should be used while
            storing the metadata information. The default value is
            '.pklmetadata'.

    Returns:
        A Boolean value of True is returned if the DataFrame is successfully
        saved.

    Raises:
        AssertionError: If `data_frame` is not of type pandas
         DataFrame.
        AssertionError: If `file_path` is not of type string.
        AssertionError: If `metadata_ext` is not of type string.
        AssertionError: If a file cannot written in the given `file_path`.

    Examples:

        >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]})
        >>> em.save_table(A, './A.pkl') # will store two files ./A.pkl and ./A.pklmetadata

        >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]})
        >>> em.save_table(A, './A.pkl', metadata_ext='.pklmeta') # will store two files ./A.pkl and ./A.pklmeta


    See Also:
        :meth:`~py_entitymatching.load_table`

    Note:
        This function is a bit different from to_csv_metadata, where the
        DataFrame is stored in a CSV file format. The CSV file format can be
        viewed using a text editor. But a DataFrame stored using 'save_table' is
        stored in a special format, which cannot be viewed with a text editor.
        The reason we have save_table is, for larger DataFrames it is
        efficient to pickle the DataFrame to disk than writing the DataFrame
        in CSV format.
    """
    # Validate the input parameters

    validate_object_type(data_frame, pd.DataFrame)

    validate_object_type(file_path, six.string_types, error_prefix='Input file path')

    validate_object_type(metadata_ext, six.string_types, error_prefix='Input Metadata ext')

    # Get the file_name (with out extension) and the extension from the given
    #  file path. For example if the file_path was /Users/foo/file.csv then
    # the file_name will be /Users/foo/file and the extension will be '.csv'
    file_name, _ = os.path.splitext(file_path)

    # The metadata file name is the same file name but with the extension
    # given by the user
    metadata_filename = file_name + metadata_ext

    # Check if the file exists in the file_path and whether we have
    # sufficient access privileges to write in that path
    can_write, file_exists = ps._check_file_path(file_path)

    if can_write:
        # If the file already exists then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning(
                'File already exists at %s; Overwriting it', file_path)
            # we open the file_path in binary mode, as we are writing in
            # binary format'
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
        else:
            #
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
    else:
        # Looks like we cannot write the file in the given path. Raise an
        # error in this case.
        logger.error('Cannot write in the file path %s; Exiting', file_path)
        raise AssertionError('Cannot write in the file path %s', file_path)

    # Once we are done with writing the DataFrame, we will write the metadata
    #  now

    # Initialize a metadata dictionary to hold the metadata of DataFrame from
    #  the catalog
    metadata_dict = collections.OrderedDict()

    # get all the properties for the input data frame
    # # Check if the DataFrame information is present in the catalog
    properties = {}
    if cm.is_dfinfo_present(data_frame) is True:
        properties = cm.get_all_properties(data_frame)

    # If the properties are present in the catalog, then write properties to
    # disk
    if len(properties) > 0:
        for property_name, property_value in six.iteritems(properties):
            if isinstance(property_value, six.string_types) is True:
                metadata_dict[property_name] = property_value

    # try to save metadata
    can_write, file_exists = ps._check_file_path(metadata_filename)
    if can_write:
        # If the file already exists, then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning(
                'Metadata file already exists at %s. Overwriting it',
                metadata_filename)
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
        else:
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
    else:
        logger.warning(
            'Cannot write metadata at the file path %s. Skip writing metadata '
            'file', metadata_filename)

    return True
Exemplo n.º 43
0
 def test_del_property_valid_df_name(self):
     A = read_csv_metadata(path_a)
     cm.del_property(A, 'key')
     self.assertEqual(len(cm.get_all_properties(A)), 0)
Exemplo n.º 44
0
 def test_get_all_properties_invalid_df_2(self):
     # cm.del_catalog()
     A = pd.read_csv(path_a)
     C = cm.get_all_properties(A)
Exemplo n.º 45
0
 def test_get_all_properties_invalid_df_1(self):
     # cm.del_catalog()
     C = cm.get_all_properties(None)