def test_copy_properties_update_false_2(self):
     A = read_csv_metadata(path_a)
     A1 = pd.read_csv(path_a)
     cm.copy_properties(A, A1, replace=False)
     p = cm.get_all_properties(A)
     p1 = cm.get_all_properties(A1)
     self.assertEqual(p, p1)
     self.assertEqual(cm.get_key(A1), cm.get_key(A))
 def test_copy_properties_valid_1(self):
     A = read_csv_metadata(path_a)
     A1 = pd.read_csv(path_a)
     cm.copy_properties(A, A1)
     self.assertEqual(cm.is_dfinfo_present(A1), True)
     p = cm.get_all_properties(A)
     p1 = cm.get_all_properties(A1)
     self.assertEqual(p, p1)
     self.assertEqual(cm.get_key(A1), cm.get_key(A))
    def test_valid_path_table_1(self):
        cm.del_catalog()
        del_files_in_dir(sndbx_path)
        A = read_csv_metadata(path_a)
        p = os.sep.join([sndbx_path, 'A.pkl'])
        save_table(A, p)

        A1 = load_table(p)
        self.assertEqual(A.equals(A), True)
        self.assertEqual(cm.get_key(A), cm.get_key(A1))
    def test_valid_path_table_1(self):
        cm.del_catalog()
        del_files_in_dir(sndbx_path)
        A = read_csv_metadata(path_a)
        p = os.sep.join([sndbx_path, 'A.pkl'])
        save_table(A, p)

        A1 = load_table(p)
        self.assertEqual(A.equals(A), True)
        self.assertEqual(cm.get_key(A), cm.get_key(A1))
    def test_index_candidate_set_3(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        l_key = cm.get_key(A)
        r_key = cm.get_key(B)
        C.ix[0, 'rtable_ID'] = 'bbbb'

        lrecord_id_to_index_map = db._get_record_id_to_index_map(A, l_key)
        rrecord_id_to_index_map = db._get_record_id_to_index_map(B, r_key)

        db._index_candidate_set(C, lrecord_id_to_index_map,
                                rrecord_id_to_index_map, False)
    def test_valid_path_df_chk_catalog_1(self):
        cm.del_catalog()
        del_files_in_dir(sndbx_path)
        A = read_csv_metadata(path_a)

        p = os.sep.join([sndbx_path, 'A_saved.csv'])

        creat_dir_ifnot_exists(sndbx_path)
        to_csv_metadata(A, p)

        A1 = read_csv_metadata(p)

        self.assertEqual(cm.get_key(A1), cm.get_key(A), 'The keys in the catalog are not same')
    def test_index_candidate_set_3(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        l_key = cm.get_key(A)
        r_key = cm.get_key(B)
        C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_' +
                l_key, fk_rtable='rtable_' + r_key, key = '_id')
        C.loc[0, 'rtable_ID'] = 'bbbb'

        lrecord_id_to_index_map = db._build_id_to_index_map(A, l_key)
        rrecord_id_to_index_map = db._build_id_to_index_map(B, r_key)

        db._index_candidate_set(C,
                lrecord_id_to_index_map, rrecord_id_to_index_map, False)
    def test_valid_path_df_chk_catalog_1(self):
        cm.del_catalog()
        del_files_in_dir(sndbx_path)
        A = read_csv_metadata(path_a)

        p = os.sep.join([sndbx_path, 'A_saved.csv'])

        creat_dir_ifnot_exists(sndbx_path)
        to_csv_metadata(A, p)

        A1 = read_csv_metadata(p)

        self.assertEqual(cm.get_key(A1), cm.get_key(A),
                         'The keys in the catalog are not same')
    def test_index_candidate_set_3(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        l_key = cm.get_key(A)
        r_key = cm.get_key(B)
        C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_' +
                l_key, fk_rtable='rtable_' + r_key, key = '_id')
        C.ix[0, 'rtable_ID'] = 'bbbb'

        lrecord_id_to_index_map = db._build_id_to_index_map(A, l_key)
        rrecord_id_to_index_map = db._build_id_to_index_map(B, r_key)

        db._index_candidate_set(C,
                lrecord_id_to_index_map, rrecord_id_to_index_map, False)
def rename_col(df, old_col_name, new_col_name):
    new_df = df.rename(columns={old_col_name: new_col_name})

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
                    cm.get_metadata_for_candset(df, logger, False)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)
                elif fk_ltable == old_col_name:
                    cm.set_fk_ltable(new_df, new_col_name)
                elif fk_rtable == old_col_name:
                    cm.set_fk_rtable(new_df, new_col_name)
                else:
                    pass
            else:
                key = cm.get_key(df)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)

    return new_df
예제 #11
0
def rename_col(df, old_col_name, new_col_name):
    new_df = df.rename(columns={old_col_name: new_col_name})

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
                    cm.get_metadata_for_candset(df, logger, False)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)
                elif fk_ltable == old_col_name:
                    cm.set_fk_ltable(new_df, new_col_name)
                elif fk_rtable == old_col_name:
                    cm.set_fk_rtable(new_df, new_col_name)
                else:
                    pass
            else:
                key = cm.get_key(df)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)

    return new_df
    def test_index_candidate_set_1(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        l_key = cm.get_key(A)
        r_key = cm.get_key(B)
        C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_' +
                l_key, fk_rtable='rtable_' + r_key, key = '_id')

        lrecord_id_to_index_map = db._build_id_to_index_map(A, l_key)
        rrecord_id_to_index_map = db._build_id_to_index_map(B, r_key)

        expected_cand_set = {0: set([0, 1, 5]), 1: set([2, 3, 4]), 2: set([0, 1,
            5]), 3: set([2, 3, 4]), 4: set([2, 3, 4])}
        actual_cand_set = db._index_candidate_set(C,
                lrecord_id_to_index_map, rrecord_id_to_index_map, False)
        self.assertEqual(expected_cand_set, actual_cand_set)
 def test_select_matcher_valid_2(self):
     A = read_csv_metadata(path_a, key='id')
     B = read_csv_metadata(path_b, key='id')
     # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
     #                       fk_rtable='rtable.id', key='_id')
     # labels = [0] * 7
     # labels.extend([1] * 8)
     # C['labels'] = labels
     # feature_table = get_features_for_matching(A, B)
     # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
     # feature_vectors.fillna(0, inplace=True)
     feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
     dtmatcher = DTMatcher()
     nbmatcher = NBMatcher()
     rfmatcher = RFMatcher()
     svmmatcher = SVMMatcher()
     linregmatcher = LinRegMatcher()
     logregmatcher = LogRegMatcher()
     matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]
     col_list = list(feature_vectors.columns)
     l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
                              cm.get_fk_rtable(feature_vectors),
                              'gold'])
     X = feature_vectors[l]
     Y = feature_vectors['gold']
     result = select_matcher(matchers, x=X, y=Y)
     header = ['Name', 'Matcher', 'Num folds']
     result_df = result['drill_down_cv_stats']['precision']
     self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
     self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
     d = result_df.set_index('Name')
     p_max = d.ix[result['selected_matcher'].name, 'Mean score']
     a_max = pd.np.max(d['Mean score'])
     self.assertEqual(p_max, a_max)
    def test_index_candidate_set_1(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        l_key = cm.get_key(A)
        r_key = cm.get_key(B)

        lrecord_id_to_index_map = db._get_record_id_to_index_map(A, l_key)
        rrecord_id_to_index_map = db._get_record_id_to_index_map(B, r_key)

        expected_cand_set = {(0, 1), (1, 2), (3, 2), (0, 0), (3, 3),
                             (4, 4), (1, 4), (2, 0), (1, 3), (0, 5),
                             (2, 1), (4, 3), (4, 2), (2, 5), (3, 4)}
        actual_cand_set = db._index_candidate_set(C,
                lrecord_id_to_index_map, rrecord_id_to_index_map, False)
        self.assertEqual(expected_cand_set, actual_cand_set)
 def test_valid_path_candset_with_diff_metadataextn_2(self):
     cm.del_catalog()
     path_a = os.sep.join([io_datasets_path, 'A.csv'])
     A = read_csv_metadata(path_a, metadata_extn='.mdx')
     pd_A = pd.read_csv(path_a)
     self.assertEqual(A.equals(pd_A), True)
     self.assertEqual(cm.get_key(A), 'ID')
 def test_valid_path_candset_with_diff_metadataextn_2(self):
     cm.del_catalog()
     path_a = os.sep.join([io_datasets_path, 'A.csv'])
     A = read_csv_metadata(path_a, metadata_extn='.mdx')
     pd_A = pd.read_csv(path_a)
     self.assertEqual(A.equals(pd_A), True)
     self.assertEqual(cm.get_key(A), 'ID')
    def test_valid_path_table_2(self):
        cm.del_catalog()
        del_files_in_dir(sndbx_path)
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        p = os.sep.join([sndbx_path, 'C.pkl'])
        save_table(C, p)

        C1 = load_table(p)
        self.assertEqual(C.equals(C1), True)
        self.assertEqual(cm.get_key(C), cm.get_key(C1))
        # self.assertEqual(cm.get_ltable(C).equals(cm.get_ltable(C1)), True)
        # self.assertEqual(cm.get_rtable(C).equals(cm.get_rtable(C1)), True)
        self.assertEqual(cm.get_fk_ltable(C), cm.get_fk_ltable(C1))
        self.assertEqual(cm.get_fk_rtable(C), cm.get_fk_rtable(C1))
    def test_valid_path_table_2(self):
        cm.del_catalog()
        del_files_in_dir(sndbx_path)
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        p = os.sep.join([sndbx_path, 'C.pkl'])
        save_table(C, p)

        C1 = load_table(p)
        self.assertEqual(C.equals(C1), True)
        self.assertEqual(cm.get_key(C), cm.get_key(C1))
        # self.assertEqual(cm.get_ltable(C).equals(cm.get_ltable(C1)), True)
        # self.assertEqual(cm.get_rtable(C).equals(cm.get_rtable(C1)), True)
        self.assertEqual(cm.get_fk_ltable(C), cm.get_fk_ltable(C1))
        self.assertEqual(cm.get_fk_rtable(C), cm.get_fk_rtable(C1))
예제 #19
0
    def test_copy_properties_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b)
        C = read_csv_metadata(path_c, ltable=A, rtable=B)

        C1 = pd.read_csv(path_c)
        cm.copy_properties(C, C1)
        self.assertEqual(cm.is_dfinfo_present(C1), True)
        p = cm.get_all_properties(C1)
        p1 = cm.get_all_properties(C1)
        self.assertEqual(p, p1)
        self.assertEqual(cm.get_key(C1), cm.get_key(C))
        self.assertEqual(cm.get_ltable(C1).equals(A), True)
        self.assertEqual(cm.get_rtable(C1).equals(B), True)
        self.assertEqual(cm.get_fk_ltable(C1), cm.get_fk_ltable(C))
        self.assertEqual(cm.get_fk_rtable(C1), cm.get_fk_rtable(C))
예제 #20
0
 def test_select_matcher_valid_2(self):
     A = read_csv_metadata(path_a, key='id')
     B = read_csv_metadata(path_b, key='id')
     # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
     #                       fk_rtable='rtable.id', key='_id')
     # labels = [0] * 7
     # labels.extend([1] * 8)
     # C['labels'] = labels
     # feature_table = get_features_for_matching(A, B)
     # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
     # feature_vectors.fillna(0, inplace=True)
     feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
     dtmatcher = DTMatcher()
     nbmatcher = NBMatcher()
     rfmatcher = RFMatcher()
     svmmatcher = SVMMatcher()
     linregmatcher = LinRegMatcher()
     logregmatcher = LogRegMatcher()
     matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]
     col_list = list(feature_vectors.columns)
     l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
                              cm.get_fk_rtable(feature_vectors),
                              'gold'])
     X = feature_vectors[l]
     Y = feature_vectors['gold']
     result = select_matcher(matchers, x=X, y=Y)
     header = ['Name', 'Matcher', 'Num folds']
     result_df = result['drill_down_cv_stats']['precision']
     self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
     self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
     d = result_df.set_index('Name')
     p_max = d.loc[result['selected_matcher'].name, 'Mean score']
     a_max = pd.np.max(d['Mean score'])
     self.assertEqual(p_max, a_max)
    def test_index_candidate_set_1(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        l_key = cm.get_key(A)
        r_key = cm.get_key(B)
        C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_' +
                l_key, fk_rtable='rtable_' + r_key, key = '_id')

        lrecord_id_to_index_map = db._build_id_to_index_map(A, l_key)
        rrecord_id_to_index_map = db._build_id_to_index_map(B, r_key)

        expected_cand_set = {0: set([0, 1, 5]), 1: set([2, 3, 4]), 2: set([0, 1,
            5]), 3: set([2, 3, 4]), 4: set([2, 3, 4])}
        actual_cand_set = db._index_candidate_set(C,
                lrecord_id_to_index_map, rrecord_id_to_index_map, False)
        self.assertEqual(expected_cand_set, actual_cand_set)
    def __init__(self, matcher, matcher_type, exclude_attrs_or_feature_table,
                 dictionary, table,
                 fp_dataframe, fn_dataframe):
        super(MainWindowManager, self).__init__()
        # Set the parameters as attributes to the class.
        self.matcher = matcher
        self.matcher_type = matcher_type
        self.exclude_attrs_or_feature_table = exclude_attrs_or_feature_table
        self.dictionary = dictionary
        self.table = table
        self.fp_dataframe = fp_dataframe
        self.fn_dataframe = fn_dataframe
        # Get the instance for QtWidgets
        em._viewapp = QtWidgets.QApplication.instance()
        if em._viewapp is None:
            em._viewapp = QtWidgets.QApplication([])
        app = em._viewapp

        ltable = cm.get_ltable(self.table)
        rtable = cm.get_rtable(self.table)

        # Get the copy of ltable and rtable
        l_df = ltable.copy()
        r_df = rtable.copy()

        # Set it as dataframes in the class
        self.l_df = l_df.set_index(cm.get_key(ltable), drop=False)
        self.r_df = r_df.set_index(cm.get_key(rtable), drop=False)

        # Set the metric widget, dataframe widget , combo box and the
        # dataframe correctly.
        self.metric_widget = None
        self.dataframe_widget = None
        self.combo_box = None
        self.current_combo_text = 'False Positives'
        self.current_dataframe = self.fp_dataframe
        self.setup_gui()

        width = min((40 + 1) * 105, app.desktop().screenGeometry().width() - 50)
        height = min((50 + 1) * 41, app.desktop().screenGeometry().width() -
                     100)
        # set the size of height and width corrrectly.
        self.resize(width, height)
예제 #23
0
    def __init__(self, matcher, matcher_type, exclude_attrs_or_feature_table,
                 dictionary, table, fp_dataframe, fn_dataframe):
        super(MainWindowManager, self).__init__()
        # Set the parameters as attributes to the class.
        self.matcher = matcher
        self.matcher_type = matcher_type
        self.exclude_attrs_or_feature_table = exclude_attrs_or_feature_table
        self.dictionary = dictionary
        self.table = table
        self.fp_dataframe = fp_dataframe
        self.fn_dataframe = fn_dataframe
        # Get the instance for QtGui
        em._viewapp = QtGui.QApplication.instance()
        if em._viewapp is None:
            em._viewapp = QtGui.QApplication([])
        app = em._viewapp

        ltable = cm.get_ltable(self.table)
        rtable = cm.get_rtable(self.table)

        # Get the copy of ltable and rtable
        l_df = ltable.copy()
        r_df = rtable.copy()

        # Set it as dataframes in the class
        self.l_df = l_df.set_index(cm.get_key(ltable), drop=False)
        self.r_df = r_df.set_index(cm.get_key(rtable), drop=False)

        # Set the metric widget, dataframe widget , combo box and the
        # dataframe correctly.
        self.metric_widget = None
        self.dataframe_widget = None
        self.combo_box = None
        self.current_combo_text = 'False Positives'
        self.current_dataframe = self.fp_dataframe
        self.setup_gui()

        width = min((40 + 1) * 105,
                    app.desktop().screenGeometry().width() - 50)
        height = min((50 + 1) * 41,
                     app.desktop().screenGeometry().width() - 100)
        # set the size of height and width corrrectly.
        self.resize(width, height)
 def test_valid_path_candset_wi_valid_metadata(self):
     cm.del_catalog()
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID') # not initializing with ID will raise key_error
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     pd_C = pd.read_csv(path_c)
     self.assertEqual(C.equals(pd_C), True)
     self.assertEqual(len(cm.get_all_properties(C).keys()), 5)
     self.assertEqual(cm.get_key(C), '_id')
     self.assertEqual(cm.get_fk_ltable(C), 'ltable_ID')
     self.assertEqual(cm.get_fk_rtable(C), 'rtable_ID')
 def test_valid_path_candset_wi_valid_metadata(self):
     cm.del_catalog()
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(
         path_b, key='ID')  # not initializing with ID will raise key_error
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     pd_C = pd.read_csv(path_c)
     self.assertEqual(C.equals(pd_C), True)
     self.assertEqual(len(cm.get_all_properties(C).keys()), 5)
     self.assertEqual(cm.get_key(C), '_id')
     self.assertEqual(cm.get_fk_ltable(C), 'ltable_ID')
     self.assertEqual(cm.get_fk_rtable(C), 'rtable_ID')
    def test_ml_matcher_valid_2(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')

        col_list = list(feature_vectors.columns)
        l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
                                 cm.get_fk_rtable(feature_vectors),
                                 'gold'])
        X = train[l]
        Y = train['gold']

        dt.fit(x=X, y=Y)
        predictions = dt.predict(test[l])
        self.assertEqual(len(predictions), len(test))
예제 #27
0
    def test_ml_matcher_valid_2(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.split_train_test(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')

        col_list = list(feature_vectors.columns)
        l = list_diff(col_list, [
            cm.get_key(feature_vectors),
            cm.get_fk_ltable(feature_vectors),
            cm.get_fk_rtable(feature_vectors), 'gold'
        ])
        X = train[l]
        Y = train['gold']

        dt.fit(x=X, y=Y)
        predictions = dt.predict(test[l])
        self.assertEqual(len(predictions), len(test))
def drop_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                col_list = gh.list_diff(col_list, [key, fk_ltable, fk_rtable])
                col_list = gh.list_drop_duplicates(col_list)
            else:
                key = cm.get_key(df)
                col_list = gh.list_diff(col_list, [key])
                col_list = gh.list_drop_duplicates(col_list)
        new_df = df.drop(col_list, axis=1)
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
def preserve_metadata(df, new_df):
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                if not ch.check_attrs_present(new_df,
                                              [key, fk_ltable, fk_rtable]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df
            else:
                key = cm.get_key(df)
                if not ch.check_attrs_present(new_df, [key]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df

        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    return new_df
예제 #30
0
def drop_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                col_list = gh.list_diff(col_list, [key, fk_ltable, fk_rtable])
                col_list = gh.list_drop_duplicates(col_list)
            else:
                key = cm.get_key(df)
                col_list = gh.list_diff(col_list, [key])
                col_list = gh.list_drop_duplicates(col_list)
        new_df = df.drop(col_list, axis=1)
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
예제 #31
0
def preserve_metadata(df, new_df):
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                if not ch.check_attrs_present(new_df, [key, fk_ltable,
                                                     fk_rtable]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df
            else:
                key = cm.get_key(df)
                if not ch.check_attrs_present(new_df, [key]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df


        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    return new_df
예제 #32
0
def project_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                updated_col_list = [key, fk_ltable, fk_rtable]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
            else:
                key = cm.get_key(df)
                updated_col_list = [key]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
        new_df = df[col_list]
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
def project_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                updated_col_list = [key, fk_ltable, fk_rtable]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
            else:
                key = cm.get_key(df)
                updated_col_list = [key]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
        new_df = df[col_list]
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
예제 #34
0
 def test_set_key_valid(self):
     A = pd.read_csv(path_a)
     cm.set_key(A, 'ID')
     self.assertEqual(cm.get_key(A), 'ID')
예제 #35
0
 def test_get_key_df_notin_catalog(self):
     A = pd.read_csv(path_a)
     cm.get_key(A)
예제 #36
0
 def test_get_key_invalid_df(self):
     cm.get_key(None)
def combine_blocker_outputs_via_union(
        blocker_output_list,
        l_prefix='ltable_',
        r_prefix='rtable_',
        verbose=False):
    """
    Combines multiple blocker outputs by doing a union of their tuple pair
    ids (foreign key ltable, foreign key rtable).

    Specifically, this function takes in a list of DataFrames (candidate
    sets, typically the
    output from blockers) and returns a consolidated DataFrame. The output
    DataFrame contains the union of tuple pair ids (foreign key ltable,
    foreign key rtable) and other attributes from the input list of DataFrames.

    This function makes some assumptions about the input DataFrames. First,
    each DataFrame is expected to contain the following metadata in the
    catalog: key, fk_ltable, fk_rtable, ltable, and rtable. Second,
    all the DataFrames must be a result of blocking from the same underlying
    tables. Concretely the ltable and rtable properties must refer to the
    same DataFrame across all the input tables. Third, all the input
    DataFrames must have the same fk_ltable and fk_rtable properties.
    Finally, in each input DataFrame, for the attributes included from the
    ltable or rtable, the attribute names must be prefixed with the given
    l_prefix and r_prefix in the function.

    The input DataFrames may contain different attribute lists and it demands
    the question of how to combine them. Currently py_entitymatching takes an union
    of attribute names that has prefix l_prefix or r_prefix across
    input tables. After taking the union, for each tuple id pair included
    in output, the attribute values (for union-ed attribute names) are
    probed from ltable/rtable and included in the output.

    A subtle point to note here is,  if an input DataFrame has a column
    added by user (say label for some reason), then that column will not
    be present in the output. The reason is, the same column may not be
    present in other candidate sets so it is not clear about how to
    combine them. One possibility is to include label in output for all
    tuple id pairs, but set as NaN for the values not present. Currently
    py_entitymatching does not include such columns and addressing it will be part
    of future work.

    Args:
        blocker_output_list (list of DataFrames): The list of DataFrames that
            should be combined.
        l_prefix (string): The prefix given to the attributes from the ltable.
        r_prefix (string): The prefix given to the attributes from the rtable.
        verbose (boolean): A flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with the combined tuple pairs and other attributes from
        all the blocker lists.

    Raises:
        AssertionError: If `l_prefix` is not of type string.
        AssertionError: If `r_prefix` is not of type string.
        AssertionError: If the length of the input DataFrame list is 0.
        AssertionError: If `blocker_output_list` is not a list of
            DataFrames.
        AssertionError: If the ltables are different across the input list of
            DataFrames.
        AssertionError: If the rtables are different across the input list of
            DataFrames.
        AssertionError: If the `fk_ltable` values are different across the
            input list of DataFrames.
        AssertionError: If the `fk_rtable` values are different across the
            input list of DataFrames.

    Examples:

        >>> import py_entitymatching as em
        >>> ab = em.AttrEquivalenceBlocker()
        >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode')
        >>> ob = em.OverlapBlocker()
        >>> D = ob.block_candset(C, 'address', 'address')
        >>> block_f = em.get_features_for_blocking(A, B)
        >>> rb = em.RuleBasedBlocker()
        >>> rule = ['address_address_lev(ltuple, rtuple) > 6']
        >>> rb.add_rule(rule, block_f)
        >>> E = rb.block_tables(A, B)
        >>> F = em.combine_blocker_outputs_via_union([C, E])


    """

    # validate input parameters

    # The l_prefix is expected to be of type string
    py_entitymatching.utils.validation_helper.validate_object_type(l_prefix, six.string_types, 'l_prefix')

    # The r_prefix is expected to be of type string
    py_entitymatching.utils.validation_helper.validate_object_type(r_prefix, six.string_types, 'r_prefix')

    # We cannot combine empty DataFrame list
    if not len(blocker_output_list) > 0:
        logger.error('There no DataFrames to combine')
        raise AssertionError('There are no DataFrames to combine')

    # Validate the assumptions about the input tables.
    # # 1) All the input object must be DataFrames
    # # 2) All the input DataFrames must have the metadata as that of a
    # candidate set
    # # 3) All the input DataFrames must have the same fk_ltable and fk_rtable
    _validate_lr_tables(blocker_output_list)

    # # Get the ltable and rtable. We take it from the first DataFrame as all
    #  the DataFrames contain the same ltables and rtables
    ltable = cm.get_ltable(blocker_output_list[0])
    rtable = cm.get_rtable(blocker_output_list[0])

    # # Get the fk_ltable and fk_rtable. We take it from the first DataFrame as
    #  all the DataFrames contain the same ltables and rtables
    fk_ltable = cm.get_fk_ltable(blocker_output_list[0])
    fk_rtable = cm.get_fk_rtable(blocker_output_list[0])

    # Retrieve the keys for the ltable and rtables.
    l_key = cm.get_key(ltable)
    r_key = cm.get_key(rtable)

    # Check if the fk_ltable is starting with the given prefix, if not its
    # not an error. Just raise a warning.
    if fk_ltable.startswith(l_prefix) is False:
        logger.warning(
            'Foreign key for ltable is not starting with the given prefix ('
            '%s)', l_prefix)

    # Check if the fk_rtable is starting with the given prefix, if not its
    # not an error. Just raise a warning.
    if fk_rtable.startswith(r_prefix) is False:
        logger.warning(
            'Foreign key for rtable is not starting with the given prefix ('
            '%s)', r_prefix)

    # Initialize lists
    # # keep track of projected tuple pair ids
    tuple_pair_ids = []
    # # keep track of output attributes from the left table
    l_output_attrs = []
    # # keep track of output attributes from the right table
    r_output_attrs = []

    # for each DataFrame in the given list, project out tuple pair ids, get the
    #  attributes from the ltable and rtable
    for data_frame in blocker_output_list:
        # Project out the tuple pair ids. A tuple pair id is a fk_ltable,
        # fk_rtable pair
        projected_tuple_pair_ids = data_frame[[fk_ltable, fk_rtable]]
        # Update the list that tracks tuple pair ids
        tuple_pair_ids.append(projected_tuple_pair_ids)

        # Get the columns, which should be segregated into the attributes
        # from the ltable and table
        col_set = (
            gh.list_diff(list(data_frame.columns),
                         [fk_ltable, fk_rtable, cm.get_key(data_frame)]))

        # Segregate the columns as attributes from the ltable and rtable
        l_attrs, r_attrs = _lr_cols(col_set, l_prefix, r_prefix)

        # Update the l_output_attrs, r_output_attrs
        l_output_attrs.extend(l_attrs)
        # the reason we use extend because l_attrs a list
        r_output_attrs.extend(r_attrs)

    ch.log_info(logger, 'Concatenating the tuple pair ids across given '
                        'blockers ...', verbose)

    # concatenate the tuple pair ids from the list of input DataFrames
    concatenated_tuple_pair_ids = pd.concat(tuple_pair_ids)

    ch.log_info(logger, 'Concatenating the tuple pair ids ... DONE', verbose)
    ch.log_info(logger, 'Deduplicating the tuple pair ids ...', verbose)

    # Deduplicate the DataFrame. Now the returned DataFrame will contain
    # unique tuple pair ids.

    # noinspection PyUnresolvedReferences
    deduplicated_tuple_pair_ids = concatenated_tuple_pair_ids.drop_duplicates()

    ch.log_info(logger, 'Deduplicating the tuple pair ids ... DONE', verbose)

    # Construct output table
    # # Get unique list of attributes across different tables
    l_output_attrs = gh.list_drop_duplicates(l_output_attrs)
    r_output_attrs = gh.list_drop_duplicates(r_output_attrs)

    # Reset the index that might have lingered from concatenation.
    deduplicated_tuple_pair_ids.reset_index(inplace=True, drop=True)

    # Add the output attribtues from the ltable and rtable.
    # NOTE: This approach may be inefficient as it probes the ltable, rtable
    # to get the attribute values. A better way would be to fill the
    # attribute values from the input list of DataFrames. This attribute values
    # could be harvested (at the expense of some space) while we iterate the
    # input blocker output list for the first time.

    # noinspection PyProtectedMember
    consolidated_data_frame = gh._add_output_attributes(
        deduplicated_tuple_pair_ids, fk_ltable,
        fk_rtable,
        ltable, rtable, l_key, r_key,
        l_output_attrs, r_output_attrs,
        l_prefix,
        r_prefix,
        validate=False)
    # Sort the DataFrame ordered by fk_ltable and fk_rtable.
    # The function "sort" will be depreciated in the newer versions of
    # pandas DataFrame, and it will replaced by 'sort_values' function. So we
    # will first try to use sort_values if this fails we will use sort.
    try:
        consolidated_data_frame.sort_values([fk_ltable, fk_rtable],
                                            inplace=True)
    except AttributeError:
        consolidated_data_frame.sort([fk_ltable, fk_rtable], inplace=True)

    # update the catalog for the consolidated DataFrame
    # First get a column name for the key
    key = ch.get_name_for_key(consolidated_data_frame.columns)
    # Second, add the column name as the key
    consolidated_data_frame = ch.add_key_column(consolidated_data_frame, key)
    # Third, reset the index to remove any out of order index  values from
    # the sort.
    consolidated_data_frame.reset_index(inplace=True, drop=True)
    # Finally, set the properties for the consolidated DataFrame in the catalog
    cm.set_candset_properties(consolidated_data_frame, key, fk_ltable,
                              fk_rtable, ltable,
                              rtable)

    # Return the consolidated DataFrame
    return consolidated_data_frame
 def test_validpath_metadata_set_to_none_1(self):
     cm.del_catalog()
     del_files_in_dir(sndbx_path)
     A = read_csv_metadata(path_a, key=None)
     self.assertEqual(cm.is_dfinfo_present(A), True)
     cm.get_key(A)
예제 #39
0
 def test_set_properties_df_notin_catalog_replace_false(self):
     A = read_csv_metadata(path_a)
     cm.set_properties(A, {}, replace=False)
     self.assertEqual(cm.get_key(A), 'ID')
 def test_validpath_metadata_set_to_none_1(self):
     cm.del_catalog()
     del_files_in_dir(sndbx_path)
     A = read_csv_metadata(path_a, key=None)
     self.assertEqual(cm.is_dfinfo_present(A), True)
     cm.get_key(A)
 def test_valid_path_wi_valid_metadata(self):
     cm.del_catalog()
     A = read_csv_metadata(path_a)
     pd_A = pd.read_csv(path_a)
     self.assertEqual(A.equals(pd_A), True)
     self.assertEqual(cm.get_key(A), 'ID')
 def test_valid_path_wi_valid_metadata(self):
     cm.del_catalog()
     A = read_csv_metadata(path_a)
     pd_A = pd.read_csv(path_a)
     self.assertEqual(A.equals(pd_A), True)
     self.assertEqual(cm.get_key(A), 'ID')
예제 #43
0
def combine_blocker_outputs_via_union(blocker_output_list,
                                      l_prefix='ltable_',
                                      r_prefix='rtable_',
                                      verbose=False):
    """
    Combines multiple blocker outputs by doing a union of their tuple pair
    ids (foreign key ltable, foreign key rtable).

    Specifically, this function takes in a list of DataFrames (candidate
    sets, typically the
    output from blockers) and returns a consolidated DataFrame. The output
    DataFrame contains the union of tuple pair ids (foreign key ltable,
    foreign key rtable) and other attributes from the input list of DataFrames.

    This function makes some assumptions about the input DataFrames. First,
    each DataFrame is expected to contain the following metadata in the
    catalog: key, fk_ltable, fk_rtable, ltable, and rtable. Second,
    all the DataFrames must be a result of blocking from the same underlying
    tables. Concretely the ltable and rtable properties must refer to the
    same DataFrame across all the input tables. Third, all the input
    DataFrames must have the same fk_ltable and fk_rtable properties.
    Finally, in each input DataFrame, for the attributes included from the
    ltable or rtable, the attribute names must be prefixed with the given
    l_prefix and r_prefix in the function.

    The input DataFrames may contain different attribute lists and it demands
    the question of how to combine them. Currently py_entitymatching takes an union
    of attribute names that has prefix l_prefix or r_prefix across
    input tables. After taking the union, for each tuple id pair included
    in output, the attribute values (for union-ed attribute names) are
    probed from ltable/rtable and included in the output.

    A subtle point to note here is,  if an input DataFrame has a column
    added by user (say label for some reason), then that column will not
    be present in the output. The reason is, the same column may not be
    present in other candidate sets so it is not clear about how to
    combine them. One possibility is to include label in output for all
    tuple id pairs, but set as NaN for the values not present. Currently
    py_entitymatching does not include such columns and addressing it will be part
    of future work.

    Args:
        blocker_output_list (list of DataFrames): The list of DataFrames that
            should be combined.
        l_prefix (string): The prefix given to the attributes from the ltable.
        r_prefix (string): The prefix given to the attributes from the rtable.
        verbose (boolean): A flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with the combined tuple pairs and other attributes from
        all the blocker lists.

    Raises:
        AssertionError: If `l_prefix` is not of type string.
        AssertionError: If `r_prefix` is not of type string.
        AssertionError: If the length of the input DataFrame list is 0.
        AssertionError: If `blocker_output_list` is not a list of
            DataFrames.
        AssertionError: If the ltables are different across the input list of
            DataFrames.
        AssertionError: If the rtables are different across the input list of
            DataFrames.
        AssertionError: If the `fk_ltable` values are different across the
            input list of DataFrames.
        AssertionError: If the `fk_rtable` values are different across the
            input list of DataFrames.
    """

    # validate input parameters

    # The l_prefix is expected to be of type string
    if not isinstance(l_prefix, six.string_types):
        logger.error('l_prefix is not of type string')
        raise AssertionError('l_prefix is not of type string')

    # The r_prefix is expected to be of type string
    if not isinstance(r_prefix, six.string_types):
        logger.error('r_prefix is not of type string')
        raise AssertionError('r_prefix is not of type string')

    # We cannot combine empty DataFrame list
    if not len(blocker_output_list) > 0:
        logger.error('There no DataFrames to combine')
        raise AssertionError('There are no DataFrames to combine')

    # Validate the assumptions about the input tables.
    # # 1) All the input object must be DataFrames
    # # 2) All the input DataFrames must have the metadata as that of a
    # candidate set
    # # 3) All the input DataFrames must have the same fk_ltable and fk_rtable
    _validate_lr_tables(blocker_output_list)

    # # Get the ltable and rtable. We take it from the first DataFrame as all
    #  the DataFrames contain the same ltables and rtables
    ltable = cm.get_ltable(blocker_output_list[0])
    rtable = cm.get_rtable(blocker_output_list[0])

    # # Get the fk_ltable and fk_rtable. We take it from the first DataFrame as
    #  all the DataFrames contain the same ltables and rtables
    fk_ltable = cm.get_fk_ltable(blocker_output_list[0])
    fk_rtable = cm.get_fk_rtable(blocker_output_list[0])

    # Retrieve the keys for the ltable and rtables.
    l_key = cm.get_key(ltable)
    r_key = cm.get_key(rtable)

    # Check if the fk_ltable is starting with the given prefix, if not its
    # not an error. Just raise a warning.
    if fk_ltable.startswith(l_prefix) is False:
        logger.warning(
            'Foreign key for ltable is not starting with the given prefix ('
            '%s)', l_prefix)

    # Check if the fk_rtable is starting with the given prefix, if not its
    # not an error. Just raise a warning.
    if fk_rtable.startswith(r_prefix) is False:
        logger.warning(
            'Foreign key for rtable is not starting with the given prefix ('
            '%s)', r_prefix)

    # Initialize lists
    # # keep track of projected tuple pair ids
    tuple_pair_ids = []
    # # keep track of output attributes from the left table
    l_output_attrs = []
    # # keep track of output attributes from the right table
    r_output_attrs = []

    # for each DataFrame in the given list, project out tuple pair ids, get the
    #  attributes from the ltable and rtable
    for data_frame in blocker_output_list:
        # Project out the tuple pair ids. A tuple pair id is a fk_ltable,
        # fk_rtable pair
        projected_tuple_pair_ids = data_frame[[fk_ltable, fk_rtable]]
        # Update the list that tracks tuple pair ids
        tuple_pair_ids.append(projected_tuple_pair_ids)

        # Get the columns, which should be segregated into the attributes
        # from the ltable and table
        col_set = (gh.list_diff(list(data_frame.columns),
                                [fk_ltable, fk_rtable,
                                 cm.get_key(data_frame)]))

        # Segregate the columns as attributes from the ltable and rtable
        l_attrs, r_attrs = _lr_cols(col_set, l_prefix, r_prefix)

        # Update the l_output_attrs, r_output_attrs
        l_output_attrs.extend(l_attrs)
        # the reason we use extend because l_attrs a list
        r_output_attrs.extend(r_attrs)

    ch.log_info(
        logger, 'Concatenating the tuple pair ids across given '
        'blockers ...', verbose)

    # concatenate the tuple pair ids from the list of input DataFrames
    concatenated_tuple_pair_ids = pd.concat(tuple_pair_ids)

    ch.log_info(logger, 'Concatenating the tuple pair ids ... DONE', verbose)
    ch.log_info(logger, 'Deduplicating the tuple pair ids ...', verbose)

    # Deduplicate the DataFrame. Now the returned DataFrame will contain
    # unique tuple pair ids.

    # noinspection PyUnresolvedReferences
    deduplicated_tuple_pair_ids = concatenated_tuple_pair_ids.drop_duplicates()

    ch.log_info(logger, 'Deduplicating the tuple pair ids ... DONE', verbose)

    # Construct output table
    # # Get unique list of attributes across different tables
    l_output_attrs = gh.list_drop_duplicates(l_output_attrs)
    r_output_attrs = gh.list_drop_duplicates(r_output_attrs)

    # Reset the index that might have lingered from concatenation.
    deduplicated_tuple_pair_ids.reset_index(inplace=True, drop=True)

    # Add the output attribtues from the ltable and rtable.
    # NOTE: This approach may be inefficient as it probes the ltable, rtable
    # to get the attribute values. A better way would be to fill the
    # attribute values from the input list of DataFrames. This attribute values
    # could be harvested (at the expense of some space) while we iterate the
    # input blocker output list for the first time.

    # noinspection PyProtectedMember
    consolidated_data_frame = gh._add_output_attributes(
        deduplicated_tuple_pair_ids,
        fk_ltable,
        fk_rtable,
        ltable,
        rtable,
        l_key,
        r_key,
        l_output_attrs,
        r_output_attrs,
        l_prefix,
        r_prefix,
        validate=False)
    # Sort the DataFrame ordered by fk_ltable and fk_rtable.
    # The function "sort" will be depreciated in the newer versions of
    # pandas DataFrame, and it will replaced by 'sort_values' function. So we
    # will first try to use sort_values if this fails we will use sort.
    try:
        consolidated_data_frame.sort_values([fk_ltable, fk_rtable],
                                            inplace=True)
    except AttributeError:
        consolidated_data_frame.sort([fk_ltable, fk_rtable], inplace=True)

    # update the catalog for the consolidated DataFrame
    # First get a column name for the key
    key = ch.get_name_for_key(consolidated_data_frame.columns)
    # Second, add the column name as the key
    consolidated_data_frame = ch.add_key_column(consolidated_data_frame, key)
    # Third, reset the index to remove any out of order index  values from
    # the sort.
    consolidated_data_frame.reset_index(inplace=True, drop=True)
    # Finally, set the properties for the consolidated DataFrame in the catalog
    cm.set_candset_properties(consolidated_data_frame, key, fk_ltable,
                              fk_rtable, ltable, rtable)

    # Return the consolidated DataFrame
    return consolidated_data_frame