예제 #1
0
    def test_vis_debug_matcher_dt_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C,
                                               feature_table=feature_table,
                                               attrs_after='labels')

        dt = DTMatcher()
        train_test = mu.train_test_split(feature_vectors)

        train = train_test['train']
        test = train_test['test']

        _vis_debug_dt(
            dt,
            train,
            test,
            exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
            target_attr='labels',
            show_window=False)
 def test_select_matcher_valid_3(self):
     A = read_csv_metadata(path_a, key='id')
     B = read_csv_metadata(path_b, key='id')
     # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
     #                       fk_rtable='rtable.id', key='_id')
     # labels = [0] * 7
     # labels.extend([1] * 8)
     # C['labels'] = labels
     # feature_table = get_features_for_matching(A, B)
     # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
     # feature_vectors.fillna(0, inplace=True)
     feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
     dtmatcher = DTMatcher()
     nbmatcher = NBMatcher()
     rfmatcher = RFMatcher()
     svmmatcher = SVMMatcher()
     linregmatcher = LinRegMatcher()
     logregmatcher = LogRegMatcher()
     matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]
     col_list = list(feature_vectors.columns)
     l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
                              cm.get_fk_rtable(feature_vectors),
                              'gold'])
     X = feature_vectors[l]
     Y = feature_vectors['gold']
     result = select_matcher(matchers, x=X, y=Y, metric='recall')
     header = ['Name', 'Matcher', 'Num folds']
     result_df = result['cv_stats']
     self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
     self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
     d = result_df.set_index('Name')
     p_max = d.ix[result['selected_matcher'].name, 'Mean score']
     a_max = pd.np.max(d['Mean score'])
     self.assertEqual(p_max, a_max)
예제 #3
0
    def test_add_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
        add_feature(pd.DataFrame(), 'test', f_dict)
예제 #4
0
 def test_valid_path_df_metadata_split_betn_file_kw(self):
     cm.del_catalog()
     del_files_in_dir(sndbx_path)
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     path_c = os.sep.join([io_datasets_path, 'C_partialmeta.csv'])
     C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID')
예제 #5
0
 def test_label_table_valid_3(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     D = label_table(C, 'label')
     p1, p2 = cm.get_all_properties(C), cm.get_all_properties(D)
     self.assertEqual(p1, p2)
예제 #6
0
 def test_feature_fn_valid_nosim_tok(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = get_features_for_matching(A, B)
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, dict(), dict())
예제 #7
0
    def test_eval_matches_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'predicted', 'gold')
        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 14)
        self.assertEqual(result['prec_denominator'], 14)
        self.assertAlmostEqual(result['precision'], 1)
        self.assertEqual(result['recall_numerator'], 14)
        self.assertEqual(result['recall_denominator'], 15)
        self.assertEqual(result['recall'], 0.9333333333333333)
        self.assertEqual(result['f1'], 0.9655172413793104)
        self.assertEqual(result['pred_pos_num'], 14)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 1)
        self.assertEqual(result['false_neg_num'], 1.0)
        self.assertEqual(len(result['false_neg_ls']), 1)
        t = result['false_neg_ls'][0]
        self.assertEqual(t[0], 'a1')
        self.assertEqual(t[1], 'b1')
예제 #8
0
    def test_eval_matches_valid_3(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = len(C1)
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        # gold.extend([1]*num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        D = pd.DataFrame(columns=C1.columns)
        cm.copy_properties(C, D)
        result = eval_matches(D, 'gold', 'predicted')

        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 0)
        self.assertEqual(result['prec_denominator'], 0)
        self.assertAlmostEqual(result['precision'], 0)
        self.assertEqual(result['recall_numerator'], 0)
        self.assertEqual(result['recall_denominator'], 0)
        self.assertEqual(result['recall'], 0)
        self.assertEqual(result['f1'], 0)
        self.assertEqual(result['pred_pos_num'], 0)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 0)
        self.assertEqual(result['false_neg_num'], 0.0)
        self.assertEqual(len(result['false_neg_ls']), 0)
예제 #9
0
    def test_eval_matches_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'predicted', 'gold')
        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 14)
        self.assertEqual(result['prec_denominator'], 14)
        self.assertAlmostEqual(result['precision'], 1)
        self.assertEqual(result['recall_numerator'], 14)
        self.assertEqual(result['recall_denominator'], 15)
        self.assertEqual(result['recall'], 0.9333333333333333)
        self.assertEqual(result['f1'], 0.9655172413793104)
        self.assertEqual(result['pred_pos_num'], 14)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 1)
        self.assertEqual(result['false_neg_num'], 1.0)
        self.assertEqual(len(result['false_neg_ls']), 1)
        t = result['false_neg_ls'][0]
        self.assertEqual(t[0], 'a1')
        self.assertEqual(t[1], 'b1')
예제 #10
0
    def test_eval_matches_valid_3(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = len(C1)
        num_zeros = len(C1) - num_ones
        gold = [0]*num_ones
        # gold.extend([1]*num_zeros)
        predicted = [1]* (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln+1, 'predicted', predicted)
        D = pd.DataFrame(columns=C1.columns)
        cm.copy_properties(C, D)
        result = eval_matches(D, 'gold', 'predicted')

        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 0)
        self.assertEqual(result['prec_denominator'], 0)
        self.assertAlmostEqual(result['precision'], 0)
        self.assertEqual(result['recall_numerator'], 0)
        self.assertEqual(result['recall_denominator'], 0)
        self.assertEqual(result['recall'], 0)
        self.assertEqual(result['f1'], 0)
        self.assertEqual(result['pred_pos_num'], 0)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 0)
        self.assertEqual(result['false_neg_num'], 0.0)
        self.assertEqual(len(result['false_neg_ls']), 0)
예제 #11
0
 def test_sample_table_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     D = sample_table(C, 10, False)
     self.assertEqual(cm.get_all_properties(C), cm.get_all_properties(D))
     self.assertEqual(len(D), 10)
예제 #12
0
    def test_blocker_combiner_valid_8(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_1.csv']), ltable=A, rtable=B)
        C1.rename(columns={'l_ID':'ltable_ID'}, inplace=True)
        C1.rename(columns={'r_ID':'rtable_ID'}, inplace=True)
        cm.set_fk_ltable(C1, 'ltable_ID')
        cm.set_fk_rtable(C1, 'rtable_ID')
        C2 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_2.csv']), ltable=A, rtable=B)
        C2.rename(columns={'l_ID':'ltable_ID'}, inplace=True)
        C2.rename(columns={'r_ID':'rtable_ID'}, inplace=True)
        cm.set_fk_ltable(C2, 'ltable_ID')
        cm.set_fk_rtable(C2, 'rtable_ID')

        C = combine_blocker_outputs_via_union([C1, C2], 'l_', 'r_')
        C_exp = read_csv_metadata(os.sep.join([bc_datasets_path, 'C_ex_4.csv']), ltable=A, rtable=B)
        C_exp.rename(columns={'l_ID':'ltable_ID'}, inplace=True)
        C_exp.rename(columns={'r_ID':'rtable_ID'}, inplace=True)
        cm.set_fk_ltable(C_exp, 'ltable_ID')
        cm.set_fk_rtable(C_exp, 'rtable_ID')

        # C_exp.sort_values(['l_ID', 'r_ID'], inplace=True)
        # C_exp.reset_index(inplace=True, drop=True)
        # C_exp['_id'] = six.moves.range(0, len(C_exp))
        # C_exp.drop('r_address', axis=1, inplace=True)
        if os.name != 'nt':
            self.assertEqual(C.equals(C_exp), True)
        p1 = cm.get_all_properties(C)
        p2 = cm.get_all_properties(C_exp)
        self.assertEqual(p1, p2)
예제 #13
0
    def test_ml_matcher_inplace_false_predict(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.train_test_split(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        test.drop('gold', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test,
                                 exclude_attrs='_id',
                                 target_attr='predicted',
                                 inplace=False,
                                 append=True)

        self.assertNotEqual(id(predictions), id(test))
        self.assertEqual(len(predictions), len(test))
        self.assertEqual(
            set(list(test.columns)).issubset(list(predictions.columns)), True)
        p_col = predictions.columns[len(predictions.columns) - 1]
        self.assertEqual(p_col, 'predicted')
예제 #14
0
 def test_label_table_valid_3(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     D = label_table(C, 'label')
     p1, p2 = cm.get_all_properties(C), cm.get_all_properties(D)
     self.assertEqual(p1, p2)
예제 #15
0
 def test_feature_fn_valid_nosim_tok(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = get_features_for_matching(A, B)
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, dict(), dict())
    def test_select_matcher_target_attr_not_present(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
        #                       fk_rtable='rtable.id', key='_id')
        # labels = [0] * 7
        # labels.extend([1] * 8)
        # C['labels'] = labels
        # feature_table = get_features_for_matching(A, B)
        # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
        # feature_vectors.fillna(0, inplace=True)
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        col_list = list(feature_vectors.columns)
        l = list_diff(col_list, [cm.get_fk_ltable(feature_vectors),
                                 cm.get_fk_rtable(feature_vectors)
                                 ])
        feature_vectors = feature_vectors[l]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs='_id',
                                target_attr='labels1', k=2)
    def test_select_matcher_valid_1(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
        #                       fk_rtable='rtable.id', key='_id')
        # C['labels'] = labels
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                target_attr='gold', k=7)
        header = ['Name', 'Matcher', 'Num folds']
        result_df = result['cv_stats']
        self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
        d = result_df.set_index('Name')
        p_max = d.ix[result['selected_matcher'].name, 'Mean score']
        a_max = pd.np.max(d['Mean score'])
        self.assertEqual(p_max, a_max)
예제 #18
0
 def test_check_table_order_invalid_df2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(A, B)
     status = afg.check_table_order(A, None, l_attr_types, r_attr_types, attr_corres)
예제 #19
0
 def test_ml_matcher_invalid_input_combn_fit(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.train_test_split(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(x=train, table=train)
예제 #20
0
    def test_valid_path_df_metadata_invalid_rtable(self):
        cm.del_catalog()
        del_files_in_dir(sndbx_path)
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        # path_c = os.sep.join([io_datasets_path, 'C_partialmeta.csv'])

        C = read_csv_metadata(path_c, rtable="temp", ltable=A)
예제 #21
0
    def test_valid_path_df_metadata_set_to_none_2(self):
        cm.del_catalog()
        del_files_in_dir(sndbx_path)
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        path_c = os.sep.join([io_datasets_path, 'C_partialmeta.csv'])

        C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable=None)
예제 #22
0
 def test_check_table_order_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(A, B)
     status = afg.check_table_order(A, B, l_attr_types, r_attr_types, attr_corres)
     self.assertEqual(status, True)
예제 #23
0
    def test_valid_path_df_metadata_invalid_rtable(self):
        cm.del_catalog()
        del_files_in_dir(sndbx_path)
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        # path_c = os.sep.join([io_datasets_path, 'C_partialmeta.csv'])

        C = read_csv_metadata(path_c, rtable="temp", ltable=A)
예제 #24
0
    def test_add_bb_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        def bb_fn(ltuple, rtuple):
            return 1.0

        add_blackbox_feature(pd.DataFrame(), 'test', bb_fn)
예제 #25
0
    def test_add_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                                get_sim_funs_for_matching())
        add_feature(pd.DataFrame(), 'test', f_dict)
예제 #26
0
 def test_check_table_order_invalid_df2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(A, B)
     status = afg.check_table_order(A, None, l_attr_types, r_attr_types,
                                    attr_corres)
예제 #27
0
 def test_ml_matcher_invalid_input_combn_fit(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.train_test_split(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(x=train, table=train)
예제 #28
0
 def test_get_features_invalid_ltable_rtable_switch(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(B, A)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim)
예제 #29
0
 def test_blocker_combiner_valid_5(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C3_ex_2.csv']), ltable=A, rtable=B)
     C = combine_blocker_outputs_via_union([C1, C1])
     self.assertEqual(len(C), 0)
     p1 = cm.get_all_properties(C)
     p2 = cm.get_all_properties(C1)
     self.assertEqual(p1, p2)
예제 #30
0
 def test_add_feature_name_already_present(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = create_feature_table()
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     add_feature(feature_table, 'test', f_dict)
예제 #31
0
    def test_get_attr_corres_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        ac = get_attr_corres(A, B)
        for c in ac['corres']:
            self.assertEqual(c[0], c[1])

        self.assertEqual(all(ac['ltable'] == A), True)
        self.assertEqual(all(ac['rtable'] == B), True)
예제 #32
0
    def test_add_bb_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        def bb_fn(ltuple, rtuple):
            return 1.0


        add_blackbox_feature(pd.DataFrame(), 'test', bb_fn)
예제 #33
0
 def test_check_table_order_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(A, B)
     status = afg.check_table_order(A, B, l_attr_types, r_attr_types,
                                    attr_corres)
     self.assertEqual(status, True)
예제 #34
0
 def test_get_features_for_matching_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feat_table = afg.get_features_for_matching(A, B)
     self.assertEqual(isinstance(feat_table, pd.DataFrame), True)
     functions = feat_table['function']
     for f in functions:
         x = f(A.ix[1], B.ix[2])
         self.assertEqual(x >= 0, True)
예제 #35
0
 def test_get_features_for_blocking_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feat_table = afg.get_features_for_blocking(A, B)
     self.assertEqual(isinstance(feat_table, pd.DataFrame), True)
     functions = feat_table['function']
     for f in functions:
         x = f(A.ix[1], B.ix[2])
         self.assertEqual(x >= 0, True)
예제 #36
0
 def test_check_table_order_invalid_attrcorres_ltable(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(A, B)
     attr_corres['ltable'] = pd.DataFrame()
     status = afg.check_table_order(A, B, l_attr_types, r_attr_types, attr_corres)
     self.assertEqual(status, False)
예제 #37
0
    def test_add_bb_feature_name_already_present(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        feature_table = create_feature_table()
        len1 = len(feature_table)
        def bb_fn(ltuple, rtuple):
            return 1.0

        add_blackbox_feature(feature_table, 'test', bb_fn)
        add_blackbox_feature(feature_table, 'test', bb_fn)
예제 #38
0
 def test_sample_table_valid_2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     D = sample_table(C, 10, True)
     self.assertEqual(id(cm.get_ltable(D)), id(cm.get_ltable(C)))
     self.assertEqual(id(cm.get_rtable(D)), id(cm.get_rtable(C)))
     self.assertEqual(cm.get_fk_ltable(D), cm.get_fk_ltable(C))
     self.assertEqual(cm.get_fk_rtable(D), cm.get_fk_rtable(C))
     self.assertEqual(len(D), 10)
예제 #39
0
 def test_get_reqd_metadata_from_catalog_valid_3(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     d = cm.get_reqd_metadata_from_catalog(C, ['key', 'fk_ltable', 'fk_rtable', 'ltable', 'rtable'])
     self.assertEqual(d['key'], cm.get_key(C))
     self.assertEqual(d['fk_ltable'], cm.get_fk_ltable(C))
     self.assertEqual(d['fk_rtable'], cm.get_fk_rtable(C))
     self.assertEqual(cm.get_ltable(C).equals(A), True)
     self.assertEqual(cm.get_rtable(C).equals(B), True)
예제 #40
0
 def test_check_table_order_invalid_attrcorres_ltable(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(A, B)
     attr_corres['ltable'] = pd.DataFrame()
     status = afg.check_table_order(A, B, l_attr_types, r_attr_types,
                                    attr_corres)
     self.assertEqual(status, False)
예제 #41
0
 def test_add_feature_name_already_present(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = create_feature_table()
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                             get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     add_feature(feature_table, 'test', f_dict)
 def test_extract_feature_vecs_invalid_feature_table(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_pos = len(C.columns)
     C.insert(col_pos, 'label', [0]*len(C))
     feature_table = get_features_for_matching(A, B)
     F = extract_feature_vecs(C, attrs_before='ltable_name',
                              feature_table=None,
                              attrs_after=['label', '_id'])
예제 #43
0
 def test_get_features_invalid_ltable_rtable_switch(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(B, A)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types,
                                   attr_corres, tok, sim)
예제 #44
0
 def test_valid_path_candset_wi_valid_metadata(self):
     cm.del_catalog()
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID') # not initializing with ID will raise key_error
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     pd_C = pd.read_csv(path_c)
     self.assertEqual(C.equals(pd_C), True)
     self.assertEqual(len(cm.get_all_properties(C).keys()), 5)
     self.assertEqual(cm.get_key(C), '_id')
     self.assertEqual(cm.get_fk_ltable(C), 'ltable_ID')
     self.assertEqual(cm.get_fk_rtable(C), 'rtable_ID')
예제 #45
0
 def test_ml_matcher_invalid_df_predict(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.train_test_split(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold')
     predictions = dt.predict(table="", exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                              target_attr='predicted',
                              append=True)
예제 #46
0
 def test_add_features_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = get_features_for_matching(A, B)
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     len2 = len(feature_table)
     self.assertEqual(len1+1, len2)
     self.assertEqual(feature_table.ix[len(feature_table)-1, 'function'](A.ix[1], B.ix[2]), 1.0)
예제 #47
0
 def test_get_property_valid_df_name_2(self):
     # cm.del_catalog()
     self.assertEqual(cm.get_catalog_len(), 0)
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     self.assertEqual(cm.get_property(C, 'key'), '_id')
     self.assertEqual(cm.get_property(C, 'fk_ltable'), 'ltable_ID')
     self.assertEqual(cm.get_property(C, 'fk_rtable'), 'rtable_ID')
     self.assertEqual(cm.get_property(C, 'ltable').equals(A), True)
     self.assertEqual(cm.get_property(C, 'rtable').equals(B), True)
예제 #48
0
 def test_add_bb_feature_valid_2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = create_feature_table()
     def bb_fn(ltuple, rtuple):
         return 1.0
     len1 = len(feature_table)
     add_blackbox_feature(feature_table, 'test', bb_fn)
     len2 = len(feature_table)
     self.assertEqual(len1+1, len2)
     self.assertEqual(feature_table.ix[len(feature_table)-1, 'function'](A.ix[1], B.ix[2]), 1.0)
예제 #49
0
    def test_add_bb_feature_name_already_present(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        feature_table = create_feature_table()
        len1 = len(feature_table)

        def bb_fn(ltuple, rtuple):
            return 1.0

        add_blackbox_feature(feature_table, 'test', bb_fn)
        add_blackbox_feature(feature_table, 'test', bb_fn)
예제 #50
0
    def test_label_table_with_colname_diff_values(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C['label'] = 0
        col_name = 'label'
        num_zeros, num_ones, num_twos = 8, 5, 2
        label_values = [0]*num_zeros
        label_values.extend([1]*num_ones)
        label_values.extend([2]*num_twos)

        D = self._test_label_table(C, col_name, label_values)
예제 #51
0
 def test_get_metadata_for_candset_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(C, None, False)
     self.assertEqual(key, '_id')
     self.assertEqual(fk_ltable, 'ltable_ID')
     self.assertEqual(fk_rtable, 'rtable_ID')
     self.assertEqual(l_key, 'ID')
     self.assertEqual(r_key, 'ID')
     self.assertEqual(ltable.equals(A), True)
     self.assertEqual(rtable.equals(B), True)
예제 #52
0
 def test_valid_path_candset_wi_valid_metadata(self):
     cm.del_catalog()
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(
         path_b, key='ID')  # not initializing with ID will raise key_error
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     pd_C = pd.read_csv(path_c)
     self.assertEqual(C.equals(pd_C), True)
     self.assertEqual(len(cm.get_all_properties(C).keys()), 5)
     self.assertEqual(cm.get_key(C), '_id')
     self.assertEqual(cm.get_fk_ltable(C), 'ltable_ID')
     self.assertEqual(cm.get_fk_rtable(C), 'rtable_ID')
예제 #53
0
 def test_label_table_valid_2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_name = 'label'
     num_zeros, num_ones = 8, 7
     label_values = [0]*num_zeros
     label_values.extend([1]*num_ones)
     D = self._test_label_table(C, col_name, label_values)
     self.assertEqual(pd.np.sum(D[col_name]), num_ones)
     p1, p2 = cm.get_all_properties(C), cm.get_all_properties(D)
     self.assertEqual(p1, p2)
예제 #54
0
    def test_label_table_with_colname_diff_values(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C['label'] = 0
        col_name = 'label'
        num_zeros, num_ones, num_twos = 8, 5, 2
        label_values = [0] * num_zeros
        label_values.extend([1] * num_ones)
        label_values.extend([2] * num_twos)

        D = self._test_label_table(C, col_name, label_values)
예제 #55
0
 def test_get_all_properties_valid_2(self):
     # cm.del_catalog()
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     m = cm.get_all_properties(C)
     self.assertEqual(len(m), 5)
     self.assertEqual(m['key'], '_id')
     self.assertEqual(m['fk_ltable'], 'ltable_ID')
     self.assertEqual(m['fk_rtable'], 'rtable_ID')
     self.assertEqual(m['ltable'].equals(A), True)
     self.assertEqual(m['rtable'].equals(B), True)
예제 #56
0
 def test_label_table_valid_2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     col_name = 'label'
     num_zeros, num_ones = 8, 7
     label_values = [0] * num_zeros
     label_values.extend([1] * num_ones)
     D = self._test_label_table(C, col_name, label_values)
     self.assertEqual(pd.np.sum(D[col_name]), num_ones)
     p1, p2 = cm.get_all_properties(C), cm.get_all_properties(D)
     self.assertEqual(p1, p2)