def test_feature_fn_valid_nosim_tok(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = get_features_for_matching(A, B)
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, dict(), dict())
 def test_feature_fn_valid_nosim_tok(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False)
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, dict(), dict())
    def test_add_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                                get_sim_funs_for_matching())
        add_feature(pd.DataFrame(), 'test', f_dict)
 def test_add_feature_name_already_present(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = create_feature_table()
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     add_feature(feature_table, 'test', f_dict)
예제 #5
0
 def test_rulebased_matcher_rule_wi_no_auto_gen_feature(self):
     feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                             get_sim_funs_for_blocking())
     add_feature(self.feature_table, 'test', f_dict)
     test_rule = ['test(ltuple, rtuple) > 0.4']  # same as rule_1
     self.brm.add_rule(test_rule, self.feature_table)
     predictions = self.brm.predict(table=self.C)
     assert_equal(expected_labels_1, predictions)
예제 #6
0
 def test_rulebased_matcher_rule_wi_overlap_coeff_sim_fn(self):
     feature_string = "overlap_coeff(dlm_dc0(ltuple['name']), dlm_dc0(rtuple['name']))"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                             get_sim_funs_for_blocking())
     f_dict['is_auto_generated'] = True
     add_feature(self.feature_table, 'test', f_dict)
     test_rule = ['test(ltuple, rtuple) > 1']  # should return an empty set
     self.brm.add_rule(test_rule, self.feature_table)
     predictions = self.brm.predict(table=self.C)
     assert_equal(expected_labels_all_zeroes, predictions)
 def test_rb_block_tables_rule_wi_no_auto_gen_feature(self):
     feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                             get_sim_funs_for_blocking())
     add_feature(self.feature_table, 'test', f_dict)
     test_rule = ['test(ltuple, rtuple) < 0.3'] # same as rule_1
     self.rb.add_rule(test_rule, self.feature_table)
     C = self.rb.block_tables(self.A, self.B, show_progress=False)
     validate_metadata(C)
     validate_data(C, expected_ids_1)
 def test_add_feature_name_already_present(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = create_feature_table()
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                             get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     add_feature(feature_table, 'test', f_dict)
 def test_add_features_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False)
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     len2 = len(feature_table)
     self.assertEqual(len1+1, len2)
     self.assertEqual(feature_table.ix[len(feature_table)-1, 'function'](A.ix[1], B.ix[2]), 1.0)
 def test_rb_block_tables_rule_wi_overlap_coeff_sim_fn(self):
     feature_string = "overlap_coeff(dlm_dc0(ltuple['name']), dlm_dc0(rtuple['name']))"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                             get_sim_funs_for_blocking())
     f_dict['is_auto_generated'] = True
     add_feature(self.feature_table, 'test', f_dict)
     test_rule = ['test(ltuple, rtuple) <= 1'] # should return an empty set
     self.rb.add_rule(test_rule, self.feature_table)
     C = self.rb.block_tables(self.A, self.B, show_progress=False)
     validate_metadata(C)
     validate_data(C)
    def test_trigger_rule_wi_no_auto_gen_feature(self):
        feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                                get_sim_funs_for_blocking())
        add_feature(self.feature_table, 'test', f_dict)
        test_rule = ['test(ltuple, rtuple) > 0.4']  # same as rule_1

        self.mt.add_cond_rule(test_rule, self.feature_table)
        self.mt.add_cond_status(False)
        self.mt.add_action(0)
        preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False)
        predictions = preds['neg_trig_labels'].tolist()
        assert_equal(expected_labels_1, predictions)
    def test_trigger_rule_wi_no_auto_gen_feature(self):
        feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                                get_sim_funs_for_blocking())
        add_feature(self.feature_table, 'test', f_dict)
        test_rule = ['test(ltuple, rtuple) > 0.4']  # same as rule_1

        self.mt.add_cond_rule(test_rule, self.feature_table)
        self.mt.add_cond_status(False)
        self.mt.add_action(0)
        preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False)
        predictions = preds['neg_trig_labels'].tolist()
        assert_equal(expected_labels_1, predictions)
    def test_trigger_rule_wi_diff_tokenizers(self):
        feature_string = "jaccard(qgm_3(ltuple['address']), dlm_dc0(rtuple['address']))"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                                get_sim_funs_for_blocking())
        f_dict['is_auto_generated'] = True
        add_feature(self.feature_table, 'test', f_dict)
        test_rule = ['test(ltuple, rtuple) > 1']  # should return an empty set

        self.mt.add_cond_rule(test_rule, self.feature_table)
        self.mt.add_cond_status(False)
        self.mt.add_action(0)
        preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False)
        predictions = preds['neg_trig_labels'].tolist()
        assert_equal(expected_labels_zeros, predictions)
 def test_add_features_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = get_features_for_matching(A, B)
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                             get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     len2 = len(feature_table)
     self.assertEqual(len1 + 1, len2)
     self.assertEqual(
         feature_table.ix[len(feature_table) - 1, 'function'](A.ix[1],
                                                              B.ix[2]), 1.0)
    def test_trigger_rule_wi_diff_tokenizers(self):
        feature_string = "jaccard(qgm_3(ltuple['address']), dlm_dc0(rtuple['address']))"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                                get_sim_funs_for_blocking())
        f_dict['is_auto_generated'] = True
        add_feature(self.feature_table, 'test', f_dict)
        test_rule = ['test(ltuple, rtuple) > 1']  # should return an empty set

        self.mt.add_cond_rule(test_rule, self.feature_table)
        self.mt.add_cond_status(False)
        self.mt.add_action(0)
        preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False)
        predictions = preds['neg_trig_labels'].tolist()
        assert_equal(expected_labels_zeros, predictions)
    def test_add_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())

        with self.assertRaises(AssertionError) as ctx:
            add_feature(pd.DataFrame(), 'test', f_dict)

        actual = str(ctx.exception)
        print(actual)
        expected = 'Feature table does not have all required columns\n ' \
                   'The following columns are missing: feature_name, left_attribute, right_attribute, ' \
                   'left_attr_tokenizer,' \
                   ' right_attr_tokenizer, simfunction, function, function_source, is_auto_generated'
        self.assertEqual(actual, expected)
    def test_add_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                                get_sim_funs_for_matching())

        with self.assertRaises(AssertionError) as ctx:
            add_feature(pd.DataFrame(), 'test', f_dict)

        actual = str(ctx.exception)
        print(actual)
        expected = 'Feature table does not have all required columns\n ' \
                   'The following columns are missing: feature_name, left_attribute, right_attribute, ' \
                   'left_attr_tokenizer,' \
                   ' right_attr_tokenizer, simfunction, function, function_source, is_auto_generated'
        self.assertEqual(actual, expected)
 def test_get_feature_fn_invalid_feat_str(self):
     get_feature_fn(None, dict(), dict())
 def test_get_feature_fn_invalid_sim(self):
     get_feature_fn("", None, dict())
 def test_get_feature_fn_invalid_tok(self):
     get_feature_fn("", dict(), None)
 def test_get_feature_fn_invalid_tok(self):
     get_feature_fn("", dict(), None)
 def test_get_feature_fn_invalid_sim(self):
     get_feature_fn("", None, dict())
 def test_get_feature_fn_invalid_feat_str(self):
     get_feature_fn(None, dict(), dict())