Пример #1
0
class JaccardTestCase(unittest.TestCase):
    def setUp(self):
        self.threshold = 0.3
        self.matches_using_cart_prod = sim_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, get_jaccard_fn(), self.threshold, ['id'], ['id'])
        self.size_filter = SizeFilter(table_A, tokenized_table_A, l_attr, tok)
        self.size_filter.build_index()
        self.prefix_filter = PrefixFilter(table_A, tokenized_table_A, l_attr,
                                          tok, self.threshold, token_ordering)
        self.prefix_filter.build_index()
        self.position_filter = PositionFilter(table_A, tokenized_table_A,
                                              l_attr, tok, self.threshold,
                                              token_ordering)
        self.position_filter.build_index()
        self.suffix_filter = SuffixFilter(table_A, tokenized_table_A, l_attr,
                                          tok, self.threshold, token_ordering)

    def test_jaccard_match(self):
        # test jaccard with position filter, size filter, suffix filter
        matches = jaccard_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, self.threshold,
            [self.position_filter, self.size_filter, self.suffix_filter],
            ['id'], ['id'])
        self.assertTrue(compare_matches(self.matches_using_cart_prod, matches))

        # test jaccard with prefix filter, size filter, suffix filter
        matches = jaccard_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, self.threshold,
            [self.prefix_filter, self.size_filter, self.suffix_filter], ['id'],
            ['id'])
        self.assertTrue(compare_matches(self.matches_using_cart_prod, matches))
Пример #2
0
 def test_filter_pair(self, lstring, rstring, tokenizer, sim_measure_type,
                      threshold, allow_empty, allow_missing,
                      expected_output):
     size_filter = SizeFilter(tokenizer, sim_measure_type, threshold,
                              allow_empty, allow_missing)
     actual_output = size_filter.filter_pair(lstring, rstring)
     assert_equal(actual_output, expected_output)
Пример #3
0
 def setUp(self):
     self.threshold = 0.3
     self.matches_using_cart_prod = sim_match(
         table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
         r_attr, get_jaccard_fn(), self.threshold, ['id'], ['id'])
     self.size_filter = SizeFilter(table_A, tokenized_table_A, l_attr, tok)
     self.size_filter.build_index()
     self.prefix_filter = PrefixFilter(table_A, tokenized_table_A, l_attr,
                                       tok, self.threshold, token_ordering)
     self.prefix_filter.build_index()
     self.position_filter = PositionFilter(table_A, tokenized_table_A,
                                           l_attr, tok, self.threshold,
                                           token_ordering)
     self.position_filter.build_index()
     self.suffix_filter = SuffixFilter(table_A, tokenized_table_A, l_attr,
                                       tok, self.threshold, token_ordering)
    def test_filter_tables(self, tokenizer, sim_measure_type, threshold,
                           allow_empty, allow_missing, args, expected_pairs):
        size_filter = SizeFilter(tokenizer, sim_measure_type, threshold,
                                 allow_empty, allow_missing)
        actual_candset = size_filter.filter_tables(*args)

        expected_output_attrs = ['_id']
        l_out_prefix = self.default_l_out_prefix
        r_out_prefix = self.default_r_out_prefix

        # Check for l_out_prefix in args.
        if len(args) > 8:
            l_out_prefix = args[8]
        expected_output_attrs.append(l_out_prefix + args[2])

        # Check for r_out_prefix in args.
        if len(args) > 9:
            r_out_prefix = args[9]
        expected_output_attrs.append(r_out_prefix + args[3])

        # Check for l_out_attrs in args.
        if len(args) > 6:
            if args[6]:
                l_out_attrs = remove_redundant_attrs(args[6], args[2])
                for attr in l_out_attrs:
                    expected_output_attrs.append(l_out_prefix + attr)

        # Check for r_out_attrs in args.
        if len(args) > 7:
            if args[7]:
                r_out_attrs = remove_redundant_attrs(args[7], args[3])
                for attr in r_out_attrs:
                    expected_output_attrs.append(r_out_prefix + attr)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_candset.columns.values),
                          expected_output_attrs)

        actual_pairs = set()
        for idx, row in actual_candset.iterrows():
            actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]),
                                       str(row[r_out_prefix + args[3]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
Пример #5
0
    def test_filter_candset(self, tokenizer, sim_measure_type, threshold, args,
                           expected_pairs):
        size_filter = SizeFilter(tokenizer, sim_measure_type, threshold)
        actual_output_candset = size_filter.filter_candset(*args)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_output_candset.columns.values),
                          list(args[0].columns.values))

        actual_pairs = set()
        for idx, row in actual_output_candset.iterrows():
            actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
    def test_filter_candset(self, tokenizer, sim_measure_type, threshold,
                            allow_empty, allow_missing, args, expected_pairs):
        size_filter = SizeFilter(tokenizer, sim_measure_type, threshold,
                                 allow_empty, allow_missing)
        actual_output_candset = size_filter.filter_candset(*args)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_output_candset.columns.values),
                          list(args[0].columns.values))

        actual_pairs = set()
        for idx, row in actual_output_candset.iterrows():
            actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
Пример #7
0
class SizeFilterTestCase(unittest.TestCase):
    def setUp(self):
        self.size_filter = SizeFilter(A, A_tokenized, 'str', tok)
        self.size_filter.build_index()

    def test_apply_filter(self):
        # size filter satisfies
        l_tokens = ['aa', 'bb', 'cd', 'ef', 'fg']
        r_tokens = ['xx', 'yy', 'aa', 'bb']
        self.assertTrue(
            self.size_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                          len(r_tokens), 0.8))

        # size filter doesn't satisfy
        l_tokens = ['aa', 'bb', 'cd', 'ef', 'fg']
        r_tokens = ['xx']
        self.assertFalse(
            self.size_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                          len(r_tokens), 0.8))

        # test empty list of tokens
        l_tokens = ['aa', 'bb', 'cd', 'ef', 'fg']
        r_tokens = []
        self.assertFalse(
            self.size_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                          len(r_tokens), 0.8))
        self.assertFalse(
            self.size_filter.apply_filter(r_tokens, l_tokens, len(r_tokens),
                                          len(l_tokens), 0.8))

    def test_find_candidates(self):
        # test default case (presence of candidates)
        tokens = ['aa', 'xx', 'yy', 'uu']
        self.assertSetEqual(
            self.size_filter.find_candidates(tokens, len(tokens), 0.8),
            set([0, 3, 4]))

        # test empty set of candidates
        tokens = ['aa', 'op', 'xx', 'yy', 'uu', 'yu', 'iu', 'lp']
        self.assertSetEqual(
            self.size_filter.find_candidates(tokens, len(tokens), 0.8), set())

        # test empty list of probe tokens
        tokens = []
        self.assertSetEqual(
            self.size_filter.find_candidates(tokens, len(tokens), 0.8), set())
Пример #8
0
 def test_invalid_threshold(self):
     size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, 1.2)
Пример #9
0
 def test_invalid_r_out_attr(self):
     size_filter = SizeFilter(self.tokenizer, self.sim_measure_type,
                              self.threshold)
     size_filter.filter_tables(self.A, self.B, 'A.id', 'B.id',
                               'A.attr', 'B.attr',
                               ['A.attr'], ['B.invalid_attr'])
Пример #10
0
 def test_invalid_sim_measure_type(self):
     size_filter = SizeFilter(self.tokenizer, 'INVALID_TYPE', self.threshold)
 def test_invalid_rtable(self):
     size_filter = SizeFilter(self.tokenizer, self.sim_measure_type,
                              self.threshold)
     size_filter.filter_tables(self.A, [], 'A.id', 'B.id',
                               'A.attr', 'B.attr')
Пример #12
0
 def test_invalid_rtable(self):
     size_filter = SizeFilter(self.tokenizer, self.sim_measure_type,
                              self.threshold)
     size_filter.filter_tables(self.A, [], 'A.id', 'B.id',
                               'A.attr', 'B.attr')
 def test_numeric_r_filter_attr(self):                                       
     size_filter = SizeFilter(self.tokenizer, self.sim_measure_type,         
                              self.threshold)                                
     size_filter.filter_tables(self.A, self.B, 'A.id', 'B.id',               
                               'A.attr', 'B.int_attr')
 def test_invalid_r_out_attr(self):
     size_filter = SizeFilter(self.tokenizer, self.sim_measure_type,
                              self.threshold)
     size_filter.filter_tables(self.A, self.B, 'A.id', 'B.id',
                               'A.attr', 'B.attr',
                               ['A.attr'], ['B.invalid_attr'])
Пример #15
0
 def setUp(self):
     self.size_filter = SizeFilter(A, A_tokenized, 'str', tok)
     self.size_filter.build_index()
 def test_filter_pair(self, lstring, rstring, tokenizer, sim_measure_type,
                      threshold, allow_empty, allow_missing, expected_output):
     size_filter = SizeFilter(tokenizer, sim_measure_type, threshold,
                              allow_empty, allow_missing)
     actual_output = size_filter.filter_pair(lstring, rstring)
     assert_equal(actual_output, expected_output)
 def test_invalid_tokenizer_for_edit_distance(self):
     size_filter = SizeFilter(self.tokenizer, 'EDIT_DISTANCE', 2)