def test_set_return_set(self):
     tok = DelimiterTokenizer(set(['..', 'ab']))
     self.assertEqual(tok.get_return_set(), False)
     self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
                      [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
     self.assertEqual(tok.set_return_set(True), True)
     self.assertEqual(tok.get_return_set(), True)
     self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
                      [' cd', 'ef', 'bb', 'gg', 'gh'])
     self.assertEqual(tok.set_return_set(False), True)
     self.assertEqual(tok.get_return_set(), False)
     self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
                      [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
    def setUp(self):
        self.dlm = DelimiterTokenizer(delim_set=[' '], return_set=True)

        self.A = pd.DataFrame([{'l_id': 1, 'l_attr':'ab cd ef aa bb'},
                               {'l_id': 2, 'l_attr':''},
                               {'l_id': 3, 'l_attr':'ab'},
                               {'l_id': 4, 'l_attr':'ll oo he'},
                               {'l_id': 5, 'l_attr':'xy xx zz fg'},
                               {'l_id': 6, 'l_attr': pd.np.NaN}])
        self.B = pd.DataFrame([{'r_id': 1, 'r_attr':'zz fg xx'},
                               {'r_id': 2, 'r_attr':'he ll'},
                               {'r_id': 3, 'r_attr':'xz pl ou'},
                               {'r_id': 4, 'r_attr':'aa'},
                               {'r_id': 5, 'r_attr':'fg cd aa ef ab'},
                               {'r_id': 6, 'r_attr':None}])

        # generate cartesian product A x B to be used as candset
        self.A['tmp_join_key'] = 1
        self.B['tmp_join_key'] = 1
        self.C = pd.merge(self.A[['l_id', 'tmp_join_key']],
                          self.B[['r_id', 'tmp_join_key']],
                     on='tmp_join_key').drop('tmp_join_key', 1)

        self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr'])
        self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr'])
        self.empty_candset = pd.DataFrame(columns=['l_id', 'r_id'])
 def test_set_delim_set(self):
     tok = DelimiterTokenizer(['*', '.'])
     self.assertSetEqual(tok.get_delim_set(), {'*', '.'})
     self.assertEqual(tok.tokenize('ab cd*ef.*bb. gg.'),
                      ['ab cd', 'ef', 'bb', ' gg'])
     self.assertEqual(tok.set_delim_set({'..', 'ab'}), True)
     self.assertSetEqual(tok.get_delim_set(), {'..', 'ab'})
     self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
                      [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
Exemplo n.º 4
0
 def setUp(self):
     self.A = pd.DataFrame([{
         'A.id': 1,
         'A.attr': 'hello',
         'A.int_attr': 5
     }])
     self.B = pd.DataFrame([{
         'B.id': 1,
         'B.attr': 'world',
         'B.int_attr': 6
     }])
     self.tokenizer = DelimiterTokenizer(delim_set=[' '], return_set=True)
     self.threshold = 1
    def setUp(self):
        self.dlm = DelimiterTokenizer(delim_set=[' '], return_set=True)
        self.A = pd.DataFrame([{
            'id': 1,
            'attr': 'ab cd ef aa bb'
        }, {
            'id': 2,
            'attr': ''
        }, {
            'id': 3,
            'attr': 'ab'
        }, {
            'id': 4,
            'attr': 'll oo he'
        }, {
            'id': 5,
            'attr': 'xy xx zz fg'
        }, {
            'id': 6,
            'attr': pd.np.NaN
        }, {
            'id': 7,
            'attr': ''
        }])

        self.B = pd.DataFrame([{
            'id': 1,
            'attr': 'zz fg xx'
        }, {
            'id': 2,
            'attr': 'he ll'
        }, {
            'id': 3,
            'attr': 'xy pl ou'
        }, {
            'id': 4,
            'attr': 'aa'
        }, {
            'id': 5,
            'attr': 'fg cd aa ef ab'
        }, {
            'id': 6,
            'attr': None
        }, {
            'id': 7,
            'attr': ' '
        }])

        self.empty_table = pd.DataFrame(columns=['id', 'attr'])
        self.default_l_out_prefix = 'l_'
        self.default_r_out_prefix = 'r_'
Exemplo n.º 6
0
    def setup(self):
        ltable_path = os.sep.join([BASE_PATH, 'music', 'A.csv'])
        rtable_path = os.sep.join([BASE_PATH, 'music', 'B.csv'])

        if not os.path.exists(ltable_path):
            raise NotImplementedError(
                'Left table not found. Skipping benchmark.')

        if not os.path.exists(rtable_path):
            raise NotImplementedError(
                'Right table not found. Skipping benchmark.')

        self.ltable = pd.read_csv(ltable_path, encoding="iso-8859-1")
        self.rtable = pd.read_csv(rtable_path, encoding="iso-8859-1")
        self.l_id_attr = 'Sno'
        self.r_id_attr = 'Sno'
        self.l_join_attr = 'Song_Name'
        self.r_join_attr = 'Song_Name'
        self.delim_tok = DelimiterTokenizer(delim_set=[' '], return_set=True)
Exemplo n.º 7
0
    def setup(self):
        ltable_path = os.sep.join([BASE_PATH, 'restaurants', 'A.csv'])
        rtable_path = os.sep.join([BASE_PATH, 'restaurants', 'B.csv'])

        if not os.path.exists(ltable_path):
            raise NotImplementedError(
                'Left table not found. Skipping benchmark.')

        if not os.path.exists(rtable_path):
            raise NotImplementedError(
                'Right table not found. Skipping benchmark.')

        self.ltable = pd.read_csv(ltable_path)
        self.rtable = pd.read_csv(rtable_path)
        self.l_id_attr = 'ID'
        self.r_id_attr = 'ID'
        self.l_join_attr = 'NAME'
        self.r_join_attr = 'NAME'
        self.delim_tok = DelimiterTokenizer(delim_set=[' '], return_set=True)
Exemplo n.º 8
0
 def setUp(self):
     self.dlm = DelimiterTokenizer(delim_set=[' '], return_set=True)
Exemplo n.º 9
0
def test_set_sim_join():
    # data to be tested.
    test_scenario_1 = [(os.sep.join(['data',
                                     'table_A.csv']), 'A.ID', 'A.name'),
                       (os.sep.join(['data',
                                     'table_B.csv']), 'B.ID', 'B.name')]
    data = {'TEST_SCENARIO_1': test_scenario_1}

    # similarity measures to be tested.
    sim_measure_types = ['COSINE', 'DICE', 'JACCARD', 'OVERLAP_COEFFICIENT']

    # similarity thresholds to be tested.
    thresholds = {
        'JACCARD': [0.3, 0.5, 0.7, 0.85, 1],
        'COSINE': [0.3, 0.5, 0.7, 0.85, 1],
        'DICE': [0.3, 0.5, 0.7, 0.85, 1],
        'OVERLAP_COEFFICIENT': [0.3, 0.5, 0.7, 0.85, 1]
    }

    # tokenizers to be tested.
    tokenizers = {
        'SPACE_DELIMITER': DelimiterTokenizer(delim_set=[' '],
                                              return_set=True),
        '2_GRAM': QgramTokenizer(qval=2, return_set=True),
        '3_GRAM': QgramTokenizer(qval=3, return_set=True)
    }

    # Test each combination of similarity measure, threshold and tokenizer
    # for different test scenarios.
    for label, scenario in iteritems(data):
        for sim_measure_type in sim_measure_types:
            for threshold in thresholds.get(sim_measure_type):
                for tok_type, tok in iteritems(tokenizers):
                    test_function = partial(test_valid_join, scenario,
                                            sim_measure_type, (tok, threshold))
                    test_function.description = 'Test ' + sim_measure_type + \
                        ' with ' + str(threshold) + ' threshold and ' + \
                        tok_type + ' tokenizer for ' + label + '.'
                    yield test_function,

# Test each similarity measure with different comparison operators.
    for sim_measure_type in sim_measure_types:
        for comp_op in ['>', '=']:
            test_function = partial(
                test_valid_join, test_scenario_1, sim_measure_type,
                (tokenizers['SPACE_DELIMITER'], 0.3, comp_op, False))
            test_function.description = 'Test ' + sim_measure_type + \
                                        ' with comp_op ' + comp_op + '.'
            yield test_function,

    # Test each similarity measure with allow_missing set to True.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.7, '>=', False, True))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with allow_missing set to True.'
        yield test_function,

    # Test each similarity measure with output attributes added.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.3, '>=', False, False, [
                'A.ID', 'A.birth_year', 'A.zipcode'
            ], ['B.ID', 'B.name', 'B.zipcode']))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with output attributes.'
        yield test_function,

    # Test each similarity measure with a different output prefix.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.7, '>=', False, False, [
                'A.birth_year', 'A.zipcode'
            ], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.'))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with output attributes and prefix.'
        yield test_function,

    # Test each similarity measure with output_sim_score disabled.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.7, '>=', False, False, [
                'A.birth_year', 'A.zipcode'
            ], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.', False))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with sim_score disabled.'
        yield test_function,

    # Test each similarity measure with n_jobs above 1.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.3, '>=', False, False, [
                'A.birth_year', 'A.zipcode'
            ], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.', False, 2))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with n_jobs above 1.'
        yield test_function,

    # scenario where join attributes are of type int
    test_scenario_2 = [(os.sep.join(['data',
                                     'table_A.csv']), 'A.ID', 'A.zipcode'),
                       (os.sep.join(['data',
                                     'table_B.csv']), 'B.ID', 'B.zipcode')]

    # Test each similarity measure with join attribute of type int.
    for sim_measure_type in sim_measure_types:
        test_function = partial(test_valid_join, test_scenario_2,
                                sim_measure_type, (tokenizers['2_GRAM'], 0.3),
                                True)
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with join attribute of type int.'
        yield test_function,

    # scenario where join attributes are of type float
    test_scenario_3 = [(os.sep.join(['data',
                                     'table_A.csv']), 'A.ID', 'A.hourly_wage'),
                       (os.sep.join(['data',
                                     'table_B.csv']), 'B.ID', 'B.hourly_wage')]

    # Test each similarity measure with join attribute of type float.
    for sim_measure_type in sim_measure_types:
        test_function = partial(test_valid_join, test_scenario_3,
                                sim_measure_type, (tokenizers['2_GRAM'], 0.3),
                                True)
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with join attribute of type float.'
        yield test_function,

# Test each similarity measure with a tokenizer with return_set flag set to False.
    for sim_measure_type in sim_measure_types:
        tok = QgramTokenizer(2)
        test_function = partial(test_valid_join, test_scenario_1,
                                sim_measure_type, (tok, 0.3))
        test_function.description = 'Test ' + sim_measure_type + \
                    ' with a tokenizer with return_set flag set to False .'
        yield test_function,

    # Test each similarity measure with allow_empty set to True.
    for sim_measure_type in sim_measure_types:
        test_function = partial(
            test_valid_join, test_scenario_1, sim_measure_type,
            (tokenizers['SPACE_DELIMITER'], 0.7, '>=', True))
        test_function.description = 'Test ' + sim_measure_type + \
                                    ' with allow_empty set to True.'
        yield test_function,

    # Test each similarity measure with allow_empty set to True and with output attributes.
    for sim_measure_type in sim_measure_types:
        test_function = partial(test_valid_join, test_scenario_1,
                                sim_measure_type,
                                (tokenizers['SPACE_DELIMITER'], 0.7, '>=',
                                 True, False, ['A.name'], ['B.name']))
        test_function.description = 'Test ' + sim_measure_type + \
                    ' with allow_empty set to True and with output attributes.'
        yield test_function,
Exemplo n.º 10
0
 def setup(self):
     tokens = generate_tokens(6, 2, 5000)
     self.ltable = generate_table(5, 1, tokens, 50000, 'id', 'attr')
     self.rtable = generate_table(5, 1, tokens, 50000, 'id', 'attr')
     self.delim_tok = DelimiterTokenizer(delim_set=[' '], return_set=True)
 def test_delimiter_invalid1(self):
     invalid_delim_tok = DelimiterTokenizer(set([',', 10]))
 def setUp(self):
     self.delim_tok1 = DelimiterTokenizer()
     self.delim_tok2 = DelimiterTokenizer(set([',']))
     self.delim_tok3 = DelimiterTokenizer(set(['*', '.']))
     self.delim_tok4 = DelimiterTokenizer(set(['..', 'ab']))
     self.delim_tok4_list = DelimiterTokenizer(['..', 'ab', '..'])
     self.delim_tok4_return_set = DelimiterTokenizer(set(['..', 'ab']),
                                                     return_set=True)
class DelimiterTokenizerTestCases(unittest.TestCase):
    def setUp(self):
        self.delim_tok1 = DelimiterTokenizer()
        self.delim_tok2 = DelimiterTokenizer(set([',']))
        self.delim_tok3 = DelimiterTokenizer(set(['*', '.']))
        self.delim_tok4 = DelimiterTokenizer(set(['..', 'ab']))
        self.delim_tok4_list = DelimiterTokenizer(['..', 'ab', '..'])
        self.delim_tok4_return_set = DelimiterTokenizer(set(['..', 'ab']),
                                                        return_set=True)

    def test_delimiter_valid(self):
        self.assertEqual(self.delim_tok1.tokenize('data science'),
                         ['data', 'science'])
        self.assertEqual(self.delim_tok2.tokenize('data,science'),
                         ['data', 'science'])
        self.assertEqual(self.delim_tok2.tokenize('data science'),
                         ['data science'])
        self.assertEqual(self.delim_tok3.tokenize('ab cd*ef.*bb. gg.'),
                         ['ab cd', 'ef', 'bb', ' gg'])
        self.assertEqual(
            self.delim_tok4.tokenize('ab cd..efabbb....ggab cd..efabgh'),
            [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
        self.assertEqual(
            self.delim_tok4_list.tokenize('ab cd..efabbb....ggab cd..efabgh'),
            [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
        self.assertEqual(
            self.delim_tok4_return_set.tokenize(
                'ab cd..efabbb....ggab cd..efabgh'),
            [' cd', 'ef', 'bb', 'gg', 'gh'])

    def test_get_return_set(self):
        self.assertEqual(self.delim_tok4.get_return_set(), False)
        self.assertEqual(self.delim_tok4_return_set.get_return_set(), True)

    def test_get_delim_set(self):
        self.assertSetEqual(self.delim_tok1.get_delim_set(), {' '})
        self.assertSetEqual(self.delim_tok3.get_delim_set(), {'*', '.'})
        self.assertSetEqual(self.delim_tok4_list.get_delim_set(), {'..', 'ab'})

    def test_set_return_set(self):
        tok = DelimiterTokenizer(set(['..', 'ab']))
        self.assertEqual(tok.get_return_set(), False)
        self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
                         [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
        self.assertEqual(tok.set_return_set(True), True)
        self.assertEqual(tok.get_return_set(), True)
        self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
                         [' cd', 'ef', 'bb', 'gg', 'gh'])
        self.assertEqual(tok.set_return_set(False), True)
        self.assertEqual(tok.get_return_set(), False)
        self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
                         [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])

    def test_set_delim_set(self):
        tok = DelimiterTokenizer(['*', '.'])
        self.assertSetEqual(tok.get_delim_set(), {'*', '.'})
        self.assertEqual(tok.tokenize('ab cd*ef.*bb. gg.'),
                         ['ab cd', 'ef', 'bb', ' gg'])
        self.assertEqual(tok.set_delim_set({'..', 'ab'}), True)
        self.assertSetEqual(tok.get_delim_set(), {'..', 'ab'})
        self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'),
                         [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])

    @raises(TypeError)
    def test_delimiter_invalid1(self):
        invalid_delim_tok = DelimiterTokenizer(set([',', 10]))

    @raises(TypeError)
    def test_delimiter_invalid2(self):
        self.delim_tok1.tokenize(None)

    @raises(TypeError)
    def test_delimiter_invalid3(self):
        self.delim_tok1.tokenize(99)