def test_set_return_set(self): tok = DelimiterTokenizer(set(['..', 'ab'])) self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) self.assertEqual(tok.set_return_set(True), True) self.assertEqual(tok.get_return_set(), True) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', 'gh']) self.assertEqual(tok.set_return_set(False), True) self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
def setUp(self): self.dlm = DelimiterTokenizer(delim_set=[' '], return_set=True) self.A = pd.DataFrame([{'l_id': 1, 'l_attr':'ab cd ef aa bb'}, {'l_id': 2, 'l_attr':''}, {'l_id': 3, 'l_attr':'ab'}, {'l_id': 4, 'l_attr':'ll oo he'}, {'l_id': 5, 'l_attr':'xy xx zz fg'}, {'l_id': 6, 'l_attr': pd.np.NaN}]) self.B = pd.DataFrame([{'r_id': 1, 'r_attr':'zz fg xx'}, {'r_id': 2, 'r_attr':'he ll'}, {'r_id': 3, 'r_attr':'xz pl ou'}, {'r_id': 4, 'r_attr':'aa'}, {'r_id': 5, 'r_attr':'fg cd aa ef ab'}, {'r_id': 6, 'r_attr':None}]) # generate cartesian product A x B to be used as candset self.A['tmp_join_key'] = 1 self.B['tmp_join_key'] = 1 self.C = pd.merge(self.A[['l_id', 'tmp_join_key']], self.B[['r_id', 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr']) self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr']) self.empty_candset = pd.DataFrame(columns=['l_id', 'r_id'])
def test_set_delim_set(self): tok = DelimiterTokenizer(['*', '.']) self.assertSetEqual(tok.get_delim_set(), {'*', '.'}) self.assertEqual(tok.tokenize('ab cd*ef.*bb. gg.'), ['ab cd', 'ef', 'bb', ' gg']) self.assertEqual(tok.set_delim_set({'..', 'ab'}), True) self.assertSetEqual(tok.get_delim_set(), {'..', 'ab'}) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
def setUp(self): self.A = pd.DataFrame([{ 'A.id': 1, 'A.attr': 'hello', 'A.int_attr': 5 }]) self.B = pd.DataFrame([{ 'B.id': 1, 'B.attr': 'world', 'B.int_attr': 6 }]) self.tokenizer = DelimiterTokenizer(delim_set=[' '], return_set=True) self.threshold = 1
def setUp(self): self.dlm = DelimiterTokenizer(delim_set=[' '], return_set=True) self.A = pd.DataFrame([{ 'id': 1, 'attr': 'ab cd ef aa bb' }, { 'id': 2, 'attr': '' }, { 'id': 3, 'attr': 'ab' }, { 'id': 4, 'attr': 'll oo he' }, { 'id': 5, 'attr': 'xy xx zz fg' }, { 'id': 6, 'attr': pd.np.NaN }, { 'id': 7, 'attr': '' }]) self.B = pd.DataFrame([{ 'id': 1, 'attr': 'zz fg xx' }, { 'id': 2, 'attr': 'he ll' }, { 'id': 3, 'attr': 'xy pl ou' }, { 'id': 4, 'attr': 'aa' }, { 'id': 5, 'attr': 'fg cd aa ef ab' }, { 'id': 6, 'attr': None }, { 'id': 7, 'attr': ' ' }]) self.empty_table = pd.DataFrame(columns=['id', 'attr']) self.default_l_out_prefix = 'l_' self.default_r_out_prefix = 'r_'
def setup(self): ltable_path = os.sep.join([BASE_PATH, 'music', 'A.csv']) rtable_path = os.sep.join([BASE_PATH, 'music', 'B.csv']) if not os.path.exists(ltable_path): raise NotImplementedError( 'Left table not found. Skipping benchmark.') if not os.path.exists(rtable_path): raise NotImplementedError( 'Right table not found. Skipping benchmark.') self.ltable = pd.read_csv(ltable_path, encoding="iso-8859-1") self.rtable = pd.read_csv(rtable_path, encoding="iso-8859-1") self.l_id_attr = 'Sno' self.r_id_attr = 'Sno' self.l_join_attr = 'Song_Name' self.r_join_attr = 'Song_Name' self.delim_tok = DelimiterTokenizer(delim_set=[' '], return_set=True)
def setup(self): ltable_path = os.sep.join([BASE_PATH, 'restaurants', 'A.csv']) rtable_path = os.sep.join([BASE_PATH, 'restaurants', 'B.csv']) if not os.path.exists(ltable_path): raise NotImplementedError( 'Left table not found. Skipping benchmark.') if not os.path.exists(rtable_path): raise NotImplementedError( 'Right table not found. Skipping benchmark.') self.ltable = pd.read_csv(ltable_path) self.rtable = pd.read_csv(rtable_path) self.l_id_attr = 'ID' self.r_id_attr = 'ID' self.l_join_attr = 'NAME' self.r_join_attr = 'NAME' self.delim_tok = DelimiterTokenizer(delim_set=[' '], return_set=True)
def setUp(self): self.dlm = DelimiterTokenizer(delim_set=[' '], return_set=True)
def test_set_sim_join(): # data to be tested. test_scenario_1 = [(os.sep.join(['data', 'table_A.csv']), 'A.ID', 'A.name'), (os.sep.join(['data', 'table_B.csv']), 'B.ID', 'B.name')] data = {'TEST_SCENARIO_1': test_scenario_1} # similarity measures to be tested. sim_measure_types = ['COSINE', 'DICE', 'JACCARD', 'OVERLAP_COEFFICIENT'] # similarity thresholds to be tested. thresholds = { 'JACCARD': [0.3, 0.5, 0.7, 0.85, 1], 'COSINE': [0.3, 0.5, 0.7, 0.85, 1], 'DICE': [0.3, 0.5, 0.7, 0.85, 1], 'OVERLAP_COEFFICIENT': [0.3, 0.5, 0.7, 0.85, 1] } # tokenizers to be tested. tokenizers = { 'SPACE_DELIMITER': DelimiterTokenizer(delim_set=[' '], return_set=True), '2_GRAM': QgramTokenizer(qval=2, return_set=True), '3_GRAM': QgramTokenizer(qval=3, return_set=True) } # Test each combination of similarity measure, threshold and tokenizer # for different test scenarios. for label, scenario in iteritems(data): for sim_measure_type in sim_measure_types: for threshold in thresholds.get(sim_measure_type): for tok_type, tok in iteritems(tokenizers): test_function = partial(test_valid_join, scenario, sim_measure_type, (tok, threshold)) test_function.description = 'Test ' + sim_measure_type + \ ' with ' + str(threshold) + ' threshold and ' + \ tok_type + ' tokenizer for ' + label + '.' yield test_function, # Test each similarity measure with different comparison operators. for sim_measure_type in sim_measure_types: for comp_op in ['>', '=']: test_function = partial( test_valid_join, test_scenario_1, sim_measure_type, (tokenizers['SPACE_DELIMITER'], 0.3, comp_op, False)) test_function.description = 'Test ' + sim_measure_type + \ ' with comp_op ' + comp_op + '.' yield test_function, # Test each similarity measure with allow_missing set to True. for sim_measure_type in sim_measure_types: test_function = partial( test_valid_join, test_scenario_1, sim_measure_type, (tokenizers['SPACE_DELIMITER'], 0.7, '>=', False, True)) test_function.description = 'Test ' + sim_measure_type + \ ' with allow_missing set to True.' yield test_function, # Test each similarity measure with output attributes added. for sim_measure_type in sim_measure_types: test_function = partial( test_valid_join, test_scenario_1, sim_measure_type, (tokenizers['SPACE_DELIMITER'], 0.3, '>=', False, False, [ 'A.ID', 'A.birth_year', 'A.zipcode' ], ['B.ID', 'B.name', 'B.zipcode'])) test_function.description = 'Test ' + sim_measure_type + \ ' with output attributes.' yield test_function, # Test each similarity measure with a different output prefix. for sim_measure_type in sim_measure_types: test_function = partial( test_valid_join, test_scenario_1, sim_measure_type, (tokenizers['SPACE_DELIMITER'], 0.7, '>=', False, False, [ 'A.birth_year', 'A.zipcode' ], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.')) test_function.description = 'Test ' + sim_measure_type + \ ' with output attributes and prefix.' yield test_function, # Test each similarity measure with output_sim_score disabled. for sim_measure_type in sim_measure_types: test_function = partial( test_valid_join, test_scenario_1, sim_measure_type, (tokenizers['SPACE_DELIMITER'], 0.7, '>=', False, False, [ 'A.birth_year', 'A.zipcode' ], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.', False)) test_function.description = 'Test ' + sim_measure_type + \ ' with sim_score disabled.' yield test_function, # Test each similarity measure with n_jobs above 1. for sim_measure_type in sim_measure_types: test_function = partial( test_valid_join, test_scenario_1, sim_measure_type, (tokenizers['SPACE_DELIMITER'], 0.3, '>=', False, False, [ 'A.birth_year', 'A.zipcode' ], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.', False, 2)) test_function.description = 'Test ' + sim_measure_type + \ ' with n_jobs above 1.' yield test_function, # scenario where join attributes are of type int test_scenario_2 = [(os.sep.join(['data', 'table_A.csv']), 'A.ID', 'A.zipcode'), (os.sep.join(['data', 'table_B.csv']), 'B.ID', 'B.zipcode')] # Test each similarity measure with join attribute of type int. for sim_measure_type in sim_measure_types: test_function = partial(test_valid_join, test_scenario_2, sim_measure_type, (tokenizers['2_GRAM'], 0.3), True) test_function.description = 'Test ' + sim_measure_type + \ ' with join attribute of type int.' yield test_function, # scenario where join attributes are of type float test_scenario_3 = [(os.sep.join(['data', 'table_A.csv']), 'A.ID', 'A.hourly_wage'), (os.sep.join(['data', 'table_B.csv']), 'B.ID', 'B.hourly_wage')] # Test each similarity measure with join attribute of type float. for sim_measure_type in sim_measure_types: test_function = partial(test_valid_join, test_scenario_3, sim_measure_type, (tokenizers['2_GRAM'], 0.3), True) test_function.description = 'Test ' + sim_measure_type + \ ' with join attribute of type float.' yield test_function, # Test each similarity measure with a tokenizer with return_set flag set to False. for sim_measure_type in sim_measure_types: tok = QgramTokenizer(2) test_function = partial(test_valid_join, test_scenario_1, sim_measure_type, (tok, 0.3)) test_function.description = 'Test ' + sim_measure_type + \ ' with a tokenizer with return_set flag set to False .' yield test_function, # Test each similarity measure with allow_empty set to True. for sim_measure_type in sim_measure_types: test_function = partial( test_valid_join, test_scenario_1, sim_measure_type, (tokenizers['SPACE_DELIMITER'], 0.7, '>=', True)) test_function.description = 'Test ' + sim_measure_type + \ ' with allow_empty set to True.' yield test_function, # Test each similarity measure with allow_empty set to True and with output attributes. for sim_measure_type in sim_measure_types: test_function = partial(test_valid_join, test_scenario_1, sim_measure_type, (tokenizers['SPACE_DELIMITER'], 0.7, '>=', True, False, ['A.name'], ['B.name'])) test_function.description = 'Test ' + sim_measure_type + \ ' with allow_empty set to True and with output attributes.' yield test_function,
def setup(self): tokens = generate_tokens(6, 2, 5000) self.ltable = generate_table(5, 1, tokens, 50000, 'id', 'attr') self.rtable = generate_table(5, 1, tokens, 50000, 'id', 'attr') self.delim_tok = DelimiterTokenizer(delim_set=[' '], return_set=True)
def test_delimiter_invalid1(self): invalid_delim_tok = DelimiterTokenizer(set([',', 10]))
def setUp(self): self.delim_tok1 = DelimiterTokenizer() self.delim_tok2 = DelimiterTokenizer(set([','])) self.delim_tok3 = DelimiterTokenizer(set(['*', '.'])) self.delim_tok4 = DelimiterTokenizer(set(['..', 'ab'])) self.delim_tok4_list = DelimiterTokenizer(['..', 'ab', '..']) self.delim_tok4_return_set = DelimiterTokenizer(set(['..', 'ab']), return_set=True)
class DelimiterTokenizerTestCases(unittest.TestCase): def setUp(self): self.delim_tok1 = DelimiterTokenizer() self.delim_tok2 = DelimiterTokenizer(set([','])) self.delim_tok3 = DelimiterTokenizer(set(['*', '.'])) self.delim_tok4 = DelimiterTokenizer(set(['..', 'ab'])) self.delim_tok4_list = DelimiterTokenizer(['..', 'ab', '..']) self.delim_tok4_return_set = DelimiterTokenizer(set(['..', 'ab']), return_set=True) def test_delimiter_valid(self): self.assertEqual(self.delim_tok1.tokenize('data science'), ['data', 'science']) self.assertEqual(self.delim_tok2.tokenize('data,science'), ['data', 'science']) self.assertEqual(self.delim_tok2.tokenize('data science'), ['data science']) self.assertEqual(self.delim_tok3.tokenize('ab cd*ef.*bb. gg.'), ['ab cd', 'ef', 'bb', ' gg']) self.assertEqual( self.delim_tok4.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) self.assertEqual( self.delim_tok4_list.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) self.assertEqual( self.delim_tok4_return_set.tokenize( 'ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', 'gh']) def test_get_return_set(self): self.assertEqual(self.delim_tok4.get_return_set(), False) self.assertEqual(self.delim_tok4_return_set.get_return_set(), True) def test_get_delim_set(self): self.assertSetEqual(self.delim_tok1.get_delim_set(), {' '}) self.assertSetEqual(self.delim_tok3.get_delim_set(), {'*', '.'}) self.assertSetEqual(self.delim_tok4_list.get_delim_set(), {'..', 'ab'}) def test_set_return_set(self): tok = DelimiterTokenizer(set(['..', 'ab'])) self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) self.assertEqual(tok.set_return_set(True), True) self.assertEqual(tok.get_return_set(), True) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', 'gh']) self.assertEqual(tok.set_return_set(False), True) self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) def test_set_delim_set(self): tok = DelimiterTokenizer(['*', '.']) self.assertSetEqual(tok.get_delim_set(), {'*', '.'}) self.assertEqual(tok.tokenize('ab cd*ef.*bb. gg.'), ['ab cd', 'ef', 'bb', ' gg']) self.assertEqual(tok.set_delim_set({'..', 'ab'}), True) self.assertSetEqual(tok.get_delim_set(), {'..', 'ab'}) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) @raises(TypeError) def test_delimiter_invalid1(self): invalid_delim_tok = DelimiterTokenizer(set([',', 10])) @raises(TypeError) def test_delimiter_invalid2(self): self.delim_tok1.tokenize(None) @raises(TypeError) def test_delimiter_invalid3(self): self.delim_tok1.tokenize(99)