class PrefixFilterTestCase(unittest.TestCase): def setUp(self): self.prefix_filter = PrefixFilter(A, A_tokenized, 'str', tok, 0.8, token_ordering) self.prefix_filter.build_index() def test_apply_filter(self): # prefix filter satisfies l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['fg', 'cd', 'aa'], token_ordering) self.assertTrue( self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['aa'], token_ordering) self.assertTrue( self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # prefix filter doesn't satisfy l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['fg'], token_ordering) self.assertFalse( self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # test empty list of tokens l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering([], token_ordering) self.assertFalse( self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) self.assertFalse( self.prefix_filter.apply_filter(r_tokens, l_tokens, len(r_tokens), len(l_tokens), 0.8)) def test_find_candidates(self): # test default case (presence of candidates) tokens = order_using_token_ordering(['aa', 'ef', 'lp'], token_ordering) self.assertSetEqual( self.prefix_filter.find_candidates(tokens, len(tokens), 0.8), set([0, 3])) # test empty set of candidates tokens = order_using_token_ordering(['op', 'lp', 'mp'], token_ordering) self.assertSetEqual( self.prefix_filter.find_candidates(tokens, len(tokens), 0.8), set()) # test empty list of probe tokens tokens = order_using_token_ordering([], token_ordering) self.assertSetEqual( self.prefix_filter.find_candidates(tokens, len(tokens), 0.8), set())
def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform edit distance join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) sim_measure_type = 'EDIT_DISTANCE' # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # cache l_join_attr lengths l_join_attr_list = [] for row in ltable_list: l_join_attr_list.append(len(row[l_join_attr_index])) # Build prefix index on l_join_attr prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) prefix_index.build(False) prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold) comp_fn = COMP_OP_MAP[comp_op] sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_len = len(r_string) r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # obtain candidates by applying prefix filter. candidates = prefix_filter.find_candidates(r_ordered_tokens, prefix_index) for cand in candidates: if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold: l_row = ltable_list[cand] # compute the actual edit distance edit_dist = sim_fn(l_row[l_join_attr_index], r_string) if comp_fn(edit_dist, threshold): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [l_row[l_key_attr_index], r_row[r_key_attr_index]] # if out_sim_score flag is set, append the edit distance # score to the output record. if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform edit distance join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) sim_measure_type = 'EDIT_DISTANCE' # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # cache l_join_attr lengths l_join_attr_list = [] for row in ltable_list: l_join_attr_list.append(len(row[l_join_attr_index])) # Build prefix index on l_join_attr prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) prefix_index.build(False) prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold) comp_fn = COMP_OP_MAP[comp_op] sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_len = len(r_string) r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # obtain candidates by applying prefix filter. candidates = prefix_filter.find_candidates(r_ordered_tokens, prefix_index) for cand in candidates: if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold: l_row = ltable_list[cand] # compute the actual edit distance edit_dist = sim_fn(l_row[l_join_attr_index], r_string) if comp_fn(edit_dist, threshold): if has_output_attributes: output_row = get_output_row_from_tables( l_row, r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ l_row[l_key_attr_index], r_row[r_key_attr_index] ] # if out_sim_score flag is set, append the edit distance # score to the output record. if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table