def jaccard_join_auto(ltable, rtable, l_id_attr, l_join_attr, r_id_attr, r_join_attr, threshold, ltable_output_attrs=None, rtable_output_attrs=None): matches_list = [] sim_function = get_jaccard_fn() token_ordering = gen_token_ordering(ltable, l_join_attr) position_filter = PositionFilter(ltable, l_id_attr, l_join_attr, threshold, token_ordering, adaptive_prefix=True) position_filter.build_index() prog_bar = pyprind.ProgBar(len(rtable.index)) l_row_dict = {} for idx, l_row in ltable.iterrows(): l_id = l_row[l_id_attr] l_row_dict[l_id] = l_row r_row_dict = {} for idx, r_row in rtable.iterrows(): r_id = r_row[r_id_attr] r_row_dict[r_id] = r_row for r_id in r_row_dict.keys(): r_row = r_row_dict[r_id] r_tokens = order_using_token_ordering(list(r_row[r_join_attr]), token_ordering) r_num_tokens = len(r_tokens) l_cand_ids = position_filter.find_candidates(r_tokens, r_num_tokens, threshold) for l_id in l_cand_ids: l_row = l_row_dict[l_id] if sim_function(l_row[l_join_attr], r_row[r_join_attr]) >= threshold: match_dict = get_output_attributes(l_row, r_row, l_id_attr, l_id, r_id_attr, r_id, ltable_output_attrs, rtable_output_attrs) matches_list.append(match_dict) # matches_list.append(str(l_id)+','+str(r_id)) prog_bar.update() output_matches = pd.DataFrame(matches_list) return output_matches
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # Build position index on l_join_attr position_index = PositionIndex(ltable, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) # While building the index, we cache the tokens and the empty records. # We cache the tokens so that we need not tokenize each string in # l_join_attr multiple times when we need to compute the similarity measure. # Further we cache the empty record ids to handle the allow_empty flag. cached_data = position_index.build(allow_empty, cache_tokens=True) l_empty_records = cached_data['empty_records'] cached_l_tokens = cached_data['cached_tokens'] pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) k = 0 for r_row in rtable: r_string = r_row[r_join_attr_index] # order the tokens using the token ordering. r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the position # index. if allow_empty and len(r_ordered_tokens) == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[l_id][l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # obtain candidates by applying position filter. candidate_overlap = pos_filter.find_candidates(r_ordered_tokens, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = cached_l_tokens[cand] k += 1 # compute the actual similarity score sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable[cand][l_key_attr_index], r_row[r_key_attr_index]] # if out_sim_score flag is set, append the similarity score # to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() print 'k : ', k output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def set_sim_join(ltable, rtable, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable, rtable], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # Build position index on l_join_attr position_index = PositionIndex(ltable, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) # While building the index, we cache the tokens and the empty records. # We cache the tokens so that we need not tokenize each string in # l_join_attr multiple times when we need to compute the similarity measure. # Further we cache the empty record ids to handle the allow_empty flag. cached_data = position_index.build(allow_empty, cache_tokens=True) l_empty_records = cached_data['empty_records'] cached_l_tokens = cached_data['cached_tokens'] pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable)) k = 0 for r_row in rtable: r_string = r_row[r_join_attr_index] # order the tokens using the token ordering. r_ordered_tokens = order_using_token_ordering( tokenizer.tokenize(r_string), token_ordering) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the position # index. if allow_empty and len(r_ordered_tokens) == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable[l_id][l_key_attr_index], r_row[r_key_attr_index] ] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # obtain candidates by applying position filter. candidate_overlap = pos_filter.find_candidates(r_ordered_tokens, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = cached_l_tokens[cand] k += 1 # compute the actual similarity score sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable[cand][l_key_attr_index], r_row[r_key_attr_index] ] # if out_sim_score flag is set, append the similarity score # to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() print 'k : ', k output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
class PositionFilterTestCase(unittest.TestCase): def setUp(self): self.position_filter = PositionFilter(A, A_tokenized, 'str', tok, 0.8, token_ordering) self.position_filter.build_index() def test_apply_filter(self): # position filter satisfies l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['fg', 'cd', 'aa', 'ef'], token_ordering) self.assertTrue( self.position_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # position filter doesn't satisfy l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['fg'], token_ordering) self.assertFalse( self.position_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # prefix filter satisfies but position filter doesn't satisfy l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['aa'], token_ordering) self.assertFalse( self.position_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # test empty list of tokens l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering([], token_ordering) self.assertFalse( self.position_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) self.assertFalse( self.position_filter.apply_filter(r_tokens, l_tokens, len(r_tokens), len(l_tokens), 0.8)) def test_find_candidates(self): # test default case (presence of candidates) tokens = order_using_token_ordering(['aa', 'ef', 'ab', 'cd'], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set([0, 3])) # test empty set of candidates tokens = order_using_token_ordering(['op', 'lp', 'mp'], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set()) # prefix index returns 2 candidates where as position index prunes them tokens = order_using_token_ordering(['aa', 'ef', 'lp'], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set()) # test empty list of probe tokens tokens = order_using_token_ordering([], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set())