def overlap_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=True, n_jobs=1): """Join two tables using overlap similarity measure. Finds tuple pairs from ltable and rtable such that Overlap(ltable.l_join_attr, rtable.r_join_attr) >= threshold Args: ltable, rtable : Pandas data frame l_key_attr, r_key_attr : String, key attribute from ltable and rtable l_join_attr, r_join_attr : String, join attribute from ltable and rtable tokenizer : Tokenizer object, tokenizer to be used to tokenize join attributes threshold : float, overlap threshold to be satisfied l_out_attrs, r_out_attrs : list of attributes to be included in the output table from ltable and rtable l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table out_sim_score : boolean, indicates if similarity score needs to be included in the output table Returns: result : Pandas data frame """ overlap_filter = OverlapFilter(tokenizer, threshold) return overlap_filter.filter_tables(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, n_jobs)
def test_apply_matcher_with_join_attr_of_type_int(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' l_join_attr = 'A.zipcode' r_join_attr = 'B.zipcode' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(tok.tokenize(str(row[l_join_attr])), tok.tokenize(str(row[r_join_attr]))), axis=1) comp_fn = COMP_OP_MAP[comp_op] # compute expected output pairs expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join( (str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # use overlap filter to obtain a candset. overlap_filter = OverlapFilter(tok, 1, comp_op) candset = overlap_filter.filter_tables(self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr) # apply a jaccard matcher to the candset output_candset = apply_matcher( candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr, tok, sim_func, threshold) expected_output_attrs = [ '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score' ] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join( (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_apply_matcher(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) comp_fn = COMP_OP_MAP[comp_op] # compute expected output pairs expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join((str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # use overlap filter to obtain a candset. overlap_filter = OverlapFilter(tok, 1, comp_op) candset = overlap_filter.filter_tables(self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.ltable, self.rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, False, [self.l_join_attr], [self.r_join_attr], out_sim_score=True) expected_output_attrs=['_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, DEFAULT_L_OUT_PREFIX + self.l_join_attr, DEFAULT_R_OUT_PREFIX + self.r_join_attr, '_sim_score'] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_candset_with_numeric_r_filter_attr(self): A = pd.DataFrame([{'l_id': 1, 'l_attr': '1990'}]) B = pd.DataFrame([{'r_id': 1, 'r_attr': 2001}]) A['tmp_join_key'] = 1 B['tmp_join_key'] = 1 C = pd.merge(A[['l_id', 'tmp_join_key']], B[['r_id', 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) qg2_tok = QgramTokenizer(2, return_set=True) overlap_filter = OverlapFilter(qg2_tok) overlap_filter.filter_candset(C, 'l_id', 'r_id', A, B, 'l_id', 'r_id', 'l_attr', 'r_attr')
def test_candset_with_numeric_r_filter_attr(self): A = pd.DataFrame([{'l_id': 1, 'l_attr':'1990'}]) B = pd.DataFrame([{'r_id': 1, 'r_attr':2001}]) A['tmp_join_key'] = 1 B['tmp_join_key'] = 1 C = pd.merge(A[['l_id', 'tmp_join_key']], B[['r_id', 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) qg2_tok = QgramTokenizer(2, return_set=True) overlap_filter = OverlapFilter(qg2_tok) overlap_filter.filter_candset(C, 'l_id', 'r_id', A, B, 'l_id', 'r_id', 'l_attr', 'r_attr')
def test_filter_tables(self, tokenizer, overlap_size, comp_op, allow_missing, args, expected_pairs): overlap_filter = OverlapFilter(tokenizer, overlap_size, comp_op, allow_missing) actual_candset = overlap_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]), str(row[r_out_prefix + args[3]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_filter_tables(self, tokenizer, overlap_size, comp_op, allow_missing, args, expected_pairs): overlap_filter = OverlapFilter(tokenizer, overlap_size, comp_op, allow_missing) actual_candset = overlap_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]), str(row[r_out_prefix + args[3]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_filter_candset(self, tokenizer, overlap_size, args, expected_pairs): overlap_filter = OverlapFilter(tokenizer, overlap_size) actual_output_candset = overlap_filter.filter_candset(*args) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_output_candset.columns.values), list(args[0].columns.values)) actual_pairs = set() for idx, row in actual_output_candset.iterrows(): actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_filter_candset(self, tokenizer, overlap_size, comp_op, allow_missing, args, expected_pairs): overlap_filter = OverlapFilter(tokenizer, overlap_size, comp_op, allow_missing) actual_output_candset = overlap_filter.filter_candset(*args) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_output_candset.columns.values), list(args[0].columns.values)) actual_pairs = set() for idx, row in actual_output_candset.iterrows(): actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def _overlap_coefficient_join_split( ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform overlap coefficient join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # Build inverted index over ltable inverted_index = InvertedIndex(ltable_list, l_join_attr_index, tokenizer, cache_size_flag=True) # While building the index, we cache the record ids with empty set of # tokens. This is needed to handle the allow_empty flag. cached_data = inverted_index.build(allow_empty) l_empty_records = cached_data['empty_records'] overlap_filter = OverlapFilter(tokenizer, 1) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_join_attr_tokens = tokenizer.tokenize(r_string) r_num_tokens = len(r_join_attr_tokens) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the inverted # index. if allow_empty and r_num_tokens == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable_list[l_id][l_key_attr_index], r_row[r_key_attr_index] ] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # probe inverted index and find overlap of candidates candidate_overlap = overlap_filter.find_candidates( r_join_attr_tokens, inverted_index) for cand, overlap in iteritems(candidate_overlap): # compute the actual similarity score sim_score = ( float(overlap) / float(min(r_num_tokens, inverted_index.size_cache[cand]))) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable_list[cand][l_key_attr_index], r_row[r_key_attr_index] ] # if out_sim_score flag is set, append the overlap coefficient # score to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _overlap_coefficient_join_split(ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform overlap coefficient join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # Build inverted index over ltable inverted_index = InvertedIndex(ltable_list, l_join_attr_index, tokenizer, cache_size_flag=True) # While building the index, we cache the record ids with empty set of # tokens. This is needed to handle the allow_empty flag. cached_data = inverted_index.build(allow_empty) l_empty_records = cached_data['empty_records'] overlap_filter = OverlapFilter(tokenizer, 1) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_join_attr_tokens = tokenizer.tokenize(r_string) r_num_tokens = len(r_join_attr_tokens) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the inverted # index. if allow_empty and r_num_tokens == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable_list[l_id][l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # probe inverted index and find overlap of candidates candidate_overlap = overlap_filter.find_candidates( r_join_attr_tokens, inverted_index) for cand, overlap in iteritems(candidate_overlap): # compute the actual similarity score sim_score = (float(overlap) / float(min(r_num_tokens, inverted_index.size_cache[cand]))) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable_list[cand][l_key_attr_index], r_row[r_key_attr_index]] # if out_sim_score flag is set, append the overlap coefficient # score to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def test_rtable_r_key_attr_with_missing_value(self): overlap_filter = OverlapFilter(self.dlm) overlap_filter.filter_candset(self.C, 'l_id', 'r_id', self.A, self.B, 'l_id', 'r_attr', 'l_attr', 'r_attr')
def test_apply_matcher_with_allow_missing(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) # compute expected output pairs comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join( (str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # find pairs that need to be included in output due to # the presence of missing value in one of the join attributes. missing_pairs = set() for l_idx, l_row in self.orig_ltable.iterrows(): for r_idx, r_row in self.orig_rtable.iterrows(): if (pd.isnull(l_row[self.l_join_attr]) or pd.isnull(r_row[self.r_join_attr])): missing_pairs.add(','.join((str(l_row[self.l_key_attr]), str(r_row[self.r_key_attr])))) # add the pairs containing missing value to the set of expected pairs. expected_pairs = expected_pairs.union(missing_pairs) # use overlap filter to obtain a candset with allow_missing set to True. overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True) candset = overlap_filter.filter_tables( self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset with allow_missing set to True. output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, True, out_sim_score=True) expected_output_attrs = [ '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score' ] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join( (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_numeric_r_filter_attr(self): overlap_filter = OverlapFilter(self.tokenizer, self.threshold) overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.int_attr')
def test_invalid_ltable(self): overlap_filter = OverlapFilter(self.dlm) overlap_filter.filter_candset(self.C, 'l_id', 'r_id', [], self.B, 'l_id', 'r_id', 'l_attr', 'r_attr')
def test_invalid_rtable_r_filter_attr(self): overlap_filter = OverlapFilter(self.dlm) overlap_filter.filter_candset(self.C, 'l_id', 'r_id', self.A, self.B, 'l_id', 'r_id', 'l_attr', 'invalid_attr')
def test_invalid_r_out_attr(self): overlap_filter = OverlapFilter(self.tokenizer, self.threshold) overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', ['A.attr'], ['B.invalid_attr'])
def test_filter_pair(self, lstring, rstring, tokenizer, overlap_size, comp_op, allow_missing, expected_output): overlap_filter = OverlapFilter(tokenizer, overlap_size, comp_op, allow_missing) actual_output = overlap_filter.filter_pair(lstring, rstring) assert_equal(actual_output, expected_output)
def overlap_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op='>=', allow_missing=False, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=True, n_jobs=1, show_progress=True): """Join two tables using overlap measure. For two sets X and Y, the overlap between them is given by: :math:`overlap(X, Y) = |X \\cap Y|` Finds tuple pairs from left table and right table such that the overlap between the join attributes satisfies the condition on input threshold. For example, if the comparison operator is '>=', finds tuple pairs whose overlap between the strings that are the values of the join attributes is greater than or equal to the input threshold, as specified in "threshold". Args: ltable (DataFrame): left input table. rtable (DataFrame): right input table. l_key_attr (string): key attribute in left table. r_key_attr (string): key attribute in right table. l_join_attr (string): join attribute in left table. r_join_attr (string): join attribute in right table. tokenizer (Tokenizer): tokenizer to be used to tokenize join attributes. threshold (float): overlap threshold to be satisfied. comp_op (string): comparison operator. Supported values are '>=', '>' and '=' (defaults to '>='). allow_missing (boolean): flag to indicate whether tuple pairs with missing value in at least one of the join attributes should be included in the output (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the join attribute will be matched with every tuple in rtable and vice versa. l_out_attrs (list): list of attribute names from the left table to be included in the output table (defaults to None). r_out_attrs (list): list of attribute names from the right table to be included in the output table (defaults to None). l_out_prefix (string): prefix to be used for the attribute names coming from the left table, in the output table (defaults to 'l\_'). r_out_prefix (string): prefix to be used for the attribute names coming from the right table, in the output table (defaults to 'r\_'). out_sim_score (boolean): flag to indicate whether similarity score should be included in the output table (defaults to True). Setting this flag to True will add a column named '_sim_score' in the output table. This column will contain the similarity scores for the tuple pairs in the output. n_jobs (int): number of parallel jobs to use for the computation (defaults to 1). If -1 is given, all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, then no parallel computing code will be used (i.e., equivalent to the default). show_progress (boolean): flag to indicate whether task progress should be displayed to the user (defaults to True). Returns: An output table containing tuple pairs that satisfy the join condition (DataFrame). """ # check if the input tokenizer is valid validate_tokenizer(tokenizer) # set return_set flag of tokenizer to be True, in case it is set to False revert_tokenizer_return_set_flag = False if not tokenizer.get_return_set(): tokenizer.set_return_set(True) revert_tokenizer_return_set_flag = True # use overlap filter to perform the join. overlap_filter = OverlapFilter(tokenizer, threshold, comp_op, allow_missing) output_table = overlap_filter.filter_tables(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, n_jobs, show_progress) # revert the return_set flag of tokenizer, in case it was modified. if revert_tokenizer_return_set_flag: tokenizer.set_return_set(False) return output_table
def sample_pairs(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, sample_size, y_param, seed, l_out_prefix='l_', r_out_prefix='r_', show_progress=True): # get attributes to project. l_proj_attrs = get_attrs_to_project(None, l_key_attr, l_join_attr) r_proj_attrs = get_attrs_to_project(None, r_key_attr, r_join_attr) # convert dataframe to array for faster access ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr) rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr) # find column indices of key attr and join attr in ltable array l_key_attr_index = l_proj_attrs.index(l_key_attr) l_join_attr_index = l_proj_attrs.index(l_join_attr) # find column indices of key attr and join attr in rtable array r_key_attr_index = r_proj_attrs.index(r_key_attr) r_join_attr_index = r_proj_attrs.index(r_join_attr) # create a whitespace tokenizer to tokenize join attributes ws_tok = WhitespaceTokenizer(return_set=True) # build inverted index on join attriubute in ltable inverted_index = InvertedIndex(ltable_array, l_join_attr_index, ws_tok) inverted_index.build() number_of_r_tuples_to_sample = int(ceil(float(sample_size) / float(y_param))) sample_rtable_indices = random.sample(range(0, len(rtable_array)), number_of_r_tuples_to_sample) cand_pos_ltuples_required = int(ceil(y_param / 2.0)) overlap_filter = OverlapFilter(ws_tok, 1) output_rows = [] if show_progress: prog_bar = pyprind.ProgBar(number_of_r_tuples_to_sample) for r_idx in sample_rtable_indices: r_row = rtable_array[r_idx] r_id = r_row[r_key_attr_index] r_join_attr_tokens = ws_tok.tokenize(r_row[r_join_attr_index]) # probe inverted index and find ltable candidates cand_overlap = overlap_filter.find_candidates( r_join_attr_tokens, inverted_index) sampled_ltuples = set() for cand in sorted(cand_overlap.items(), key=operator.itemgetter(1), reverse=True): if len(sampled_ltuples) == cand_pos_ltuples_required: break sampled_ltuples.add(cand[0]) ltable_size = len(ltable_array) while len(sampled_ltuples) < y_param: rand_idx = random.randint(0, ltable_size - 1) sampled_ltuples.add(rand_idx) for l_idx in sampled_ltuples: output_rows.append([ltable_array[l_idx][l_key_attr_index], r_id]) if show_progress: prog_bar.update() for seed_pair_row in seed.itertuples(index=False): output_rows.append([seed_pair_row[0], seed_pair_row[1]]) output_header = get_output_header_from_tables(l_key_attr, r_key_attr, None, None, l_out_prefix, r_out_prefix) output_table = pd.DataFrame(output_rows, columns=output_header) # add an id column named '_id' to the output table. output_table.insert(0, '_id', range(0, len(output_table))) return output_table
def overlap_join_py(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op='>=', allow_missing=False, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=True, n_jobs=1, show_progress=True): """Join two tables using overlap measure. For two sets X and Y, the overlap between them is given by: :math:`overlap(X, Y) = |X \\cap Y|` Finds tuple pairs from left table and right table such that the overlap between the join attributes satisfies the condition on input threshold. For example, if the comparison operator is '>=', finds tuple pairs whose overlap between the strings that are the values of the join attributes is greater than or equal to the input threshold, as specified in "threshold". Args: ltable (DataFrame): left input table. rtable (DataFrame): right input table. l_key_attr (string): key attribute in left table. r_key_attr (string): key attribute in right table. l_join_attr (string): join attribute in left table. r_join_attr (string): join attribute in right table. tokenizer (Tokenizer): tokenizer to be used to tokenize join attributes. threshold (float): overlap threshold to be satisfied. comp_op (string): comparison operator. Supported values are '>=', '>' and '=' (defaults to '>='). allow_missing (boolean): flag to indicate whether tuple pairs with missing value in at least one of the join attributes should be included in the output (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the join attribute will be matched with every tuple in rtable and vice versa. l_out_attrs (list): list of attribute names from the left table to be included in the output table (defaults to None). r_out_attrs (list): list of attribute names from the right table to be included in the output table (defaults to None). l_out_prefix (string): prefix to be used for the attribute names coming from the left table, in the output table (defaults to 'l\_'). r_out_prefix (string): prefix to be used for the attribute names coming from the right table, in the output table (defaults to 'r\_'). out_sim_score (boolean): flag to indicate whether similarity score should be included in the output table (defaults to True). Setting this flag to True will add a column named '_sim_score' in the output table. This column will contain the similarity scores for the tuple pairs in the output. n_jobs (int): number of parallel jobs to use for the computation (defaults to 1). If -1 is given, all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, then no parallel computing code will be used (i.e., equivalent to the default). show_progress (boolean): flag to indicate whether task progress should be displayed to the user (defaults to True). Returns: An output table containing tuple pairs that satisfy the join condition (DataFrame). """ # check if the input tokenizer is valid validate_tokenizer(tokenizer) # set return_set flag of tokenizer to be True, in case it is set to False revert_tokenizer_return_set_flag = False if not tokenizer.get_return_set(): tokenizer.set_return_set(True) revert_tokenizer_return_set_flag = True # use overlap filter to perform the join. overlap_filter = OverlapFilter(tokenizer, threshold, comp_op, allow_missing) output_table = overlap_filter.filter_tables(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, n_jobs, show_progress) # revert the return_set flag of tokenizer, in case it was modified. if revert_tokenizer_return_set_flag: tokenizer.set_return_set(False) return output_table
def test_filter_pair(self, lstring, rstring, tokenizer, overlap_size, comp_op, allow_missing, expected_output): overlap_filter = OverlapFilter(tokenizer, overlap_size, comp_op, allow_missing) actual_output = overlap_filter.filter_pair(lstring, rstring) assert_equal(actual_output, expected_output)
def test_invalid_r_out_attr(self): overlap_filter = OverlapFilter(self.tokenizer, self.threshold) overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', ['A.attr'], ['B.invalid_attr'])
def test_invalid_rtable_r_filter_attr(self): overlap_filter = OverlapFilter(self.dlm) overlap_filter.filter_candset(self.C, 'l_id', 'r_id', self.A, self.B, 'l_id', 'r_id', 'l_attr', 'invalid_attr')
def test_invalid_ltable(self): overlap_filter = OverlapFilter(self.dlm) overlap_filter.filter_candset(self.C, 'l_id', 'r_id', [], self.B, 'l_id', 'r_id', 'l_attr', 'r_attr')
def test_invalid_threshold(self): overlap_filter = OverlapFilter(self.tokenizer, -1)
def test_rtable_r_key_attr_with_missing_value(self): overlap_filter = OverlapFilter(self.dlm) overlap_filter.filter_candset(self.C, 'l_id', 'r_id', self.A, self.B, 'l_id', 'r_attr', 'l_attr', 'r_attr')
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words self.cleanup_table(l_df, l_overlap_attr, rem_stop_words) self.cleanup_table(r_df, r_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # create a filter for overlap similarity join overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) # # perform overlap similarity filtering of the candset out_table = overlap_filter.filter_candset(candset, fk_ltable, fk_rtable, l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, n_jobs, show_progress=show_progress) # update catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return out_table
def test_invalid_rtable(self): overlap_filter = OverlapFilter(self.tokenizer, self.threshold) overlap_filter.filter_tables(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr')
def test_invalid_rtable(self): overlap_filter = OverlapFilter(self.tokenizer, self.threshold) overlap_filter.filter_tables(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr')
def test_numeric_r_filter_attr(self): overlap_filter = OverlapFilter(self.tokenizer, self.threshold) overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.int_attr')
def sample_pairs(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, sample_size, y_param, seed, l_out_prefix='l_', r_out_prefix='r_', show_progress=True): # get attributes to project. l_proj_attrs = get_attrs_to_project(None, l_key_attr, l_join_attr) r_proj_attrs = get_attrs_to_project(None, r_key_attr, r_join_attr) # convert dataframe to array for faster access ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr) rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr) # find column indices of key attr and join attr in ltable array l_key_attr_index = l_proj_attrs.index(l_key_attr) l_join_attr_index = l_proj_attrs.index(l_join_attr) # find column indices of key attr and join attr in rtable array r_key_attr_index = r_proj_attrs.index(r_key_attr) r_join_attr_index = r_proj_attrs.index(r_join_attr) # create a whitespace tokenizer to tokenize join attributes ws_tok = WhitespaceTokenizer(return_set=True) # build inverted index on join attriubute in ltable inverted_index = InvertedIndex(ltable_array, l_join_attr_index, ws_tok) inverted_index.build() number_of_r_tuples_to_sample = int( ceil(float(sample_size) / float(y_param))) sample_rtable_indices = random.sample(range(0, len(rtable_array)), number_of_r_tuples_to_sample) cand_pos_ltuples_required = int(ceil(y_param / 2.0)) overlap_filter = OverlapFilter(ws_tok, 1) output_rows = [] if show_progress: prog_bar = pyprind.ProgBar(number_of_r_tuples_to_sample) for r_idx in sample_rtable_indices: r_row = rtable_array[r_idx] r_id = r_row[r_key_attr_index] r_join_attr_tokens = ws_tok.tokenize(r_row[r_join_attr_index]) # probe inverted index and find ltable candidates cand_overlap = overlap_filter.find_candidates(r_join_attr_tokens, inverted_index) sampled_ltuples = set() for cand in sorted(cand_overlap.items(), key=operator.itemgetter(1), reverse=True): if len(sampled_ltuples) == cand_pos_ltuples_required: break sampled_ltuples.add(cand[0]) ltable_size = len(ltable_array) while len(sampled_ltuples) < y_param: rand_idx = random.randint(0, ltable_size - 1) sampled_ltuples.add(rand_idx) for l_idx in sampled_ltuples: output_rows.append([ltable_array[l_idx][l_key_attr_index], r_id]) if show_progress: prog_bar.update() for seed_pair_row in seed.itertuples(index=False): output_rows.append([seed_pair_row[0], seed_pair_row[1]]) output_header = get_output_header_from_tables(l_key_attr, r_key_attr, None, None, l_out_prefix, r_out_prefix) output_table = pd.DataFrame(output_rows, columns=output_header) # add an id column named '_id' to the output table. output_table.insert(0, '_id', range(0, len(output_table))) return output_table
def test_invalid_tokenizer(self): overlap_filter = OverlapFilter([], self.threshold)
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words self.cleanup_table(l_df, l_overlap_attr, rem_stop_words) self.cleanup_table(r_df, r_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # create a filter for overlap similarity join overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) # # perform overlap similarity filtering of the candset out_table = overlap_filter.filter_candset(candset, fk_ltable, fk_rtable, l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, n_jobs, show_progress=show_progress) # update catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return out_table
def test_invalid_comp_op_le(self): overlap_filter = OverlapFilter(self.tokenizer, self.threshold, '<=')
def block_tuples(self, ltuple, rtuple, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False): """Blocks a tuple pair based on the overlap of token sets of attribute values. Args: ltuple (Series): The input left tuple. rtuple (Series): The input right tuple. l_overlap_attr (string): The overlap attribute in left tuple. r_overlap_attr (string): The overlap attribute in right tuple. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): A value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether a tuple pair with missing value in at least one of the blocking attributes should be blocked (defaults to False). If this flag is set to True, the pair will be kept if either ltuple has missing value in l_block_attr or rtuple has missing value in r_block_attr or both. Returns: A status indicating if the tuple pair is blocked (boolean). Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> status = ob.block_tuples(A.ix[0], B.ix[0], 'address', 'address') """ # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # determine which tokenizer to use if word_level == True: # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # cleanup the tuples from non-ascii characters, punctuations, and stop words l_val = self.cleanup_tuple_val(ltuple[l_overlap_attr], rem_stop_words) r_val = self.cleanup_tuple_val(rtuple[r_overlap_attr], rem_stop_words) # create a filter for overlap similarity overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) return overlap_filter.filter_pair(l_val, r_val)
def block_tuples(self, ltuple, rtuple, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False): """Blocks a tuple pair based on the overlap of token sets of attribute values. Args: ltuple (Series): The input left tuple. rtuple (Series): The input right tuple. l_overlap_attr (string): The overlap attribute in left tuple. r_overlap_attr (string): The overlap attribute in right tuple. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): A value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether a tuple pair with missing value in at least one of the blocking attributes should be blocked (defaults to False). If this flag is set to True, the pair will be kept if either ltuple has missing value in l_block_attr or rtuple has missing value in r_block_attr or both. Returns: A status indicating if the tuple pair is blocked (boolean). Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> status = ob.block_tuples(A.ix[0], B.ix[0], 'address', 'address') """ # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # determine which tokenizer to use if word_level == True: # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # cleanup the tuples from non-ascii characters, punctuations, and stop words l_val = self.cleanup_tuple_val(ltuple[l_overlap_attr], rem_stop_words) r_val = self.cleanup_tuple_val(rtuple[r_overlap_attr], rem_stop_words) # create a filter for overlap similarity overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) return overlap_filter.filter_pair(l_val, r_val)
def test_apply_matcher_with_allow_missing(self): tok = QgramTokenizer(qval=2, return_set=True) sim_func = get_sim_function('JACCARD') threshold = 0.3 comp_op = '>=' # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod = self.cartprod cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( tok.tokenize(str(row[self.l_join_attr])), tok.tokenize(str(row[self.r_join_attr]))), axis=1) # compute expected output pairs comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), threshold): expected_pairs.add(','.join((str(row[self.l_key_attr]), str(row[self.r_key_attr])))) # find pairs that need to be included in output due to # the presence of missing value in one of the join attributes. missing_pairs = set() for l_idx, l_row in self.orig_ltable.iterrows(): for r_idx, r_row in self.orig_rtable.iterrows(): if (pd.isnull(l_row[self.l_join_attr]) or pd.isnull(r_row[self.r_join_attr])): missing_pairs.add(','.join((str(l_row[self.l_key_attr]), str(r_row[self.r_key_attr])))) # add the pairs containing missing value to the set of expected pairs. expected_pairs = expected_pairs.union(missing_pairs) # use overlap filter to obtain a candset with allow_missing set to True. overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True) candset = overlap_filter.filter_tables(self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr) # apply a jaccard matcher to the candset with allow_missing set to True. output_candset = apply_matcher(candset, DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr, self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr, self.l_join_attr, self.r_join_attr, tok, sim_func, threshold, comp_op, True, out_sim_score=True) expected_output_attrs=['_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr, DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'] # verify whether the output table has the necessary attributes. assert_list_equal(list(output_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in output_candset.iterrows(): actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]), str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))