def filter_candset(self, candset, candset_l_key_attr, candset_r_key_attr, ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, n_jobs=1): """Filter candidate set. Args: candset : Pandas data frame candset_l_key_attr, candset_r_key_attr : String, key attributes in candset (that refer to ltable and rtable) ltable, rtable : Pandas data frame, base tables from which candset was obtained l_key_attr, r_key_attr : String, key attribute from ltable and rtable l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable Returns: result : Pandas data frame """ # check for empty candset if candset.empty: return candset if n_jobs == 1: return _filter_candset_split(candset, candset_l_key_attr, candset_r_key_attr, ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, self) else: candset_splits = split_table(candset, n_jobs) results = Parallel(n_jobs=n_jobs)(delayed(_filter_candset_split)( candset_split, candset_l_key_attr, candset_r_key_attr, ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, self) for candset_split in candset_splits) return pd.concat(results)
def edit_dist_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=True, n_jobs=1, tokenizer=create_qgram_tokenizer(2)): """Join two tables using edit distance similarity measure. Finds tuple pairs from ltable and rtable such that EditDistance(ltable.l_join_attr, rtable.r_join_attr) <= threshold Args: ltable, rtable : Pandas data frame l_key_attr, r_key_attr : String, key attribute from ltable and rtable l_join_attr, r_join_attr : String, join attribute from ltable and rtable tokenizer : Tokenizer object, tokenizer to be used to tokenize join attributes threshold : int, edit distance threshold to be satisfied l_out_attrs, r_out_attrs : list of attributes to be included in the output table from ltable and rtable l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table out_sim_score : boolean, indicates if edit distance needs to be included in the output table Returns: result : Pandas data frame """ # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and join attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_join_attr, ltable.columns, 'join attribute', 'left table') validate_attr(r_join_attr, rtable.columns, 'join attribute', 'right table') # check if the input tokenizer is valid validate_tokenizer(tokenizer) # check if the input threshold is valid validate_threshold(threshold, 'EDIT_DISTANCE') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') # convert threshold to integer (incase if it is float) threshold = int(floor(threshold)) if n_jobs == 1: output_table = _edit_dist_join_split(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score) output_table.insert(0, '_id', range(0, len(output_table))) return output_table else: r_splits = split_table(rtable, n_jobs) results = Parallel(n_jobs=n_jobs)(delayed(_edit_dist_join_split)( ltable, s, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score) for s in r_splits) output_table = pd.concat(results) output_table.insert(0, '_id', range(0, len(output_table))) return output_table
def filter_tables(self, ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', n_jobs=1): """Filter tables with size filter. Args: ltable, rtable : Pandas data frame l_key_attr, r_key_attr : String, key attribute from ltable and rtable l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable l_out_attrs, r_out_attrs : list of attribtues to be included in the output table from ltable and rtable l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table Returns: result : Pandas data frame """ # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and filter attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_filter_attr, ltable.columns, 'filter attribute', 'left table') validate_attr(r_filter_attr, rtable.columns, 'filter attribute', 'right table') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') if n_jobs == 1: output_table = _filter_tables_split(ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, self, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) output_table.insert(0, '_id', range(0, len(output_table))) return output_table else: rtable_splits = split_table(rtable, n_jobs) results = Parallel(n_jobs=n_jobs)(delayed(_filter_tables_split)( ltable, rtable_split, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, self, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) for rtable_split in rtable_splits) output_table = pd.concat(results) output_table.insert(0, '_id', range(0, len(output_table))) return output_table
def jaccard_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=True, n_jobs=1): """Join two tables using jaccard similarity measure. Finds tuple pairs from ltable and rtable such that Jaccard(ltable.l_join_attr, rtable.r_join_attr) >= threshold Args: ltable, rtable : Pandas data frame l_key_attr, r_key_attr : String, key attribute from ltable and rtable l_join_attr, r_join_attr : String, join attribute from ltable and rtable tokenizer : Tokenizer object, tokenizer to be used to tokenize join attributes threshold : float, jaccard threshold to be satisfied l_out_attrs, r_out_attrs : list of attributes to be included in the output table from ltable and rtable l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table out_sim_score : boolean, indicates if similarity score needs to be included in the output table Returns: result : Pandas data frame """ # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and join attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_join_attr, ltable.columns, 'join attribute', 'left table') validate_attr(r_join_attr, rtable.columns, 'join attribute', 'right table') # check if the input tokenizer is valid validate_tokenizer(tokenizer) # check if the input threshold is valid validate_threshold(threshold, 'JACCARD') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') if n_jobs == 1: output_table = _set_sim_join_split(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, 'JACCARD', threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score) output_table.insert(0, '_id', range(0, len(output_table))) return output_table else: r_splits = split_table(rtable, n_jobs) results = Parallel(n_jobs=n_jobs)(delayed(_set_sim_join_split)( ltable, r_split, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, 'JACCARD', threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score) for r_split in r_splits) output_table = pd.concat(results) output_table.insert(0, '_id', range(0, len(output_table))) return output_table