def test_filter_tables(self, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, args, expected_pairs): position_filter = PositionFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) actual_candset = position_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]), str(row[r_out_prefix + args[3]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_filter_tables(self, tokenizer, overlap_size, comp_op, allow_missing, args, expected_pairs): overlap_filter = OverlapFilter(tokenizer, overlap_size, comp_op, allow_missing) actual_candset = overlap_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]), str(row[r_out_prefix + args[3]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def dice_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op='>=', allow_empty=True, allow_missing=False, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=True, n_jobs=1, show_progress=True): """Join two tables using Dice similarity measure. For two sets X and Y, the Dice similarity score between them is given by: :math:`dice(X, Y) = \\frac{2 * |X \\cap Y|}{|X| + |Y|}` In the case where both X and Y are empty sets, we define their Dice score to be 1. Finds tuple pairs from left table and right table such that the Dice similarity between the join attributes satisfies the condition on input threshold. For example, if the comparison operator is '>=', finds tuple pairs whose Dice similarity between the strings that are the values of the join attributes is greater than or equal to the input threshold, as specified in "threshold". Args: ltable (DataFrame): left input table. rtable (DataFrame): right input table. l_key_attr (string): key attribute in left table. r_key_attr (string): key attribute in right table. l_join_attr (string): join attribute in left table. r_join_attr (string): join attribute in right table. tokenizer (Tokenizer): tokenizer to be used to tokenize join attributes. threshold (float): Dice similarity threshold to be satisfied. comp_op (string): comparison operator. Supported values are '>=', '>' and '=' (defaults to '>='). allow_empty (boolean): flag to indicate whether tuple pairs with empty set of tokens in both the join attributes should be included in the output (defaults to True). allow_missing (boolean): flag to indicate whether tuple pairs with missing value in at least one of the join attributes should be included in the output (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the join attribute will be matched with every tuple in rtable and vice versa. l_out_attrs (list): list of attribute names from the left table to be included in the output table (defaults to None). r_out_attrs (list): list of attribute names from the right table to be included in the output table (defaults to None). l_out_prefix (string): prefix to be used for the attribute names coming from the left table, in the output table (defaults to 'l\_'). r_out_prefix (string): prefix to be used for the attribute names coming from the right table, in the output table (defaults to 'r\_'). out_sim_score (boolean): flag to indicate whether similarity score should be included in the output table (defaults to True). Setting this flag to True will add a column named '_sim_score' in the output table. This column will contain the similarity scores for the tuple pairs in the output. n_jobs (int): number of parallel jobs to use for the computation (defaults to 1). If -1 is given, all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, then no parallel computing code will be used (i.e., equivalent to the default). show_progress (boolean): flag to indicate whether task progress should be displayed to the user (defaults to True). Returns: An output table containing tuple pairs that satisfy the join condition (DataFrame). """ # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and join attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_join_attr, ltable.columns, 'join attribute', 'left table') validate_attr(r_join_attr, rtable.columns, 'join attribute', 'right table') # check if the join attributes are not of numeric type validate_attr_type(l_join_attr, ltable[l_join_attr].dtype, 'join attribute', 'left table') validate_attr_type(r_join_attr, rtable[r_join_attr].dtype, 'join attribute', 'right table') # check if the input tokenizer is valid validate_tokenizer(tokenizer) # check if the input threshold is valid validate_threshold(threshold, 'DICE') # check if the comparison operator is valid validate_comp_op_for_sim_measure(comp_op, 'DICE') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') # set return_set flag of tokenizer to be True, in case it is set to False revert_tokenizer_return_set_flag = False if not tokenizer.get_return_set(): tokenizer.set_return_set(True) revert_tokenizer_return_set_flag = True # remove redundant attrs from output attrs. l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr) r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr) # get attributes to project. l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_join_attr) r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_join_attr) # Do a projection on the input dataframes to keep only the required # attributes. Then, remove rows with missing value in join attribute from # the input dataframes. Then, convert the resulting dataframes into ndarray. ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr) rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr) # computes the actual number of jobs to launch. n_jobs = min(get_num_processes_to_launch(n_jobs), len(rtable_array)) if n_jobs <= 1: # if n_jobs is 1, do not use any parallel code. output_table = set_sim_join(ltable_array, rtable_array, l_proj_attrs, r_proj_attrs, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, 'DICE', threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress) else: # if n_jobs is above 1, split the right table into n_jobs splits and # join each right table split with the whole of left table in a separate # process. r_splits = split_table(rtable_array, n_jobs) results = Parallel(n_jobs=n_jobs)(delayed(set_sim_join)( ltable_array, r_splits[job_index], l_proj_attrs, r_proj_attrs, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, 'DICE', threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, ( show_progress and (job_index == n_jobs - 1))) for job_index in range(n_jobs)) output_table = pd.concat(results) # If allow_missing flag is set, then compute all pairs with missing value in # at least one of the join attributes and then add it to the output # obtained from the join. if allow_missing: missing_pairs = get_pairs_with_missing_value( ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress) output_table = pd.concat([output_table, missing_pairs]) # add an id column named '_id' to the output table. output_table.insert(0, '_id', range(0, len(output_table))) # revert the return_set flag of tokenizer, in case it was modified. if revert_tokenizer_return_set_flag: tokenizer.set_return_set(False) return output_table
def apply_matcher(candset, candset_l_key_attr, candset_r_key_attr, ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, tokenizer, sim_function, threshold, comp_op='>=', allow_missing=False, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=True, n_jobs=1, show_progress=True): """Find matching string pairs from the candidate set (typically produced by applying a filter to two tables) by applying a matcher of form (sim_function comp_op threshold). Specifically, this method computes the input similarity function on string pairs in the candidate set and checks if the resulting score satisfies the input threshold (depending on the comparison operator). Args: candset (DataFrame): input candidate set. candset_l_key_attr (string): attribute in candidate set which is a key in left table. candset_r_key_attr (string): attribute in candidate set which is a key in right table. ltable (DataFrame): left input table. rtable (DataFrame): right input table. l_key_attr (string): key attribute in left table. r_key_attr (string): key attribute in right table. l_match_attr (string): attribute in left table on which the matcher should be applied. r_match_attr (string): attribute in right table on which the matcher should be applied. tokenizer (Tokenizer): tokenizer to be used to tokenize the match attributes. If set to None, the matcher is applied directly on the match attributes. sim_function (function): matcher function to be applied. threshold (float): threshold to be satisfied. comp_op (string): comparison operator. Supported values are '>=', '>', ' <=', '<', '=' and '!=' (defaults to '>='). allow_missing (boolean): flag to indicate whether tuple pairs with missing value in at least one of the match attributes should be included in the output (defaults to False). l_out_attrs (list): list of attribute names from the left table to be included in the output table (defaults to None). r_out_attrs (list): list of attribute names from the right table to be included in the output table (defaults to None). l_out_prefix (string): prefix to be used for the attribute names coming from the left table, in the output table (defaults to 'l\_'). r_out_prefix (string): prefix to be used for the attribute names coming from the right table, in the output table (defaults to 'r\_'). out_sim_score (boolean): flag to indicate whether similarity score should be included in the output table (defaults to True). Setting this flag to True will add a column named '_sim_score' in the output table. This column will contain the similarity scores for the tuple pairs in the output. n_jobs (int): number of parallel jobs to use for the computation (defaults to 1). If -1 is given, all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, then no parallel computing code will be used (i.e., equivalent to the default). show_progress (boolean): flag to indicate whether task progress should be displayed to the user (defaults to True). Returns: An output table containing tuple pairs from the candidate set that survive the matcher (DataFrame). """ # check if the input candset is a dataframe validate_input_table(candset, 'candset') # check if the candset key attributes exist validate_attr(candset_l_key_attr, candset.columns, 'left key attribute', 'candset') validate_attr(candset_r_key_attr, candset.columns, 'right key attribute', 'candset') # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and join attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_match_attr, ltable.columns, 'match attribute', 'left table') validate_attr(r_match_attr, rtable.columns, 'match attribute', 'right table') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the input tokenizer is valid, if it is not None if tokenizer is not None: validate_tokenizer(tokenizer) # check if the comparison operator is valid validate_comp_op(comp_op) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') # check for empty candset if candset.empty: return candset # remove redundant attrs from output attrs. l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr) r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr) # get attributes to project. l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_match_attr) r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_match_attr) # do a projection on the input dataframes. Note that this doesn't create a # copy of the dataframes. It only creates a view on original dataframes. ltable_projected = ltable[l_proj_attrs] rtable_projected = rtable[r_proj_attrs] # computes the actual number of jobs to launch. n_jobs = min(get_num_processes_to_launch(n_jobs), len(candset)) # If a tokenizer is provided, we can optimize by tokenizing each value # only once by caching the tokens of l_match_attr and r_match_attr. But, # this can be a bad strategy in case the candset has very few records # compared to the original tables. Hence, we check if the sum of tuples in # ltable and rtable is less than twice the number of tuples in the candset. # If yes, we decide to cache the token values. Else, we do not cache the # tokens as the candset is small. l_tokens = None r_tokens = None if tokenizer is not None and (len(ltable) + len(rtable) < len(candset) * 2): l_tokens = generate_tokens(ltable_projected, l_key_attr, l_match_attr, tokenizer) r_tokens = generate_tokens(rtable_projected, r_key_attr, r_match_attr, tokenizer) if n_jobs <= 1: # if n_jobs is 1, do not use any parallel code. output_table = _apply_matcher_split( candset, candset_l_key_attr, candset_r_key_attr, ltable_projected, rtable_projected, l_key_attr, r_key_attr, l_match_attr, r_match_attr, tokenizer, sim_function, threshold, comp_op, allow_missing, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress, l_tokens, r_tokens) else: # if n_jobs is above 1, split the candset into n_jobs splits and apply # the matcher on each candset split in a separate process. candset_splits = split_table(candset, n_jobs) results = Parallel(n_jobs=n_jobs)(delayed(_apply_matcher_split)( candset_splits[job_index], candset_l_key_attr, candset_r_key_attr, ltable_projected, rtable_projected, l_key_attr, r_key_attr, l_match_attr, r_match_attr, tokenizer, sim_function, threshold, comp_op, allow_missing, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, (show_progress and ( job_index == n_jobs - 1)), l_tokens, r_tokens) for job_index in range(n_jobs)) output_table = pd.concat(results) return output_table
def filter_tables(self, ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=False, n_jobs=1, show_progress=True): """Finds candidate matching pairs of strings from the input tables using overlap filtering technique. Args: ltable (DataFrame): left input table. rtable (DataFrame): right input table. l_key_attr (string): key attribute in left table. r_key_attr (string): key attribute in right table. l_filter_attr (string): attribute in left table on which the filter should be applied. r_filter_attr (string): attribute in right table on which the filter should be applied. l_out_attrs (list): list of attribute names from the left table to be included in the output table (defaults to None). r_out_attrs (list): list of attribute names from the right table to be included in the output table (defaults to None). l_out_prefix (string): prefix to be used for the attribute names coming from the left table, in the output table (defaults to 'l\_'). r_out_prefix (string): prefix to be used for the attribute names coming from the right table, in the output table (defaults to 'r\_'). out_sim_score (boolean): flag to indicate whether the overlap score should be included in the output table (defaults to True). Setting this flag to True will add a column named '_sim_score' in the output table. This column will contain the overlap scores for the tuple pairs in the output. n_jobs (int): number of parallel jobs to use for the computation (defaults to 1). If -1 is given, all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, then no parallel computing code will be used (i.e., equivalent to the default). show_progress (boolean): flag to indicate whether task progress should be displayed to the user (defaults to True). Returns: An output table containing tuple pairs that survive the filter (DataFrame). """ # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and filter attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_filter_attr, ltable.columns, 'attribute', 'left table') validate_attr(r_filter_attr, rtable.columns, 'attribute', 'right table') # check if the filter attributes are not of numeric type validate_attr_type(l_filter_attr, ltable[l_filter_attr].dtype, 'attribute', 'left table') validate_attr_type(r_filter_attr, rtable[r_filter_attr].dtype, 'attribute', 'right table') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the key attributes are unique and do not contain # missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') # remove redundant attrs from output attrs. l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr) r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr) # get attributes to project. l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_filter_attr) r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_filter_attr) # Do a projection on the input dataframes to keep only the required # attributes. Then, remove rows with missing value in filter attribute # from the input dataframes. Then, convert the resulting dataframes # into ndarray. ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_filter_attr) rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_filter_attr) # computes the actual number of jobs to launch. n_jobs = min(get_num_processes_to_launch(n_jobs), len(rtable_array)) if n_jobs <= 1: # if n_jobs is 1, do not use any parallel code. output_table = _filter_tables_split( ltable_array, rtable_array, l_proj_attrs, r_proj_attrs, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, self, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress) else: # if n_jobs is above 1, split the right table into n_jobs splits and # filter each right table split with the whole of left table in a # separate process. r_splits = split_table(rtable_array, n_jobs) results = Parallel(n_jobs=n_jobs)(delayed(_filter_tables_split)( ltable_array, r_splits[job_index], l_proj_attrs, r_proj_attrs, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, self, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, (show_progress and (job_index==n_jobs-1))) for job_index in range(n_jobs)) output_table = pd.concat(results) # If allow_missing flag is set, then compute all pairs with missing # value in at least one of the filter attributes and then add it to the # output obtained from applying the filter. if self.allow_missing: missing_pairs = get_pairs_with_missing_value( ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress) output_table = pd.concat([output_table, missing_pairs]) # add an id column named '_id' to the output table. output_table.insert(0, '_id', range(0, len(output_table))) return output_table
def test_valid_join(scenario, sim_measure_type, args, convert_to_str=False): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] join_fn = JOIN_FN_MAP[sim_measure_type] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 4 and args[4]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join((str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() if len(args) > 3 and (not args[3]): ltable_not_missing = ltable_not_missing[ltable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[l_join_attr]))), 1) > 0] rtable_not_missing = rtable_not_missing[rtable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[r_join_attr]))), 1) > 0] # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge(ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: round(sim_func( args[0].tokenize(str(row[l_join_attr])), args[0].tokenize(str(row[r_join_attr]))), 4), axis=1) comp_fn = COMP_OP_MAP[DEFAULT_COMP_OP] # Check for comp_op in args. if len(args) > 2: comp_fn = COMP_OP_MAP[args[2]] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), args[1]): expected_pairs.add(','.join((str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = args[0].get_return_set() # use join function to obtain actual output pairs. actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, *args) assert_equal(args[0].get_return_set(), orig_return_set_flag) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 7: l_out_prefix = args[7] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 8: r_out_prefix = args[8] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 5: if args[5]: l_out_attrs = remove_redundant_attrs(args[5], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 6: if args[6]: r_out_attrs = remove_redundant_attrs(args[6], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 9: if args[9]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, comp_op='<=', allow_missing=False, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=True, n_jobs=1, show_progress=True, tokenizer=QgramTokenizer(qval=2)): """Join two tables using edit distance measure. Finds tuple pairs from left table and right table such that the edit distance between the join attributes satisfies the condition on input threshold. For example, if the comparison operator is '<=', finds tuple pairs whose edit distance between the strings that are the values of the join attributes is less than or equal to the input threshold, as specified in "threshold". Note: Currently, this method only computes an approximate join result. This is because, to perform the join we transform an edit distance measure between strings into an overlap measure between qgrams of the strings. Hence, we need at least one qgram to be in common between two input strings, to appear in the join output. For smaller strings, where all qgrams of the strings differ, we cannot process them. This method implements a simplified version of the algorithm proposed in `Ed-Join: An Efficient Algorithm for Similarity Joins With Edit Distance Constraints (Chuan Xiao, Wei Wang and Xuemin Lin), VLDB 08 <http://www.vldb.org/pvldb/1/1453957.pdf>`_. Args: ltable (DataFrame): left input table. rtable (DataFrame): right input table. l_key_attr (string): key attribute in left table. r_key_attr (string): key attribute in right table. l_join_attr (string): join attribute in left table. r_join_attr (string): join attribute in right table. threshold (float): edit distance threshold to be satisfied. comp_op (string): comparison operator. Supported values are '<=', '<' and '=' (defaults to '<='). allow_missing (boolean): flag to indicate whether tuple pairs with missing value in at least one of the join attributes should be included in the output (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the join attribute will be matched with every tuple in rtable and vice versa. l_out_attrs (list): list of attribute names from the left table to be included in the output table (defaults to None). r_out_attrs (list): list of attribute names from the right table to be included in the output table (defaults to None). l_out_prefix (string): prefix to be used for the attribute names coming from the left table, in the output table (defaults to 'l\_'). r_out_prefix (string): prefix to be used for the attribute names coming from the right table, in the output table (defaults to 'r\_'). out_sim_score (boolean): flag to indicate whether the edit distance score should be included in the output table (defaults to True). Setting this flag to True will add a column named '_sim_score' in the output table. This column will contain the edit distance scores for the tuple pairs in the output. n_jobs (int): number of parallel jobs to use for the computation (defaults to 1). If -1 is given, all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, then no parallel computing code will be used (i.e., equivalent to the default). show_progress (boolean): flag to indicate whether task progress should be displayed to the user (defaults to True). tokenizer (Tokenizer): tokenizer to be used to tokenize the join attributes during filtering, when edit distance measure is transformed into an overlap measure. This must be a q-gram tokenizer (defaults to 2-gram tokenizer). Returns: An output table containing tuple pairs that satisfy the join condition (DataFrame). """ # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and join attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_join_attr, ltable.columns, 'join attribute', 'left table') validate_attr(r_join_attr, rtable.columns, 'join attribute', 'right table') # check if the join attributes are not of numeric type validate_attr_type(l_join_attr, ltable[l_join_attr].dtype, 'join attribute', 'left table') validate_attr_type(r_join_attr, rtable[r_join_attr].dtype, 'join attribute', 'right table') # check if the input tokenizer is valid for edit distance measure. Only # qgram tokenizer can be used for edit distance. validate_tokenizer_for_sim_measure(tokenizer, 'EDIT_DISTANCE') # check if the input threshold is valid validate_threshold(threshold, 'EDIT_DISTANCE') # check if the comparison operator is valid validate_comp_op_for_sim_measure(comp_op, 'EDIT_DISTANCE') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') # convert threshold to integer (incase if it is float) threshold = int(floor(threshold)) # set return_set flag of tokenizer to be False, in case it is set to True revert_tokenizer_return_set_flag = False if tokenizer.get_return_set(): tokenizer.set_return_set(False) revert_tokenizer_return_set_flag = True # remove redundant attrs from output attrs. l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr) r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr) # get attributes to project. l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_join_attr) r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_join_attr) # Do a projection on the input dataframes to keep only the required # attributes. Then, remove rows with missing value in join attribute from # the input dataframes. Then, convert the resulting dataframes into ndarray. ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr) rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr) # computes the actual number of jobs to launch. n_jobs = min(get_num_processes_to_launch(n_jobs), len(rtable_array)) if n_jobs <= 1: # if n_jobs is 1, do not use any parallel code. output_table = _edit_distance_join_split( ltable_array, rtable_array, l_proj_attrs, r_proj_attrs, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress) else: # if n_jobs is above 1, split the right table into n_jobs splits and # join each right table split with the whole of left table in a separate # process. r_splits = split_table(rtable_array, n_jobs) results = Parallel(n_jobs=n_jobs)(delayed(_edit_distance_join_split)( ltable_array, r_splits[job_index], l_proj_attrs, r_proj_attrs, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, (show_progress and (job_index==n_jobs-1))) for job_index in range(n_jobs)) output_table = pd.concat(results) # If allow_missing flag is set, then compute all pairs with missing value in # at least one of the join attributes and then add it to the output # obtained from the join. if allow_missing: missing_pairs = get_pairs_with_missing_value( ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress) output_table = pd.concat([output_table, missing_pairs]) # add an id column named '_id' to the output table. output_table.insert(0, '_id', range(0, len(output_table))) # revert the return_set flag of tokenizer, in case it was modified. if revert_tokenizer_return_set_flag: tokenizer.set_return_set(True) return output_table
def test_valid_join(scenario, tok, threshold, comp_op=DEFAULT_COMP_OP, args=(), convert_to_str=False): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 0 and args[0]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join( (str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge( ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_measure_type = 'EDIT_DISTANCE' sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply( lambda row: sim_func(str(row[l_join_attr]), str(row[r_join_attr])), axis=1) comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() overlap = get_sim_function('OVERLAP') for idx, row in cartprod.iterrows(): l_tokens = tok.tokenize(str(row[l_join_attr])) r_tokens = tok.tokenize(str(row[r_join_attr])) if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0: continue # current edit distance join is approximate. It cannot find matching # strings which don't have any common q-grams. Hence, remove pairs # that don't have any common q-grams from expected pairs. if comp_fn(float(row['sim_score']), threshold): if overlap(l_tokens, r_tokens) > 0: expected_pairs.add(','.join( (str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = tok.get_return_set() # use join function to obtain actual output pairs. actual_candset = edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, comp_op, *args, tokenizer=tok) assert_equal(tok.get_return_set(), orig_return_set_flag) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 3: l_out_prefix = args[3] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 4: r_out_prefix = args[4] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 1: if args[1]: l_out_attrs = remove_redundant_attrs(args[1], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 2: if args[2]: r_out_attrs = remove_redundant_attrs(args[2], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 5: if args[5]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, comp_op='<=', allow_missing=False, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=True, n_jobs=1, show_progress=True, tokenizer=QgramTokenizer(qval=2)): """Join two tables using edit distance measure. Finds tuple pairs from left table and right table such that the edit distance between the join attributes satisfies the condition on input threshold. For example, if the comparison operator is '<=', finds tuple pairs whose edit distance between the strings that are the values of the join attributes is less than or equal to the input threshold, as specified in "threshold". Note: Currently, this method only computes an approximate join result. This is because, to perform the join we transform an edit distance measure between strings into an overlap measure between qgrams of the strings. Hence, we need at least one qgram to be in common between two input strings, to appear in the join output. For smaller strings, where all qgrams of the strings differ, we cannot process them. This method implements a simplified version of the algorithm proposed in `Ed-Join: An Efficient Algorithm for Similarity Joins With Edit Distance Constraints (Chuan Xiao, Wei Wang and Xuemin Lin), VLDB 08 <http://www.vldb.org/pvldb/1/1453957.pdf>`_. Args: ltable (DataFrame): left input table. rtable (DataFrame): right input table. l_key_attr (string): key attribute in left table. r_key_attr (string): key attribute in right table. l_join_attr (string): join attribute in left table. r_join_attr (string): join attribute in right table. threshold (float): edit distance threshold to be satisfied. comp_op (string): comparison operator. Supported values are '<=', '<' and '=' (defaults to '<='). allow_missing (boolean): flag to indicate whether tuple pairs with missing value in at least one of the join attributes should be included in the output (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the join attribute will be matched with every tuple in rtable and vice versa. l_out_attrs (list): list of attribute names from the left table to be included in the output table (defaults to None). r_out_attrs (list): list of attribute names from the right table to be included in the output table (defaults to None). l_out_prefix (string): prefix to be used for the attribute names coming from the left table, in the output table (defaults to 'l\_'). r_out_prefix (string): prefix to be used for the attribute names coming from the right table, in the output table (defaults to 'r\_'). out_sim_score (boolean): flag to indicate whether the edit distance score should be included in the output table (defaults to True). Setting this flag to True will add a column named '_sim_score' in the output table. This column will contain the edit distance scores for the tuple pairs in the output. n_jobs (int): number of parallel jobs to use for the computation (defaults to 1). If -1 is given, all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, then no parallel computing code will be used (i.e., equivalent to the default). show_progress (boolean): flag to indicate whether task progress should be displayed to the user (defaults to True). tokenizer (Tokenizer): tokenizer to be used to tokenize the join attributes during filtering, when edit distance measure is transformed into an overlap measure. This must be a q-gram tokenizer (defaults to 2-gram tokenizer). Returns: An output table containing tuple pairs that satisfy the join condition (DataFrame). """ # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and join attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_join_attr, ltable.columns, 'join attribute', 'left table') validate_attr(r_join_attr, rtable.columns, 'join attribute', 'right table') # check if the join attributes are not of numeric type validate_attr_type(l_join_attr, ltable[l_join_attr].dtype, 'join attribute', 'left table') validate_attr_type(r_join_attr, rtable[r_join_attr].dtype, 'join attribute', 'right table') # check if the input tokenizer is valid for edit distance measure. Only # qgram tokenizer can be used for edit distance. validate_tokenizer_for_sim_measure(tokenizer, 'EDIT_DISTANCE') # check if the input threshold is valid validate_threshold(threshold, 'EDIT_DISTANCE') # check if the comparison operator is valid validate_comp_op_for_sim_measure(comp_op, 'EDIT_DISTANCE') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') # convert threshold to integer (incase if it is float) threshold = int(floor(threshold)) # set return_set flag of tokenizer to be False, in case it is set to True revert_tokenizer_return_set_flag = False if tokenizer.get_return_set(): tokenizer.set_return_set(False) revert_tokenizer_return_set_flag = True # remove redundant attrs from output attrs. l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr) r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr) # get attributes to project. l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_join_attr) r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_join_attr) # Do a projection on the input dataframes to keep only the required # attributes. Then, remove rows with missing value in join attribute from # the input dataframes. Then, convert the resulting dataframes into ndarray. ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr) rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr) # computes the actual number of jobs to launch. n_jobs = min(get_num_processes_to_launch(n_jobs), len(rtable_array)) if n_jobs <= 1: # if n_jobs is 1, do not use any parallel code. output_table = _edit_distance_join_split( ltable_array, rtable_array, l_proj_attrs, r_proj_attrs, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress) else: # if n_jobs is above 1, split the right table into n_jobs splits and # join each right table split with the whole of left table in a separate # process. r_splits = split_table(rtable_array, n_jobs) results = Parallel(n_jobs=n_jobs)(delayed(_edit_distance_join_split)( ltable_array, r_splits[job_index], l_proj_attrs, r_proj_attrs, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, ( show_progress and (job_index == n_jobs - 1))) for job_index in range(n_jobs)) output_table = pd.concat(results) # If allow_missing flag is set, then compute all pairs with missing value in # at least one of the join attributes and then add it to the output # obtained from the join. if allow_missing: missing_pairs = get_pairs_with_missing_value( ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress) output_table = pd.concat([output_table, missing_pairs]) # add an id column named '_id' to the output table. output_table.insert(0, '_id', range(0, len(output_table))) # revert the return_set flag of tokenizer, in case it was modified. if revert_tokenizer_return_set_flag: tokenizer.set_return_set(True) return output_table
def dice_join_py(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op='>=', allow_empty=True, allow_missing=False, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=True, n_jobs=1, show_progress=True): """Join two tables using Dice similarity measure. For two sets X and Y, the Dice similarity score between them is given by: :math:`dice(X, Y) = \\frac{2 * |X \\cap Y|}{|X| + |Y|}` In the case where both X and Y are empty sets, we define their Dice score to be 1. Finds tuple pairs from left table and right table such that the Dice similarity between the join attributes satisfies the condition on input threshold. For example, if the comparison operator is '>=', finds tuple pairs whose Dice similarity between the strings that are the values of the join attributes is greater than or equal to the input threshold, as specified in "threshold". Args: ltable (DataFrame): left input table. rtable (DataFrame): right input table. l_key_attr (string): key attribute in left table. r_key_attr (string): key attribute in right table. l_join_attr (string): join attribute in left table. r_join_attr (string): join attribute in right table. tokenizer (Tokenizer): tokenizer to be used to tokenize join attributes. threshold (float): Dice similarity threshold to be satisfied. comp_op (string): comparison operator. Supported values are '>=', '>' and '=' (defaults to '>='). allow_empty (boolean): flag to indicate whether tuple pairs with empty set of tokens in both the join attributes should be included in the output (defaults to True). allow_missing (boolean): flag to indicate whether tuple pairs with missing value in at least one of the join attributes should be included in the output (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the join attribute will be matched with every tuple in rtable and vice versa. l_out_attrs (list): list of attribute names from the left table to be included in the output table (defaults to None). r_out_attrs (list): list of attribute names from the right table to be included in the output table (defaults to None). l_out_prefix (string): prefix to be used for the attribute names coming from the left table, in the output table (defaults to 'l\_'). r_out_prefix (string): prefix to be used for the attribute names coming from the right table, in the output table (defaults to 'r\_'). out_sim_score (boolean): flag to indicate whether similarity score should be included in the output table (defaults to True). Setting this flag to True will add a column named '_sim_score' in the output table. This column will contain the similarity scores for the tuple pairs in the output. n_jobs (int): number of parallel jobs to use for the computation (defaults to 1). If -1 is given, all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, then no parallel computing code will be used (i.e., equivalent to the default). show_progress (boolean): flag to indicate whether task progress should be displayed to the user (defaults to True). Returns: An output table containing tuple pairs that satisfy the join condition (DataFrame). """ # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and join attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_join_attr, ltable.columns, 'join attribute', 'left table') validate_attr(r_join_attr, rtable.columns, 'join attribute', 'right table') # check if the join attributes are not of numeric type validate_attr_type(l_join_attr, ltable[l_join_attr].dtype, 'join attribute', 'left table') validate_attr_type(r_join_attr, rtable[r_join_attr].dtype, 'join attribute', 'right table') # check if the input tokenizer is valid validate_tokenizer(tokenizer) # check if the input threshold is valid validate_threshold(threshold, 'DICE') # check if the comparison operator is valid validate_comp_op_for_sim_measure(comp_op, 'DICE') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') # set return_set flag of tokenizer to be True, in case it is set to False revert_tokenizer_return_set_flag = False if not tokenizer.get_return_set(): tokenizer.set_return_set(True) revert_tokenizer_return_set_flag = True # remove redundant attrs from output attrs. l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr) r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr) # get attributes to project. l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_join_attr) r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_join_attr) # Do a projection on the input dataframes to keep only the required # attributes. Then, remove rows with missing value in join attribute from # the input dataframes. Then, convert the resulting dataframes into ndarray. ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr) rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr) # computes the actual number of jobs to launch. n_jobs = min(get_num_processes_to_launch(n_jobs), len(rtable_array)) if n_jobs <= 1: # if n_jobs is 1, do not use any parallel code. output_table = set_sim_join(ltable_array, rtable_array, l_proj_attrs, r_proj_attrs, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, 'DICE', threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress) else: # if n_jobs is above 1, split the right table into n_jobs splits and # join each right table split with the whole of left table in a separate # process. r_splits = split_table(rtable_array, n_jobs) results = Parallel(n_jobs=n_jobs)(delayed(set_sim_join)( ltable_array, r_splits[job_index], l_proj_attrs, r_proj_attrs, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, 'DICE', threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, (show_progress and (job_index==n_jobs-1))) for job_index in range(n_jobs)) output_table = pd.concat(results) # If allow_missing flag is set, then compute all pairs with missing value in # at least one of the join attributes and then add it to the output # obtained from the join. if allow_missing: missing_pairs = get_pairs_with_missing_value( ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress) output_table = pd.concat([output_table, missing_pairs]) # add an id column named '_id' to the output table. output_table.insert(0, '_id', range(0, len(output_table))) # revert the return_set flag of tokenizer, in case it was modified. if revert_tokenizer_return_set_flag: tokenizer.set_return_set(False) return output_table
def test_valid_join(scenario, sim_measure_type, args, convert_to_str=False): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] join_fn = JOIN_FN_MAP[sim_measure_type] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 4 and args[4]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join( (str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() if len(args) > 3 and (not args[3]): ltable_not_missing = ltable_not_missing[ltable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[l_join_attr]))), 1) > 0] rtable_not_missing = rtable_not_missing[rtable_not_missing.apply( lambda row: len(args[0].tokenize(str(row[r_join_attr]))), 1) > 0] # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge( ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: round( sim_func(args[0].tokenize(str(row[l_join_attr])), args[0].tokenize( str(row[r_join_attr]))), 4), axis=1) comp_fn = COMP_OP_MAP[DEFAULT_COMP_OP] # Check for comp_op in args. if len(args) > 2: comp_fn = COMP_OP_MAP[args[2]] expected_pairs = set() for idx, row in cartprod.iterrows(): if comp_fn(float(row['sim_score']), args[1]): expected_pairs.add(','.join( (str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = args[0].get_return_set() # use join function to obtain actual output pairs. actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, *args) assert_equal(args[0].get_return_set(), orig_return_set_flag) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 7: l_out_prefix = args[7] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 8: r_out_prefix = args[8] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 5: if args[5]: l_out_attrs = remove_redundant_attrs(args[5], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 6: if args[6]: r_out_attrs = remove_redundant_attrs(args[6], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 9: if args[9]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_valid_join(scenario, tok, threshold,comp_op=DEFAULT_COMP_OP, args=(), convert_to_str=False,data_limit=100000,temp_dir = os.getcwd(), output_file_path = default_output_file_path): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) if convert_to_str: dataframe_column_to_str(ltable, l_join_attr, inplace=True) dataframe_column_to_str(rtable, r_join_attr, inplace=True) missing_pairs = set() # if allow_missing flag is set, compute missing pairs. if len(args) > 0 and args[0]: for l_idx, l_row in ltable.iterrows(): for r_idx, r_row in rtable.iterrows(): if (pd.isnull(l_row[l_join_attr]) or pd.isnull(r_row[r_join_attr])): missing_pairs.add(','.join((str(l_row[l_key_attr]), str(r_row[r_key_attr])))) # remove rows with missing value in join attribute and create new dataframes # consisting of rows with non-missing values. ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy() rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy() # generate cartesian product to be used as candset ltable_not_missing['tmp_join_key'] = 1 rtable_not_missing['tmp_join_key'] = 1 cartprod = pd.merge(ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable_not_missing.drop('tmp_join_key', 1) rtable_not_missing.drop('tmp_join_key', 1) sim_measure_type = 'EDIT_DISTANCE' sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( str(row[l_join_attr]), str(row[r_join_attr])), axis=1) comp_fn = COMP_OP_MAP[comp_op] expected_pairs = set() overlap = get_sim_function('OVERLAP') for idx, row in cartprod.iterrows(): l_tokens = tok.tokenize(str(row[l_join_attr])) r_tokens = tok.tokenize(str(row[r_join_attr])) if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0: continue # current edit distance join is approximate. It cannot find matching # strings which don't have any common q-grams. Hence, remove pairs # that don't have any common q-grams from expected pairs. if comp_fn(float(row['sim_score']), threshold): if overlap(l_tokens, r_tokens) > 0: expected_pairs.add(','.join((str(row[l_key_attr]), str(row[r_key_attr])))) expected_pairs = expected_pairs.union(missing_pairs) orig_return_set_flag = tok.get_return_set() # Removing any previously existing output file path. if os.path.exists(output_file_path): os.remove(output_file_path) # Use join function to process the input data. It returns the boolean value. is_success = disk_edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, data_limit, comp_op, *args, tokenizer=tok, temp_dir = temp_dir, output_file_path = output_file_path) # Use edit distance join without the disk version to get the dataframe to compare. no_disk_candset = edit_distance_join(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, threshold, comp_op, *args, tokenizer=tok) # Deleting Id to make the schema consistent for comparison. if '_id' in no_disk_candset : del no_disk_candset['_id'] assert_equal(tok.get_return_set(), orig_return_set_flag) expected_output_attrs = [] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 3: l_out_prefix = args[3] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for r_out_prefix in args. if len(args) > 4: r_out_prefix = args[4] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for l_out_attrs in args. if len(args) > 1: if args[1]: l_out_attrs = remove_redundant_attrs(args[1], l_key_attr) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 2: if args[2]: r_out_attrs = remove_redundant_attrs(args[2], r_key_attr) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 5: if args[5]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # Verify whether the current output file path exists. assert_equal(True,os.path.exists(output_file_path)) # verify whether the output table has the necessary attributes. actual_candset = pd.read_csv(output_file_path) # Comparing column header values assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) assert_list_equal(list(no_disk_candset.columns.values), list(actual_candset.columns.values)) actual_pairs = set() no_disk_pairs = set() # Creating sets for comparing the data tuples for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) for idx, row in no_disk_candset.iterrows(): no_disk_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # Verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) assert_equal(len(expected_pairs), len(no_disk_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) common_pairs_no_disk = no_disk_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs)) assert_equal(len(common_pairs_no_disk), len(expected_pairs))
def test_filter_tables(self, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, args): suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) sim_fn = get_sim_function(sim_measure_type) # compute the join output pairs join_output_pairs = set() for l_idx, l_row in args[0].iterrows(): for r_idx, r_row in args[1].iterrows(): # if allow_missing is set to True, then add pairs containing # missing value to the join output. if pd.isnull(l_row[args[4]]) or pd.isnull(r_row[args[5]]): if allow_missing: join_output_pairs.add(','.join((str(l_row[args[2]]), str(r_row[args[3]])))) continue if sim_measure_type == 'EDIT_DISTANCE': l_join_val = str(l_row[args[4]]) r_join_val = str(r_row[args[5]]) comp_fn = COMP_OP_MAP['<='] else: l_join_val = tokenizer.tokenize(str(l_row[args[4]])) r_join_val = tokenizer.tokenize(str(r_row[args[5]])) comp_fn = COMP_OP_MAP['>='] if (len(l_join_val) == 0 and len(r_join_val) == 0 and sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']): if allow_empty: join_output_pairs.add(','.join((str(l_row[args[2]]), str(r_row[args[3]])))) continue # if both attributes are not missing and not empty, then check # if the pair satisfies the join condition. If yes, then add it # to the join output. if comp_fn(sim_fn(l_join_val, r_join_val), threshold): join_output_pairs.add(','.join((str(l_row[args[2]]), str(r_row[args[3]])))) actual_candset = suffix_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(int(row[l_out_prefix + args[2]])), str(int(row[r_out_prefix + args[3]]))))) # verify whether all the join output pairs are # present in the actual output pairs common_pairs = actual_pairs.intersection(join_output_pairs) assert_equal(len(common_pairs), len(join_output_pairs))
def test_filter_tables(self, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, args): suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) sim_fn = get_sim_function(sim_measure_type) # compute the join output pairs join_output_pairs = set() for l_idx, l_row in args[0].iterrows(): for r_idx, r_row in args[1].iterrows(): # if allow_missing is set to True, then add pairs containing # missing value to the join output. if pd.isnull(l_row[args[4]]) or pd.isnull(r_row[args[5]]): if allow_missing: join_output_pairs.add(','.join( (str(l_row[args[2]]), str(r_row[args[3]])))) continue if sim_measure_type == 'EDIT_DISTANCE': l_join_val = str(l_row[args[4]]) r_join_val = str(r_row[args[5]]) comp_fn = COMP_OP_MAP['<='] else: l_join_val = tokenizer.tokenize(str(l_row[args[4]])) r_join_val = tokenizer.tokenize(str(r_row[args[5]])) comp_fn = COMP_OP_MAP['>='] if (len(l_join_val) == 0 and len(r_join_val) == 0 and sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']): if allow_empty: join_output_pairs.add(','.join( (str(l_row[args[2]]), str(r_row[args[3]])))) continue # if both attributes are not missing and not empty, then check # if the pair satisfies the join condition. If yes, then add it # to the join output. if comp_fn(sim_fn(l_join_val, r_join_val), threshold): join_output_pairs.add(','.join( (str(l_row[args[2]]), str(r_row[args[3]])))) actual_candset = suffix_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(int(row[l_out_prefix + args[2]])), str(int(row[r_out_prefix + args[3]]))))) # verify whether all the join output pairs are # present in the actual output pairs common_pairs = actual_pairs.intersection(join_output_pairs) assert_equal(len(common_pairs), len(join_output_pairs))
def apply_matcher(candset, candset_l_key_attr, candset_r_key_attr, ltable, rtable, l_key_attr, r_key_attr, l_match_attr, r_match_attr, tokenizer, sim_function, threshold, comp_op='>=', allow_missing=False, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=True, n_jobs=1, show_progress=True): """Find matching string pairs from the candidate set (typically produced by applying a filter to two tables) by applying a matcher of form (sim_function comp_op threshold). Specifically, this method computes the input similarity function on string pairs in the candidate set and checks if the resulting score satisfies the input threshold (depending on the comparison operator). Args: candset (DataFrame): input candidate set. candset_l_key_attr (string): attribute in candidate set which is a key in left table. candset_r_key_attr (string): attribute in candidate set which is a key in right table. ltable (DataFrame): left input table. rtable (DataFrame): right input table. l_key_attr (string): key attribute in left table. r_key_attr (string): key attribute in right table. l_match_attr (string): attribute in left table on which the matcher should be applied. r_match_attr (string): attribute in right table on which the matcher should be applied. tokenizer (Tokenizer): tokenizer to be used to tokenize the match attributes. If set to None, the matcher is applied directly on the match attributes. sim_function (function): matcher function to be applied. threshold (float): threshold to be satisfied. comp_op (string): comparison operator. Supported values are '>=', '>', ' <=', '<', '=' and '!=' (defaults to '>='). allow_missing (boolean): flag to indicate whether tuple pairs with missing value in at least one of the match attributes should be included in the output (defaults to False). l_out_attrs (list): list of attribute names from the left table to be included in the output table (defaults to None). r_out_attrs (list): list of attribute names from the right table to be included in the output table (defaults to None). l_out_prefix (string): prefix to be used for the attribute names coming from the left table, in the output table (defaults to 'l\_'). r_out_prefix (string): prefix to be used for the attribute names coming from the right table, in the output table (defaults to 'r\_'). out_sim_score (boolean): flag to indicate whether similarity score should be included in the output table (defaults to True). Setting this flag to True will add a column named '_sim_score' in the output table. This column will contain the similarity scores for the tuple pairs in the output. n_jobs (int): number of parallel jobs to use for the computation (defaults to 1). If -1 is given, all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, then no parallel computing code will be used (i.e., equivalent to the default). show_progress (boolean): flag to indicate whether task progress should be displayed to the user (defaults to True). Returns: An output table containing tuple pairs from the candidate set that survive the matcher (DataFrame). """ # check if the input candset is a dataframe validate_input_table(candset, 'candset') # check if the candset key attributes exist validate_attr(candset_l_key_attr, candset.columns, 'left key attribute', 'candset') validate_attr(candset_r_key_attr, candset.columns, 'right key attribute', 'candset') # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and join attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_match_attr, ltable.columns, 'match attribute', 'left table') validate_attr(r_match_attr, rtable.columns, 'match attribute', 'right table') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the input tokenizer is valid, if it is not None if tokenizer is not None: validate_tokenizer(tokenizer) # check if the comparison operator is valid validate_comp_op(comp_op) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') # check for empty candset if candset.empty: return candset # remove redundant attrs from output attrs. l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr) r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr) # get attributes to project. l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_match_attr) r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_match_attr) # do a projection on the input dataframes. Note that this doesn't create a # copy of the dataframes. It only creates a view on original dataframes. ltable_projected = ltable[l_proj_attrs] rtable_projected = rtable[r_proj_attrs] # computes the actual number of jobs to launch. n_jobs = min(get_num_processes_to_launch(n_jobs), len(candset)) # If a tokenizer is provided, we can optimize by tokenizing each value # only once by caching the tokens of l_match_attr and r_match_attr. But, # this can be a bad strategy in case the candset has very few records # compared to the original tables. Hence, we check if the sum of tuples in # ltable and rtable is less than twice the number of tuples in the candset. # If yes, we decide to cache the token values. Else, we do not cache the # tokens as the candset is small. l_tokens = None r_tokens = None if tokenizer is not None and (len(ltable) + len(rtable) < len(candset)*2): l_tokens = generate_tokens(ltable_projected, l_key_attr, l_match_attr, tokenizer) r_tokens = generate_tokens(rtable_projected, r_key_attr, r_match_attr, tokenizer) if n_jobs <= 1: # if n_jobs is 1, do not use any parallel code. output_table = _apply_matcher_split(candset, candset_l_key_attr, candset_r_key_attr, ltable_projected, rtable_projected, l_key_attr, r_key_attr, l_match_attr, r_match_attr, tokenizer, sim_function, threshold, comp_op, allow_missing, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress, l_tokens, r_tokens) else: # if n_jobs is above 1, split the candset into n_jobs splits and apply # the matcher on each candset split in a separate process. candset_splits = split_table(candset, n_jobs) results = Parallel(n_jobs=n_jobs)(delayed(_apply_matcher_split)( candset_splits[job_index], candset_l_key_attr, candset_r_key_attr, ltable_projected, rtable_projected, l_key_attr, r_key_attr, l_match_attr, r_match_attr, tokenizer, sim_function, threshold, comp_op, allow_missing, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, (show_progress and (job_index==n_jobs-1)), l_tokens, r_tokens) for job_index in range(n_jobs)) output_table = pd.concat(results) return output_table