def apply_filterable_rule(self, rule_name, l_df, r_df, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, show_progress, n_jobs): candset = None conjunct_list = self.rule_str[rule_name] for conjunct in conjunct_list: is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, th = parse_conjunct( conjunct, self.rule_ft[rule_name]) if l_tok == 'dlm_dc0': tokenizer = WhitespaceTokenizer(return_set=True) elif l_tok == 'qgm_3': tokenizer = QgramTokenizer(qval=3, return_set=True) if sim_fn == 'jaccard': join_fn = ssj.jaccard_join elif sim_fn == 'cosine': join_fn = ssj.cosine_join elif sim_fn == 'dice': join_fn = ssj.dice_join elif sim_fn == 'overlap_coeff': join_fn = ssj.overlap_coefficient_join elif sim_fn == 'lev_dist': join_fn = ssj.edit_distance_join if join_fn == ssj.edit_distance_join: comp_op = '<=' if op == '>=': comp_op = '<' else: comp_op = '>=' if op == '<=': comp_op = '>' ssj.dataframe_column_to_str(l_df, l_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_attr, inplace=True) if join_fn == ssj.edit_distance_join: c_df = join_fn(l_df, r_df, l_key, r_key, l_attr, r_attr, float(th), comp_op, True, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, n_jobs, show_progress) else: c_df = join_fn(l_df, r_df, l_key, r_key, l_attr, r_attr, tokenizer, float(th), comp_op, True, True, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, n_jobs, show_progress) if candset is not None: # union the candset of this conjunct with the existing candset candset = pd.concat([candset, c_df]).drop_duplicates( [l_output_prefix + l_key, r_output_prefix + r_key]).reset_index(drop=True) else: # candset from the first conjunct of the rule candset = c_df return candset
def apply_filterable_rule(self, rule_name, l_df, r_df, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, show_progress, n_chunks): candset = None conjunct_list = self.rule_str[rule_name] for conjunct in conjunct_list: is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, th = parse_conjunct( conjunct, self.rule_ft[rule_name]) if l_tok == 'dlm_dc0': tokenizer = WhitespaceTokenizer(return_set=True) elif l_tok == 'qgm_3': tokenizer = QgramTokenizer(qval=3, return_set=True) if sim_fn == 'jaccard': join_fn = ssj.jaccard_join elif sim_fn == 'cosine': join_fn = ssj.cosine_join elif sim_fn == 'dice': join_fn = ssj.dice_join elif sim_fn == 'overlap_coeff': join_fn = ssj.overlap_coefficient_join elif sim_fn == 'lev_dist': join_fn = ssj.edit_distance_join if join_fn == ssj.edit_distance_join: comp_op = '<=' if op == '>=': comp_op = '<' else: comp_op = '>=' if op == '<=': comp_op = '>' ssj.dataframe_column_to_str(l_df, l_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_attr, inplace=True) if join_fn == ssj.edit_distance_join: c_df = join_fn(l_df, r_df, l_key, r_key, l_attr, r_attr, float(th), comp_op, True, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, n_chunks, show_progress) else: c_df = join_fn(l_df, r_df, l_key, r_key, l_attr, r_attr, tokenizer, float(th), comp_op, True, True, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, n_chunks, show_progress) if candset is not None: # union the candset of this conjunct with the existing candset candset = pd.concat([candset, c_df]).drop_duplicates( [l_output_prefix + l_key, r_output_prefix + r_key]).reset_index(drop=True) else: # candset from the first conjunct of the rule candset = c_df return candset
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on the overlap of token sets of attribute values. Finds tuple pairs from left and right tables such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of a tuple from the left table, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of a tuple from the right table, is above a certain threshold. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_output_attrs` are not in the ltable. AssertionError: If `r_output_attrs` are not in the rtable. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() # Use word-level tokenizer >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1) # Use q-gram tokenizer >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2) # Include all possible missing values >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) # Use all the cores in the machine >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], n_jobs=-1) """ # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate data type of show_progress self.validate_show_progress(show_progress) # validate overlap attributes self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_proj_attrs = self.get_attrs_to_project(l_key, l_overlap_attr, l_output_attrs) l_df = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_overlap_attr, r_output_attrs) r_df = rtable[r_proj_attrs] # # case the column to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words l_dummy_overlap_attr = '@#__xx__overlap_ltable__#@' r_dummy_overlap_attr = '@#__xx__overlap_rtable__#@' l_df[l_dummy_overlap_attr] = l_df[l_overlap_attr] r_df[r_dummy_overlap_attr] = r_df[r_overlap_attr] if not l_df.empty: self.cleanup_table(l_df, l_dummy_overlap_attr, rem_stop_words) if not r_df.empty: self.cleanup_table(r_df, r_dummy_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # perform overlap similarity join candset = overlap_join(l_df, r_df, l_key, r_key, l_dummy_overlap_attr, r_dummy_overlap_attr, tokenizer, overlap_size, '>=', allow_missing, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, n_jobs, show_progress) # # retain only the required attributes in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = candset[retain_cols] # update metadata in the catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return the candidate set return candset
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words self.cleanup_table(l_df, l_overlap_attr, rem_stop_words) self.cleanup_table(r_df, r_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # create a filter for overlap similarity join overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) # # perform overlap similarity filtering of the candset out_table = overlap_filter.filter_candset(candset, fk_ltable, fk_rtable, l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, n_jobs, show_progress=show_progress) # update catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return out_table
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_chunks=-1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_chunks=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ logger.warning( "WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Validate input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # validate number of chunks validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(return_set=True) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = pd.np.array_split(candset, n_chunks) valid_splits = [] # Create DAG for i in range(n_chunks): result = delayed(self._block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, fk_ltable, fk_rtable, allow_missing, rem_stop_words, tokenizer, overlap_size) valid_splits.append(result) valid_splits = delayed(wrap)(valid_splits) # Execute the DAG if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def _apply_filterable_rule(self, rule_name, ltable, rtable, l_key, r_key): candset = None conjunct_list = self.rule_str[rule_name] for conjunct in conjunct_list: is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, \ th = self._parse_conjunct( conjunct, rule_name) if l_tok == 'dlm_dc0': tokenizer = WhitespaceTokenizer(return_set=True) elif l_tok == 'qgm_3': tokenizer = QgramTokenizer(qval=3, return_set=True) if sim_fn == 'jaccard': join_fn = ssj.jaccard_join elif sim_fn == 'cosine': join_fn = ssj.cosine_join elif sim_fn == 'dice': join_fn = ssj.dice_join elif sim_fn == 'overlap_coeff': join_fn = ssj.overlap_coefficient_join elif sim_fn == 'lev_dist': join_fn = ssj.edit_distance_join if join_fn == ssj.edit_distance_join: comp_op = '<=' if op == '>=': comp_op = '<' else: comp_op = '>=' if op == '<=': comp_op = '>' ssj.dataframe_column_to_str(ltable, l_attr, inplace=True) ssj.dataframe_column_to_str(rtable, r_attr, inplace=True) if join_fn == ssj.edit_distance_join: tokenizer = QgramTokenizer(qval=2, return_set=False) c_df = join_fn( ltable, rtable, l_key, r_key, l_attr, r_attr, float(th), comp_op, allow_missing=True, # need to revisit allow_missing out_sim_score=False, l_out_prefix='l_', r_out_prefix='r_', show_progress=False, tokenizer=tokenizer) else: c_df = join_fn(ltable, rtable, l_key, r_key, l_attr, r_attr, tokenizer, float(th), comp_op, allow_empty=True, allow_missing=True, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=False) #c_df.drop('_id', axis=1) if candset is not None: # union the candset of this conjunct with the existing candset candset = pd.concat([candset, c_df]).drop_duplicates( [l_output_prefix + l_key, r_output_prefix + r_key]).reset_index(drop=True) else: # candset from the first conjunct of the rule candset = c_df return candset
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words self.cleanup_table(l_df, l_overlap_attr, rem_stop_words) self.cleanup_table(r_df, r_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # create a filter for overlap similarity join overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) # # perform overlap similarity filtering of the candset out_table = overlap_filter.filter_candset(candset, fk_ltable, fk_rtable, l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, n_jobs, show_progress=show_progress) # update catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return out_table