def block_candset(self, candset, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks an input candidate set of tuple pairs based on a sequence of blocking rules supplied by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked (dropped). Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> D = rb.block_tables(C) # C is the candidate set. """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, ' + 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) # do blocking # # initialize the progress bar # if show_progress: # bar = pyprind.ProgBar(len(candset)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, [], []) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, None, show_progress, n_chunks) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def dask_extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. AssertionError: If `n_chunks` is not of type int. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = np.array_split(candset, n_chunks) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = [] for i in range(len(c_splits)): partial_result = delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], False) feat_vals_by_splits.append(partial_result) feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits) if show_progress: with ProgressBar(): feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on a black box blocking function specified by the user. Finds tuple pairs from left and right tables that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If values in `l_output_attrs` is not of type string. AssertionError: If values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] ) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate black box function assert self.black_box_function != None, 'Black box function is not set' # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # pickle the black-box function before passing it as an arg to # # _block_tables_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress) else: # multiprocessing l_splits = pd.np.array_split(l_df, n_ltable_chunks) r_splits = pd.np.array_split(r_df, n_rtable_chunks) c_splits = [] for i in range(len(l_splits)): for j in range(len(r_splits)): partial_result = delayed(_block_tables_split)( l_splits[i], r_splits[j], l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, False) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) if show_progress: with ProgressBar(): c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # # determine the attributes to retain in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on the sequence of rules supplied by the user. Finds tuple pairs from left and right tables that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived the sequence of blocking rules (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If the input `l_output_prefix` is not of type string. AssertionError: If the input `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> C = rb.block_tables(A, B) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, l_output_attrs_1, r_output_attrs_1) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] candset, rule_applied = self.block_tables_with_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, get_num_cores()) # pass number of splits as # the number of cores in the machine if candset is None: # no filterable rule was applied candset = self.block_tables_without_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_ltable_chunks, n_rtable_chunks) elif len(self.rules) > 1: # one filterable rule was applied but other rules are left # block candset by applying other rules and excluding the applied rule candset = self.block_candset_excluding_rule( candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key, r_output_prefix + r_key, rule_applied, show_progress, get_num_cores()) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on the overlap of token sets of attribute values. Finds tuple pairs from left and right tables such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of a tuple from the left table, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of a tuple from the right table, is above a certain threshold. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_output_attrs` are not in the ltable. AssertionError: If `r_output_attrs` are not in the rtable. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() # Use all cores # # Use word-level tokenizer >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Use q-gram tokenizer >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Include all possible missing values >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True, n_ltable_chunks=-1, n_rtable_chunks=-1) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Input validations self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_ltable_chunks, n_rtable_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) self.validate_allow_missing(allow_missing) self.validate_show_progress(show_progress) self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) self.validate_word_level_qval(word_level, q_val) log_info(logger, 'Required metadata: ltable key, rtable key', verbose) l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate input table chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) if n_ltable_chunks == -1: n_ltable_chunks = multiprocessing.cpu_count() ltable_chunks = pd.np.array_split(ltable, n_ltable_chunks) # preprocess/tokenize ltable if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(qval=q_val, return_set=True) preprocessed_tokenized_ltbl = [] # Construct DAG for preprocessing/tokenizing ltable chunks start_row_id = 0 for i in range(len(ltable_chunks)): result = delayed(self.process_tokenize_block_attr)(ltable_chunks[i][ l_overlap_attr], start_row_id, rem_stop_words, tokenizer) preprocessed_tokenized_ltbl.append(result) start_row_id += len(ltable_chunks[i]) preprocessed_tokenized_ltbl = delayed(wrap)(preprocessed_tokenized_ltbl) # Execute the DAG if show_progress: with ProgressBar(): logger.info('Preprocessing/tokenizing ltable') preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) else: preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) ltable_processed_dict = {} for i in range(len(preprocessed_tokenized_ltbl_vals)): ltable_processed_dict.update(preprocessed_tokenized_ltbl_vals[i]) # build inverted index inverted_index = self.build_inverted_index(ltable_processed_dict) if n_rtable_chunks == -1: n_rtable_chunks = multiprocessing.cpu_count() rtable_chunks = pd.np.array_split(rtable, n_rtable_chunks) # Construct the DAG for probing probe_result = [] start_row_id = 0 for i in range(len(rtable_chunks)): result = delayed(self.probe)(rtable_chunks[i][r_overlap_attr], inverted_index, start_row_id, rem_stop_words, tokenizer, overlap_size) probe_result.append(result) start_row_id += len(rtable_chunks[i]) probe_result = delayed(wrap)(probe_result) # Execute the DAG for probing if show_progress: with ProgressBar(): logger.info('Probing using rtable') probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) else: probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) # construct a minimal dataframe that can be used to add more attributes flat_list = [item for sublist in probe_result for item in sublist] tmp = pd.DataFrame(flat_list, columns=['fk_ltable_rid', 'fk_rtable_rid']) fk_ltable = ltable.iloc[tmp.fk_ltable_rid][l_key].values fk_rtable = rtable.iloc[tmp.fk_rtable_rid][r_key].values id_vals = list(range(len(flat_list))) candset = pd.DataFrame.from_dict( {'_id': id_vals, l_output_prefix+l_key: fk_ltable, r_output_prefix+r_key: fk_rtable}) # set the properties for the candidate set cm.set_key(candset, '_id') cm.set_fk_ltable(candset, 'ltable_'+l_key) cm.set_fk_rtable(candset, 'rtable_'+r_key) cm.set_ltable(candset, ltable) cm.set_rtable(candset, rtable) ret_candset = gh.add_output_attributes(candset, l_output_attrs=l_output_attrs, r_output_attrs=r_output_attrs, l_output_prefix=l_output_prefix, r_output_prefix=r_output_prefix, validate=False) # handle missing values if allow_missing: missing_value_pairs = get_pairs_with_missing_value(ltable, rtable, l_key, r_key, l_overlap_attr, r_overlap_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, False) missing_value_pairs.insert(0, '_id', range(len(ret_candset), len(ret_candset)+len(missing_value_pairs))) if len(missing_value_pairs) > 0: ret_candset = pd.concat([ret_candset, missing_value_pairs], ignore_index=True, sort=False) cm.set_key(ret_candset, '_id') cm.set_fk_ltable(ret_candset, 'ltable_' + l_key) cm.set_fk_rtable(ret_candset, 'rtable_' + r_key) cm.set_ltable(ret_candset, ltable) cm.set_rtable(ret_candset, rtable) # Return the final candidate set to user. return ret_candset
def block_candset(self, candset, verbose=True, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on a black box blocking function specified by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether logging should be done (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker import DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> D = bb.block_candset(C) # C is an output from block_tables """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate black box functionn assert self.black_box_function != None, 'Black box function is not set' # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # project candset to keep only the ID attributes c_df = candset[[key, fk_ltable, fk_rtable]] # # determine the number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) # # pickle the black-box function before passing it as an arg to # # _block_candset_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress) else: # multiprocessing c_splits = pd.np.array_split(c_df, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)( c_splits[i], l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, False) valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute( scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(c_df) > 0: c_df = candset[valid] else: c_df = pd.DataFrame(columns=candset.columns) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on attribute equivalence. Conceptually, this will check `l_block_attr=r_block_attr` for each tuple pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a Pandas dataframe object with tuple pairs that satisfy the equality condition. The dataframe will include attributes '_id', key attribute from ltable, key attributes from rtable, followed by lists `l_output_attrs` and `r_output_attrs` if they are specified. Each of these output and key attributes will be prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set to `True` then all tuple pairs with missing value in at least one of the tuples will be included in the output dataframe. Further, this will update the following metadata in the catalog for the output table: (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) # Include all possible tuple pairs with missing values >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) """ logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR " "OWN RISK.") # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # last arg is # set to 1 just to reuse the function from the # old blocker. # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # do projection of required attributes from the tables l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr, l_output_attrs) ltable_proj = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr, r_output_attrs) rtable_proj = rtable[r_proj_attrs] # # remove records with nans in the blocking attribute l_df = rem_nan(ltable_proj, l_block_attr) r_df = rem_nan(rtable_proj, r_block_attr) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) else: l_splits = pd.np.array_split(l_df, n_ltable_chunks) r_splits = pd.np.array_split(r_df, n_rtable_chunks) c_splits = [] for l in l_splits: for r in r_splits: partial_result = delayed(_block_tables_split)(l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # if allow_missing flag is True, then compute # all pairs with missing value in left table, and # all pairs with missing value in right table if allow_missing: missing_pairs = self.get_pairs_with_missing_value(ltable_proj, rtable_proj, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = pd.concat([candset, missing_pairs], ignore_index=True) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_candset(self, candset, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks an input candidate set of tuple pairs based on a sequence of blocking rules supplied by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked (dropped). Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> D = rb.block_tables(C) # C is the candidate set. """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' + 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) # do blocking # # initialize the progress bar # if show_progress: # bar = pyprind.ProgBar(len(candset)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(l_key, r_key, [], []) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, None, show_progress, n_chunks) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def block_candset(self, candset, l_block_attr, r_block_attr, allow_missing=False, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on attribute equivalence. Finds tuple pairs from an input candidate set of tuple pairs such that the value of attribute l_block_attr of the left tuple in a tuple pair exactly matches the value of attribute r_block_attr of the right tuple in the tuple pair. Args: candset (DataFrame): The input candidate set of tuple pairs. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ab.block_candset(C, 'age', 'age', n_chunks=-1) """ logger.warning("WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # do projection before merge l_df = ltable[[l_key, l_block_attr]] r_df = rtable[[r_key, r_block_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # determine number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(candset, l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, show_progress) else: c_splits = pd.np.array_split(candset, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, False) # setting show # progress to False as we will use Dask diagnostics to display progress # bar valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_chunks=-1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_chunks=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ logger.warning( "WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Validate input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # validate number of chunks validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(return_set=True) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = pd.np.array_split(candset, n_chunks) valid_splits = [] # Create DAG for i in range(n_chunks): result = delayed(self._block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, fk_ltable, fk_rtable, allow_missing, rem_stop_words, tokenizer, overlap_size) valid_splits.append(result) valid_splits = delayed(wrap)(valid_splits) # Execute the DAG if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on a black box blocking function specified by the user. Finds tuple pairs from left and right tables that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If values in `l_output_attrs` is not of type string. AssertionError: If values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] ) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate black box function assert self.black_box_function != None, 'Black box function is not set' # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # pickle the black-box function before passing it as an arg to # # _block_tables_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress) else: # multiprocessing l_splits = pd.np.array_split(l_df, n_ltable_chunks) r_splits = pd.np.array_split(r_df, n_rtable_chunks) c_splits = [] for i in range(len(l_splits)): for j in range(len(r_splits)): partial_result = delayed(_block_tables_split)(l_splits[i], r_splits[j], l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, False) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) if show_progress: with ProgressBar(): c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # # determine the attributes to retain in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_candset(self, candset, verbose=True, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on a black box blocking function specified by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether logging should be done (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker import DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> D = bb.block_candset(C) # C is an output from block_tables """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate black box functionn assert self.black_box_function != None, 'Black box function is not set' # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # project candset to keep only the ID attributes c_df = candset[[key, fk_ltable, fk_rtable]] # # determine the number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) # # pickle the black-box function before passing it as an arg to # # _block_candset_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress) else: # multiprocessing c_splits = pd.np.array_split(c_df, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, False) valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(c_df) > 0: c_df = candset[valid] else: c_df = pd.DataFrame(columns=candset.columns) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def block_candset(self, candset, l_block_attr, r_block_attr, allow_missing=False, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on attribute equivalence. Finds tuple pairs from an input candidate set of tuple pairs such that the value of attribute l_block_attr of the left tuple in a tuple pair exactly matches the value of attribute r_block_attr of the right tuple in the tuple pair. Args: candset (DataFrame): The input candidate set of tuple pairs. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ab.block_candset(C, 'age', 'age', n_chunks=-1) """ logger.warning("WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # do projection before merge l_df = ltable[[l_key, l_block_attr]] r_df = rtable[[r_key, r_block_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # determine number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(candset, l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, show_progress) else: c_splits = np.array_split(candset, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, False) # setting show # progress to False as we will use Dask diagnostics to display progress # bar valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on attribute equivalence. Conceptually, this will check `l_block_attr=r_block_attr` for each tuple pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a Pandas dataframe object with tuple pairs that satisfy the equality condition. The dataframe will include attributes '_id', key attribute from ltable, key attributes from rtable, followed by lists `l_output_attrs` and `r_output_attrs` if they are specified. Each of these output and key attributes will be prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set to `True` then all tuple pairs with missing value in at least one of the tuples will be included in the output dataframe. Further, this will update the following metadata in the catalog for the output table: (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) # Include all possible tuple pairs with missing values >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) """ logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR " "OWN RISK.") # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # last arg is # set to 1 just to reuse the function from the # old blocker. # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # do projection of required attributes from the tables l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr, l_output_attrs) ltable_proj = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr, r_output_attrs) rtable_proj = rtable[r_proj_attrs] # # remove records with nans in the blocking attribute l_df = rem_nan(ltable_proj, l_block_attr) r_df = rem_nan(rtable_proj, r_block_attr) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) else: l_splits = np.array_split(l_df, n_ltable_chunks) r_splits = np.array_split(r_df, n_rtable_chunks) c_splits = [] for l in l_splits: for r in r_splits: partial_result = delayed(_block_tables_split)(l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # if allow_missing flag is True, then compute # all pairs with missing value in left table, and # all pairs with missing value in right table if allow_missing: missing_pairs = self.get_pairs_with_missing_value(ltable_proj, rtable_proj, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = pd.concat([candset, missing_pairs], ignore_index=True) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on the sequence of rules supplied by the user. Finds tuple pairs from left and right tables that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived the sequence of blocking rules (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If the input `l_output_prefix` is not of type string. AssertionError: If the input `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> C = rb.block_tables(A, B) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(l_key, r_key, l_output_attrs_1, r_output_attrs_1) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] candset, rule_applied = self.block_tables_with_filters(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, get_num_cores()) # pass number of splits as # the number of cores in the machine if candset is None: # no filterable rule was applied candset = self.block_tables_without_filters(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_ltable_chunks, n_rtable_chunks) elif len(self.rules) > 1: # one filterable rule was applied but other rules are left # block candset by applying other rules and excluding the applied rule candset = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key, r_output_prefix + r_key, rule_applied, show_progress, get_num_cores()) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset