def get_metadata_for_candset(candset, lgr, verbose): """ Gets metadata for the candset """ # Validate input parameters if not isinstance(candset, pd.DataFrame): logger.error('Input candset is not of type pandas data frame') raise AssertionError('Input candset is not of type pandas data frame') ch.log_info(lgr, 'Getting metadata from the catalog', verbose) # Get the key, foreign keys, ltable, rtable and their keys # # Get key key = get_key(candset) # # Get the foreign keys fk_ltable = get_fk_ltable(candset) fk_rtable = get_fk_rtable(candset) # # Get the base tables ltable = get_ltable(candset) rtable = get_rtable(candset) # Get the base table keys l_key = get_key(ltable) r_key = get_key(rtable) ch.log_info(lgr, '..... Done', verbose) # Return the metadata return key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key
def _validate_metadata_for_table(table, key, output_string, lgr, verbose): """ Validates metadata for table (DataFrame) """ # Validate input parameters # # We expect the input table to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input object is not of type pandas DataFrame') raise AssertionError('Input object is not of type pandas DataFrame') # Check the key column is present in the table if not ch.check_attrs_present(table, key): logger.error('Input key ( %s ) not in the DataFrame' % key) raise KeyError('Input key ( %s ) not in the DataFrame' % key) # Validate the key ch.log_info(lgr, 'Validating ' + output_string + ' key: ' + str(key), verbose) # We expect the key to be of type string if not isinstance(key, six.string_types): logger.error('Key attribute must be of type string') raise AssertionError('Key attribute must be of type string') if not ch.is_key_attribute(table, key, verbose): logger.error('Attribute %s in the %s table does not ' 'qualify to be the key' % (str(key), output_string)) raise AssertionError('Attribute %s in the %s table does not ' 'qualify to be the key' % (str(key), output_string)) ch.log_info(lgr, '..... Done', verbose) return True
def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. if not isinstance(table, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame
def _validate_inputs(table, label_column_name, verbose): """ This function validates the inputs for the label_table function """ # Validate the input parameters # # The input table table is expected to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input object is not of type data frame') raise AssertionError('Input object is not of type data frame') # # The label column name is expected to be of type string if not isinstance(label_column_name, six.string_types): logger.error('Input attr. is not of type string') raise AssertionError('Input attr. is not of type string') # # Check if the label column name is already present in the input table if ch.check_attrs_present(table, label_column_name): logger.error( 'The label column name (%s) is already present in the ' 'input table', label_column_name) raise AssertionError( 'The label column name (%s) is already present ' 'in the input table', label_column_name) # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Return True if everything was successful return True
def _validate_inputs(table, label_column_name, verbose): """ This function validates the inputs for the label_table function """ # Validate the input parameters # # The input table table is expected to be of type pandas DataFrame validate_object_type(table, pd.DataFrame) # # The label column name is expected to be of type string validate_object_type(label_column_name, six.string_types, error_prefix='Input attr.') # # Check if the label column name is already present in the input table if ch.check_attrs_present(table, label_column_name): logger.error('The label column name (%s) is already present in the ' 'input table', label_column_name) raise AssertionError('The label column name (%s) is already present ' 'in the input table', label_column_name) # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Return True if everything was successful return True
def extract_from(self, candset): # Get metadata for candidate set key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(candset, logger, self.verbose) # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # Apply feature functions ch.log_info(logger, 'Applying feature functions', self.verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) n_procs = get_num_procs(self.n_jobs, len(candset)) c_splits = np.array_split(candset, n_procs) pickled_obj = cloudpickle.dumps(self.feature_table) feat_vals_by_splits = Parallel(n_jobs=n_procs)( delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_split, self.show_progress and i == len(c_splits) - 1 ) for i, c_split in enumerate(c_splits) ) feat_vals = sum(feat_vals_by_splits, []) return feat_vals
def get_keys_for_ltable_rtable(ltable, rtable, lgr, verbose): """ Gets keys for the ltable and rtable. """ # We expect the ltable to be of type pandas DataFrame if not isinstance(ltable, pd.DataFrame): logger.error('Input ltable is not of type pandas data frame') raise AssertionError('Input ltable is not of type pandas data frame') # We expect the rtable to be of type pandas DataFrame if not isinstance(rtable, pd.DataFrame): logger.error('Input rtable is not of type pandas data frame') raise AssertionError('Input rtable is not of type pandas data frame') ch.log_info(lgr, 'Required metadata: ltable key, rtable key', verbose) ch.log_info(lgr, 'Getting metadata from the catalog', verbose) # Get the ltable key and rtable key from the catalog ltable_key = get_key(ltable) rtable_key = get_key(rtable) ch.log_info(lgr, '..... Done', verbose) # return the ltable and rtable keys return ltable_key, rtable_key
def split_train_test(labeled_data, train_proportion=0.5, random_state=None, verbose=True): """ This function splits the input data into train and test. Specifically, this function is just a wrapper of scikit-learn's train_test_split function. This function also takes care of copying the metadata from the input table to train and test splits. Args: labeled_data (DataFrame): The input pandas DataFrame that needs to be split into train and test. train_proportion (float): A number between 0 and 1, indicating the proportion of tuples that should be included in the train split ( defaults to 0.5). random_state (object): A number of random number object (as in scikit-learn). verbose (boolean): A flag to indicate whether the debug information should be displayed. Returns: A Python dictionary containing two keys - train and test. The value for the key 'train' is a pandas DataFrame containing tuples allocated from the input table based on train_proportion. Similarly, the value for the key 'test' is a pandas DataFrame containing tuples for evaluation. This function sets the output DataFrames (train, test) properties same as the input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data or the feature vectors that should be split >>> train_test = em.split_train_test(G, train_proportion=0.5) >>> train, test = train_test['train'], train_test['test'] """ # Validate input parameters # # We expected labeled data to be of type pandas DataFrame if not isinstance(labeled_data, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( labeled_data, logger, verbose) # # Validate metadata cm._validate_metadata_for_candset(labeled_data, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) num_rows = len(labeled_data) # We expect the train proportion to be between 0 and 1. assert train_proportion >= 0 and train_proportion <= 1, \ " Train proportion is expected to be between 0 and 1" # We expect the number of rows in the table to be non-empty assert num_rows > 0, 'The input table is empty' # Explicitly get the train and test size in terms of tuples (based on the # given proportion) train_size = int(math.floor(num_rows * train_proportion)) test_size = int(num_rows - train_size) # Use sk-learn to split the data idx_values = pd.np.array(labeled_data.index.values) idx_train, idx_test = ms.train_test_split(idx_values, test_size=test_size, train_size=train_size, random_state=random_state) # Construct output tables. label_train = labeled_data.ix[idx_train] label_test = labeled_data.ix[idx_test] # Update catalog cm.init_properties(label_train) cm.copy_properties(labeled_data, label_train) cm.init_properties(label_test) cm.copy_properties(labeled_data, label_test) # Return output tables result = OrderedDict() result['train'] = label_train result['test'] = label_test # Finally, return the dictionary. return result
def _validate_metadata_for_candset(candset, key, foreign_key_ltable, foreign_key_rtable, ltable, rtable, ltable_key, rtable_key, lgr, verbose): """ Validates metadata for a candidate set. """ # Validate input parameters # # We expect candset to be of type pandas DataFrame if not isinstance(candset, pd.DataFrame): logger.error('Input candset is not of type pandas DataFrame') raise AssertionError('Input candset is not of type pandas DataFrame') # Check if the key column is present in the candset if not ch.check_attrs_present(candset, key): logger.error('Input key ( %s ) not in the DataFrame' % key) raise KeyError('Input key ( %s ) not in the DataFrame' % key) # Check if the foreign key ltable column is present in the candset if not ch.check_attrs_present(candset, foreign_key_ltable): logger.error('Input foreign_key_ltable ( %s ) not in the DataFrame' % foreign_key_ltable) raise KeyError('Input foreign_key_ltable ( %s ) not in the DataFrame' % foreign_key_ltable) # Check if the foreign key rtable column is present in the candset if not ch.check_attrs_present(candset, foreign_key_rtable): logger.error('Input fk_rtable ( %s ) not in the DataFrame' % foreign_key_rtable) raise KeyError('Input fk_rtable ( %s ) not in the DataFrame' % foreign_key_rtable) # We expect the ltable to be of type pandas DataFrame if not isinstance(ltable, pd.DataFrame): logger.error('Input ltable is not of type pandas data frame') raise AssertionError('Input ltable is not of type pandas data frame') # We expect the rtable to be of type pandas DataFrame if not isinstance(rtable, pd.DataFrame): logger.error('Input rtable is not of type pandas data frame') raise AssertionError('Input rtable is not of type pandas data frame') # We expect the ltable key to be present in the ltable if not ch.check_attrs_present(ltable, ltable_key): logger.error('ltable key ( %s ) not in ltable' % ltable_key) raise KeyError('ltable key ( %s ) not in ltable' % ltable_key) # We expect the rtable key to be present in the rtable if not ch.check_attrs_present(rtable, rtable_key): logger.error('rtable key ( %s ) not in rtable' % rtable_key) raise KeyError('rtable key ( %s ) not in rtable' % rtable_key) # First validate metadata for the candidate set (as a table) _validate_metadata_for_table(candset, key, 'candset', lgr, verbose) ch.log_info(lgr, 'Validating foreign key constraint for left table', verbose) # Second check foreign key constraints if not ch.check_fk_constraint(candset, foreign_key_ltable, ltable, ltable_key): logger.error('Candset does not satisfy foreign key constraint with ' 'the left table') raise AssertionError( 'Candset does not satisfy foreign key constraint with ' 'the left table') if not ch.check_fk_constraint(candset, foreign_key_rtable, rtable, rtable_key): logger.error('Candset does not satisfy foreign key constraint with ' 'the right table') raise AssertionError( 'Candset does not satisfy foreign key constraint with ' 'the right table') ch.log_info(lgr, '..... Done', verbose) ch.log_info(lgr, 'Validating foreign key constraint for right table', verbose) ch.log_info(lgr, '..... Done', verbose) return True
def block_candset(self, candset, verbose=False, show_progress=True, n_jobs=1): """ Blocks an input candidate set of tuple pairs based on a sequence of blocking rules supplied by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked (dropped). Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> rb = em.RuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> D = rb.block_tables(C) # C is the candidate set. """ # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, ' + 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # do blocking # # initialize the progress bar if show_progress: bar = pyprind.ProgBar(len(candset)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, [], []) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, None, show_progress, n_jobs) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def dask_extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. AssertionError: If `n_chunks` is not of type int. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = np.array_split(candset, n_chunks) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = [] for i in range(len(c_splits)): partial_result = delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], False) feat_vals_by_splits.append(partial_result) feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits) if show_progress: with ProgressBar(): feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def down_sample(table_a, table_b, size, y_param, show_progress=True, verbose=False, seed=None, rem_stop_words=True, rem_puncs=True, n_jobs=1): """ This function down samples two tables A and B into smaller tables A' and B' respectively. Specifically, first it randomly selects `size` tuples from the table B to be table B'. Next, it builds an inverted index I (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm finds a set P of k/2 tuples from I that match x, and a set Q of k/2 tuples randomly selected from A - P. The idea is for A' and B' to share some matches yet be as representative of A and B as possible. Args: table_a,table_b (DataFrame): The input tables A and B. size (int): The size that table B should be down sampled to. y_param (int): The parameter to control the down sample size of table A. Specifically, the down sampled size of table A should be close to size * y_param. show_progress (boolean): A flag to indicate whether a progress bar should be displayed (defaults to True). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). seed (int): The seed for the pseudo random number generator to select the tuples from A and B (defaults to None). rem_stop_words (boolean): A flag to indicate whether a default set of stop words must be removed. rem_puncs (boolean): A flag to indicate whether the punctuations must be removed from the strings. n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: Down sampled tables A and B as pandas DataFrames. Raises: AssertionError: If any of the input tables (`table_a`, `table_b`) are empty or not a DataFrame. AssertionError: If `size` or `y_param` is empty or 0 or not a valid integer value. AssertionError: If `seed` is not a valid integer value. AssertionError: If `verbose` is not of type bool. AssertionError: If `show_progress` is not of type bool. AssertionError: If `n_jobs` is not of type int. Examples: >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, n_jobs=-1) # Example with seed = 0. This means the same sample data set will be returned # each time this function is run. >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, seed=0, n_jobs=-1) """ if not isinstance(table_a, pd.DataFrame): logger.error('Input table A is not of type pandas DataFrame') raise AssertionError( 'Input table A is not of type pandas DataFrame') if not isinstance(table_b, pd.DataFrame): logger.error('Input table B is not of type pandas DataFrame') raise AssertionError( 'Input table B is not of type pandas DataFrame') if len(table_a) == 0 or len(table_b) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') if size == 0 or y_param == 0: logger.error( 'size or y cannot be zero (3rd and 4th parameter of downsample)') raise AssertionError( 'size or y_param cannot be zero (3rd and 4th parameter of downsample)') if seed is not None and not isinstance(seed, int): logger.error('Seed is not of type integer') raise AssertionError('Seed is not of type integer') if len(table_b) < size: logger.warning( 'Size of table B is less than b_size parameter - using entire table B') validate_object_type(verbose, bool, 'Parameter verbose') validate_object_type(show_progress, bool, 'Parameter show_progress') validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words') validate_object_type(rem_puncs, bool, 'Parameter rem_puncs') validate_object_type(n_jobs, int, 'Parameter n_jobs') # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # # get metadata # l_key, r_key = cm.get_keys_for_ltable_rtable(table_a, table_b, logger, # verbose) # # # # validate metadata # cm._validate_metadata_for_table(table_a, l_key, 'ltable', logger, # verbose) # cm._validate_metadata_for_table(table_b, r_key, 'rtable', logger, # verbose) # Inverted index built on table A will consist of all tuples in such P's and Q's - central idea is to have # good coverage in the down sampled A' and B'. s_inv_index = _inv_index(table_a, rem_stop_words, rem_puncs) # Randomly select size tuples from table B to be B' # If a seed value has been give, use a RandomState with the given seed b_sample_size = min(math.floor(size), len(table_b)) if seed is not None: rand = RandomState(seed) else: rand = RandomState() b_tbl_indices = list(rand.choice(len(table_b), int(b_sample_size), replace=False)) n_jobs = get_num_procs(n_jobs, len(table_b)) sample_table_b = table_b.loc[b_tbl_indices] if n_jobs <= 1: # Probe inverted index to find all tuples in A that share tokens with tuples in B'. s_tbl_indices = _probe_index_split(sample_table_b, y_param, len(table_a), s_inv_index, show_progress, seed, rem_stop_words, rem_puncs) else: sample_table_splits = np.array_split(sample_table_b, n_jobs) results = Parallel(n_jobs=n_jobs)( delayed(_probe_index_split)(sample_table_splits[job_index], y_param, len(table_a), s_inv_index, (show_progress and (job_index == n_jobs - 1)), seed, rem_stop_words, rem_puncs) for job_index in range(n_jobs) ) results = map(list, results) s_tbl_indices = set(sum(results, [])) s_tbl_indices = list(s_tbl_indices) l_sampled = table_a.iloc[list(s_tbl_indices)] r_sampled = table_b.iloc[list(b_tbl_indices)] # update catalog if cm.is_dfinfo_present(table_a): cm.copy_properties(table_a, l_sampled) if cm.is_dfinfo_present(table_b): cm.copy_properties(table_b, r_sampled) return l_sampled, r_sampled
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, window_size=2, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_jobs=1): """ WARNING: THIS IS AN EXPERIMENTAL COMMAND. THIS COMMAND IS NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on sorted neighborhood. Finds tuple pairs from left and right tables such that when each table is sorted based upon a blocking attribute, tuple pairs are within a distance w of each other. The blocking attribute is created prior to calling this function. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute for left table. r_block_attr (string): The blocking attribute for right table. window_size (int): size of sliding window. Defaults to 2 l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `window_size` is not of type of int or if window_size < 2. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. """ # Warning that this code is still in alpha stage # display warning message print( "WARNING: THIS IS AN EXPERIMENTAL COMMAND. THIS COMMAND IS NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # check if ltable or rtable are empty. if ltable.empty: raise AssertionError('Left table is empty') if rtable.empty: raise AssertionError('Right table is empty') # check if window_size < 2 if window_size < 2: raise AssertionError('window_size is < 2') # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # determine number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, min(len(ltable), len(rtable))) # handle potential missing values c_missing = pd.DataFrame() if n_procs <= 1: # single process c_splits, c_missing = _sn_block_tables_split( ltable, rtable, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, allow_missing) else: # multiprocessing # Split l and r into n_procs chunks. # each core will get an l and an r, merge them, sort them. l_splits = pd.np.array_split(ltable, n_procs) r_splits = pd.np.array_split(rtable, n_procs) p_answer = Parallel(n_jobs=n_procs)( delayed(_sn_block_tables_split) (l_splits[i], r_splits[i], l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, allow_missing) for i in range(n_procs)) c_splits, c_missing = zip(*p_answer) c_splits = list(c_splits) c_missing = pd.concat(c_missing) # make a deque for the sliding window sliding_window = deque() result = [] c_missing = c_missing.to_dict(orient='records') # Use generator function to merge sorted runs. # If single core, generator is trivial (see fn below) for row in _gen_iter_merge(c_splits): row = row._asdict() # if the sliding window is full, remove the largest. The new tuple will be # compared against the (window_size-1) previously seen tuples. # (if at the beginning just compare with whatever we have) if len(sliding_window) >= window_size: sliding_window.popleft() # Now, iterate over the sliding window (plus any tuples missing BKV's, # if that was called for): for window_element in chain(sliding_window, c_missing): ltable = window_element rtable = row # SN blocking is often implemented on a single table. # In this implementation, we are only considering tuples that have # one tuple from the left table and one tuple from the right table. # Thus, only keep candidates that span both tables. # However, the restriction is that matches need to be (left, right) so # if we end up with (right, left) flip it. if ltable["source"] != rtable["source"]: # Span both tables if ltable[ "source"] == 'r': # Left is right, so flip it to make it sane again ltable, rtable = rtable, ltable merged = OrderedDict() merged[l_output_prefix + "ID"] = ltable[l_key] merged[r_output_prefix + "ID"] = rtable[r_key] merged[l_output_prefix + l_key] = ltable[l_key] merged[r_output_prefix + r_key] = rtable[r_key] # # add l/r output attributes to the ordered dictionary if l_output_attrs is not None: for attr in l_output_attrs: merged[l_output_prefix + attr] = ltable[attr] if r_output_attrs is not None: for attr in r_output_attrs: merged[r_output_prefix + attr] = rtable[attr] # # add the ordered dict to the list result.append(merged) sliding_window.append(row) candset = pd.DataFrame(result, columns=result[0].keys()) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) return candset
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on a black box blocking function specified by the user. Finds tuple pairs from left and right tables that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If values in `l_output_attrs` is not of type string. AssertionError: If values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> bb = em.BlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] ) """ # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data type of show_progress self.validate_show_progress(show_progress) # validate black box function assert self.black_box_function != None, 'Black box function is not set' # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs,r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # determine the number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(l_df) * len(r_df)) # # pickle the black-box function before passing it as an arg to # # _block_tables_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) if n_procs <= 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress) else: # multiprocessing m, n = self.get_split_params(n_procs, len(l_df), len(r_df)) l_splits = pd.np.array_split(l_df, m) r_splits = pd.np.array_split(r_df, n) c_splits = Parallel(n_jobs=m*n)(delayed(_block_tables_split)(l_splits[i], r_splits[j], l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress and i == len(l_splits) - 1 and j == len(r_splits) - 1) for i in range(len(l_splits)) for j in range(len(r_splits))) candset = pd.concat(c_splits, ignore_index=True) # # determine the attributes to retain in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset =pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix+l_key, r_output_prefix+r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on the sequence of rules supplied by the user. Finds tuple pairs from left and right tables that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived the sequence of blocking rules (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If the input `l_output_prefix` is not of type string. AssertionError: If the input `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> C = rb.block_tables(A, B) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(l_key, r_key, l_output_attrs_1, r_output_attrs_1) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] candset, rule_applied = self.block_tables_with_filters(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, get_num_cores()) # pass number of splits as # the number of cores in the machine if candset is None: # no filterable rule was applied candset = self.block_tables_without_filters(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_ltable_chunks, n_rtable_chunks) elif len(self.rules) > 1: # one filterable rule was applied but other rules are left # block candset by applying other rules and excluding the applied rule candset = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key, r_output_prefix + r_key, rule_applied, show_progress, get_num_cores()) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on a black box blocking function specified by the user. Finds tuple pairs from left and right tables that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If values in `l_output_attrs` is not of type string. AssertionError: If values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] ) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate black box function assert self.black_box_function != None, 'Black box function is not set' # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # pickle the black-box function before passing it as an arg to # # _block_tables_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress) else: # multiprocessing l_splits = pd.np.array_split(l_df, n_ltable_chunks) r_splits = pd.np.array_split(r_df, n_rtable_chunks) c_splits = [] for i in range(len(l_splits)): for j in range(len(r_splits)): partial_result = delayed(_block_tables_split)(l_splits[i], r_splits[j], l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, False) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) if show_progress: with ProgressBar(): c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # # determine the attributes to retain in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_candset(self, candset, verbose=True, show_progress=True, n_jobs=1): """ Blocks an input candidate set of tuple pairs based on a black box blocking function specified by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether logging should be done (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> bb = em.BlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> D = bb.block_candset(C) # C is an output from block_tables """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate black box functionn assert self.black_box_function != None, 'Black box function is not set' # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # project candset to keep only the ID attributes c_df = candset[[key, fk_ltable, fk_rtable]] # # determine the number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(c_df)) # # pickle the black-box function before passing it as an arg to # # _block_candset_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) valid = [] if n_procs <= 1: # single process valid = _block_candset_split(c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress) else: # multiprocessing c_splits = pd.np.array_split(c_df, n_procs) valid_splits = Parallel(n_jobs=n_procs)(delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress and i == len(c_splits) - 1) for i in range(len(c_splits))) valid = sum(valid_splits, []) # construct output table if len(c_df) > 0: c_df = candset[valid] else: c_df = pd.DataFrame(columns=candset.columns) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on attribute equivalence. Conceptually, this will check `l_block_attr=r_block_attr` for each tuple pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a Pandas dataframe object with tuple pairs that satisfy the equality condition. The dataframe will include attributes '_id', key attribute from ltable, key attributes from rtable, followed by lists `l_output_attrs` and `r_output_attrs` if they are specified. Each of these output and key attributes will be prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set to `True` then all tuple pairs with missing value in at least one of the tuples will be included in the output dataframe. Further, this will update the following metadata in the catalog for the output table: (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) # Include all possible tuple pairs with missing values >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) """ logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR " "OWN RISK.") # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # last arg is # set to 1 just to reuse the function from the # old blocker. # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # do projection of required attributes from the tables l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr, l_output_attrs) ltable_proj = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr, r_output_attrs) rtable_proj = rtable[r_proj_attrs] # # remove records with nans in the blocking attribute l_df = rem_nan(ltable_proj, l_block_attr) r_df = rem_nan(rtable_proj, r_block_attr) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) else: l_splits = pd.np.array_split(l_df, n_ltable_chunks) r_splits = pd.np.array_split(r_df, n_rtable_chunks) c_splits = [] for l in l_splits: for r in r_splits: partial_result = delayed(_block_tables_split)(l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # if allow_missing flag is True, then compute # all pairs with missing value in left table, and # all pairs with missing value in right table if allow_missing: missing_pairs = self.get_pairs_with_missing_value(ltable_proj, rtable_proj, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = pd.concat([candset, missing_pairs], ignore_index=True) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_candset(self, candset, l_block_attr, r_block_attr, allow_missing=False, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on attribute equivalence. Finds tuple pairs from an input candidate set of tuple pairs such that the value of attribute l_block_attr of the left tuple in a tuple pair exactly matches the value of attribute r_block_attr of the right tuple in the tuple pair. Args: candset (DataFrame): The input candidate set of tuple pairs. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ab.block_candset(C, 'age', 'age', n_chunks=-1) """ logger.warning("WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # do projection before merge l_df = ltable[[l_key, l_block_attr]] r_df = rtable[[r_key, r_block_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # determine number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(candset, l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, show_progress) else: c_splits = pd.np.array_split(candset, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, False) # setting show # progress to False as we will use Dask diagnostics to display progress # bar valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on the overlap of token sets of attribute values. Finds tuple pairs from left and right tables such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of a tuple from the left table, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of a tuple from the right table, is above a certain threshold. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_output_attrs` are not in the ltable. AssertionError: If `r_output_attrs` are not in the rtable. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() # Use all cores # # Use word-level tokenizer >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Use q-gram tokenizer >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Include all possible missing values >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True, n_ltable_chunks=-1, n_rtable_chunks=-1) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Input validations self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_ltable_chunks, n_rtable_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) self.validate_allow_missing(allow_missing) self.validate_show_progress(show_progress) self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) self.validate_word_level_qval(word_level, q_val) log_info(logger, 'Required metadata: ltable key, rtable key', verbose) l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate input table chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) if n_ltable_chunks == -1: n_ltable_chunks = multiprocessing.cpu_count() ltable_chunks = pd.np.array_split(ltable, n_ltable_chunks) # preprocess/tokenize ltable if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(qval=q_val, return_set=True) preprocessed_tokenized_ltbl = [] # Construct DAG for preprocessing/tokenizing ltable chunks start_row_id = 0 for i in range(len(ltable_chunks)): result = delayed(self.process_tokenize_block_attr)(ltable_chunks[i][ l_overlap_attr], start_row_id, rem_stop_words, tokenizer) preprocessed_tokenized_ltbl.append(result) start_row_id += len(ltable_chunks[i]) preprocessed_tokenized_ltbl = delayed(wrap)(preprocessed_tokenized_ltbl) # Execute the DAG if show_progress: with ProgressBar(): logger.info('Preprocessing/tokenizing ltable') preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) else: preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) ltable_processed_dict = {} for i in range(len(preprocessed_tokenized_ltbl_vals)): ltable_processed_dict.update(preprocessed_tokenized_ltbl_vals[i]) # build inverted index inverted_index = self.build_inverted_index(ltable_processed_dict) if n_rtable_chunks == -1: n_rtable_chunks = multiprocessing.cpu_count() rtable_chunks = pd.np.array_split(rtable, n_rtable_chunks) # Construct the DAG for probing probe_result = [] start_row_id = 0 for i in range(len(rtable_chunks)): result = delayed(self.probe)(rtable_chunks[i][r_overlap_attr], inverted_index, start_row_id, rem_stop_words, tokenizer, overlap_size) probe_result.append(result) start_row_id += len(rtable_chunks[i]) probe_result = delayed(wrap)(probe_result) # Execute the DAG for probing if show_progress: with ProgressBar(): logger.info('Probing using rtable') probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) else: probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) # construct a minimal dataframe that can be used to add more attributes flat_list = [item for sublist in probe_result for item in sublist] tmp = pd.DataFrame(flat_list, columns=['fk_ltable_rid', 'fk_rtable_rid']) fk_ltable = ltable.iloc[tmp.fk_ltable_rid][l_key].values fk_rtable = rtable.iloc[tmp.fk_rtable_rid][r_key].values id_vals = list(range(len(flat_list))) candset = pd.DataFrame.from_dict( {'_id': id_vals, l_output_prefix+l_key: fk_ltable, r_output_prefix+r_key: fk_rtable}) # set the properties for the candidate set cm.set_key(candset, '_id') cm.set_fk_ltable(candset, 'ltable_'+l_key) cm.set_fk_rtable(candset, 'rtable_'+r_key) cm.set_ltable(candset, ltable) cm.set_rtable(candset, rtable) ret_candset = gh.add_output_attributes(candset, l_output_attrs=l_output_attrs, r_output_attrs=r_output_attrs, l_output_prefix=l_output_prefix, r_output_prefix=r_output_prefix, validate=False) # handle missing values if allow_missing: missing_value_pairs = get_pairs_with_missing_value(ltable, rtable, l_key, r_key, l_overlap_attr, r_overlap_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, False) missing_value_pairs.insert(0, '_id', range(len(ret_candset), len(ret_candset)+len(missing_value_pairs))) if len(missing_value_pairs) > 0: ret_candset = pd.concat([ret_candset, missing_value_pairs], ignore_index=True, sort=False) cm.set_key(ret_candset, '_id') cm.set_fk_ltable(ret_candset, 'ltable_' + l_key) cm.set_fk_rtable(ret_candset, 'rtable_' + r_key) cm.set_ltable(ret_candset, ltable) cm.set_rtable(ret_candset, rtable) # Return the final candidate set to user. return ret_candset
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_chunks=-1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_chunks=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ logger.warning( "WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Validate input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # validate number of chunks validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(return_set=True) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = pd.np.array_split(candset, n_chunks) valid_splits = [] # Create DAG for i in range(n_chunks): result = delayed(self._block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, fk_ltable, fk_rtable, allow_missing, rem_stop_words, tokenizer, overlap_size) valid_splits.append(result) valid_splits = delayed(wrap)(valid_splits) # Execute the DAG if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words self.cleanup_table(l_df, l_overlap_attr, rem_stop_words) self.cleanup_table(r_df, r_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # create a filter for overlap similarity join overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) # # perform overlap similarity filtering of the candset out_table = overlap_filter.filter_candset(candset, fk_ltable, fk_rtable, l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, n_jobs, show_progress=show_progress) # update catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return out_table
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. if not isinstance(candset, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) if show_progress: prog_bar = pyprind.ProgBar(len(candset)) # # Apply feature functions feat_vals = [] ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) l_dict = {} r_dict = {} for row in candset.itertuples(index=False): if show_progress: prog_bar.update() fk_ltable_val = row[fk_ltable_idx] fk_rtable_val = row[fk_rtable_idx] if fk_ltable_val not in l_dict: l_dict[fk_ltable_val] = l_df.ix[fk_ltable_val] l_tuple = l_dict[fk_ltable_val] if fk_rtable_val not in r_dict: r_dict[fk_rtable_val] = r_df.ix[fk_rtable_val] r_tuple = r_dict[fk_rtable_val] f = apply_feat_fns(l_tuple, r_tuple, feature_table) feat_vals.append(f) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on the overlap of token sets of attribute values. Finds tuple pairs from left and right tables such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of a tuple from the left table, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of a tuple from the right table, is above a certain threshold. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_output_attrs` are not in the ltable. AssertionError: If `r_output_attrs` are not in the rtable. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() # Use word-level tokenizer >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1) # Use q-gram tokenizer >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2) # Include all possible missing values >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) # Use all the cores in the machine >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], n_jobs=-1) """ # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate data type of show_progress self.validate_show_progress(show_progress) # validate overlap attributes self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_proj_attrs = self.get_attrs_to_project(l_key, l_overlap_attr, l_output_attrs) l_df = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_overlap_attr, r_output_attrs) r_df = rtable[r_proj_attrs] # # case the column to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words l_dummy_overlap_attr = '@#__xx__overlap_ltable__#@' r_dummy_overlap_attr = '@#__xx__overlap_rtable__#@' l_df[l_dummy_overlap_attr] = l_df[l_overlap_attr] r_df[r_dummy_overlap_attr] = r_df[r_overlap_attr] if not l_df.empty: self.cleanup_table(l_df, l_dummy_overlap_attr, rem_stop_words) if not r_df.empty: self.cleanup_table(r_df, r_dummy_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # perform overlap similarity join candset = overlap_join(l_df, r_df, l_key, r_key, l_dummy_overlap_attr, r_dummy_overlap_attr, tokenizer, overlap_size, '>=', allow_missing, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, n_jobs, show_progress) # # retain only the required attributes in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = candset[retain_cols] # update metadata in the catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return the candidate set return candset
def block_candset(self, candset, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks an input candidate set of tuple pairs based on a sequence of blocking rules supplied by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked (dropped). Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> D = rb.block_tables(C) # C is the candidate set. """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, ' + 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) # do blocking # # initialize the progress bar # if show_progress: # bar = pyprind.ProgBar(len(candset)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, [], []) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, None, show_progress, n_chunks) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def sample_table(table, sample_size, replace=False, verbose=False): """ Samples a candidate set of tuple pairs (for labeling purposes). This function samples a DataFrame, typically used for labeling purposes. This function expects the input DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable). Specifically, this function creates a copy of the input DataFrame, samples the data using uniform random sampling (uses 'random' function from numpy to sample) and returns the sampled DataFrame. Further, also copies the properties from the input DataFrame to the output DataFrame. Args: table (DataFrame): The input DataFrame to be sampled. Specifically, a DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable) in the catalog. sample_size (int): The number of samples to be picked from the input DataFrame. replace (boolean): A flag to indicate whether sampling should be done with replacement or not (defaults to False). verbose (boolean): A flag to indicate whether more detailed information about the execution steps should be printed out (defaults to False). Returns: A new DataFrame with 'sample_size' number of rows. Further, this function sets the output DataFrame's properties same as input DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. AssertionError: If the size of `table` is 0. AssertionError: If the `sample_size` is greater than the input DataFrame size. Examples: >>> import py_entitymatching as em >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from. Note: As mentioned in the above description, the output DataFrame is updated (in the catalog) with the properties from the input DataFrame. A subtle point to note here is, when the replace flag is set to True, then the output DataFrame can contain duplicate keys. In that case, this function will not set the key and it is up to the user to fix it after the function returns. """ # Validate input parameters. # # The input DataFrame is expected to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame) # # There should at least not-zero rows to sample from if len(table) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') # # The sample size should be less than or equal to the number of rows in # the input DataFrame if len(table) < sample_size: logger.error('Sample size is larger than the input table size') raise AssertionError('Sample size is larger than the input table size') # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Get the sample set for the output table sample_indices = pd.np.random.choice(len(table), sample_size, replace=replace) # Sort the indices ordered by index value sample_indices = sorted(sample_indices) sampled_table = table.iloc[list(sample_indices)] # Copy the properties cm.init_properties(sampled_table) # # If the replace is set to True, then we should check for the validity # of key before setting it if replace: properties = cm.get_all_properties(table) for property_name, property_value in six.iteritems(properties): if property_name == 'key': # Check for the validity of key before setting it cm.set_key(sampled_table, property_value) else: # Copy the other properties as is cm.set_property(sampled_table, property_name, property_value) else: cm.copy_properties(table, sampled_table) # Return the sampled table return sampled_table
def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') >>> false_neg_df = em.get_false_negatives_as_df(H, eval_summary) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame, error_prefix='Input cand.set') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_chunks=-1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_chunks=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ logger.warning( "WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Validate input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # validate number of chunks validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(return_set=True) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = pd.np.array_split(candset, n_chunks) valid_splits = [] # Create DAG for i in range(n_chunks): result = delayed(self._block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, fk_ltable, fk_rtable, allow_missing, rem_stop_words, tokenizer, overlap_size) valid_splits.append(result) valid_splits = delayed(wrap)(valid_splits) # Execute the DAG if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on the sequence of rules supplied by the user. Finds tuple pairs from left and right tables that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived the sequence of blocking rules (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If the input `l_output_prefix` is not of type string. AssertionError: If the input `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> rb = em.RuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> C = rb.block_tables(A, B) """ # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data type of show_progress self.validate_show_progress(show_progress) # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, l_output_attrs_1, r_output_attrs_1) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] candset, rule_applied = self.block_tables_with_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_jobs) if candset is None: # no filterable rule was applied candset = self.block_tables_without_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_jobs) elif len(self.rules) > 1: # one filterable rule was applied but other rules are left # block candset by applying other rules and excluding the applied rule candset = self.block_candset_excluding_rule( candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key, r_output_prefix + r_key, rule_applied, show_progress, n_jobs) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_candset(self, candset, verbose=True, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on a black box blocking function specified by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether logging should be done (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker import DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> D = bb.block_candset(C) # C is an output from block_tables """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate black box functionn assert self.black_box_function != None, 'Black box function is not set' # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # project candset to keep only the ID attributes c_df = candset[[key, fk_ltable, fk_rtable]] # # determine the number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) # # pickle the black-box function before passing it as an arg to # # _block_candset_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress) else: # multiprocessing c_splits = pd.np.array_split(c_df, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, False) valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(c_df) > 0: c_df = candset[valid] else: c_df = pd.DataFrame(columns=candset.columns) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_jobs=1): """Blocks two tables based on attribute equivalence. Conceptually, this will check `l_block_attr=r_block_attr` for each tuple pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a Pandas dataframe object with tuple pairs that satisfy the equality condition. The dataframe will include attributes '_id', key attribute from ltable, key attributes from rtable, followed by lists `l_output_attrs` and `r_output_attrs` if they are specified. Each of these output and key attributes will be prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set to `True` then all tuple pairs with missing value in at least one of the tuples will be included in the output dataframe. Further, this will update the following metadata in the catalog for the output table: (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ab = em.AttrEquivalenceBlocker() >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) # Include all possible tuple pairs with missing values >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) """ # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # do projection of required attributes from the tables l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr, l_output_attrs) ltable_proj = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr, r_output_attrs) rtable_proj = rtable[r_proj_attrs] # # remove records with nans in the blocking attribute l_df = rem_nan(ltable_proj, l_block_attr) r_df = rem_nan(rtable_proj, r_block_attr) # # determine number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(l_df) * len(r_df)) if n_procs <= 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) else: # multiprocessing m, n = self.get_split_params(n_procs, len(l_df), len(r_df)) l_splits = np.array_split(l_df, m) r_splits = np.array_split(r_df, n) c_splits = Parallel(n_jobs=m * n)(delayed(_block_tables_split)( l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) for l in l_splits for r in r_splits) candset = pd.concat(c_splits, ignore_index=True) # if allow_missing flag is True, then compute # all pairs with missing value in left table, and # all pairs with missing value in right table if allow_missing: missing_pairs = self.get_pairs_with_missing_value( ltable_proj, rtable_proj, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = pd.concat([candset, missing_pairs], ignore_index=True) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on attribute equivalence. Conceptually, this will check `l_block_attr=r_block_attr` for each tuple pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a Pandas dataframe object with tuple pairs that satisfy the equality condition. The dataframe will include attributes '_id', key attribute from ltable, key attributes from rtable, followed by lists `l_output_attrs` and `r_output_attrs` if they are specified. Each of these output and key attributes will be prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set to `True` then all tuple pairs with missing value in at least one of the tuples will be included in the output dataframe. Further, this will update the following metadata in the catalog for the output table: (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) # Include all possible tuple pairs with missing values >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) """ logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR " "OWN RISK.") # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # last arg is # set to 1 just to reuse the function from the # old blocker. # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # do projection of required attributes from the tables l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr, l_output_attrs) ltable_proj = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr, r_output_attrs) rtable_proj = rtable[r_proj_attrs] # # remove records with nans in the blocking attribute l_df = rem_nan(ltable_proj, l_block_attr) r_df = rem_nan(rtable_proj, r_block_attr) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) else: l_splits = np.array_split(l_df, n_ltable_chunks) r_splits = np.array_split(r_df, n_rtable_chunks) c_splits = [] for l in l_splits: for r in r_splits: partial_result = delayed(_block_tables_split)(l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # if allow_missing flag is True, then compute # all pairs with missing value in left table, and # all pairs with missing value in right table if allow_missing: missing_pairs = self.get_pairs_with_missing_value(ltable_proj, rtable_proj, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = pd.concat([candset, missing_pairs], ignore_index=True) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def impute_table(table, exclude_attrs=None, missing_val='NaN', strategy='mean', axis=0, val_all_nans=0, verbose=True): """ Impute table containing missing values. Args: table (DataFrame): DataFrame which values should be imputed. exclude_attrs (List) : list of attribute names to be excluded from imputing (defaults to None). missing_val (string or int): The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For missing values encoded as np.nan, use the string value 'NaN' (defaults to 'NaN'). strategy (string): String that specifies on how to impute values. Valid strings: 'mean', 'median', 'most_frequent' (defaults to 'mean'). axis (int): axis=1 along rows, and axis=0 along columns (defaults to 0). val_all_nans (float): Value to fill in if all the values in the column are NaN. Returns: Imputed DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> # H is the feature vector which should be imputed. Specifically, impute the missing values >>> # in each column, with the mean of that column >>> H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id'], strategy='mean') """ # Validate input paramaters # # We expect the input table to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) fv_columns = table.columns if exclude_attrs == None: feature_names = fv_columns else: # Check if the exclude attributes are present in the input table if not ch.check_attrs_present(table, exclude_attrs): logger.error('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') raise AssertionError('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') # We expect exclude attributes to be of type list. If not convert it into # a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = gh.list_drop_duplicates(exclude_attrs) cols = [c not in exclude_attrs for c in fv_columns] feature_names = fv_columns[cols] # print feature_names table_copy = table.copy() projected_table = table_copy[feature_names] projected_table_values = projected_table.values imp = Imputer(missing_values=missing_val, strategy=strategy, axis=axis) imp.fit(projected_table_values) imp.statistics_[pd.np.isnan(imp.statistics_)] = val_all_nans projected_table_values = imp.transform(projected_table_values) table_copy[feature_names] = projected_table_values # Update catalog cm.init_properties(table_copy) cm.copy_properties(table, table_copy) return table_copy
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on the overlap of token sets of attribute values. Finds tuple pairs from left and right tables such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of a tuple from the left table, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of a tuple from the right table, is above a certain threshold. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_output_attrs` are not in the ltable. AssertionError: If `r_output_attrs` are not in the rtable. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() # Use word-level tokenizer >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1) # Use q-gram tokenizer >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2) # Include all possible missing values >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) # Use all the cores in the machine >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], n_jobs=-1) """ # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate data type of show_progress self.validate_show_progress(show_progress) # validate overlap attributes self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_proj_attrs = self.get_attrs_to_project(l_key, l_overlap_attr, l_output_attrs) l_df = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_overlap_attr, r_output_attrs) r_df = rtable[r_proj_attrs] # # case the column to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words l_dummy_overlap_attr = '@#__xx__overlap_ltable__#@' r_dummy_overlap_attr = '@#__xx__overlap_rtable__#@' l_df[l_dummy_overlap_attr] = l_df[l_overlap_attr] r_df[r_dummy_overlap_attr] = r_df[r_overlap_attr] if not l_df.empty: self.cleanup_table(l_df, l_dummy_overlap_attr, rem_stop_words) if not r_df.empty: self.cleanup_table(r_df, r_dummy_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # perform overlap similarity join candset = overlap_join(l_df, r_df, l_key, r_key, l_dummy_overlap_attr, r_dummy_overlap_attr, tokenizer, overlap_size, '>=', allow_missing, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, n_jobs, show_progress) # # retain only the required attributes in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = candset[retain_cols] # update metadata in the catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return the candidate set return candset
def block_candset(self, candset, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks an input candidate set of tuple pairs based on a sequence of blocking rules supplied by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked (dropped). Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> D = rb.block_tables(C) # C is the candidate set. """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' + 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) # do blocking # # initialize the progress bar # if show_progress: # bar = pyprind.ProgBar(len(candset)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(l_key, r_key, [], []) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, None, show_progress, n_chunks) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_jobs=1): """Blocks two tables based on attribute equivalence. Finds tuple pairs from left and right tables such that the value of attribute l_block_attr of a tuple from the left table exactly matches the value of attribute r_block_attr of a tuple from the right table. This is similar to equi-join of two tables. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. """ # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # do projection of required attributes from the tables l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr, l_output_attrs) ltable_proj = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr, r_output_attrs) rtable_proj = rtable[r_proj_attrs] # # remove records with nans in the blocking attribute l_df = rem_nan(ltable_proj, l_block_attr) r_df = rem_nan(rtable_proj, r_block_attr) # # determine number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(l_df) * len(r_df)) if n_procs <= 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) else: # multiprocessing m, n = self.get_split_params(n_procs, len(l_df), len(r_df)) l_splits = pd.np.array_split(l_df, m) r_splits = pd.np.array_split(r_df, n) c_splits = Parallel(n_jobs=m * n)(delayed(_block_tables_split)( l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) for l in l_splits for r in r_splits) candset = pd.concat(c_splits, ignore_index=True) # if allow_missing flag is True, then compute # all pairs with missing value in left table, and # all pairs with missing value in right table if allow_missing: missing_pairs = self.get_pairs_with_missing_value( ltable_proj, rtable_proj, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = pd.concat([candset, missing_pairs], ignore_index=True) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on the overlap of token sets of attribute values. Finds tuple pairs from left and right tables such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of a tuple from the left table, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of a tuple from the right table, is above a certain threshold. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_output_attrs` are not in the ltable. AssertionError: If `r_output_attrs` are not in the rtable. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() # Use all cores # # Use word-level tokenizer >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Use q-gram tokenizer >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Include all possible missing values >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True, n_ltable_chunks=-1, n_rtable_chunks=-1) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Input validations self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_ltable_chunks, n_rtable_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) self.validate_allow_missing(allow_missing) self.validate_show_progress(show_progress) self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) self.validate_word_level_qval(word_level, q_val) log_info(logger, 'Required metadata: ltable key, rtable key', verbose) l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate input table chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) if n_ltable_chunks == -1: n_ltable_chunks = multiprocessing.cpu_count() ltable_chunks = pd.np.array_split(ltable, n_ltable_chunks) # preprocess/tokenize ltable if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(qval=q_val, return_set=True) preprocessed_tokenized_ltbl = [] # Construct DAG for preprocessing/tokenizing ltable chunks start_row_id = 0 for i in range(len(ltable_chunks)): result = delayed(self.process_tokenize_block_attr)(ltable_chunks[i][ l_overlap_attr], start_row_id, rem_stop_words, tokenizer) preprocessed_tokenized_ltbl.append(result) start_row_id += len(ltable_chunks[i]) preprocessed_tokenized_ltbl = delayed(wrap)(preprocessed_tokenized_ltbl) # Execute the DAG if show_progress: with ProgressBar(): logger.info('Preprocessing/tokenizing ltable') preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) else: preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) ltable_processed_dict = {} for i in range(len(preprocessed_tokenized_ltbl_vals)): ltable_processed_dict.update(preprocessed_tokenized_ltbl_vals[i]) # build inverted index inverted_index = self.build_inverted_index(ltable_processed_dict) if n_rtable_chunks == -1: n_rtable_chunks = multiprocessing.cpu_count() rtable_chunks = pd.np.array_split(rtable, n_rtable_chunks) # Construct the DAG for probing probe_result = [] start_row_id = 0 for i in range(len(rtable_chunks)): result = delayed(self.probe)(rtable_chunks[i][r_overlap_attr], inverted_index, start_row_id, rem_stop_words, tokenizer, overlap_size) probe_result.append(result) start_row_id += len(rtable_chunks[i]) probe_result = delayed(wrap)(probe_result) # Execute the DAG for probing if show_progress: with ProgressBar(): logger.info('Probing using rtable') probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) else: probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) # construct a minimal dataframe that can be used to add more attributes flat_list = [item for sublist in probe_result for item in sublist] tmp = pd.DataFrame(flat_list, columns=['fk_ltable_rid', 'fk_rtable_rid']) fk_ltable = ltable.iloc[tmp.fk_ltable_rid][l_key].values fk_rtable = rtable.iloc[tmp.fk_rtable_rid][r_key].values id_vals = list(range(len(flat_list))) candset = pd.DataFrame.from_dict( {'_id': id_vals, l_output_prefix+l_key: fk_ltable, r_output_prefix+r_key: fk_rtable}) # set the properties for the candidate set cm.set_key(candset, '_id') cm.set_fk_ltable(candset, 'ltable_'+l_key) cm.set_fk_rtable(candset, 'rtable_'+r_key) cm.set_ltable(candset, ltable) cm.set_rtable(candset, rtable) ret_candset = gh.add_output_attributes(candset, l_output_attrs=l_output_attrs, r_output_attrs=r_output_attrs, l_output_prefix=l_output_prefix, r_output_prefix=r_output_prefix, validate=False) # handle missing values if allow_missing: missing_value_pairs = get_pairs_with_missing_value(ltable, rtable, l_key, r_key, l_overlap_attr, r_overlap_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, False) missing_value_pairs.insert(0, '_id', range(len(ret_candset), len(ret_candset)+len(missing_value_pairs))) if len(missing_value_pairs) > 0: ret_candset = pd.concat([ret_candset, missing_value_pairs], ignore_index=True, sort=False) cm.set_key(ret_candset, '_id') cm.set_fk_ltable(ret_candset, 'ltable_' + l_key) cm.set_fk_rtable(ret_candset, 'rtable_' + r_key) cm.set_ltable(ret_candset, ltable) cm.set_rtable(ret_candset, rtable) # Return the final candidate set to user. return ret_candset
def combine_blocker_outputs_via_union( blocker_output_list, l_prefix='ltable_', r_prefix='rtable_', verbose=False): """ Combines multiple blocker outputs by doing a union of their tuple pair ids (foreign key ltable, foreign key rtable). Specifically, this function takes in a list of DataFrames (candidate sets, typically the output from blockers) and returns a consolidated DataFrame. The output DataFrame contains the union of tuple pair ids (foreign key ltable, foreign key rtable) and other attributes from the input list of DataFrames. This function makes some assumptions about the input DataFrames. First, each DataFrame is expected to contain the following metadata in the catalog: key, fk_ltable, fk_rtable, ltable, and rtable. Second, all the DataFrames must be a result of blocking from the same underlying tables. Concretely the ltable and rtable properties must refer to the same DataFrame across all the input tables. Third, all the input DataFrames must have the same fk_ltable and fk_rtable properties. Finally, in each input DataFrame, for the attributes included from the ltable or rtable, the attribute names must be prefixed with the given l_prefix and r_prefix in the function. The input DataFrames may contain different attribute lists and it demands the question of how to combine them. Currently py_entitymatching takes an union of attribute names that has prefix l_prefix or r_prefix across input tables. After taking the union, for each tuple id pair included in output, the attribute values (for union-ed attribute names) are probed from ltable/rtable and included in the output. A subtle point to note here is, if an input DataFrame has a column added by user (say label for some reason), then that column will not be present in the output. The reason is, the same column may not be present in other candidate sets so it is not clear about how to combine them. One possibility is to include label in output for all tuple id pairs, but set as NaN for the values not present. Currently py_entitymatching does not include such columns and addressing it will be part of future work. Args: blocker_output_list (list of DataFrames): The list of DataFrames that should be combined. l_prefix (string): The prefix given to the attributes from the ltable. r_prefix (string): The prefix given to the attributes from the rtable. verbose (boolean): A flag to indicate whether more detailed information about the execution steps should be printed out (default value is False). Returns: A new DataFrame with the combined tuple pairs and other attributes from all the blocker lists. Raises: AssertionError: If `l_prefix` is not of type string. AssertionError: If `r_prefix` is not of type string. AssertionError: If the length of the input DataFrame list is 0. AssertionError: If `blocker_output_list` is not a list of DataFrames. AssertionError: If the ltables are different across the input list of DataFrames. AssertionError: If the rtables are different across the input list of DataFrames. AssertionError: If the `fk_ltable` values are different across the input list of DataFrames. AssertionError: If the `fk_rtable` values are different across the input list of DataFrames. Examples: >>> import py_entitymatching as em >>> ab = em.AttrEquivalenceBlocker() >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode') >>> ob = em.OverlapBlocker() >>> D = ob.block_candset(C, 'address', 'address') >>> block_f = em.get_features_for_blocking(A, B) >>> rb = em.RuleBasedBlocker() >>> rule = ['address_address_lev(ltuple, rtuple) > 6'] >>> rb.add_rule(rule, block_f) >>> E = rb.block_tables(A, B) >>> F = em.combine_blocker_outputs_via_union([C, E]) """ # validate input parameters # The l_prefix is expected to be of type string py_entitymatching.utils.validation_helper.validate_object_type(l_prefix, six.string_types, 'l_prefix') # The r_prefix is expected to be of type string py_entitymatching.utils.validation_helper.validate_object_type(r_prefix, six.string_types, 'r_prefix') # We cannot combine empty DataFrame list if not len(blocker_output_list) > 0: logger.error('There no DataFrames to combine') raise AssertionError('There are no DataFrames to combine') # Validate the assumptions about the input tables. # # 1) All the input object must be DataFrames # # 2) All the input DataFrames must have the metadata as that of a # candidate set # # 3) All the input DataFrames must have the same fk_ltable and fk_rtable _validate_lr_tables(blocker_output_list) # # Get the ltable and rtable. We take it from the first DataFrame as all # the DataFrames contain the same ltables and rtables ltable = cm.get_ltable(blocker_output_list[0]) rtable = cm.get_rtable(blocker_output_list[0]) # # Get the fk_ltable and fk_rtable. We take it from the first DataFrame as # all the DataFrames contain the same ltables and rtables fk_ltable = cm.get_fk_ltable(blocker_output_list[0]) fk_rtable = cm.get_fk_rtable(blocker_output_list[0]) # Retrieve the keys for the ltable and rtables. l_key = cm.get_key(ltable) r_key = cm.get_key(rtable) # Check if the fk_ltable is starting with the given prefix, if not its # not an error. Just raise a warning. if fk_ltable.startswith(l_prefix) is False: logger.warning( 'Foreign key for ltable is not starting with the given prefix (' '%s)', l_prefix) # Check if the fk_rtable is starting with the given prefix, if not its # not an error. Just raise a warning. if fk_rtable.startswith(r_prefix) is False: logger.warning( 'Foreign key for rtable is not starting with the given prefix (' '%s)', r_prefix) # Initialize lists # # keep track of projected tuple pair ids tuple_pair_ids = [] # # keep track of output attributes from the left table l_output_attrs = [] # # keep track of output attributes from the right table r_output_attrs = [] # for each DataFrame in the given list, project out tuple pair ids, get the # attributes from the ltable and rtable for data_frame in blocker_output_list: # Project out the tuple pair ids. A tuple pair id is a fk_ltable, # fk_rtable pair projected_tuple_pair_ids = data_frame[[fk_ltable, fk_rtable]] # Update the list that tracks tuple pair ids tuple_pair_ids.append(projected_tuple_pair_ids) # Get the columns, which should be segregated into the attributes # from the ltable and table col_set = ( gh.list_diff(list(data_frame.columns), [fk_ltable, fk_rtable, cm.get_key(data_frame)])) # Segregate the columns as attributes from the ltable and rtable l_attrs, r_attrs = _lr_cols(col_set, l_prefix, r_prefix) # Update the l_output_attrs, r_output_attrs l_output_attrs.extend(l_attrs) # the reason we use extend because l_attrs a list r_output_attrs.extend(r_attrs) ch.log_info(logger, 'Concatenating the tuple pair ids across given ' 'blockers ...', verbose) # concatenate the tuple pair ids from the list of input DataFrames concatenated_tuple_pair_ids = pd.concat(tuple_pair_ids) ch.log_info(logger, 'Concatenating the tuple pair ids ... DONE', verbose) ch.log_info(logger, 'Deduplicating the tuple pair ids ...', verbose) # Deduplicate the DataFrame. Now the returned DataFrame will contain # unique tuple pair ids. # noinspection PyUnresolvedReferences deduplicated_tuple_pair_ids = concatenated_tuple_pair_ids.drop_duplicates() ch.log_info(logger, 'Deduplicating the tuple pair ids ... DONE', verbose) # Construct output table # # Get unique list of attributes across different tables l_output_attrs = gh.list_drop_duplicates(l_output_attrs) r_output_attrs = gh.list_drop_duplicates(r_output_attrs) # Reset the index that might have lingered from concatenation. deduplicated_tuple_pair_ids.reset_index(inplace=True, drop=True) # Add the output attribtues from the ltable and rtable. # NOTE: This approach may be inefficient as it probes the ltable, rtable # to get the attribute values. A better way would be to fill the # attribute values from the input list of DataFrames. This attribute values # could be harvested (at the expense of some space) while we iterate the # input blocker output list for the first time. # noinspection PyProtectedMember consolidated_data_frame = gh._add_output_attributes( deduplicated_tuple_pair_ids, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_prefix, r_prefix, validate=False) # Sort the DataFrame ordered by fk_ltable and fk_rtable. # The function "sort" will be depreciated in the newer versions of # pandas DataFrame, and it will replaced by 'sort_values' function. So we # will first try to use sort_values if this fails we will use sort. try: consolidated_data_frame.sort_values([fk_ltable, fk_rtable], inplace=True) except AttributeError: consolidated_data_frame.sort([fk_ltable, fk_rtable], inplace=True) # update the catalog for the consolidated DataFrame # First get a column name for the key key = ch.get_name_for_key(consolidated_data_frame.columns) # Second, add the column name as the key consolidated_data_frame = ch.add_key_column(consolidated_data_frame, key) # Third, reset the index to remove any out of order index values from # the sort. consolidated_data_frame.reset_index(inplace=True, drop=True) # Finally, set the properties for the consolidated DataFrame in the catalog cm.set_candset_properties(consolidated_data_frame, key, fk_ltable, fk_rtable, ltable, rtable) # Return the consolidated DataFrame return consolidated_data_frame
def block_candset(self, candset, l_block_attr, r_block_attr, allow_missing=False, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on attribute equivalence. Finds tuple pairs from an input candidate set of tuple pairs such that the value of attribute l_block_attr of the left tuple in a tuple pair exactly matches the value of attribute r_block_attr of the right tuple in the tuple pair. Args: candset (DataFrame): The input candidate set of tuple pairs. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ab.block_candset(C, 'age', 'age', n_chunks=-1) """ logger.warning("WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # do projection before merge l_df = ltable[[l_key, l_block_attr]] r_df = rtable[[r_key, r_block_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # determine number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(candset, l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, show_progress) else: c_splits = np.array_split(candset, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, False) # setting show # progress to False as we will use Dask diagnostics to display progress # bar valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on the sequence of rules supplied by the user. Finds tuple pairs from left and right tables that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived the sequence of blocking rules (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If the input `l_output_prefix` is not of type string. AssertionError: If the input `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> rb = em.RuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> C = rb.block_tables(A, B) """ # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data type of show_progress self.validate_show_progress(show_progress) # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(l_key, r_key, l_output_attrs_1, r_output_attrs_1) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] candset, rule_applied = self.block_tables_with_filters(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_jobs) if candset is None: # no filterable rule was applied candset = self.block_tables_without_filters(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_jobs) elif len(self.rules) > 1: # one filterable rule was applied but other rules are left # block candset by applying other rules and excluding the applied rule candset = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key, r_output_prefix + r_key, rule_applied, show_progress, n_jobs) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words self.cleanup_table(l_df, l_overlap_attr, rem_stop_words) self.cleanup_table(r_df, r_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # create a filter for overlap similarity join overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) # # perform overlap similarity filtering of the candset out_table = overlap_filter.filter_candset(candset, fk_ltable, fk_rtable, l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, n_jobs, show_progress=show_progress) # update catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return out_table
def block_candset(self, candset, verbose=False, show_progress=True, n_jobs=1): """ Blocks an input candidate set of tuple pairs based on a sequence of blocking rules supplied by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked (dropped). Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> rb = em.RuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> D = rb.block_tables(C) # C is the candidate set. """ # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' + 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # do blocking # # initialize the progress bar if show_progress: bar = pyprind.ProgBar(len(candset)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(l_key, r_key, [], []) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, None, show_progress, n_jobs) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def down_sample(table_a, table_b, size, y_param, show_progress=True, verbose=False, seed=None): """ This function down samples two tables A and B into smaller tables A' and B' respectively. Specifically, first it randomly selects `size` tuples from the table B to be table B'. Next, it builds an inverted index I (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm finds a set P of k/2 tuples from I that match x, and a set Q of k/2 tuples randomly selected from A - P. The idea is for A' and B' to share some matches yet be as representative of A and B as possible. Args: table_a,table_b (DataFrame): The input tables A and B. size (int): The size that table B should be down sampled to. y_param (int): The parameter to control the down sample size of table A. Specifically, the down sampled size of table A should be close to size * y_param. show_progress (boolean): A flag to indicate whether a progress bar should be displayed (defaults to True). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). seed (int): The seed for the pseudo random number generator to select the tuples from A and B (defaults to None). Returns: Down sampled tables A and B as pandas DataFrames. Raises: AssertionError: If any of the input tables (`table_a`, `table_b`) are empty or not a DataFrame. AssertionError: If `size` or `y_param` is empty or 0 or not a valid integer value. AssertionError: If `seed` is not a valid integer value. Examples: >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = em.down_sample(A, B, 500, 1) # Example with seed = 0. This means the same sample data set will be returned # each time this function is run. >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, seed=0) """ if not isinstance(table_a, pd.DataFrame): logger.error('Input table A is not of type pandas DataFrame') raise AssertionError( 'Input table A is not of type pandas DataFrame') if not isinstance(table_b, pd.DataFrame): logger.error('Input table B is not of type pandas DataFrame') raise AssertionError( 'Input table B is not of type pandas DataFrame') if len(table_a) == 0 or len(table_b) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') if size == 0 or y_param == 0: logger.error( 'size or y cannot be zero (3rd and 4th parameter of downsample)') raise AssertionError( 'size or y_param cannot be zero (3rd and 4th parameter of downsample)') if seed is not None and not isinstance(seed, int): logger.error('Seed is not of type integer') raise AssertionError('Seed is not of type integer') if len(table_b) < size: logger.warning( 'Size of table B is less than b_size parameter - using entire table B') # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # # get metadata # l_key, r_key = cm.get_keys_for_ltable_rtable(table_a, table_b, logger, # verbose) # # # # validate metadata # cm._validate_metadata_for_table(table_a, l_key, 'ltable', logger, # verbose) # cm._validate_metadata_for_table(table_b, r_key, 'rtable', logger, # verbose) # Inverted index built on table A will consist of all tuples in such P's and Q's - central idea is to have # good coverage in the down sampled A' and B'. s_inv_index = _inv_index(table_a) # Randomly select size tuples from table B to be B' # If a seed value has been give, use a RandomState with the given seed b_sample_size = min(math.floor(size), len(table_b)) if seed is not None: rand = RandomState(seed) else: rand = RandomState() b_tbl_indices = list(rand.choice(len(table_b), int(b_sample_size), replace=False)) # Probe inverted index to find all tuples in A that share tokens with tuples in B'. s_tbl_indices = _probe_index(table_b.ix[b_tbl_indices], y_param, len(table_a), s_inv_index, show_progress, seed=seed) s_tbl_indices = list(s_tbl_indices) l_sampled = table_a.iloc[list(s_tbl_indices)] r_sampled = table_b.iloc[list(b_tbl_indices)] # update catalog if cm.is_dfinfo_present(table_a): cm.copy_properties(table_a, l_sampled) if cm.is_dfinfo_present(table_b): cm.copy_properties(table_b, r_sampled) return l_sampled, r_sampled
def block_candset(self, candset, verbose=True, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on a black box blocking function specified by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether logging should be done (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker import DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> D = bb.block_candset(C) # C is an output from block_tables """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate black box functionn assert self.black_box_function != None, 'Black box function is not set' # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # project candset to keep only the ID attributes c_df = candset[[key, fk_ltable, fk_rtable]] # # determine the number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) # # pickle the black-box function before passing it as an arg to # # _block_candset_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress) else: # multiprocessing c_splits = pd.np.array_split(c_df, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)( c_splits[i], l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, False) valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute( scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(c_df) > 0: c_df = candset[valid] else: c_df = pd.DataFrame(columns=candset.columns) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def block_candset(self, candset, l_block_attr, r_block_attr, allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """Blocks an input candidate set of tuple pairs based on attribute equivalence. Finds tuple pairs from an input candidate set of tuple pairs such that the value of attribute l_block_attr of the left tuple in a tuple pair exactly matches the value of attribute r_block_attr of the right tuple in the tuple pair. Args: candset (DataFrame): The input candidate set of tuple pairs. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. """ # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) # do blocking # # do projection before merge l_df = ltable[[l_key, l_block_attr]] r_df = rtable[[r_key, r_block_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # determine number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(candset)) valid = [] if n_procs <= 1: # single process valid = _block_candset_split(candset, l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, show_progress) else: c_splits = pd.np.array_split(candset, n_procs) valid_splits = Parallel(n_jobs=n_procs)( delayed(_block_candset_split) (c_splits[i], l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, show_progress and i == len(c_splits) - 1) for i in range(len(c_splits))) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on a black box blocking function specified by the user. Finds tuple pairs from left and right tables that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If values in `l_output_attrs` is not of type string. AssertionError: If values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] ) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate black box function assert self.black_box_function != None, 'Black box function is not set' # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # pickle the black-box function before passing it as an arg to # # _block_tables_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress) else: # multiprocessing l_splits = pd.np.array_split(l_df, n_ltable_chunks) r_splits = pd.np.array_split(r_df, n_rtable_chunks) c_splits = [] for i in range(len(l_splits)): for j in range(len(r_splits)): partial_result = delayed(_block_tables_split)( l_splits[i], r_splits[j], l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, False) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) if show_progress: with ProgressBar(): c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # # determine the attributes to retain in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_jobs=1): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) n_procs = get_num_procs(n_jobs, len(candset)) c_splits = pd.np.array_split(candset, n_procs) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = Parallel(n_jobs=n_procs)(delayed(get_feature_vals_by_cand_split)(pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], show_progress and i == len( c_splits) - 1) for i in range(len(c_splits))) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on the sequence of rules supplied by the user. Finds tuple pairs from left and right tables that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived the sequence of blocking rules (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If the input `l_output_prefix` is not of type string. AssertionError: If the input `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> C = rb.block_tables(A, B) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, l_output_attrs_1, r_output_attrs_1) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] candset, rule_applied = self.block_tables_with_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, get_num_cores()) # pass number of splits as # the number of cores in the machine if candset is None: # no filterable rule was applied candset = self.block_tables_without_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_ltable_chunks, n_rtable_chunks) elif len(self.rules) > 1: # one filterable rule was applied but other rules are left # block candset by applying other rules and excluding the applied rule candset = self.block_candset_excluding_rule( candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key, r_output_prefix + r_key, rule_applied, show_progress, get_num_cores()) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset