def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=True): # validate data types of input parameters self.validate_types_tables(ltable, rtable, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm.validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm.validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # remove nans: should be modified based on missing data policy l_df, r_df = rem_nan(ltable, l_block_attr), rem_nan(rtable, r_block_attr) # # do projection before merge l_proj_attrs = self.get_proj_attrs(l_key, l_block_attr, l_output_attrs) l_df = l_df[l_proj_attrs] r_proj_attrs = self.get_proj_attrs(r_key, r_block_attr, r_output_attrs) r_df = r_df[r_proj_attrs] # # use pandas merge to do equi join candset = pd.merge(l_df, r_df, left_on=l_block_attr, right_on=r_block_attr, suffixes=('_ltable', '_rtable')) # construct output table retain_cols, final_cols = self.output_columns(l_key, r_key, list(candset.columns), l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = candset[retain_cols] candset.columns = final_cols # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def combine_blocker_outputs_via_union(blocker_output_list, l_prefix='ltable_', r_prefix='rtable_', verbose=False): """ Combine multiple blocker outputs by doing an union of their tuple pair ids ( foreign key ltable, foreign key rtable). This function combines multiple blocker outputs via union. Specifically, this function takes in a list of DataFrames (candidate sets, typically the output from blockers) and returns a consolidated DataFrame. The output DataFrame contains the union of tuple pair ids (foreign key ltable, foreign key rtable) and other attributes from the input list of DataFrames. This function makes some assumptions about the input DataFrames. First, each DataFrame is expected to contain the following metadata in the catalog: (1) key,fk_ltable, fk_rtable, ltable, and rtable. Second, all the DataFrames must be a result of blocking from the same underlying tables. Concretely the ltable and rtable properties must refer to the same DataFrame across all the input tables. Third, all the input DataFrames must have the same fk_ltable and fk_rtable properties. Finally, in each input DataFrame, for the attributes included from the ltable or rtable, the attribute names must be prefixed with the given l_prefix and r_prefix in the function. As an example, the schema of an input DataFrame may look like this: _id, ltable_ID, rtable_ID, ltable_name, rtable_name The input DataFrames may contain different attribute lists and it begs the question how to combine them. Currently Magellan takes an union of attribute names that has prefix l_prefix or r_prefix. across input tables. After taking the union, for each tuple id pair included in output, the attribute values (for union-ed attribute names) are probed from ltable/rtable and included in the output. A subtle point to note here is, if an input DataFrame has a column added by user (say label for some reason), then that column will not be present in the output. The reason is, the same column may not be present in other candidate sets so it is not clear about how to combine them. One possibility is include label in output for all tuple id pairs, but set as NaN for the values not present. Currently magellan does not include such columns and addressing it will be part of future work. Args: blocker_output_list (list of DataFrames): List of DataFrames that should be combined. Refer notes section for a detailed description of the assumptions made by the function about the input list of blocker outputs. l_prefix (str): Prefix given to the attributes from the ltable. r_prefix (str): Prefix given to the attributes from the rtable. verbose (boolean): Flag to indicate whether more detailed information about the execution steps should be printed out (default value is False). Returns: A new DataFrame with the combined tuple pairs and other attributes from all the blocker lists. Raises: AssertionError: If the l_prefix is not of type string. AssertionError: If the r_prefix is not of type string. AssertionError: If the length of the input DataFrame list if 0. AssertionError: If the input blocker output list is not a list of DataFrames. AssertionError: If the ltables are different across the input list of DataFrames. AssertionError: If the rtables are different across the input list of DataFrames. AssertionError: If the fk_ltable values are different across the input list of DataFrames. AssertionError: If the fk_rtable values are different across the input list of DataFrames. """ # validate input parameters # The l_prefix is expected to be of type string if not isinstance(l_prefix, six.string_types): logger.error('l_prefix is not of type string') raise AssertionError('l_prefix is not of type string') # The r_prefix is expected to be of type string if not isinstance(r_prefix, six.string_types): logger.error('r_prefix is not of type string') raise AssertionError('r_prefix is not of type string') # We cannot combine empty DataFrame list if not len(blocker_output_list) > 0: logger.error('There no DataFrames to combine') raise AssertionError('There are no DataFrames to combine') # Validate the assumptions about the input tables. # # 1) All the input object must be DataFrames # # 2) All the input DataFrames must have the metadata as that of a # candidate set # # 3) All the input DataFrames must have the same fk_ltable and fk_rtable _validate_lr_tables(blocker_output_list) # # Get the ltable and rtable. We take it from the first DataFrame as all # the DataFrames contain the same ltables and rtables ltable = cm.get_ltable(blocker_output_list[0]) rtable = cm.get_rtable(blocker_output_list[0]) # # Get the fk_ltable and fk_rtable. We take it from the first DataFrame as # all the DataFrames contain the same ltables and rtables fk_ltable = cm.get_fk_ltable(blocker_output_list[0]) fk_rtable = cm.get_fk_rtable(blocker_output_list[0]) # Retrieve the keys for the ltable and rtables. l_key = cm.get_key(ltable) r_key = cm.get_key(rtable) # Check if the fk_ltable is starting with the given prefix, if not its # not an error. Just raise a warning. if fk_ltable.startswith(l_prefix) is False: logger.warning( 'Foreign key for ltable is not starting with the given prefix (' '%s)', l_prefix) # Check if the fk_rtable is starting with the given prefix, if not its # not an error. Just raise a warning. if fk_rtable.startswith(r_prefix) is False: logger.warning( 'Foreign key for rtable is not starting with the given prefix (' '%s)', r_prefix) # Initialize lists # # keep track of projected tuple pair ids tuple_pair_ids = [] # # keep track of output attributes from the left table l_output_attrs = [] # # keep track of output attributes from the right table r_output_attrs = [] # for each DataFrame in the given list, project out tuple pair ids, get the # attributes from the ltable and rtable for data_frame in blocker_output_list: # Project out the tuple pair ids. A tuple pair id is a fk_ltable, # fk_rtable pair projected_tuple_pair_ids = data_frame[[fk_ltable, fk_rtable]] # Update the list that tracks tuple pair ids tuple_pair_ids.append(projected_tuple_pair_ids) # Get the columns, which should be segregated into the attributes # from the ltable and table col_set = ( gh.list_diff(list(data_frame.columns), [fk_ltable, fk_rtable, cm.get_key(data_frame)])) # Segregate the columns as attributes from the ltable and rtable l_attrs, r_attrs = _lr_cols(col_set, l_prefix, r_prefix) # Update the l_output_attrs, r_output_attrs l_output_attrs.extend(l_attrs) # the reason we use extend because l_attrs a list r_output_attrs.extend(r_attrs) ch.log_info(logger, 'Concatenating the tuple pair ids across given ' 'blockers ...', verbose) # concatenate the tuple pair ids from the list of input DataFrames concatenated_tuple_pair_ids = pd.concat(tuple_pair_ids) ch.log_info(logger, 'Concatenating the tuple pair ids ... DONE', verbose) ch.log_info(logger, 'Deduplicating the tuple pair ids ...', verbose) # Deduplicate the DataFrame. Now the returned DataFrame will contain # unique tuple pair ids. # noinspection PyUnresolvedReferences deduplicated_tuple_pair_ids = concatenated_tuple_pair_ids.drop_duplicates() ch.log_info(logger, 'Deduplicating the tuple pair ids ... DONE', verbose) # Construct output table # # Get unique list of attributes across different tables l_output_attrs = gh.list_drop_duplicates(l_output_attrs) r_output_attrs = gh.list_drop_duplicates(r_output_attrs) # Reset the index that might have lingered from concatenation. deduplicated_tuple_pair_ids.reset_index(inplace=True, drop=True) # Add the output attribtues from the ltable and rtable. # NOTE: This approach may be inefficient as it probes the ltable, rtable # to get the attribute values. A better way would be to fill the # attribute values from the input list of DataFrames. This attribute values # could be harvested (at the expense of some space) while we iterate the # input blocker output list for the first time. # noinspection PyProtectedMember consolidated_data_frame = gh._add_output_attributes( deduplicated_tuple_pair_ids, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_prefix, r_prefix, validate=False) # Sort the DataFrame ordered by fk_ltable and fk_rtable. # The function "sort" will be depreciated in the newer versions of # pandas DataFrame, and it will replaced by 'sort_values' function. So we # will first try to use sort_values if this fails we will use sort. try: consolidated_data_frame.sort_values([fk_ltable, fk_rtable], inplace=True) except AttributeError: consolidated_data_frame.sort([fk_ltable, fk_rtable], inplace=True) # update the catalog for the consolidated DataFrame # First get a column name for the key key = ch.get_name_for_key(consolidated_data_frame.columns) # Second, add the column name as the key consolidated_data_frame = ch.add_key_column(consolidated_data_frame, key) # Third, reset the index to remove any out of order index values from # the sort. consolidated_data_frame.reset_index(inplace=True, drop=True) # Finally, set the properties for the consolidated DataFrame in the catalog cm.set_candset_properties(consolidated_data_frame, key, fk_ltable, fk_rtable, ltable, rtable) # Return the consolidated DataFrame return consolidated_data_frame
def combine_blocker_outputs_via_union(blocker_output_list, l_prefix='ltable_', r_prefix='rtable_', verbose=False): """ Combine multiple blocker outputs by doing an union of their tuple pair ids ( foreign key ltable, foreign key rtable). This function combines multiple blocker outputs via union. Specifically, this function takes in a list of DataFrames (candidate sets, typically the output from blockers) and returns a consolidated DataFrame. The output DataFrame contains the union of tuple pair ids (foreign key ltable, foreign key rtable) and other attributes from the input list of DataFrames. This function makes some assumptions about the input DataFrames. First, each DataFrame is expected to contain the following metadata in the catalog: (1) key,fk_ltable, fk_rtable, ltable, and rtable. Second, all the DataFrames must be a result of blocking from the same underlying tables. Concretely the ltable and rtable properties must refer to the same DataFrame across all the input tables. Third, all the input DataFrames must have the same fk_ltable and fk_rtable properties. Finally, in each input DataFrame, for the attributes included from the ltable or rtable, the attribute names must be prefixed with the given l_prefix and r_prefix in the function. As an example, the schema of an input DataFrame may look like this: _id, ltable_ID, rtable_ID, ltable_name, rtable_name The input DataFrames may contain different attribute lists and it begs the question how to combine them. Currently Magellan takes an union of attribute names that has prefix l_prefix or r_prefix. across input tables. After taking the union, for each tuple id pair included in output, the attribute values (for union-ed attribute names) are probed from ltable/rtable and included in the output. A subtle point to note here is, if an input DataFrame has a column added by user (say label for some reason), then that column will not be present in the output. The reason is, the same column may not be present in other candidate sets so it is not clear about how to combine them. One possibility is include label in output for all tuple id pairs, but set as NaN for the values not present. Currently magellan does not include such columns and addressing it will be part of future work. Args: blocker_output_list (list of DataFrames): List of DataFrames that should be combined. Refer notes section for a detailed description of the assumptions made by the function about the input list of blocker outputs. l_prefix (str): Prefix given to the attributes from the ltable. r_prefix (str): Prefix given to the attributes from the rtable. verbose (boolean): Flag to indicate whether more detailed information about the execution steps should be printed out (default value is False). Returns: A new DataFrame with the combined tuple pairs and other attributes from all the blocker lists. Raises: AssertionError: If the l_prefix is not of type string. AssertionError: If the r_prefix is not of type string. AssertionError: If the length of the input DataFrame list if 0. AssertionError: If the input blocker output list is not a list of DataFrames. AssertionError: If the ltables are different across the input list of DataFrames. AssertionError: If the rtables are different across the input list of DataFrames. AssertionError: If the fk_ltable values are different across the input list of DataFrames. AssertionError: If the fk_rtable values are different across the input list of DataFrames. """ # validate input parameters # The l_prefix is expected to be of type string if not isinstance(l_prefix, six.string_types): logger.error('l_prefix is not of type string') raise AssertionError('l_prefix is not of type string') # The r_prefix is expected to be of type string if not isinstance(r_prefix, six.string_types): logger.error('r_prefix is not of type string') raise AssertionError('r_prefix is not of type string') # We cannot combine empty DataFrame list if not len(blocker_output_list) > 0: logger.error('There no DataFrames to combine') raise AssertionError('There are no DataFrames to combine') # Validate the assumptions about the input tables. # # 1) All the input object must be DataFrames # # 2) All the input DataFrames must have the metadata as that of a # candidate set # # 3) All the input DataFrames must have the same fk_ltable and fk_rtable _validate_lr_tables(blocker_output_list) # # Get the ltable and rtable. We take it from the first DataFrame as all # the DataFrames contain the same ltables and rtables ltable = cm.get_ltable(blocker_output_list[0]) rtable = cm.get_rtable(blocker_output_list[0]) # # Get the fk_ltable and fk_rtable. We take it from the first DataFrame as # all the DataFrames contain the same ltables and rtables fk_ltable = cm.get_fk_ltable(blocker_output_list[0]) fk_rtable = cm.get_fk_rtable(blocker_output_list[0]) # Retrieve the keys for the ltable and rtables. l_key = cm.get_key(ltable) r_key = cm.get_key(rtable) # Check if the fk_ltable is starting with the given prefix, if not its # not an error. Just raise a warning. if fk_ltable.startswith(l_prefix) is False: logger.warning( 'Foreign key for ltable is not starting with the given prefix (' '%s)', l_prefix) # Check if the fk_rtable is starting with the given prefix, if not its # not an error. Just raise a warning. if fk_rtable.startswith(r_prefix) is False: logger.warning( 'Foreign key for rtable is not starting with the given prefix (' '%s)', r_prefix) # Initialize lists # # keep track of projected tuple pair ids tuple_pair_ids = [] # # keep track of output attributes from the left table l_output_attrs = [] # # keep track of output attributes from the right table r_output_attrs = [] # for each DataFrame in the given list, project out tuple pair ids, get the # attributes from the ltable and rtable for data_frame in blocker_output_list: # Project out the tuple pair ids. A tuple pair id is a fk_ltable, # fk_rtable pair projected_tuple_pair_ids = data_frame[[fk_ltable, fk_rtable]] # Update the list that tracks tuple pair ids tuple_pair_ids.append(projected_tuple_pair_ids) # Get the columns, which should be segregated into the attributes # from the ltable and table col_set = (gh.list_diff(list(data_frame.columns), [fk_ltable, fk_rtable, cm.get_key(data_frame)])) # Segregate the columns as attributes from the ltable and rtable l_attrs, r_attrs = _lr_cols(col_set, l_prefix, r_prefix) # Update the l_output_attrs, r_output_attrs l_output_attrs.extend(l_attrs) # the reason we use extend because l_attrs a list r_output_attrs.extend(r_attrs) ch.log_info( logger, 'Concatenating the tuple pair ids across given ' 'blockers ...', verbose) # concatenate the tuple pair ids from the list of input DataFrames concatenated_tuple_pair_ids = pd.concat(tuple_pair_ids) ch.log_info(logger, 'Concatenating the tuple pair ids ... DONE', verbose) ch.log_info(logger, 'Deduplicating the tuple pair ids ...', verbose) # Deduplicate the DataFrame. Now the returned DataFrame will contain # unique tuple pair ids. # noinspection PyUnresolvedReferences deduplicated_tuple_pair_ids = concatenated_tuple_pair_ids.drop_duplicates() ch.log_info(logger, 'Deduplicating the tuple pair ids ... DONE', verbose) # Construct output table # # Get unique list of attributes across different tables l_output_attrs = gh.list_drop_duplicates(l_output_attrs) r_output_attrs = gh.list_drop_duplicates(r_output_attrs) # Reset the index that might have lingered from concatenation. deduplicated_tuple_pair_ids.reset_index(inplace=True, drop=True) # Add the output attribtues from the ltable and rtable. # NOTE: This approach may be inefficient as it probes the ltable, rtable # to get the attribute values. A better way would be to fill the # attribute values from the input list of DataFrames. This attribute values # could be harvested (at the expense of some space) while we iterate the # input blocker output list for the first time. # noinspection PyProtectedMember consolidated_data_frame = gh._add_output_attributes( deduplicated_tuple_pair_ids, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_prefix, r_prefix, validate=False) # Sort the DataFrame ordered by fk_ltable and fk_rtable. # The function "sort" will be depreciated in the newer versions of # pandas DataFrame, and it will replaced by 'sort_values' function. So we # will first try to use sort_values if this fails we will use sort. try: consolidated_data_frame.sort_values([fk_ltable, fk_rtable], inplace=True) except AttributeError: consolidated_data_frame.sort([fk_ltable, fk_rtable], inplace=True) # update the catalog for the consolidated DataFrame # First get a column name for the key key = ch.get_name_for_key(consolidated_data_frame.columns) # Second, add the column name as the key consolidated_data_frame = ch.add_key_column(consolidated_data_frame, key) # Third, reset the index to remove any out of order index values from # the sort. consolidated_data_frame.reset_index(inplace=True, drop=True) # Finally, set the properties for the consolidated DataFrame in the catalog cm.set_candset_properties(consolidated_data_frame, key, fk_ltable, fk_rtable, ltable, rtable) # Return the consolidated DataFrame return consolidated_data_frame
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=True, show_progress=True): # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm.validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm.validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # initialize progress bar if show_progress: bar = pyprind.ProgBar(len(ltable) * len(rtable)) # # list to keep track of the tuple pairs that survive blocking valid = [] # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # create look up table for faster processing l_dict = {} for k, r in l_df.iterrows(): l_dict[k] = r r_dict = {} for k, r in r_df.iterrows(): r_dict[k] = r # # get the position of the id attributes in the tables l_id_pos = list(ltable.columns).index(l_key) r_id_pos = list(rtable.columns).index(r_key) # # iterate through the tuples and apply the rules for l_t in ltable.itertuples(index=False): ltuple = l_dict[l_t[l_id_pos]] for r_t in rtable.itertuples(index=False): # # update the progress bar if show_progress: bar.update() rtuple = r_dict[r_t[r_id_pos]] res = self.apply_rules(ltuple, rtuple) if res != True: d = OrderedDict() # # add ltable and rtable ids ltable_id = l_output_prefix + l_key rtable_id = r_output_prefix + r_key d[ltable_id] = ltuple[l_key] d[rtable_id] = rtuple[r_key] # # add l/r output attributes if l_output_attrs: l_out = ltuple[l_output_attrs] l_out.index = l_output_prefix + l_out.index d.update(l_out) if r_output_attrs: r_out = rtuple[r_output_attrs] r_out.index = r_output_prefix + r_out.index d.update(r_out) # # add the ordered dict to the list valid.append(d) # construct output table candset = pd.DataFrame(valid) l_output_attrs = self.process_output_attrs(ltable, l_key, l_output_attrs, l_output_prefix) r_output_attrs = self.process_output_attrs(rtable, r_key, r_output_attrs, r_output_prefix) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=True, show_progress=True): # validate black box functionn assert self.black_box_function != None, 'Black box function is not set' # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs,r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm.validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm.validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # initialize progress bar if show_progress: bar = pyprind.ProgBar(len(ltable)*len(rtable)) # # list to keep track the tuple pairs that survive blocking valid = [] # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # create look up table for faster processing l_dict = {} for k, r in l_df.iterrows(): l_dict[k] = r r_dict = {} for k, r in r_df.iterrows(): r_dict[k] = r # # get the position of the id attribute in the tables l_id_pos = list(ltable.columns).index(l_key) r_id_pos = list(rtable.columns).index(r_key) # # iterate through the tuples and apply the black box function for l_t in ltable.itertuples(index=False): ltuple = l_dict[l_t[l_id_pos]] for r_t in rtable.itertuples(index=False): # # update the progress bar if show_progress: bar.update() rtuple = r_dict[r_t[r_id_pos]] res = self.black_box_function(ltuple, rtuple) if res != True: d = OrderedDict() # # add ltable and rtable ids ltable_id = l_output_prefix + l_key rtable_id = r_output_prefix + r_key d[ltable_id] = ltuple[l_key] d[rtable_id] = rtuple[r_key] # # add l/r output attributes if l_output_attrs: l_out = ltuple[l_output_attrs] l_out.index = l_output_prefix + l_out.index d.update(l_out) if r_output_attrs: r_out = rtuple[r_output_attrs] r_out.index = r_output_prefix + r_out.index d.update(r_out) # # add the ordered dict to the list valid.append(d) # construct output table candset = pd.DataFrame(valid) l_output_attrs = self.process_output_attrs(ltable, l_key, l_output_attrs, l_output_prefix) r_output_attrs = self.process_output_attrs(rtable, r_key, r_output_attrs, r_output_prefix) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix+l_key, r_output_prefix+r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=True, show_progress=True): # validations self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # required metadata; keys from ltable and rtable log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # do blocking if word_level == True and q_val != None: raise SyntaxError('Parameters word_level and q_val cannot be set together; Note that word_level is ' 'set to True by default, so explicity set word_level=false to use qgram with the ' 'specified q_val') # #rem nans l_df = rem_nan(ltable, l_overlap_attr) r_df = rem_nan(rtable, r_overlap_attr) # #reset indexes in the dataframe l_df.reset_index(inplace=True, drop=True) r_df.reset_index(inplace=True, drop=True) # #create a dummy column with all values set to 1. l_dummy_col_name = self.get_dummy_col_name(l_df.columns) r_dummy_col_name = self.get_dummy_col_name(r_df.columns) l_df[l_dummy_col_name] = 1 # need to fix this - should be a name that does not occur in the col. names r_df[r_dummy_col_name] = 1 # #case the column to string if required. if l_df.dtypes[l_overlap_attr] != object: logger.warning('Left overlap attribute is not of type string; coverting to string temporarily') l_df[l_overlap_attr] = l_df[l_overlap_attr].astype(str) if r_df.dtypes[r_overlap_attr] != object: logger.warning('Right overlap attribute is not of type string; coverting to string temporarily') r_df[r_overlap_attr] = r_df[r_overlap_attr].astype(str) l_dict = {} r_dict = {} # #create a lookup table for quick access for k, r in l_df.iterrows(): l_dict[k] = r for k, r in r_df.iterrows(): r_dict[k] = r l_colvalues_chopped = self.process_table(l_df, l_overlap_attr, q_val, rem_stop_words) zipped_l_colvalues = zip(l_colvalues_chopped, range(0, len(l_colvalues_chopped))) appended_l_colidx_values = [self.append_index_values(val[0], val[1]) for val in zipped_l_colvalues] inv_idx = {} sink = [self.compute_inv_index(t, inv_idx) for c in appended_l_colidx_values for t in c] r_colvalues_chopped = self.process_table(r_df, r_overlap_attr, q_val, rem_stop_words) r_idx = 0 white_list = [] if show_progress: bar = pyprind.ProgBar(len(r_colvalues_chopped)) df_list = [] for col_values in r_colvalues_chopped: if show_progress: bar.update() qualifying_ltable_indices = self.get_potential_match_indices(col_values, inv_idx, overlap_size) r_row = r_dict[r_idx] r_row_dict = r_row.to_frame().T l_rows_dict = l_df.iloc[qualifying_ltable_indices] df = l_rows_dict.merge(r_row_dict, left_on=l_dummy_col_name, right_on=r_dummy_col_name, suffixes=('_ltable', '_rtable')) if len(df) > 0: df_list.append(df) r_idx += 1 # Construct the output table candset = pd.concat(df_list) l_output_attrs = self.process_output_attrs(ltable, l_key, l_output_attrs, 'left') r_output_attrs = self.process_output_attrs(rtable, r_key, r_output_attrs, 'right') # retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, # l_output_prefix, r_output_prefix) retain_cols, final_cols = self.output_columns(l_key, r_key, list(candset.columns), l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] candset.columns = final_cols else: candset = pd.DataFrame(columns=final_cols) # Update metadata in the catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return the candidate set return candset
def test_add_key_column_invalid_attr(self): ch.add_key_column(pd.DataFrame(), None)
def test_add_key_column_invalid_df(self): ch.add_key_column(None, 'id')
def test_get_name_for_key_valid_1(self): A = pd.read_csv(path_a) ch.add_key_column(A, '_id') s = ch.get_name_for_key(A) self.assertEqual(s, '_id0')