def _link_index(self, df_a, df_b): # Index name conflicts do not occur. They are handled in the # decorator. left_on = listify(self.left_on) right_on = listify(self.right_on) if self.on: left_on, right_on = listify(self.on), listify(self.on) if not left_on or not right_on: raise ValueError("no column labels given") if len(left_on) != len(right_on): raise ValueError( "length of left and right keys needs to be the same") blocking_keys = ["blocking_key_%d" % i for i, v in enumerate(left_on)] # make a dataset for the data on the left data_left = df_a[left_on].dropna(axis=0, how='any', inplace=False) data_left.columns = blocking_keys data_left['index_x'] = data_left.index # make a dataset for the data on the right data_right = df_b[right_on].dropna(axis=0, how='any', inplace=False) data_right.columns = blocking_keys data_right['index_y'] = data_right.index # merge the dataframes pairs = data_left.merge(data_right, how='inner', on=blocking_keys).set_index( ['index_x', 'index_y']) return pairs.index.rename([df_a.index.name, df_b.index.name])
def _link_index(self, df_a, df_b): left_on, right_on = self._get_left_and_right_on() left_on = listify(left_on) right_on = listify(right_on) blocking_keys = ["blocking_key_%d" % i for i, v in enumerate(left_on)] # make a dataset for the data on the left # 1. make a dataframe # 2. rename columns # 3. add index col # 4. drop na (last step to presever index) data_left = pandas.DataFrame(df_a[left_on], copy=False) data_left.columns = blocking_keys data_left['index_x'] = numpy.arange(len(df_a)) data_left.dropna(axis=0, how='any', subset=blocking_keys, inplace=True) # make a dataset for the data on the right data_right = pandas.DataFrame(df_b[right_on], copy=False) data_right.columns = blocking_keys data_right['index_y'] = numpy.arange(len(df_b)) data_right.dropna(axis=0, how='any', subset=blocking_keys, inplace=True) # merge the dataframes pairs_df = data_left.merge(data_right, how='inner', on=blocking_keys) return pandas.MultiIndex( levels=[df_a.index.values, df_b.index.values], labels=[pairs_df['index_x'].values, pairs_df['index_y'].values], verify_integrity=False)
def _compute(self, pairs, x, x_link=None): # start the timer for the comparing step start_time = time.time() sublabels_left = self._get_labels_left(validate=x) df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1) # log timing # index_time = time.time() - start_time features = [] for feat in self.features: lbl1 = feat.labels_left lbl2 = feat.labels_right data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)]) data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)]) result = feat._compute(data1, data2) features.append((result, feat.label)) features = self._union(features, pairs) # log timing n = pairs.shape[0] i_max = '?' if self._i_max is None else self._i_max eta = time.time() - start_time self._eta.append(eta) self._n.append(n) # log logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format( self._i, i_max, eta, n)) # log total if self._output_log_total: n_total = np.sum(self._n) eta_total = np.sum(self._eta) logging.info( "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format( self._i, i_max, eta_total, n_total)) self._i += 1 return features
def compute(self, pairs, x, x_link=None): """Compare the records of each record pair. Calling this method starts the comparing of records. Parameters ---------- pairs : pandas.MultiIndex A pandas MultiIndex with the record pairs to compare. The indices in the MultiIndex are indices of the DataFrame(s) to link. x : pandas.DataFrame The DataFrame to link. If `x_link` is given, the comparing is a linking problem. If `x_link` is not given, the problem is one of deduplication. x_link : pandas.DataFrame, optional The second DataFrame. Returns ------- pandas.DataFrame A pandas DataFrame with feature vectors, i.e. the result of comparing each record pair. """ if not is_pandas_2d_multiindex(pairs): raise ValueError( "expected pandas.MultiIndex with record pair indices " "as first argument") if not isinstance(x, pandas.DataFrame): raise ValueError("expected pandas.DataFrame as second argument") if x_link is not None and not isinstance(x_link, pandas.DataFrame): raise ValueError("expected pandas.DataFrame as third argument") labels_left = listify(self.labels_left) labels_right = listify(self.labels_right) df_a = frame_indexing(x[labels_left], pairs, 0) if x_link is None: df_b = frame_indexing(x[labels_right], pairs, 1) else: df_b = frame_indexing(x_link[labels_right], pairs, 1) data1 = tuple([df_a[lbl] for lbl in listify(self.labels_left)]) data2 = tuple([df_b[lbl] for lbl in listify(self.labels_right)]) results = self._compute(*tuple(data1 + data2)) return results
def __init__(self, left_on=None, right_on=None, max_nulls=0, max_non_matches=0, windows=1, **kwargs): super(NeighbourhoodBlock, self).__init__( left_on=left_on, right_on=right_on, **kwargs) self.max_nulls = max_nulls self.max_non_matches = max_non_matches self.windows = listify(windows)
def _link_index(self, df_a, df_b): if self.on is not None: if self.left_on is not None or self.right_on is not None: raise IndexError('Can only pass argument "on" OR "left_on" ' 'and "right_on", not a combination of both.') left_on = right_on = listify(self.on) else: if self.left_on is None and self.right_on is None: raise IndexError('pass argument "on" OR "left_on" and ' '"right_on" at class initalization.') elif self.left_on is None: raise IndexError('Argument "left_on" is missing ' 'at class initalization.') elif self.right_on is None: raise IndexError('Argument "right_on" is missing ' 'at class initalization.') else: left_on = listify(self.left_on) right_on = listify(self.right_on) blocking_keys = ["blocking_key_%d" % i for i, v in enumerate(left_on)] # make a dataset for the data on the left data_left = df_a[left_on].dropna(axis=0, how='any', inplace=False) data_left.columns = blocking_keys data_left['index_x'] = data_left.index # make a dataset for the data on the right data_right = df_b[right_on].dropna(axis=0, how='any', inplace=False) data_right.columns = blocking_keys data_right['index_y'] = data_right.index # merge the dataframes pairs = data_left.merge( data_right, how='inner', on=blocking_keys ).set_index(['index_x', 'index_y']) return pairs.index
def _get_labels_right(self, validate=None): """Get all labels of the right dataframe.""" labels = [] for compare_func in self.features: labels = labels + listify(compare_func.labels_right) # check requested labels (for better error messages) if not is_label_dataframe(labels, validate): error_msg = "label is not found in the dataframe" raise KeyError(error_msg) return unique(labels)
def get_normalized_linkage_params(): def default_on_possibilities(): yield self.left_on yield self.right_on yield [ c for c in dfs[0].columns if all(c in df.columns for df in dfs) ] default_on = next( iter( filter(lambda x: x is not None, default_on_possibilities()))) key_columns = [ listify(side_on or default_on) for side_on in [self.left_on, self.right_on] ] key_cols = set(map(len, key_columns)) n_key_cols = next(iter(key_cols)) if (len(key_cols) > 1) or (n_key_cols == 0): raise IndexError('Invalid blocking keys') combined_ranks = numpy.vstack([ pandas.concat([df[col] for df, col in zip(dfs, col_grp)]).rank( method='dense', na_option='keep').fillna(0).astype(int).values - 1 for col_grp in zip(*key_columns) ]).astype(float).T combined_ranks[combined_ranks < 0] = numpy.nan blocks, indices = deduped_blocks_and_indices( blocks=combined_ranks, indices=split_to_match(numpy.arange(len(combined_ranks)), dfs)) n_keys = blocks.shape[1] windows = self.windows + self.windows[-1:] * ( n_keys - len(self.windows)) if (len(windows) > n_keys) or not all( isinstance(w, int) and (w > 0) and (w % 2 == 1) for w in windows): raise ValueError( 'Windows must be positive odd integers and the maximum' 'number allowed is the number of blocking keys' ) rank_distance_limits = ( numpy.array(windows) // 2).astype(float).reshape((1, -1)) return blocks, indices, rank_distance_limits
def _get_labels(self, frame_i, validate=None): """Get all labels. Parameters ---------- frame_i : str A string, 'left' or 'right', incidating the dataframe to collect labels from. """ labels = [] for compare_func in self._compare_functions: labels = labels + listify(compare_func[frame_i]) # check requested labels (for better error messages) if not is_label_dataframe(labels, validate): error_msg = "label is not found in the dataframe" raise KeyError(error_msg) return unique(labels)
def compute(self, pairs, x, x_link=None): """Compare the records of each record pair. Calling this method starts the comparing of records. Parameters ---------- pairs : pandas.MultiIndex A pandas MultiIndex with the record pairs to compare. The indices in the MultiIndex are indices of the DataFrame(s) to link. x : pandas.DataFrame The DataFrame to link. If `x_link` is given, the comparing is a linking problem. If `x_link` is not given, the problem is one of deduplication. x_link : pandas.DataFrame, optional The second DataFrame. Returns ------- pandas.DataFrame A pandas DataFrame with feature vectors, i.e. the result of comparing each record pair. """ sublabels_left = self._get_labels_left(validate=x) df_a_indexed = self._loc2(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = self._loc2(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = self._loc2(x_link[sublabels_right], pairs, 1) results = pandas.DataFrame(index=pairs) label_num = 0 # make a label is label is None for f, lbl1, lbl2, label, args, kwargs in self._compare_functions: data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)]) data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)]) c = f(*tuple(data1 + data2 + args), **kwargs) if isinstance(c, (pandas.Series, pandas.DataFrame)): c = c.values # convert pandas into numpy if label is not None: label = listify(label) n_cols = 1 if len(c.shape) == 1 else c.shape[1] labels = [] for i in range(0, n_cols): label_val = label[i] if label is not None else label_num label_num += 1 labels.append(label_val) results[label_val] = c return results
def _link_index(self, df_a, df_b): if self.on is not None: if self.left_on is not None or self.right_on is not None: raise IndexError('Can only pass argument "on" OR "left_on" ' 'and "right_on", not a combination of both.') left_on = right_on = listify(self.on) else: if self.left_on is None and self.right_on is None: raise IndexError('pass argument "on" OR "left_on" and ' '"right_on" at class initalization.') elif self.left_on is None: raise IndexError('Argument "left_on" is missing ' 'at class initalization.') elif self.right_on is None: raise IndexError('Argument "right_on" is missing ' 'at class initalization.') else: left_on = listify(self.left_on) right_on = listify(self.right_on) window = self.window # Check if window is an odd number if not isinstance(window, int) or (window < 0) or not bool(window % 2): raise ValueError( 'window is not a positive and odd integer') # # sorting key is single column # if isinstance(self.on, (tuple, list, dict)): # raise ValueError( # "sorting key is not a label") # make blocking keys correct block_left_on = listify(self.block_left_on) block_right_on = listify(self.block_right_on) if self.block_on: block_left_on = listify(self.block_on) block_right_on = listify(self.block_on) # drop missing values and columns without relevant information data_left = df_a[listify(left_on) + block_left_on].dropna( axis=0, how='any', inplace=False ) data_left.columns = ['sorting_key'] + \ ["blocking_key_%d" % i for i, v in enumerate(block_left_on)] data_left['index_x'] = data_left.index data_right = df_b[listify(right_on) + block_right_on].dropna( axis=0, how='any', inplace=False ) data_right.columns = ['sorting_key'] + \ ["blocking_key_%d" % i for i, v in enumerate(block_right_on)] data_right['index_y'] = data_right.index # sorting_key_values is the terminology in Data Matching [Christen, # 2012] if self.sorting_key_values is None: self.sorting_key_values = self._get_sorting_key_values( data_left['sorting_key'].values, data_right['sorting_key'].values ) sorting_key_factors = pandas.Series( numpy.arange(len(self.sorting_key_values)), index=self.sorting_key_values) data_left['sorting_key'] = data_left[ 'sorting_key'].map(sorting_key_factors) data_right['sorting_key'] = data_right[ 'sorting_key'].map(sorting_key_factors) # Internal window size _window = int((window - 1) / 2) def merge_lagged(x, y, w): """Merge two dataframes with a lag on in the sorting key.""" y = y.copy() y['sorting_key'] = y['sorting_key'] + w return x.merge(y, how='inner') pairs_concat = [merge_lagged(data_left, data_right, w) for w in range(-_window, _window + 1)] pairs = pandas.concat(pairs_concat, axis=0).set_index( ['index_x', 'index_y'] ).index return pairs
def __init__(self, left_on, right_on=None, **kwargs): super().__init__(**kwargs) self.left_on = listify(left_on) self.right_on = listify(right_on) if right_on else self.left_on self.soundex = fuzzy.Soundex(4)
def _compute(self, pairs, x, x_link=None): # start the timer for the comparing step start_time = time.time() sublabels_left = self._get_labels_left(validate=x) df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1) # log timing # index_time = time.time() - start_time features = [] for feat in self.features: # --- DATA1 # None: no data passed to func if feat.labels_left is None: data1 = tuple() # empty array: empty df with index passed to func elif feat.labels_left == []: data1 = (df_a_indexed[[]], ) # else: subset columns and pass tuple of series else: data1 = tuple( [df_a_indexed[lbl] for lbl in listify(feat.labels_left)]) # --- DATA2 # None: no data passed to func if feat.labels_right is None: data2 = tuple() # empty array: empty df with index passed to func elif feat.labels_right == []: data2 = (df_b_indexed[[]], ) # else: subset columns and pass tuple of series else: data2 = tuple( [df_b_indexed[lbl] for lbl in listify(feat.labels_right)]) result = feat._compute(data1, data2) features.append((result, feat.label)) features = self._union(features, pairs) # log timing n = pairs.shape[0] i_max = '?' if self._i_max is None else self._i_max eta = time.time() - start_time self._eta.append(eta) self._n.append(n) # log logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format( self._i, i_max, eta, n)) # log total if self._output_log_total: n_total = np.sum(self._n) eta_total = np.sum(self._eta) logging.info( "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format( self._i, i_max, eta_total, n_total)) self._i += 1 return features
def compare(self, comp_func, labels_a, labels_b, *args, **kwargs): """Compare two records. Core method to compare record pairs. This method takes a function and data from both records in the record pair. The data is compared with the compare function. The built-in methods also use this function. Example ------- >>> comp = recordlinkage.Compare(PAIRS, DATAFRAME1, DATAFRAME2) >>> comp.exact('first_name', 'name') >>> # same as >>> comp.compare(recordlinkage._compare_exact, 'first_name', 'name') Parameters ---------- comp_func : function A comparison function. This function can be a built-in function or a user defined comparison function. labels_a : label, pandas.Series, pandas.DataFrame The labels, Series or DataFrame to compare. labels_b : label, pandas.Series, pandas.DataFrame The labels, Series or DataFrame to compare. name : label The name of the feature and the name of the column. store : bool, default True Store the result in the dataframe. Returns ------- pandas.Series A pandas series with the result of comparing each record pair. """ if len(self.pairs) == 0: raise ValueError( "need at least one record pair" ) # the name and store arguments name = kwargs.pop('name', None) store = kwargs.pop('store', True) labels_a = listify(labels_a) labels_b = listify(labels_b) data_a = [] for label_a in labels_a: # the label is a numpy or pandas object if is_numpy_like(label_a) or is_pandas_like(label_a): data_a.append(label_a) # check requested labels (for better error messages) elif label_a not in self.df_a.columns: raise KeyError("label '%s' is not found in the first" "dataframe" % label_a) else: if self.low_memory: df_a_label = self._loc2(self.df_a[label_a], self.pairs, 0) data_a.append(df_a_label) # not low memory else: if self._df_a_indexed is None: self._df_a_indexed = self._loc2( self.df_a, self.pairs, 0) data_a.append(self._df_a_indexed[label_a]) data_a = tuple(data_a) data_b = [] for label_b in labels_b: # the label is a numpy or pandas object if is_numpy_like(label_b) or is_pandas_like(label_b): data_b.append(label_b) # check requested labels (for better error messages) elif label_b not in self.df_b.columns: raise KeyError("label '%s' is not found in the second" "dataframe" % label_b) else: if self.low_memory: df_b_label = self._loc2(self.df_b[label_b], self.pairs, 1) data_b.append(df_b_label) # not low memory else: if self._df_b_indexed is None: self._df_b_indexed = self._loc2( self.df_b, self.pairs, 1) data_b.append(self._df_b_indexed[label_b]) data_b = tuple(data_b) if self.njobs > 1: jobs = [] chunk_size = np.ceil(self.njobs / len(self.pairs)) # each job for i in range(0, self.njobs): # The data arguments args_a = tuple(df_a_indexed.loc[i*chunk_size:(i+1)*chunk_size, da] for da in labels_a) args_b = tuple(df_b_indexed.loc[i*chunk_size:(i+1)*chunk_size, db] for db in labels_b) p = mp.Process(target=comp_func, args=args_a + args_b + args, kwargs=kwargs) jobs.append(p) for proc in jobs: # Start the process p.start() proc.join() # merge parts c = pandas.concat(jobs, axis=0, copy=False) else: # # The data arguments # args_a = tuple(df_a_indexed.loc[:, da] for da in labels_a) # args_b = tuple(df_b_indexed.loc[:, db] for db in labels_b) # Compute the comparison c = comp_func(*tuple(data_a + data_b + args), **kwargs) # if a pandas series is returned, overwrite the index. The # returned index can be different than the MultiIndex passed to # the compare function. if isinstance(c, pandas.Series): c.index = self.vectors.index # append column to Compare.vectors if store: name_or_id = name if name else len(self.vectors.columns) self.vectors[name_or_id] = c return self.vectors[name_or_id].rename(name)
def _link_index(self, df_a, df_b): left_on, right_on = self._get_left_and_right_on() left_on = listify(left_on) right_on = listify(right_on) window = self.window # Check if window is an odd number if not isinstance(window, int) or (window < 0) or not bool(window % 2): raise ValueError('window is not a positive and odd integer') # # sorting key is single column # if isinstance(self.on, (tuple, list, dict)): # raise ValueError( # "sorting key is not a label") # make blocking keys correct block_left_on = listify(self.block_left_on) block_right_on = listify(self.block_right_on) if self.block_on: block_left_on = listify(self.block_on) block_right_on = listify(self.block_on) blocking_keys = ['sorting_key'] + \ ["blocking_key_%d" % i for i, v in enumerate(block_left_on)] # make a dataset for the data on the left # 1. make a dataframe # 2. rename columns # 3. add index col # 4. drop na (last step to presever index) data_left = pandas.DataFrame(df_a[listify(left_on) + block_left_on], copy=False) data_left.columns = blocking_keys data_left['index_x'] = numpy.arange(len(df_a)) data_left.dropna(axis=0, how='any', subset=blocking_keys, inplace=True) data_right = pandas.DataFrame(df_b[listify(right_on) + block_right_on], copy=False) data_right.columns = blocking_keys data_right['index_y'] = numpy.arange(len(df_b)) data_right.dropna(axis=0, how='any', subset=blocking_keys, inplace=True) # sorting_key_values is the terminology in Data Matching [Christen, # 2012] if self.sorting_key_values is None: self.sorting_key_values = self._get_sorting_key_values( data_left['sorting_key'].values, data_right['sorting_key'].values) sorting_key_factors = pandas.Series(numpy.arange( len(self.sorting_key_values)), index=self.sorting_key_values) data_left['sorting_key'] = data_left['sorting_key'].map( sorting_key_factors) data_right['sorting_key'] = data_right['sorting_key'].map( sorting_key_factors) # Internal window size _window = int((window - 1) / 2) def merge_lagged(x, y, w): """Merge two dataframes with a lag on in the sorting key.""" y = y.copy() y['sorting_key'] = y['sorting_key'] + w return x.merge(y, how='inner') pairs_concat = [ merge_lagged(data_left, data_right, w) for w in range(-_window, _window + 1) ] pairs_df = pandas.concat(pairs_concat, axis=0) return pandas.MultiIndex( levels=[df_a.index.values, df_b.index.values], labels=[pairs_df['index_x'].values, pairs_df['index_y'].values], verify_integrity=False)
def compute(self, pairs, x, x_link=None): """Compare the records of each record pair. Calling this method starts the comparing of records. Parameters ---------- pairs : pandas.MultiIndex A pandas MultiIndex with the record pairs to compare. The indices in the MultiIndex are indices of the DataFrame(s) to link. x : pandas.DataFrame The DataFrame to link. If `x_link` is given, the comparing is a linking problem. If `x_link` is not given, the problem is one of deduplication. x_link : pandas.DataFrame, optional The second DataFrame. Returns ------- pandas.DataFrame A pandas DataFrame with feature vectors, i.e. the result of comparing each record pair. """ if not isinstance(pairs, pandas.MultiIndex): raise ValueError( "expected pandas.MultiIndex with record pair indices " "as first argument") if not isinstance(x, pandas.DataFrame): raise ValueError("expected pandas.DataFrame as second argument") if x_link is not None and not isinstance(x_link, pandas.DataFrame): raise ValueError("expected pandas.DataFrame as third argument") logging.info("Comparing - start comparing data") # start the timer for the comparing step start_time = time.time() sublabels_left = self._get_labels_left(validate=x) df_a_indexed = self._loc2(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = self._loc2(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = self._loc2(x_link[sublabels_right], pairs, 1) # log timing index_time = time.time() - start_time results = pandas.DataFrame(index=pairs) label_num = 0 # make a label is label is None for f, lbl1, lbl2, label, args, kwargs in self._compare_functions: data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)]) data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)]) c = f(*tuple(data1 + data2 + args), **kwargs) if isinstance(c, (pandas.Series, pandas.DataFrame)): c = c.values # convert pandas into numpy if label is not None: label = listify(label) n_cols = 1 if len(c.shape) == 1 else c.shape[1] labels = [] for i in range(0, n_cols): label_val = label[i] if label is not None else label_num label_num += 1 labels.append(label_val) results[label_val] = c # log timing total_time = time.time() - start_time # log timing logging.info("Comparing - computation time: ~{:.2f}s (from which " "indexing: ~{:.2f}s)".format(total_time, index_time)) # log results logf_result = "Comparing - summary shape={}" logging.info(logf_result.format(results.shape)) return results
def _link_index(self, df_a, df_b): # Index name conflicts do not occur. They are handled in the # decorator. left_on = listify(self.left_on) right_on = listify(self.right_on) if self.on: left_on = listify(self.on) right_on = listify(self.on) if not left_on or not right_on: raise ValueError("no column labels given") if len(left_on) != len(right_on): raise ValueError( "length of left and right keys needs to be the same") window = self.window # Check if window is an odd number if not isinstance(window, int) or (window < 0) or not bool(window % 2): raise ValueError('window is not a positive and odd integer') # # sorting key is single column # if isinstance(self.on, (tuple, list, dict)): # raise ValueError( # "sorting key is not a label") # make blocking keys correct block_left_on = listify(self.block_left_on) block_right_on = listify(self.block_right_on) if self.block_on: block_left_on = listify(self.block_on) block_right_on = listify(self.block_on) # drop missing values and columns without relevant information data_left = df_a[listify(left_on) + block_left_on].dropna( axis=0, how='any', inplace=False) data_left.columns = ['sorting_key'] + \ ["blocking_key_%d" % i for i, v in enumerate(block_left_on)] data_left['index_x'] = data_left.index data_right = df_b[listify(right_on) + block_right_on].dropna( axis=0, how='any', inplace=False) data_right.columns = ['sorting_key'] + \ ["blocking_key_%d" % i for i, v in enumerate(block_right_on)] data_right['index_y'] = data_right.index # sorting_key_values is the terminology in Data Matching [Christen, # 2012] if self.sorting_key_values is None: self.sorting_key_values = self._get_sorting_key_values( data_left['sorting_key'].values, data_right['sorting_key'].values) sorting_key_factors = pandas.Series(numpy.arange( len(self.sorting_key_values)), index=self.sorting_key_values) data_left['sorting_key'] = data_left['sorting_key'].map( sorting_key_factors) data_right['sorting_key'] = data_right['sorting_key'].map( sorting_key_factors) # Internal window size _window = int((window - 1) / 2) def merge_lagged(x, y, w): """Merge two dataframes with a lag on in the sorting key.""" y = y.copy() y['sorting_key'] = y['sorting_key'] + w return x.merge(y, how='inner') pairs_concat = [ merge_lagged(data_left, data_right, w) for w in range(-_window, _window + 1) ] pairs = pandas.concat(pairs_concat, axis=0).set_index( ['index_x', 'index_y']).index.rename([df_a.index.name, df_b.index.name]) return pairs
def compare(self, comp_func, labels_a, labels_b, *args, **kwargs): """[DEPRECATED] Compare two records. Core method to compare record pairs. This method takes a function and data from both records in the record pair. The data is compared with the compare function. The built-in methods also use this function. Example ------- >>> comp = recordlinkage.Compare(PAIRS, DATAFRAME1, DATAFRAME2) >>> comp.exact('first_name', 'name') >>> # same as >>> comp.compare(recordlinkage._compare_exact, 'first_name', 'name') Parameters ---------- comp_func : function A comparison function. This function can be a built-in function or a user defined comparison function. labels_a : label, pandas.Series, pandas.DataFrame The labels, Series or DataFrame to compare. labels_b : label, pandas.Series, pandas.DataFrame The labels, Series or DataFrame to compare. name : label The name of the feature and the name of the column. store : bool, default True Store the result in the dataframe. Returns ------- pandas.Series A pandas series with the result of comparing each record pair. """ if isinstance(comp_func, pandas.MultiIndex): raise ValueError("see new api documentation: " "use method 'compute' instead of 'compare'") if len(self.pairs) == 0: raise ValueError("need at least one record pair") # the name and store arguments name = kwargs.pop('name', None) store = kwargs.pop('store', True) labels_a = listify(labels_a) labels_b = listify(labels_b) data_a = [] for label_a in labels_a: # the label is a numpy or pandas object if is_numpy_like(label_a) or is_pandas_like(label_a): data_a.append(label_a) # check requested labels (for better error messages) elif label_a not in self.df_a.columns: raise KeyError("label '{}' is not found in the first" "dataframe".format(label_a)) else: if self.low_memory: df_a_label = self._loc2(self.df_a[label_a], self.pairs, 0) data_a.append(df_a_label) # not low memory else: if self._df_a_indexed is None: self._df_a_indexed = self._loc2( self.df_a, self.pairs, 0) data_a.append(self._df_a_indexed[label_a]) data_a = tuple(data_a) data_b = [] for label_b in labels_b: # the label is a numpy or pandas object if is_numpy_like(label_b) or is_pandas_like(label_b): data_b.append(label_b) # check requested labels (for better error messages) elif label_b not in self.df_b.columns: raise KeyError("label '{}' is not found in the second" "dataframe".format(label_b)) else: if self.low_memory: df_b_label = self._loc2(self.df_b[label_b], self.pairs, 1) data_b.append(df_b_label) # not low memory else: if self._df_b_indexed is None: self._df_b_indexed = self._loc2( self.df_b, self.pairs, 1) data_b.append(self._df_b_indexed[label_b]) data_b = tuple(data_b) # Compute the comparison c = comp_func(*tuple(data_a + data_b + args), **kwargs) # if a pandas series is returned, overwrite the index. The # returned index can be different than the MultiIndex passed to # the compare function. if isinstance(c, pandas.Series): c.index = self.vectors.index # append column to Compare.vectors if store: name_or_id = name if name else len(self.vectors.columns) self.vectors[name_or_id] = c return self.vectors[name_or_id].rename(name)
def _compute(self, pairs, x, x_link=None): logging.info("Comparing - start comparing data") # start the timer for the comparing step start_time = time.time() sublabels_left = self._get_labels_left(validate=x) df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1) # log timing index_time = time.time() - start_time results = pandas.DataFrame(index=pairs) label_num = 0 # make a label is label is None for feat, label in self.features: lbl1 = feat.labels_left lbl2 = feat.labels_right data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)]) data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)]) c = feat._compute(*tuple(data1 + data2)) if is_pandas_like(c): c = c.values # convert pandas into numpy if label is not None: label = listify(label) n_cols = 1 if len(c.shape) == 1 else c.shape[1] labels = [] for i in range(0, n_cols): label_val = label[i] if label is not None else label_num label_num += 1 labels.append(label_val) results[label_val] = c # log timing total_time = time.time() - start_time # log timing logging.info("Comparing - computation time: ~{:.2f}s (from which " "indexing: ~{:.2f}s)".format(total_time, index_time)) # log results logf_result = "Comparing - summary shape={}" logging.info(logf_result.format(results.shape)) return results