def _compute(self, pairs, x, x_link=None): # start the timer for the comparing step start_time = time.time() sublabels_left = self._get_labels_left(validate=x) df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1) # log timing # index_time = time.time() - start_time features = [] for feat in self.features: lbl1 = feat.labels_left lbl2 = feat.labels_right data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)]) data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)]) result = feat._compute(data1, data2) features.append((result, feat.label)) features = self._union(features, pairs) # log timing n = pairs.shape[0] i_max = '?' if self._i_max is None else self._i_max eta = time.time() - start_time self._eta.append(eta) self._n.append(n) # log logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format( self._i, i_max, eta, n)) # log total if self._output_log_total: n_total = np.sum(self._n) eta_total = np.sum(self._eta) logging.info( "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format( self._i, i_max, eta_total, n_total)) self._i += 1 return features
def compute(self, pairs, x, x_link=None): """Compare the records of each record pair. Calling this method starts the comparing of records. Parameters ---------- pairs : pandas.MultiIndex A pandas MultiIndex with the record pairs to compare. The indices in the MultiIndex are indices of the DataFrame(s) to link. x : pandas.DataFrame The DataFrame to link. If `x_link` is given, the comparing is a linking problem. If `x_link` is not given, the problem is one of deduplication. x_link : pandas.DataFrame, optional The second DataFrame. Returns ------- pandas.Series, pandas.DataFrame, numpy.ndarray The result of comparing record pairs (the features). Can be a tuple with multiple pandas.Series, pandas.DataFrame, numpy.ndarray objects. """ if not is_pandas_2d_multiindex(pairs): raise ValueError( "expected pandas.MultiIndex with record pair indices " "as first argument" ) if not isinstance(x, pandas.DataFrame): raise ValueError("expected pandas.DataFrame as second argument") if x_link is not None and not isinstance(x_link, pandas.DataFrame): raise ValueError("expected pandas.DataFrame as third argument") labels_left = listify(self.labels_left, []) labels_right = listify(self.labels_right, []) if x_link is None: df_a = frame_indexing(x[labels_left + labels_right], pairs, 0) data1 = tuple([df_a[lbl] for lbl in listify(self.labels_left)]) data2 = tuple([df_a[lbl] for lbl in listify(self.labels_right)]) else: df_a = frame_indexing(x[labels_left], pairs, 0) data1 = tuple([df_a[lbl] for lbl in listify(self.labels_left)]) df_b = frame_indexing(x_link[labels_right], pairs, 1) data2 = tuple([df_b[lbl] for lbl in listify(self.labels_right)]) results = self._compute(data1, data2) return results
def _compute(self, pairs, x, x_link=None): # start the timer for the comparing step start_time = time.time() sublabels_left = self._get_labels_left(validate=x) df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1) # log timing # index_time = time.time() - start_time features = [] for feat in self.features: # --- DATA1 # None: no data passed to func if feat.labels_left is None: data1 = tuple() # empty array: empty df with index passed to func elif feat.labels_left == []: data1 = (df_a_indexed[[]], ) # else: subset columns and pass tuple of series else: data1 = tuple( [df_a_indexed[lbl] for lbl in listify(feat.labels_left)]) # --- DATA2 # None: no data passed to func if feat.labels_right is None: data2 = tuple() # empty array: empty df with index passed to func elif feat.labels_right == []: data2 = (df_b_indexed[[]], ) # else: subset columns and pass tuple of series else: data2 = tuple( [df_b_indexed[lbl] for lbl in listify(feat.labels_right)]) result = feat._compute(data1, data2) features.append((result, feat.label)) features = self._union(features, pairs) # log timing n = pairs.shape[0] i_max = '?' if self._i_max is None else self._i_max eta = time.time() - start_time self._eta.append(eta) self._n.append(n) # log logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format( self._i, i_max, eta, n)) # log total if self._output_log_total: n_total = np.sum(self._n) eta_total = np.sum(self._eta) logging.info( "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format( self._i, i_max, eta_total, n_total)) self._i += 1 return features
def _compute(self, pairs, x, x_link=None): logging.info("Comparing - start comparing data") # start the timer for the comparing step start_time = time.time() sublabels_left = self._get_labels_left(validate=x) df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0) if x_link is None: sublabels_right = self._get_labels_right(validate=x) df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1) else: sublabels_right = self._get_labels_right(validate=x_link) df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1) # log timing index_time = time.time() - start_time results = pandas.DataFrame(index=pairs) label_num = 0 # make a label is label is None for feat, label in self.features: lbl1 = feat.labels_left lbl2 = feat.labels_right data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)]) data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)]) c = feat._compute(*tuple(data1 + data2)) if is_pandas_like(c): c = c.values # convert pandas into numpy if label is not None: label = listify(label) n_cols = 1 if len(c.shape) == 1 else c.shape[1] labels = [] for i in range(0, n_cols): label_val = label[i] if label is not None else label_num label_num += 1 labels.append(label_val) results[label_val] = c # log timing total_time = time.time() - start_time # log timing logging.info("Comparing - computation time: ~{:.2f}s (from which " "indexing: ~{:.2f}s)".format(total_time, index_time)) # log results logf_result = "Comparing - summary shape={}" logging.info(logf_result.format(results.shape)) return results