def random_pairs_without_replacement_large_frames( n, shape, random_state=None): """Make a sample of random pairs with replacement""" n_max = max_pairs(shape) sample = np.array([]) # Run as long as the number of pairs is less than the requested number # of pairs n. while len(sample) < n: # The number of pairs to sample (sample twice as much record pairs # because the duplicates are dropped). n_sample_size = (n - len(sample)) * 2 sample = random_state.randint(n_max, size=n_sample_size) # concatenate pairs and deduplicate pairs_non_unique = np.append(sample, sample) sample = _unique_rows_numpy(pairs_non_unique) # return 2d indices if len(shape) == 1: return _map_triu_1d_on_2d(sample[0:n], shape[0]) else: return np.unravel_index(sample[0:n], shape)
def _dedup_index(self, df_a): shape = (len(df_a), ) # with replacement if self.replace: pairs = random_pairs_with_replacement(self.n, shape, self.random_state) # without replacement else: n_max = max_pairs(shape) if not isinstance(self.n, int) or self.n <= 0 or self.n > n_max: raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max) # large dataframes if n_max < 1e6: pairs = random_pairs_without_replacement_small_frames( self.n, shape, self.random_state) # small dataframes else: pairs = random_pairs_without_replacement_large_frames( self.n, shape, self.random_state) levels = [df_a.index.values, df_a.index.values] labels = pairs names = [df_a.index.name, df_a.index.name] return pandas.MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False)
def _link_index(self, df_a, df_b): n_max = max_pairs((df_a, df_b)) if n_max > 1e7: logging.warn( "The number of record pairs is large. Consider a different " "indexation algorithm for better performance. ") return pandas.MultiIndex.from_product( [df_a.index.values, df_b.index.values], names=[df_a.index.name, df_b.index.name])
def _dedup_index(self, df_a): n_max = max_pairs((df_a)) if n_max > 1e7: logging.warn( "The number of record pairs is large. Consider a different " "indexation algorithm for better performance. ") levels = [df_a.index.values, df_a.index.values] labels = numpy.triu_indices(len(df_a.index), k=1) names = [df_a.index.name, df_a.index.name] return pandas.MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False)
def random_pairs_with_replacement(n, shape, random_state=None): """make random record pairs""" if not isinstance(random_state, np.random.RandomState): random_state = np.random.RandomState(random_state) n_max = max_pairs(shape) if n_max <= 0: raise ValueError('n_max must be larger than 0') # make random pairs indices = random_state.randint(0, n_max, n) if len(shape) == 1: return _map_triu_1d_on_2d(indices, shape[0]) else: return np.unravel_index(indices, shape)
def random_pairs_without_replacement_small_frames( n, shape, random_state=None): n_max = max_pairs(shape) if not isinstance(random_state, np.random.RandomState): random_state = np.random.RandomState(random_state) if not isinstance(n, int) or n <= 0 or n > n_max: raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max) # make a sample without replacement sample = random_state.choice( np.arange(n_max), n, replace=False) # return 2d indices if len(shape) == 1: return _map_triu_1d_on_2d(sample, shape[0]) else: return np.unravel_index(sample, shape)
def index(self, x, x_link=None): """Make an index of record pairs. Parameters ---------- x: pandas.DataFrame A pandas DataFrame. When `x_link` is None, the algorithm makes record pairs within the DataFrame. When `x_link` is not empty, the algorithm makes pairs between `x` and `x_link`. x_link: pandas.DataFrame, optional A second DataFrame to link with the DataFrame x. Returns ------- pandas.MultiIndex A pandas.MultiIndex with record pairs. Each record pair contains the index labels of two records. """ if not self.algorithms: raise ValueError("No algorithms given.") # start timing start_time = time.time() pairs = None for cl_alg in self.algorithms: pairs_i = cl_alg.index(x, x_link) if pairs is None: pairs = pairs_i else: pairs = pairs.union(pairs_i) if x_link is not None: n_max = max_pairs((x, x_link)) else: n_max = max_pairs(x) # store the number of pairs n = pairs.shape[0] eta = time.time() - start_time rr = 1 - n / n_max i_max = '?' if self._i_max is None else self._i_max self._eta.append(eta) self._n.append(n) self._n_max.append(n_max) # log logging.info("indexing [{:d}/{}] - time: {:.2f}s - pairs: {:d}/{:d} - " "rr: {:0.5f}".format(self._i, i_max, eta, n, n_max, rr)) # log total if self._output_log_total: n_total = np.sum(self._n) n_max_total = np.sum(self._n_max) rr_avg = 1 - n_total / n_max_total eta_total = np.sum(self._eta) logging.info("indexing [{:d}/{}] - time: {:.2f}s - " "pairs_total: {:d}/{:d} - rr_total: {:0.5f}".format( self._i, i_max, eta_total, n_total, n_max_total, rr_avg)) self._i += 1 return pairs
def index(self, x, x_link=None): """Make an index of record pairs. Use a custom function to make record pairs of one or two dataframes. Each function should return a pandas.MultiIndex with record pairs. Parameters ---------- x: pandas.DataFrame A pandas DataFrame. When `x_link` is None, the algorithm makes record pairs within the DataFrame. When `x_link` is not empty, the algorithm makes pairs between `x` and `x_link`. x_link: pandas.DataFrame, optional A second DataFrame to link with the DataFrame x. Returns ------- pandas.MultiIndex A pandas.MultiIndex with record pairs. Each record pair contains the index labels of two records. """ if x is None: # error raise ValueError("provide at least one dataframe") elif x_link is not None: # linking (two arg) x = (x, x_link) elif isinstance(x, (list, tuple)): # dedup or linking (single arg) x = tuple(x) else: # dedup (single arg) x = (x, ) if self.verify_integrity: for df in x: self._verify_integrety(df) # start timing start_time = time.time() # linking if not self._deduplication(x): logging.info("Indexing - start indexing two DataFrames") pairs = self._link_index(*x) names = self._make_index_names(x[0].index.name, x[1].index.name) # deduplication else: logging.info("Indexing - start indexing single DataFrame") pairs = self._dedup_index(*x) names = self._make_index_names(x[0].index.name, x[0].index.name) pairs.rename(names, inplace=True) # store the number of pairs self._n.append(pairs.shape[0]) self._n_max.append(max_pairs(x)) # summary n = len(pairs) rr = 1 - self._n[-1] / self._n_max[-1] rr_avg = 1 - np.sum(self._n) / np.sum(self._n_max) # log timing logf_time = "Indexing - computation time: ~{:.2f}s" logging.info(logf_time.format(time.time() - start_time)) # log results logf_result = "Indexing - summary n={:d}, " \ "reduction_ratio={:0.5f}, reduction_ratio_mean={:0.5f}" logging.info(logf_result.format(n, rr, rr_avg)) return pairs