Exemplo n.º 1
0
    def maximum_number_of_pairs(self):
        """ the maximum number of record pairs """

        if self.deduplication:
            return max_number_of_pairs(self.df_a)
        else:
            return max_number_of_pairs(self.df_a, self.df_b)
Exemplo n.º 2
0
def _random_small_link(df_a, df_b, n):

    n_max = max_number_of_pairs(df_a, df_b)

    if not isinstance(n, int) or n <= 0 or n > n_max:
        raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)

    levels = [df_a.index.values, df_b.index.values]
    names = [df_a.index.name, df_b.index.name]

    # Initialize pandas MultiIndex
    pairs = pandas.MultiIndex(levels=levels, labels=[[], []], names=names)

    # Run as long as the number of pairs is less than the requested number
    # of pairs n.
    while len(pairs) < n:

        # The number of pairs to sample (sample twice as much record pairs
        # because the duplicates are dropped).
        n_sample = (n - len(pairs)) * 2
        sample_a = numpy.random.randint(len(df_a), size=n_sample)
        sample_b = numpy.random.randint(len(df_b), size=n_sample)

        # Make a pandas MultiIndex of the sample above
        pairs_sample = pandas.MultiIndex(
            levels=levels, labels=[sample_a, sample_b], names=names
        )

        pairs = pairs.append(pairs_sample).drop_duplicates()

    return pairs[0:n]
Exemplo n.º 3
0
def _random_large_link(df_a, df_b, n):

    n_max = max_number_of_pairs(df_a, df_b)

    if not isinstance(n, int) or n <= 0 or n > n_max:
        raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)

    full_index = _fullindex_link(df_a, df_b)
    sample = numpy.random.choice(
        numpy.arange(len(full_index)), n, replace=False
    )

    return full_index[sample]