예제 #1
0
    def _dedup_index(self, df_a):

        shape = (len(df_a),)

        # with replacement
        if self.replace:
            pairs = random_pairs_with_replacement(
                self.n, shape, self.random_state)

        # without replacement
        else:

            n_max = full_index_size(shape)

            if not isinstance(self.n, int) or self.n <= 0 or self.n > n_max:
                raise ValueError(
                    "n must be a integer satisfying 0<n<=%s" % n_max)

            # large dataframes
            if n_max < 1e6:
                pairs = random_pairs_without_replacement_small_frames(
                    self.n, shape, self.random_state)
            # small dataframes
            else:
                pairs = random_pairs_without_replacement_large_frames(
                    self.n, shape, self.random_state)

        levels = [df_a.index.values, df_a.index.values]
        labels = pairs

        return pandas.MultiIndex(
            levels=levels,
            labels=labels,
            verify_integrity=False
        )
예제 #2
0
    def _link_index(self, df_a, df_b):

        shape = (len(df_a), len(df_b))
        n_max = full_index_size(shape)

        if not isinstance(self.n, int):
            raise ValueError('n must be an integer')

        # with replacement
        if self.replace:

            if n_max == 0:
                raise ValueError(
                    "one of the dataframes is empty")

            pairs = random_pairs_with_replacement(
                self.n, shape, self.random_state)

        # without replacement
        else:

            if self.n <= 0 or self.n > n_max:
                raise ValueError(
                    "n must be a integer satisfying 0<n<=%s" % n_max)

            # large dataframes
            if n_max < 1e6:
                pairs = random_pairs_without_replacement_small_frames(
                    self.n, shape, self.random_state)
            # small dataframes
            else:
                pairs = random_pairs_without_replacement_large_frames(
                    self.n, shape, self.random_state)

        levels = [df_a.index.values, df_b.index.values]
        labels = pairs
        names = [df_a.index.name, df_b.index.name]

        return pandas.MultiIndex(
            levels=levels,
            labels=labels,
            names=names,
            verify_integrity=False
        )
예제 #3
0
    def _link_index(self, df_a, df_b):

        shape = (len(df_a), len(df_b))
        n_max = full_index_size(shape)

        if not isinstance(self.n, int):
            raise ValueError('n must be an integer')

        # with replacement
        if self.replace:

            if n_max == 0:
                raise ValueError("one of the dataframes is empty")

            pairs = random_pairs_with_replacement(self.n, shape,
                                                  self.random_state)

        # without replacement
        else:

            if self.n <= 0 or self.n > n_max:
                raise ValueError("n must be a integer satisfying 0<n<=%s" %
                                 n_max)

            # the fraction of pairs in the sample
            frac = self.n / n_max

            # large dataframes
            if n_max < 1e6 or frac > 0.5:
                pairs = random_pairs_without_replacement(
                    self.n, shape, self.random_state)
            # small dataframes
            else:
                pairs = random_pairs_without_replacement_low_memory(
                    self.n, shape, self.random_state)

        levels = [df_a.index.values, df_b.index.values]
        codes = pairs

        return pandas.MultiIndex(levels=levels,
                                 codes=codes,
                                 verify_integrity=False)