Пример #1
0
    def _validate_estimator(self, rng):
        if not 0 < self.proba_threshold < 1:
            raise ValueError("proba_threshold must be between 0 and 1")
        if not 0 < self.weight_threshold < 1:
            raise ValueError("weight_threshold must be between 0 and 1")

        self.nn_ = check_neighbors_object("n_neighbors",
                                          self.n_neighbors,
                                          additional_neighbor=1)
        self.nn_.set_params(n_jobs=self.n_jobs, algorithm="ball_tree")
        self.alpha_ = np.arange(1, self.n_pickedup + 1) / self.n_pickedup

        if not isinstance(self.n_clusters, int):
            gmm_methods = {'fit', 'predict_proba', 'set_params'}
            for attr in gmm_methods:
                if not hasattr(self.n_clusters, attr):
                    raise ValueError(
                        f'The GMM estimator must implement all of {gmm_methods}'
                    )
            self.gmm_ = clone(self.n_clusters)
            self.gmm_.set_params(random_state=safe_random_state(rng))
        else:
            self.gmm_ = GaussianMixture(
                n_components=self.n_clusters,
                random_state=safe_random_state(rng),
            )
            if self.gmm_params:
                self.gmm_.set_params(**self.gmm_params)
Пример #2
0
        def _validate_estimator(self):
            """Private function to create the NN estimator"""

            # check for deprecated random_state
            if self.random_state is not None:
                deprecate_parameter(self, '0.4', 'random_state')

            self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors)
            self.nn_.set_params(**{'n_jobs': self.n_jobs})

            if self.version == 3:
                self.nn_ver3_ = check_neighbors_object('n_neighbors_ver3',
                                                      self.n_neighbors_ver3)
                self.nn_ver3_.set_params(**{'n_jobs': self.n_jobs})

            if self.version not in (1, 2, 3):
                raise ValueError('Parameter `version` must be 1, 2 or 3, got'
                                ' {}'.format(self.version))
Пример #3
0
    def _initialize_params(self, X, y, rng):
        """Initialize the parameter values to their appropriate values."""
        f_size = X.shape[1]
        self.n_affine_ = f_size if self.n_affine is None else self.n_affine

        if self.manifold_learner:
            self.manifold_learner_ = self._check_2d_manifold_learner()
        else:
            self.manifold_learner_ = TSNE(n_components=2)
        if self.manifold_learner_params is not None:
            self.manifold_learner_.set_params(**self.manifold_learner_params)
        try:
            self.manifold_learner_.set_params(
                random_state=safe_random_state(rng))
        except ValueError:
            pass

        _, y_counts = np.unique(y, return_counts=True)
        if self.n_neighbors is None:
            n_neighbors = 30 if y_counts.min() >= 100 else 5
        else:
            n_neighbors = self.n_neighbors
        self.nn_ = check_neighbors_object("n_neighbors", n_neighbors)
        if self.n_jobs is not None:
            self.nn_.set_params(n_jobs=self.n_jobs)

        if self.n_shadow is None:
            self.n_shadow_ = max(ceil(2 * f_size / self.nn_.n_neighbors), 40)
        else:
            self.n_shadow_ = self.n_shadow

        if self.n_affine_ >= self.nn_.n_neighbors * self.n_shadow_:
            raise ValueError(
                "The number of shadow samples used to create an affine random "
                "combination must be less than `n_neighbors * n_shadow`.")

        try:
            iter(self.std)
            self.std_ = self.std
        except TypeError:
            self.std_ = [self.std] * f_size
Пример #4
0
    def _make_cluster_data(self, minority_data, X_not_minority,
                           samples_to_make):
        nn = check_neighbors_object(
            "n_neighbors_max",
            self.n_neighbors_max if self.n_neighbors_max else 5).set_params(
                n_jobs=self.n_jobs)

        X = minority_data.X
        max_weight = 0
        weights = []
        for i in range(1, self.max_clusters):
            if X.shape[0] == 0:
                break
            k = min(X.shape[0], nn.n_neighbors)
            nn.set_params(n_neighbors=k).fit(X)
            cluster_indices = np.unique(
                nn.kneighbors(X_not_minority, return_distance=False))
            minority_data.clusters.append(X[cluster_indices])
            weights.append(math.exp(-self.decay_rate * (i - 1)))
            max_weight = weights[-1] if weights[-1] > max_weight else max_weight
            X = X[~np.in1d(np.arange(X.shape[0]), cluster_indices)]

        # append the remaining partition (if it exists) as the last cluster
        # if the last (max_cluster - 1) clusters have been used up.
        if X.shape[0]:  # pragma: no cover
            weights.append(math.exp(-self.decay_rate *
                                    (self.max_clusters - 1)))
            max_weight = weights[-1] if weights[-1] > max_weight else max_weight
            minority_data.clusters.append(X)

        weight_sum = sum(weights)
        for i in range(len(weights)):
            # normalize weights
            weights[i] /= weight_sum
            minority_data.n_new_samples.append(
                math.ceil(samples_to_make * weights[i]))
            minority_data.n_affine.append(
                math.ceil(self.max_affine * weights[i] / max_weight))
        return minority_data
Пример #5
0
    def _fit_resample(self, X, y):
        random_state = check_random_state(self.random_state)
        X_res = [X.copy()]
        y_res = [y.copy()]

        n_features = X.shape[1]
        prowras_samples = defaultdict(lambda: [])

        if not isinstance(self.std, Number) and len(self.std) != n_features:
            raise ValueError(
                "``std`` and number of features of `X` must be equal.")

        self.cdata_ = []
        for minority_class, samples_to_make in self.sampling_strategy_.items():
            if samples_to_make == 0:
                continue

            mask = (y == minority_class)
            cdata = self._make_cluster_data(MinorityData(X[mask], [], [], []),
                                            X[~mask], samples_to_make)
            self.cdata_.append(cdata)

            cnn = check_neighbors_object("n_cluster_neighbors",
                                         self.n_cluster_neighbors)

            for i, c in enumerate(cdata.clusters):
                n_samples = cdata.n_new_samples[i]
                if cnn.n_neighbors < c.shape[0]:
                    cnn.set_params(n_jobs=self.n_jobs).fit(c)
                    neighborhoods = c[cnn.kneighbors(c, return_distance=False)]
                    rand_idx = random_state.integers(0,
                                                     neighborhoods.shape[0],
                                                     size=n_samples)
                else:
                    neighborhoods = c[None, :, :]
                    rand_idx = [0] * n_samples
                n_affine = (2 if cdata.n_affine[i] < n_features else
                            cdata.n_affine[i])

                if cdata.n_affine[i] < n_features:
                    shadows = neighborhoods[rand_idx].reshape(-1, n_features)
                else:
                    groups = neighborhoods[rand_idx]
                    group_size = groups.shape[1]
                    size = (n_samples, self.n_shadow, group_size, n_features)
                    norms = random_state.normal(scale=self.std, size=size)
                    shadows = groups[:, None, :, :] + norms
                    shadows = shadows.reshape(-1, n_features)

                shadow_idx = random_state.integers(0,
                                                   shadows.shape[0],
                                                   size=(n_samples, n_affine))

                affine_weights = random_state.dirichlet([1] * n_affine,
                                                        size=n_samples)[:,
                                                                        None]
                synthetic_samples = (
                    affine_weights @ shadows[shadow_idx]).reshape(
                        -1, n_features)

                prowras_samples[minority_class].append(synthetic_samples)

            samples_to_drop = sum(cdata.n_new_samples) - samples_to_make
            random_state.shuffle(prowras_samples[minority_class])
            X_res.append(
                np.vstack(prowras_samples[minority_class])[samples_to_drop:])
            y_res.append([minority_class] * X_res[-1].shape[0])

        return np.concatenate(X_res), np.concatenate(y_res)