def _validate_estimator(self, rng): if not 0 < self.proba_threshold < 1: raise ValueError("proba_threshold must be between 0 and 1") if not 0 < self.weight_threshold < 1: raise ValueError("weight_threshold must be between 0 and 1") self.nn_ = check_neighbors_object("n_neighbors", self.n_neighbors, additional_neighbor=1) self.nn_.set_params(n_jobs=self.n_jobs, algorithm="ball_tree") self.alpha_ = np.arange(1, self.n_pickedup + 1) / self.n_pickedup if not isinstance(self.n_clusters, int): gmm_methods = {'fit', 'predict_proba', 'set_params'} for attr in gmm_methods: if not hasattr(self.n_clusters, attr): raise ValueError( f'The GMM estimator must implement all of {gmm_methods}' ) self.gmm_ = clone(self.n_clusters) self.gmm_.set_params(random_state=safe_random_state(rng)) else: self.gmm_ = GaussianMixture( n_components=self.n_clusters, random_state=safe_random_state(rng), ) if self.gmm_params: self.gmm_.set_params(**self.gmm_params)
def _validate_estimator(self): """Private function to create the NN estimator""" # check for deprecated random_state if self.random_state is not None: deprecate_parameter(self, '0.4', 'random_state') self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors) self.nn_.set_params(**{'n_jobs': self.n_jobs}) if self.version == 3: self.nn_ver3_ = check_neighbors_object('n_neighbors_ver3', self.n_neighbors_ver3) self.nn_ver3_.set_params(**{'n_jobs': self.n_jobs}) if self.version not in (1, 2, 3): raise ValueError('Parameter `version` must be 1, 2 or 3, got' ' {}'.format(self.version))
def _initialize_params(self, X, y, rng): """Initialize the parameter values to their appropriate values.""" f_size = X.shape[1] self.n_affine_ = f_size if self.n_affine is None else self.n_affine if self.manifold_learner: self.manifold_learner_ = self._check_2d_manifold_learner() else: self.manifold_learner_ = TSNE(n_components=2) if self.manifold_learner_params is not None: self.manifold_learner_.set_params(**self.manifold_learner_params) try: self.manifold_learner_.set_params( random_state=safe_random_state(rng)) except ValueError: pass _, y_counts = np.unique(y, return_counts=True) if self.n_neighbors is None: n_neighbors = 30 if y_counts.min() >= 100 else 5 else: n_neighbors = self.n_neighbors self.nn_ = check_neighbors_object("n_neighbors", n_neighbors) if self.n_jobs is not None: self.nn_.set_params(n_jobs=self.n_jobs) if self.n_shadow is None: self.n_shadow_ = max(ceil(2 * f_size / self.nn_.n_neighbors), 40) else: self.n_shadow_ = self.n_shadow if self.n_affine_ >= self.nn_.n_neighbors * self.n_shadow_: raise ValueError( "The number of shadow samples used to create an affine random " "combination must be less than `n_neighbors * n_shadow`.") try: iter(self.std) self.std_ = self.std except TypeError: self.std_ = [self.std] * f_size
def _make_cluster_data(self, minority_data, X_not_minority, samples_to_make): nn = check_neighbors_object( "n_neighbors_max", self.n_neighbors_max if self.n_neighbors_max else 5).set_params( n_jobs=self.n_jobs) X = minority_data.X max_weight = 0 weights = [] for i in range(1, self.max_clusters): if X.shape[0] == 0: break k = min(X.shape[0], nn.n_neighbors) nn.set_params(n_neighbors=k).fit(X) cluster_indices = np.unique( nn.kneighbors(X_not_minority, return_distance=False)) minority_data.clusters.append(X[cluster_indices]) weights.append(math.exp(-self.decay_rate * (i - 1))) max_weight = weights[-1] if weights[-1] > max_weight else max_weight X = X[~np.in1d(np.arange(X.shape[0]), cluster_indices)] # append the remaining partition (if it exists) as the last cluster # if the last (max_cluster - 1) clusters have been used up. if X.shape[0]: # pragma: no cover weights.append(math.exp(-self.decay_rate * (self.max_clusters - 1))) max_weight = weights[-1] if weights[-1] > max_weight else max_weight minority_data.clusters.append(X) weight_sum = sum(weights) for i in range(len(weights)): # normalize weights weights[i] /= weight_sum minority_data.n_new_samples.append( math.ceil(samples_to_make * weights[i])) minority_data.n_affine.append( math.ceil(self.max_affine * weights[i] / max_weight)) return minority_data
def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) X_res = [X.copy()] y_res = [y.copy()] n_features = X.shape[1] prowras_samples = defaultdict(lambda: []) if not isinstance(self.std, Number) and len(self.std) != n_features: raise ValueError( "``std`` and number of features of `X` must be equal.") self.cdata_ = [] for minority_class, samples_to_make in self.sampling_strategy_.items(): if samples_to_make == 0: continue mask = (y == minority_class) cdata = self._make_cluster_data(MinorityData(X[mask], [], [], []), X[~mask], samples_to_make) self.cdata_.append(cdata) cnn = check_neighbors_object("n_cluster_neighbors", self.n_cluster_neighbors) for i, c in enumerate(cdata.clusters): n_samples = cdata.n_new_samples[i] if cnn.n_neighbors < c.shape[0]: cnn.set_params(n_jobs=self.n_jobs).fit(c) neighborhoods = c[cnn.kneighbors(c, return_distance=False)] rand_idx = random_state.integers(0, neighborhoods.shape[0], size=n_samples) else: neighborhoods = c[None, :, :] rand_idx = [0] * n_samples n_affine = (2 if cdata.n_affine[i] < n_features else cdata.n_affine[i]) if cdata.n_affine[i] < n_features: shadows = neighborhoods[rand_idx].reshape(-1, n_features) else: groups = neighborhoods[rand_idx] group_size = groups.shape[1] size = (n_samples, self.n_shadow, group_size, n_features) norms = random_state.normal(scale=self.std, size=size) shadows = groups[:, None, :, :] + norms shadows = shadows.reshape(-1, n_features) shadow_idx = random_state.integers(0, shadows.shape[0], size=(n_samples, n_affine)) affine_weights = random_state.dirichlet([1] * n_affine, size=n_samples)[:, None] synthetic_samples = ( affine_weights @ shadows[shadow_idx]).reshape( -1, n_features) prowras_samples[minority_class].append(synthetic_samples) samples_to_drop = sum(cdata.n_new_samples) - samples_to_make random_state.shuffle(prowras_samples[minority_class]) X_res.append( np.vstack(prowras_samples[minority_class])[samples_to_drop:]) y_res.append([minority_class] * X_res[-1].shape[0]) return np.concatenate(X_res), np.concatenate(y_res)