def stack_predict(self, df, holdout, pipes, amount=2): X, y = self.split_x_y(df) X_test, y_test = self.split_x_y(holdout) pipe = Pipeline(self.top_pipeline(pipes).steps[:-1]) X = pipe.fit_transform(X) X_test = pipe.transform(X_test) estimators = [] for i in range(amount): estimators.append((str(i), self.top_pipeline(pipes, i).steps[-1][1])) regression = False if self.METRIC in [ "explained_variance", "neg_mean_absolute_error", "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error", "r2", ]: regression = True stack = StackingTransformer(estimators, regression) stack.fit(X, y) S_train = stack.transform(X) S_test = stack.transform(X_test) final_estimator = estimators[0][1] final_estimator.fit(S_train, y) return final_estimator, y_test, final_estimator.predict(S_test)
class ClusterOverSampler(BaseOverSampler): """A class that handles clustering-based over-sampling. Any combination of over-sampler, clusterer and distributor can be used. Read more in the :ref:`user guide <user_guide>`. Parameters ---------- oversampler : oversampler estimator, default=None Over-sampler to apply to each selected cluster. clusterer : clusterer estimator, default=None Clusterer to apply to input space before over-sampling. - When ``None``, it corresponds to a clusterer that assigns a single cluster to all the samples i.e. no clustering is applied. - When clusterer, it applies clustering to the input space. Then over-sampling is applied inside each cluster and between clusters. distributor : distributor estimator, default=None Distributor to distribute the generated samples per cluster label. - When ``None`` and a clusterer is provided then it corresponds to the density distributor. If clusterer is also ``None`` than the distributor does not affect the over-sampling procedure. - When distributor object is provided, it is used to distribute the generated samples to the clusters. raise_error : bool, default=True Raise an error when no samples are generated. - If ``True``, it raises an error when no filtered clusters are identified and therefore no samples are generated. - If ``False``, it displays a warning. {random_state} {n_jobs} Attributes ---------- clusterer_ : object A fitted clone of the ``clusterer`` parameter or ``None`` when a clusterer is not given. distributor_ : object A fitted clone of the ``clusterer`` parameter or a fitted instance of the ``BaseDistributor`` when a distributor is not given. labels_ : array, shape (n_samples,) Labels of each sample. neighbors_ : array, (n_neighboring_pairs, 2) or None An array that contains all neighboring pairs with each row being a unique neighboring pair. It is ``None`` when the clusterer does not support this attribute. oversampler_ : object A fitted clone of the ``oversampler`` parameter. random_state_ : object An instance of ``RandomState`` class. sampling_strategy_ : dict Actual sampling strategy. Examples -------- >>> from collections import Counter >>> from clover.over_sampling import ClusterOverSampler >>> from sklearn.datasets import make_classification >>> from sklearn.cluster import KMeans >>> from imblearn.over_sampling import SMOTE >>> X, y = make_classification(random_state=0, n_classes=2, weights=[0.9, 0.1]) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{0: 90, 1: 10}}) >>> cluster_oversampler = ClusterOverSampler( ... oversampler=SMOTE(random_state=5), ... clusterer=KMeans(random_state=10)) >>> X_res, y_res = cluster_oversampler.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) Resampled dataset shape Counter({{0: 90, 1: 90}}) """ def __init__( self, oversampler, clusterer=None, distributor=None, raise_error=True, random_state=None, n_jobs=None, ): self.oversampler = oversampler self.clusterer = clusterer self.distributor = distributor self.raise_error = raise_error self.random_state = random_state self.n_jobs = n_jobs def fit(self, X, y): """Check inputs and statistics of the sampler. You should use ``fit_resample`` in all cases. Parameters ---------- X : {array-like, dataframe, sparse matrix} of shape \ (n_samples, n_features) Data array. y : array-like of shape (n_samples,) Target array. Returns ------- self : object Return the instance itself. """ X, y, _ = self._check_X_y(X, y) self._check(X, y) return self def fit_resample(self, X, y, **fit_params): """Resample the dataset. Parameters ---------- X : {array-like, dataframe, sparse matrix} of shape \ (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like of shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {array-like, dataframe, sparse matrix} of shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ check_classification_targets(y) arrays_transformer = ArraysTransformer(X, y) X, y, binarize_y = self._check_X_y(X, y) self._check(X, y)._fit(X, y, **fit_params) output = self._fit_resample(X, y) y_ = ( label_binarize(y=output[1], classes=np.unique(y)) if binarize_y else output[1] ) X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) def _cluster_sample(self, clusters_data, X, y): """Generate artificial data inside clusters or between clusters.""" generated_data = Parallel(n_jobs=self.n_jobs)( delayed(_generate_in_cluster)(self.oversampler_, self.transformer_, *data) for data in clusters_data ) if generated_data: return [np.concatenate(data) for data in zip(*generated_data)] else: return None, None def _intra_sample(self, X, y): """Intracluster resampling.""" clusters_data = _extract_intra_data( X, y, self.labels_, self.distributor_.intra_distribution_, self.sampling_strategy_, ) return self._cluster_sample(clusters_data, X, y) def _inter_sample(self, X, y): """Intercluster resampling.""" clusters_data = _extract_inter_data( X, y, self.labels_, self.distributor_.inter_distribution_, self.sampling_strategy_, self.random_state_, ) return self._cluster_sample(clusters_data, X, y) def _check_estimators(self, X, y): """Check various estimators.""" # Check transformer and oversampler if isinstance(self.oversampler, Pipeline): if self.oversampler.steps[:-1]: self.transformer_ = Pipeline(self.oversampler.steps[:-1]).fit(X) self.oversampler_ = clone(self.oversampler.steps[-1][-1]) else: self.oversampler_ = clone(self.oversampler) # Check clusterer and distributor if self.clusterer is None and self.distributor is not None: raise ValueError( 'Distributor was found but clusterer is set to `None`. ' 'Either set parameter `distributor` to `None` or use a clusterer.' ) elif self.clusterer is None and self.distributor is None: self.clusterer_ = None self.distributor_ = BaseDistributor() else: self.clusterer_ = clone(self.clusterer) self.distributor_ = ( DensityDistributor() if self.distributor is None else clone(self.distributor) ) return self def _check_sampling_strategy(self, y): """Check sampling strategy.""" self.sampling_strategy_ = check_sampling_strategy( self.oversampler_.sampling_strategy, y, self._sampling_type, ) return self def _check(self, X, y): """Apply various checks.""" # Check random state self.random_state_ = check_random_state(self.random_state) # Check transformer self.transformer_ = None # Check estimators and sampling strategy self._check_estimators(X, y)._check_sampling_strategy(y) return self def _fit(self, X, y, **fit_params): """Fit the clusterer and distributor.""" # Fit clusterer if self.clusterer_ is not None: self.clusterer_.fit(X, y, **fit_params) # Extract labels and neighbors self.labels_ = getattr(self.clusterer_, 'labels_', np.zeros(len(X), dtype=int)) self.neighbors_ = getattr(self.clusterer_, 'neighbors_', None) # fit distributor self.distributor_.fit(X, y, labels=self.labels_, neighbors=self.neighbors_) # Case when no samples are generated if ( not self.distributor_.intra_distribution_ and not self.distributor_.inter_distribution_ ): msg = ( 'No samples were generated. Try to modify the parameters ' 'of the clusterer or distributor.' ) # Raise error if self.raise_error: raise ValueError(msg) # Display warning else: warnings.warn(msg, FitFailedWarning) return self def _fit_resample(self, X, y, **fit_params): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ # Intracluster oversampling X_intra_new, y_intra_new = self._intra_sample(X, y) # Intercluster oversampling X_inter_new, y_inter_new = self._inter_sample(X, y) # Set sampling strategy intra_count, inter_count = Counter(y_intra_new), Counter(y_inter_new) self.sampling_strategy_ = OrderedDict({}) for class_label in set(intra_count.keys()).union(inter_count.keys()): self.sampling_strategy_[class_label] = intra_count.get( class_label, 0 ) + inter_count.get(class_label, 0) # Stack resampled data X_resampled = [ self.transformer_.transform(X) if self.transformer_ is not None else X, X_intra_new, X_inter_new, ] y_resampled = [y, y_intra_new, y_inter_new] X_resampled, y_resampled = ( np.vstack([X for X in X_resampled if X is not None]), np.hstack([y for y in y_resampled if y is not None]), ) return X_resampled, y_resampled