Python Pipeline.transform примеры использования

Язык программирования: Python

Пространство имен/Пакет: imblearn.pipeline

Класс/Тип: Pipeline

Метод/Функция: transform

Примеров на hotexamples.com: 2

Python Pipeline.transform - 2 примера найдено. Это лучшие примеры Python кода для imblearn.pipeline.Pipeline.transform, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Pipeline(30)

predict(30)

fit(30)

fit_resample(30)

predict_proba(30)

set_params(19)

fit_transform(9)

score(8)

predict_log_proba(8)

inverse_transform(5)

get_params(4)

fit_predict(4)

decision_function(4)

fit_sample(2)

score_samples(2)

steps(2)

transform(2)

_kind(1)

Пример #1

Показать файл

Файл: helper.py Проект: akoury/ml-helper

    def stack_predict(self, df, holdout, pipes, amount=2):
        X, y = self.split_x_y(df)
        X_test, y_test = self.split_x_y(holdout)

        pipe = Pipeline(self.top_pipeline(pipes).steps[:-1])
        X = pipe.fit_transform(X)
        X_test = pipe.transform(X_test)

        estimators = []

        for i in range(amount):
            estimators.append((str(i), self.top_pipeline(pipes,
                                                         i).steps[-1][1]))

        regression = False

        if self.METRIC in [
                "explained_variance",
                "neg_mean_absolute_error",
                "neg_mean_squared_error",
                "neg_mean_squared_log_error",
                "neg_median_absolute_error",
                "r2",
        ]:
            regression = True

        stack = StackingTransformer(estimators, regression)
        stack.fit(X, y)

        S_train = stack.transform(X)
        S_test = stack.transform(X_test)

        final_estimator = estimators[0][1]
        final_estimator.fit(S_train, y)

        return final_estimator, y_test, final_estimator.predict(S_test)

Пример #2

Показать файл

Файл: _cluster.py Проект: joaopfonseca/cluster-over-sampling

class ClusterOverSampler(BaseOverSampler):
    """A class that handles clustering-based over-sampling.

    Any combination of over-sampler, clusterer and distributor can
    be used.

    Read more in the :ref:`user guide <user_guide>`.

    Parameters
    ----------
    oversampler : oversampler estimator, default=None
        Over-sampler to apply to each selected cluster.

    clusterer : clusterer estimator, default=None
        Clusterer to apply to input space before over-sampling.

        - When ``None``, it corresponds to a clusterer that assigns
          a single cluster to all the samples i.e. no clustering is applied.

        - When clusterer, it applies clustering to the input space. Then
          over-sampling is applied inside each cluster and between clusters.

    distributor : distributor estimator, default=None
        Distributor to distribute the generated samples per cluster label.

        - When ``None`` and a clusterer is provided then it corresponds to the
          density distributor. If clusterer is also ``None`` than the distributor
          does not affect the over-sampling procedure.

        - When distributor object is provided, it is used to distribute the
          generated samples to the clusters.

    raise_error : bool, default=True
        Raise an error when no samples are generated.

        - If ``True``, it raises an error when no filtered clusters are
          identified and therefore no samples are generated.

        - If ``False``, it displays a warning.

    {random_state}

    {n_jobs}

    Attributes
    ----------
    clusterer_ : object
        A fitted clone of the ``clusterer`` parameter or ``None`` when a
        clusterer is not given.

    distributor_ : object
        A fitted clone of the ``clusterer`` parameter or a fitted instance of
        the ``BaseDistributor`` when a distributor is not given.

    labels_ : array, shape (n_samples,)
        Labels of each sample.

    neighbors_ : array, (n_neighboring_pairs, 2) or None
        An array that contains all neighboring pairs with each row being
        a unique neighboring pair. It is ``None`` when the clusterer does not
        support this attribute.

    oversampler_ : object
        A fitted clone of the ``oversampler`` parameter.

    random_state_ : object
        An instance of ``RandomState`` class.

    sampling_strategy_ : dict
        Actual sampling strategy.

    Examples
    --------
    >>> from collections import Counter
    >>> from clover.over_sampling import ClusterOverSampler
    >>> from sklearn.datasets import make_classification
    >>> from sklearn.cluster import KMeans
    >>> from imblearn.over_sampling import SMOTE
    >>> X, y = make_classification(random_state=0, n_classes=2, weights=[0.9, 0.1])
    >>> print('Original dataset shape %s' % Counter(y))
    Original dataset shape Counter({{0: 90, 1: 10}})
    >>> cluster_oversampler = ClusterOverSampler(
    ... oversampler=SMOTE(random_state=5),
    ... clusterer=KMeans(random_state=10))
    >>> X_res, y_res = cluster_oversampler.fit_resample(X, y)
    >>> print('Resampled dataset shape %s' % Counter(y_res))
    Resampled dataset shape Counter({{0: 90, 1: 90}})
    """

    def __init__(
        self,
        oversampler,
        clusterer=None,
        distributor=None,
        raise_error=True,
        random_state=None,
        n_jobs=None,
    ):
        self.oversampler = oversampler
        self.clusterer = clusterer
        self.distributor = distributor
        self.raise_error = raise_error
        self.random_state = random_state
        self.n_jobs = n_jobs

    def fit(self, X, y):
        """Check inputs and statistics of the sampler.

        You should use ``fit_resample`` in all cases.

        Parameters
        ----------
        X : {array-like, dataframe, sparse matrix} of shape \
                (n_samples, n_features)
            Data array.
        y : array-like of shape (n_samples,)
            Target array.

        Returns
        -------
        self : object
            Return the instance itself.
        """
        X, y, _ = self._check_X_y(X, y)
        self._check(X, y)
        return self

    def fit_resample(self, X, y, **fit_params):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, dataframe, sparse matrix} of shape \
                (n_samples, n_features)
            Matrix containing the data which have to be sampled.
        y : array-like of shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {array-like, dataframe, sparse matrix} of shape \
                (n_samples_new, n_features)
            The array containing the resampled data.
        y_resampled : array-like of shape (n_samples_new,)
            The corresponding label of `X_resampled`.
        """
        check_classification_targets(y)
        arrays_transformer = ArraysTransformer(X, y)
        X, y, binarize_y = self._check_X_y(X, y)

        self._check(X, y)._fit(X, y, **fit_params)

        output = self._fit_resample(X, y)

        y_ = (
            label_binarize(y=output[1], classes=np.unique(y))
            if binarize_y
            else output[1]
        )

        X_, y_ = arrays_transformer.transform(output[0], y_)
        return (X_, y_) if len(output) == 2 else (X_, y_, output[2])

    def _cluster_sample(self, clusters_data, X, y):
        """Generate artificial data inside clusters or between clusters."""
        generated_data = Parallel(n_jobs=self.n_jobs)(
            delayed(_generate_in_cluster)(self.oversampler_, self.transformer_, *data)
            for data in clusters_data
        )
        if generated_data:
            return [np.concatenate(data) for data in zip(*generated_data)]
        else:
            return None, None

    def _intra_sample(self, X, y):
        """Intracluster resampling."""
        clusters_data = _extract_intra_data(
            X,
            y,
            self.labels_,
            self.distributor_.intra_distribution_,
            self.sampling_strategy_,
        )
        return self._cluster_sample(clusters_data, X, y)

    def _inter_sample(self, X, y):
        """Intercluster resampling."""
        clusters_data = _extract_inter_data(
            X,
            y,
            self.labels_,
            self.distributor_.inter_distribution_,
            self.sampling_strategy_,
            self.random_state_,
        )
        return self._cluster_sample(clusters_data, X, y)

    def _check_estimators(self, X, y):
        """Check various estimators."""

        # Check transformer and oversampler
        if isinstance(self.oversampler, Pipeline):
            if self.oversampler.steps[:-1]:
                self.transformer_ = Pipeline(self.oversampler.steps[:-1]).fit(X)
            self.oversampler_ = clone(self.oversampler.steps[-1][-1])
        else:
            self.oversampler_ = clone(self.oversampler)

        # Check clusterer and distributor
        if self.clusterer is None and self.distributor is not None:
            raise ValueError(
                'Distributor was found but clusterer is set to `None`. '
                'Either set parameter `distributor` to `None` or use a clusterer.'
            )
        elif self.clusterer is None and self.distributor is None:
            self.clusterer_ = None
            self.distributor_ = BaseDistributor()
        else:
            self.clusterer_ = clone(self.clusterer)
            self.distributor_ = (
                DensityDistributor()
                if self.distributor is None
                else clone(self.distributor)
            )
        return self

    def _check_sampling_strategy(self, y):
        """Check sampling strategy."""
        self.sampling_strategy_ = check_sampling_strategy(
            self.oversampler_.sampling_strategy,
            y,
            self._sampling_type,
        )
        return self

    def _check(self, X, y):
        """Apply various checks."""

        # Check random state
        self.random_state_ = check_random_state(self.random_state)

        # Check transformer
        self.transformer_ = None

        # Check estimators and sampling strategy
        self._check_estimators(X, y)._check_sampling_strategy(y)

        return self

    def _fit(self, X, y, **fit_params):
        """Fit the clusterer and distributor."""

        # Fit clusterer
        if self.clusterer_ is not None:
            self.clusterer_.fit(X, y, **fit_params)

        # Extract labels and neighbors
        self.labels_ = getattr(self.clusterer_, 'labels_', np.zeros(len(X), dtype=int))
        self.neighbors_ = getattr(self.clusterer_, 'neighbors_', None)

        # fit distributor
        self.distributor_.fit(X, y, labels=self.labels_, neighbors=self.neighbors_)

        # Case when no samples are generated
        if (
            not self.distributor_.intra_distribution_
            and not self.distributor_.inter_distribution_
        ):
            msg = (
                'No samples were generated. Try to modify the parameters '
                'of the clusterer or distributor.'
            )

            # Raise error
            if self.raise_error:
                raise ValueError(msg)

            # Display warning
            else:
                warnings.warn(msg, FitFailedWarning)

        return self

    def _fit_resample(self, X, y, **fit_params):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`
        """

        # Intracluster oversampling
        X_intra_new, y_intra_new = self._intra_sample(X, y)

        # Intercluster oversampling
        X_inter_new, y_inter_new = self._inter_sample(X, y)

        # Set sampling strategy
        intra_count, inter_count = Counter(y_intra_new), Counter(y_inter_new)
        self.sampling_strategy_ = OrderedDict({})
        for class_label in set(intra_count.keys()).union(inter_count.keys()):
            self.sampling_strategy_[class_label] = intra_count.get(
                class_label, 0
            ) + inter_count.get(class_label, 0)

        # Stack resampled data
        X_resampled = [
            self.transformer_.transform(X) if self.transformer_ is not None else X,
            X_intra_new,
            X_inter_new,
        ]
        y_resampled = [y, y_intra_new, y_inter_new]
        X_resampled, y_resampled = (
            np.vstack([X for X in X_resampled if X is not None]),
            np.hstack([y for y in y_resampled if y is not None]),
        )

        return X_resampled, y_resampled