Пример #1
0
    def _validate_estimator(self):
        """Create the necessary attributes for Geometric SMOTE."""

        # Check random state
        self.random_state_ = check_random_state(self.random_state)

        # Validate strategy
        if self.selection_strategy not in SELECTION_STRATEGY:
            error_msg = (
                'Unknown selection_strategy for Geometric SMOTE algorithm. '
                'Choices are {}. Got {} instead.')
            raise ValueError(
                error_msg.format(SELECTION_STRATEGY, self.selection_strategy))

        # Create nearest neighbors object for positive class
        if self.selection_strategy in ('minority', 'combined'):
            self.nns_pos_ = check_neighbors_object('nns_positive',
                                                   self.k_neighbors,
                                                   additional_neighbor=1)
            n = self.nns_pos_
            self.nns_pos_.set_params(n_jobs=self.n_jobs)

        # Create nearest neighbors object for negative class
        if self.selection_strategy in ('majority', 'combined'):
            self.nn_neg_ = check_neighbors_object('nn_negative', nn_object=1)
            self.nn_neg_.set_params(n_jobs=self.n_jobs)
            n = self.nn_neg_
Пример #2
0
    def _validate_estimator(self):
        """Create the necessary attributes for Geometric SMOTE."""

        # Check random state
        self.random_state_ = check_random_state(self.random_state)
        # Create nearest neighbors object for mixed class
        self.nn_mix_ = check_neighbors_object('nns_mixed', self.k_neighbors)
        # Create nearest neighbors of positive class
        self.nns_pos_ = check_neighbors_object('nns_positive', self.k_neighbors, additional_neighbor=1)
        self.nns_pos_.set_params(n_jobs=self.n_jobs)
        # Create nearest neighbors of negative class
        self.nn_neg_ = check_neighbors_object('nn_negative', nn_object=1)
        self.nn_neg_.set_params(n_jobs=self.n_jobs)
def test_check_neighbors_object():
    name = 'n_neighbors'
    n_neighbors = 1
    estimator = check_neighbors_object(name, n_neighbors)
    assert issubclass(type(estimator), KNeighborsMixin)
    assert estimator.n_neighbors == 1
    estimator = check_neighbors_object(name, n_neighbors, 1)
    assert issubclass(type(estimator), KNeighborsMixin)
    assert estimator.n_neighbors == 2
    estimator = NearestNeighbors(n_neighbors)
    assert estimator is check_neighbors_object(name, estimator)
    n_neighbors = 'rnd'
    with raises(ValueError, match="has to be one of"):
        check_neighbors_object(name, n_neighbors)
Пример #4
0
def test_check_neighbors_object():
    name = 'n_neighbors'
    n_neighbors = 1
    estimator = check_neighbors_object(name, n_neighbors)
    assert issubclass(type(estimator), KNeighborsMixin)
    assert estimator.n_neighbors == 1
    estimator = check_neighbors_object(name, n_neighbors, 1)
    assert issubclass(type(estimator), KNeighborsMixin)
    assert estimator.n_neighbors == 2
    estimator = NearestNeighbors(n_neighbors)
    assert estimator is check_neighbors_object(name, estimator)
    n_neighbors = 'rnd'
    with raises(ValueError, match="has to be one of"):
        check_neighbors_object(name, n_neighbors)
def test_check_neighbors_object():
    name = 'n_neighbors'
    n_neighbors = 1
    estimator = check_neighbors_object(name, n_neighbors)
    assert issubclass(type(estimator), KNeighborsMixin)
    assert_equal(estimator.n_neighbors, 1)
    estimator = check_neighbors_object(name, n_neighbors, 1)
    assert issubclass(type(estimator), KNeighborsMixin)
    assert_equal(estimator.n_neighbors, 2)
    estimator = NearestNeighbors(n_neighbors)
    assert estimator is check_neighbors_object(name, estimator)
    n_neighbors = 'rnd'
    assert_raises_regex(ValueError, "has to be one of", check_neighbors_object,
                        name, n_neighbors)
Пример #6
0
def test_check_neighbors_object():
    name = "n_neighbors"
    n_neighbors = 1
    estimator = check_neighbors_object(name, n_neighbors)
    assert issubclass(type(estimator), KNeighborsMixin)
    assert estimator.n_neighbors == 1
    estimator = check_neighbors_object(name, n_neighbors, 1)
    assert issubclass(type(estimator), KNeighborsMixin)
    assert estimator.n_neighbors == 2
    estimator = NearestNeighbors(n_neighbors)
    estimator_cloned = check_neighbors_object(name, estimator)
    assert estimator.n_neighbors == estimator_cloned.n_neighbors
    n_neighbors = "rnd"
    with pytest.raises(ValueError, match="has to be one of"):
        check_neighbors_object(name, n_neighbors)
Пример #7
0
    def _validate_estimator(self):
        """Create the necessary objects for Geometric SMOTE."""

        if self.selection_strategy not in SELECTION_STRATEGY:
            error_msg = 'Unknown selection_strategy for Geometric SMOTE algorithm. Choices are {}. Got {} instead.'
            raise ValueError(
                error_msg.format(SELECTION_STRATEGY, self.selection_strategy))

        if self.selection_strategy in ('minority', 'combined'):
            self.nns_pos_ = check_neighbors_object('nns_positive',
                                                   self.k_neighbors,
                                                   additional_neighbor=1)
            self.nns_pos_.set_params(n_jobs=self.n_jobs)

        if self.selection_strategy in ('majority', 'combined'):
            self.nn_neg_ = check_neighbors_object('nn_negative', nn_object=1)
            self.nn_neg_.set_params(n_jobs=self.n_jobs)
Пример #8
0
    def _validate_estimator(self, default=AdaBoostClassifier()):
        """
        Check the estimator and the n_estimator attribute, set the
        'base_estimator_' attribute.

        :param default: classifier object used if base_estimator=None
        :return:
        """
        """"""

        if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
            raise ValueError("n_estimators must be an integer, "
                             "got {0}.".format(type(self.n_estimators)))

        if self.n_estimators <= 0:
            raise ValueError("n_estimators must be greater than zero, "
                             "got {0}.".format(self.n_estimators))

        if self.base_estimator is not None:
            base_estimator = clone(self.base_estimator)
        else:
            base_estimator = clone(default)

        if isinstance(self.ratio, dict) and self.ratio != {}:
            raise ValueError(
                "'dict' type cannot be accepted for ratio in this class; "
                "use alternative options")

        self.nn_k_ = check_neighbors_object('k_neighbors',
                                            self.k_neighbors,
                                            additional_neighbor=1)
        self.nn_k_.set_params(**{'n_jobs': self.n_jobs})

        self.smote = SMOTE(ratio=self.ratio,
                           k_neighbors=self.k_neighbors,
                           random_state=self.random_state)

        self.base_estimator_ = base_estimator
def test_check_neighbors_object():
    name = "n_neighbors"
    n_neighbors = 1
    estimator = check_neighbors_object(name, n_neighbors)
    assert issubclass(type(estimator), KNeighborsMixin)
    assert estimator.n_neighbors == 1
    estimator = check_neighbors_object(name, n_neighbors, 1)
    assert issubclass(type(estimator), KNeighborsMixin)
    assert estimator.n_neighbors == 2
    estimator = NearestNeighbors(n_neighbors=n_neighbors)
    estimator_cloned = check_neighbors_object(name, estimator)
    assert estimator.n_neighbors == estimator_cloned.n_neighbors
    estimator = _CustomNearestNeighbors()
    estimator_cloned = check_neighbors_object(name, estimator)
    assert isinstance(estimator_cloned, _CustomNearestNeighbors)
    n_neighbors = "rnd"
    err_msg = (
        "n_neighbors must be an interger or an object compatible with the "
        "KNeighborsMixin API of scikit-learn")
    with pytest.raises(ValueError, match=err_msg):
        check_neighbors_object(name, n_neighbors)
Пример #10
0
 def _validate_estimator(self):
     """Create the necessary objects for ADASYN"""
     self.nn_ = check_neighbors_object("n_neighbors",
                                       self.n_neighbors,
                                       additional_neighbor=1)
     self.nn_.set_params(**{"n_jobs": self.n_jobs})
Пример #11
0
 def _validate_estimator(self):
     self.nn_k = check_neighbors_object('K_neighbors',self.k_neighbors,additional_neighbor = 1)
     self.nn_k.set_params(**{'n_jobs': self.n_jobs})
Пример #12
0
    def _inter_sample(self, X, y, initial_sampling_strategy):
        """Intercluster resampling."""

        # Random state
        random_state = check_random_state(
            self.random_state if hasattr(self, 'random_state') else None)

        # Initialize arrays of new data
        X_new = np.array([], dtype=X.dtype).reshape(0, X.shape[1])
        y_new = np.array([], dtype=y.dtype)

        # Number of nearest neighbors
        if hasattr(self, 'k_neighbors'):
            k = self.k_neighbors
        elif hasattr(self, 'n_neighbors'):
            k = self.n_neighbors
        else:
            return X_new, y_new

        # Intercluster oversampling
        self.inter_sampling_strategies_ = []
        for (label1,
             label2), proportion in self.distributor_.inter_distribution_:

            # Filter data in cluster 1 and cluster 2
            mask1, mask2 = (self.clusterer_.labels_ == label1), (
                self.clusterer_.labels_ == label2)
            X_in_cluster1, y_in_cluster1, X_in_cluster2, y_in_cluster2 = X[
                mask1], y[mask1], X[mask2], y[mask2]

            # Calculate sampling strategy in the clusters
            clusters_sampling_strategy = OrderedDict({
                class_label:
                (int(n_samples * proportion) if class_label in y_in_cluster1
                 and class_label in y_in_cluster2 else 0)
                for class_label, n_samples in
                initial_sampling_strategy.items()
            })

            # Resample data
            for class_label, n_samples in clusters_sampling_strategy.items():

                # Modify sampling strategy
                self.sampling_strategy_ = {class_label: 1}

                for _ in range(n_samples):

                    # Identify clusters
                    ind = random_state.choice([1, -1])
                    (X1, X2), (y1,
                               y2) = [X_in_cluster1, X_in_cluster2
                                      ][::ind], [y_in_cluster1,
                                                 y_in_cluster2][::ind]

                    # Select randomly a minority class sample from cluster 1
                    ind1 = random_state.choice(np.where(y1 == class_label)[0])
                    X1_class, y1_class = X1[ind1:(ind1 + 1)], y1[ind1:(ind1 +
                                                                       1)]

                    # Select minority class samples from cluster 2
                    ind2 = np.where(y2 == class_label)[0]
                    X2_class, y2_class = X2[ind2], y2[ind2]

                    # Calculate distance matrix
                    X_class = np.vstack((X1_class, X2_class))
                    k_nn = min(k, len(X_class) - 1)
                    nn = check_neighbors_object('nn', k_nn).fit(X_class)
                    ind_nn = random_state.choice(nn.kneighbors()[1][0])

                    # Resample class data
                    X_in_clusters = np.vstack(
                        (X1_class, X2_class[(ind_nn - 1):ind_nn],
                         X1[y1 != class_label], X2[y2 != class_label]))
                    y_in_clusters = np.hstack(
                        (y1_class, y2_class[(ind_nn - 1):ind_nn],
                         y1[y1 != class_label], y2[y2 != class_label]))

                    # Modify attributes for corner cases
                    initial_attributes = self._modify_attributes(
                        1, y_in_clusters.size)

                    # Resample class data
                    X_new_cluster, y_new_cluster = self._basic_sample(
                        X_in_clusters, y_in_clusters)
                    X_new_cluster, y_new_cluster = X_new_cluster[len(
                        X_in_clusters):], y_new_cluster[len(X_in_clusters):]
                    X_new, y_new = np.vstack(
                        (X_new, X_new_cluster)), np.hstack(
                            (y_new, y_new_cluster))

                    # Restore modified attributes
                    for attribute, value in initial_attributes.items():
                        setattr(self, attribute, value)

        # Restore initial sampling strategy
        self.sampling_strategy_ = initial_sampling_strategy

        return X_new, y_new