def _validate_estimator(self): """Create the necessary attributes for Geometric SMOTE.""" # Check random state self.random_state_ = check_random_state(self.random_state) # Validate strategy if self.selection_strategy not in SELECTION_STRATEGY: error_msg = ( 'Unknown selection_strategy for Geometric SMOTE algorithm. ' 'Choices are {}. Got {} instead.') raise ValueError( error_msg.format(SELECTION_STRATEGY, self.selection_strategy)) # Create nearest neighbors object for positive class if self.selection_strategy in ('minority', 'combined'): self.nns_pos_ = check_neighbors_object('nns_positive', self.k_neighbors, additional_neighbor=1) n = self.nns_pos_ self.nns_pos_.set_params(n_jobs=self.n_jobs) # Create nearest neighbors object for negative class if self.selection_strategy in ('majority', 'combined'): self.nn_neg_ = check_neighbors_object('nn_negative', nn_object=1) self.nn_neg_.set_params(n_jobs=self.n_jobs) n = self.nn_neg_
def _validate_estimator(self): """Create the necessary attributes for Geometric SMOTE.""" # Check random state self.random_state_ = check_random_state(self.random_state) # Create nearest neighbors object for mixed class self.nn_mix_ = check_neighbors_object('nns_mixed', self.k_neighbors) # Create nearest neighbors of positive class self.nns_pos_ = check_neighbors_object('nns_positive', self.k_neighbors, additional_neighbor=1) self.nns_pos_.set_params(n_jobs=self.n_jobs) # Create nearest neighbors of negative class self.nn_neg_ = check_neighbors_object('nn_negative', nn_object=1) self.nn_neg_.set_params(n_jobs=self.n_jobs)
def test_check_neighbors_object(): name = 'n_neighbors' n_neighbors = 1 estimator = check_neighbors_object(name, n_neighbors) assert issubclass(type(estimator), KNeighborsMixin) assert estimator.n_neighbors == 1 estimator = check_neighbors_object(name, n_neighbors, 1) assert issubclass(type(estimator), KNeighborsMixin) assert estimator.n_neighbors == 2 estimator = NearestNeighbors(n_neighbors) assert estimator is check_neighbors_object(name, estimator) n_neighbors = 'rnd' with raises(ValueError, match="has to be one of"): check_neighbors_object(name, n_neighbors)
def test_check_neighbors_object(): name = 'n_neighbors' n_neighbors = 1 estimator = check_neighbors_object(name, n_neighbors) assert issubclass(type(estimator), KNeighborsMixin) assert_equal(estimator.n_neighbors, 1) estimator = check_neighbors_object(name, n_neighbors, 1) assert issubclass(type(estimator), KNeighborsMixin) assert_equal(estimator.n_neighbors, 2) estimator = NearestNeighbors(n_neighbors) assert estimator is check_neighbors_object(name, estimator) n_neighbors = 'rnd' assert_raises_regex(ValueError, "has to be one of", check_neighbors_object, name, n_neighbors)
def test_check_neighbors_object(): name = "n_neighbors" n_neighbors = 1 estimator = check_neighbors_object(name, n_neighbors) assert issubclass(type(estimator), KNeighborsMixin) assert estimator.n_neighbors == 1 estimator = check_neighbors_object(name, n_neighbors, 1) assert issubclass(type(estimator), KNeighborsMixin) assert estimator.n_neighbors == 2 estimator = NearestNeighbors(n_neighbors) estimator_cloned = check_neighbors_object(name, estimator) assert estimator.n_neighbors == estimator_cloned.n_neighbors n_neighbors = "rnd" with pytest.raises(ValueError, match="has to be one of"): check_neighbors_object(name, n_neighbors)
def _validate_estimator(self): """Create the necessary objects for Geometric SMOTE.""" if self.selection_strategy not in SELECTION_STRATEGY: error_msg = 'Unknown selection_strategy for Geometric SMOTE algorithm. Choices are {}. Got {} instead.' raise ValueError( error_msg.format(SELECTION_STRATEGY, self.selection_strategy)) if self.selection_strategy in ('minority', 'combined'): self.nns_pos_ = check_neighbors_object('nns_positive', self.k_neighbors, additional_neighbor=1) self.nns_pos_.set_params(n_jobs=self.n_jobs) if self.selection_strategy in ('majority', 'combined'): self.nn_neg_ = check_neighbors_object('nn_negative', nn_object=1) self.nn_neg_.set_params(n_jobs=self.n_jobs)
def _validate_estimator(self, default=AdaBoostClassifier()): """ Check the estimator and the n_estimator attribute, set the 'base_estimator_' attribute. :param default: classifier object used if base_estimator=None :return: """ """""" if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): raise ValueError("n_estimators must be an integer, " "got {0}.".format(type(self.n_estimators))) if self.n_estimators <= 0: raise ValueError("n_estimators must be greater than zero, " "got {0}.".format(self.n_estimators)) if self.base_estimator is not None: base_estimator = clone(self.base_estimator) else: base_estimator = clone(default) if isinstance(self.ratio, dict) and self.ratio != {}: raise ValueError( "'dict' type cannot be accepted for ratio in this class; " "use alternative options") self.nn_k_ = check_neighbors_object('k_neighbors', self.k_neighbors, additional_neighbor=1) self.nn_k_.set_params(**{'n_jobs': self.n_jobs}) self.smote = SMOTE(ratio=self.ratio, k_neighbors=self.k_neighbors, random_state=self.random_state) self.base_estimator_ = base_estimator
def test_check_neighbors_object(): name = "n_neighbors" n_neighbors = 1 estimator = check_neighbors_object(name, n_neighbors) assert issubclass(type(estimator), KNeighborsMixin) assert estimator.n_neighbors == 1 estimator = check_neighbors_object(name, n_neighbors, 1) assert issubclass(type(estimator), KNeighborsMixin) assert estimator.n_neighbors == 2 estimator = NearestNeighbors(n_neighbors=n_neighbors) estimator_cloned = check_neighbors_object(name, estimator) assert estimator.n_neighbors == estimator_cloned.n_neighbors estimator = _CustomNearestNeighbors() estimator_cloned = check_neighbors_object(name, estimator) assert isinstance(estimator_cloned, _CustomNearestNeighbors) n_neighbors = "rnd" err_msg = ( "n_neighbors must be an interger or an object compatible with the " "KNeighborsMixin API of scikit-learn") with pytest.raises(ValueError, match=err_msg): check_neighbors_object(name, n_neighbors)
def _validate_estimator(self): """Create the necessary objects for ADASYN""" self.nn_ = check_neighbors_object("n_neighbors", self.n_neighbors, additional_neighbor=1) self.nn_.set_params(**{"n_jobs": self.n_jobs})
def _validate_estimator(self): self.nn_k = check_neighbors_object('K_neighbors',self.k_neighbors,additional_neighbor = 1) self.nn_k.set_params(**{'n_jobs': self.n_jobs})
def _inter_sample(self, X, y, initial_sampling_strategy): """Intercluster resampling.""" # Random state random_state = check_random_state( self.random_state if hasattr(self, 'random_state') else None) # Initialize arrays of new data X_new = np.array([], dtype=X.dtype).reshape(0, X.shape[1]) y_new = np.array([], dtype=y.dtype) # Number of nearest neighbors if hasattr(self, 'k_neighbors'): k = self.k_neighbors elif hasattr(self, 'n_neighbors'): k = self.n_neighbors else: return X_new, y_new # Intercluster oversampling self.inter_sampling_strategies_ = [] for (label1, label2), proportion in self.distributor_.inter_distribution_: # Filter data in cluster 1 and cluster 2 mask1, mask2 = (self.clusterer_.labels_ == label1), ( self.clusterer_.labels_ == label2) X_in_cluster1, y_in_cluster1, X_in_cluster2, y_in_cluster2 = X[ mask1], y[mask1], X[mask2], y[mask2] # Calculate sampling strategy in the clusters clusters_sampling_strategy = OrderedDict({ class_label: (int(n_samples * proportion) if class_label in y_in_cluster1 and class_label in y_in_cluster2 else 0) for class_label, n_samples in initial_sampling_strategy.items() }) # Resample data for class_label, n_samples in clusters_sampling_strategy.items(): # Modify sampling strategy self.sampling_strategy_ = {class_label: 1} for _ in range(n_samples): # Identify clusters ind = random_state.choice([1, -1]) (X1, X2), (y1, y2) = [X_in_cluster1, X_in_cluster2 ][::ind], [y_in_cluster1, y_in_cluster2][::ind] # Select randomly a minority class sample from cluster 1 ind1 = random_state.choice(np.where(y1 == class_label)[0]) X1_class, y1_class = X1[ind1:(ind1 + 1)], y1[ind1:(ind1 + 1)] # Select minority class samples from cluster 2 ind2 = np.where(y2 == class_label)[0] X2_class, y2_class = X2[ind2], y2[ind2] # Calculate distance matrix X_class = np.vstack((X1_class, X2_class)) k_nn = min(k, len(X_class) - 1) nn = check_neighbors_object('nn', k_nn).fit(X_class) ind_nn = random_state.choice(nn.kneighbors()[1][0]) # Resample class data X_in_clusters = np.vstack( (X1_class, X2_class[(ind_nn - 1):ind_nn], X1[y1 != class_label], X2[y2 != class_label])) y_in_clusters = np.hstack( (y1_class, y2_class[(ind_nn - 1):ind_nn], y1[y1 != class_label], y2[y2 != class_label])) # Modify attributes for corner cases initial_attributes = self._modify_attributes( 1, y_in_clusters.size) # Resample class data X_new_cluster, y_new_cluster = self._basic_sample( X_in_clusters, y_in_clusters) X_new_cluster, y_new_cluster = X_new_cluster[len( X_in_clusters):], y_new_cluster[len(X_in_clusters):] X_new, y_new = np.vstack( (X_new, X_new_cluster)), np.hstack( (y_new, y_new_cluster)) # Restore modified attributes for attribute, value in initial_attributes.items(): setattr(self, attribute, value) # Restore initial sampling strategy self.sampling_strategy_ = initial_sampling_strategy return X_new, y_new