def _fit_resample(self, X, y): n_samples = X.shape[0] # convert y to z_score y_z = (y - y.mean()) / y.std() index0 = np.arange(n_samples) index_negative = index0[y_z > self.negative_thres] index_positive = index0[y_z <= self.positive_thres] index_unclassified = [x for x in index0 if x not in index_negative and x not in index_positive] y_z[index_negative] = 0 y_z[index_positive] = 1 y_z[index_unclassified] = -1 ros = RandomOverSampler( sampling_strategy=self.sampling_strategy, random_state=self.random_state, ratio=self.ratio) _, _ = ros.fit_resample(X, y_z) sample_indices = ros.sample_indices_ print("Before sampler: %s. Total after: %s" % (Counter(y_z), sample_indices.shape)) self.sample_indices_ = np.array(sample_indices) if self.return_indices: return (_safe_indexing(X, sample_indices), _safe_indexing(y, sample_indices), sample_indices) return (_safe_indexing(X, sample_indices), _safe_indexing(y, sample_indices))
def train_submission(self, module_path, X, y, train_idx=None): """Train the estimator of a given submission. Parameters ---------- module_path : str The path to the submission where `filename` is located. X : {array-like, sparse matrix, dataframe} of shape \ (n_samples, n_features) The data matrix. y : array-like of shape (n_samples,) The target vector. train_idx : array-like of shape (n_training_samples,), default=None The training indices. By default, the full dataset will be used to train the model. If an array is provided, `X` and `y` will be subsampled using these indices. Returns ------- estimator : estimator object The scikit-learn fitted on (`X`, `y`). """ train_idx = slice(None, None, None) if train_idx is None else train_idx submission_module = import_module_from_source( os.path.join(module_path, self.filename), os.path.splitext(self.filename)[0], # keep the module name only sanitize=True) estimator = submission_module.get_estimator() X_train = _safe_indexing(X, train_idx) y_train = _safe_indexing(y, train_idx) return estimator.fit(X_train, y_train)
def test_check_fit_params(indices): X = np.random.randn(4, 2) fit_params = { 'list': [1, 2, 3, 4], 'array': np.array([1, 2, 3, 4]), 'sparse-col': sp.csc_matrix([1, 2, 3, 4]).T, 'sparse-row': sp.csc_matrix([1, 2, 3, 4]), 'scalar-int': 1, 'scalar-str': 'xxx', 'None': None, } result = _check_fit_params(X, fit_params, indices) indices_ = indices if indices is not None else list(range(X.shape[0])) for key in ['sparse-row', 'scalar-int', 'scalar-str', 'None']: assert result[key] is fit_params[key] assert result['list'] == _safe_indexing(fit_params['list'], indices_) assert_array_equal( result['array'], _safe_indexing(fit_params['array'], indices_) ) assert_allclose_dense_sparse( result['sparse-col'], _safe_indexing(fit_params['sparse-col'], indices_) )
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices): # validation of the indices # we make a copy because indices is mutable and shared between tests indices_converted = copy(indices) if indices_type == "slice" and isinstance(indices[1], int): indices_converted[1] += 1 columns_name = ["col_0", "col_1", "col_2"] array = _convert_container( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name ) indices_converted = _convert_container(indices_converted, indices_type) if isinstance(indices[0], str) and array_type != "dataframe": err_msg = ( "Specifying the columns using strings is only supported " "for pandas DataFrames" ) with pytest.raises(ValueError, match=err_msg): _safe_indexing(array, indices_converted, axis=1) else: subset = _safe_indexing(array, indices_converted, axis=1) assert_allclose_dense_sparse( subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type) )
def __getitem__(self, index): X_resampled = _safe_indexing( self.X, self.indices_[ index * self.batch_size:(index + 1) * self.batch_size ], ) y_resampled = _safe_indexing( self.y, self.indices_[ index * self.batch_size:(index + 1) * self.batch_size ], ) if issparse(X_resampled) and not self.keep_sparse: X_resampled = X_resampled.toarray() if self.sample_weight is not None: sample_weight_resampled = _safe_indexing( self.sample_weight, self.indices_[ index * self.batch_size:(index + 1) * self.batch_size ], ) if self.sample_weight is None: return X_resampled, y_resampled else: return X_resampled, y_resampled, sample_weight_resampled
def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] index_target_class = random_state.choice( range(np.count_nonzero(y == target_class)), size=n_samples, replace=self.replacement, ) else: index_target_class = slice(None) idx_under = np.concatenate( ( idx_under, np.flatnonzero(y == target_class)[index_target_class], ), axis=0, ) self.sample_indices_ = idx_under return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under)
def _fit_and_predict_oof_model( self, estimator: RegressorMixin, X: ArrayLike, y: ArrayLike, train_index: ArrayLike, val_index: ArrayLike, sample_weight: Optional[ArrayLike] = None, ) -> Tuple[RegressorMixin, NDArray, ArrayLike]: """ Fit a single out-of-fold model on a given training set and perform predictions on a test set. Parameters ---------- estimator : RegressorMixin Estimator to train. X : ArrayLike of shape (n_samples, n_features) Input data. y : ArrayLike of shape (n_samples,) Input labels. train_index : ArrayLike of shape (n_samples_train) Training data indices. val_index : ArrayLike of shape (n_samples_val) Validation data indices. sample_weight : Optional[ArrayLike] of shape (n_samples,) Sample weights. If None, then samples are equally weighted. By default ``None``. Returns ------- Tuple[RegressorMixin, NDArray, ArrayLike] - [0]: RegressorMixin, fitted estimator - [1]: NDArray of shape (n_samples_val,), estimator predictions on the validation fold. - [3]: ArrayLike of shape (n_samples_val,), validation data indices. """ X_train = _safe_indexing(X, train_index) y_train = _safe_indexing(y, train_index) X_val = _safe_indexing(X, val_index) if sample_weight is None: estimator = fit_estimator(estimator, X_train, y_train) else: sample_weight_train = _safe_indexing(sample_weight, train_index) estimator = fit_estimator( estimator, X_train, y_train, sample_weight_train ) if _num_samples(X_val) > 0: y_pred = estimator.predict(X_val) else: y_pred = np.array([]) return estimator, y_pred, val_index
def _split_fit_score_trial(self, X, y, idx=0): """ Splits the dataset, fits a clone of the estimator, then scores it according to the required metrics. The index of the split is added to the random_state if the random_state is not None; this ensures that every split is shuffled differently but in a deterministic fashion for testing purposes. """ random_state = self.random_state if random_state is not None: random_state += idx splitter = self._check_cv(self.cv, random_state) for train_index, test_index in splitter.split(X, y): # Safe indexing handles multiple types of inputs including # DataFrames and structured arrays - required for generic splits. X_train = _safe_indexing(X, train_index) y_train = _safe_indexing(y, train_index) X_test = _safe_indexing(X, test_index) y_test = _safe_indexing(y, test_index) model = clone(self.estimator) model.fit(X_train, y_train) if hasattr(model, "predict_proba"): # Get the probabilities for the positive class y_scores = model.predict_proba(X_test)[:, 1] else: # Use the decision function to get the scores y_scores = model.decision_function(X_test) # Compute the curve metrics and thresholds curve_metrics = precision_recall_curve(y_test, y_scores) precision, recall, thresholds = curve_metrics # Compute the F1 score from precision and recall # Don't need to warn for F, precision/recall would have warned with np.errstate(divide="ignore", invalid="ignore"): beta = self.fbeta**2 f_score = (1 + beta) * precision * recall / (beta * precision + recall) # Ensure thresholds ends at 1 thresholds = np.append(thresholds, 1) # Compute the queue rate queue_rate = np.array([(y_scores >= threshold).mean() for threshold in thresholds]) yield { "thresholds": thresholds, "precision": precision, "recall": recall, "fscore": f_score, "queue_rate": queue_rate, }
def _fit_resample(self, X, y): self._validate_estimator() enn = EditedNearestNeighbours( sampling_strategy=self.sampling_strategy, n_neighbors=self.n_neighbors, kind_sel="mode", n_jobs=self.n_jobs, ) enn.fit_resample(X, y) index_not_a1 = enn.sample_indices_ index_a1 = np.ones(y.shape, dtype=bool) index_a1[index_not_a1] = False index_a1 = np.flatnonzero(index_a1) # clean the neighborhood target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) # compute which classes to consider for cleaning for the A2 group classes_under_sample = [ c for c, n_samples in target_stats.items() if ( c in self.sampling_strategy_.keys() and (n_samples > X.shape[0] * self.threshold_cleaning) ) ] self.nn_.fit(X) class_minority_indices = np.flatnonzero(y == class_minority) X_class = _safe_indexing(X, class_minority_indices) y_class = _safe_indexing(y, class_minority_indices) nnhood_idx = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] if self.kind_sel == "mode": nnhood_label_majority, _ = mode(nnhood_label, axis=1) nnhood_bool = np.ravel(nnhood_label_majority) == y_class elif self.kind_sel == "all": nnhood_label_majority = nnhood_label == class_minority nnhood_bool = np.all(nnhood_label, axis=1) else: raise NotImplementedError # compute a2 group index_a2 = np.ravel(nnhood_idx[~nnhood_bool]) index_a2 = np.unique( [index for index in index_a2 if y[index] in classes_under_sample] ) union_a1_a2 = np.union1d(index_a1, index_a2).astype(int) selected_samples = np.ones(y.shape, dtype=bool) selected_samples[union_a1_a2] = False self.sample_indices_ = np.flatnonzero(selected_samples) return ( _safe_indexing(X, self.sample_indices_), _safe_indexing(y, self.sample_indices_), )
def check_null_weight( sample_weight: Optional[ArrayLike], X: ArrayLike, y: ArrayLike) -> Tuple[Optional[NDArray], ArrayLike, ArrayLike]: """ Check sample weights and remove samples with null sample weights. Parameters ---------- sample_weight : Optional[ArrayLike] of shape (n_samples,) Sample weights. X : ArrayLike of shape (n_samples, n_features) Training samples. y : ArrayLike of shape (n_samples,) Training labels. Returns ------- sample_weight : Optional[NDArray] of shape (n_samples,) Non-null sample weights. X : ArrayLike of shape (n_samples, n_features) Training samples with non-null weights. y : ArrayLike of shape (n_samples,) Training labels with non-null weights. Examples -------- >>> import numpy as np >>> from mapie.utils import check_null_weight >>> X = np.array([[0], [1], [2], [3], [4], [5]]) >>> y = np.array([5, 7, 9, 11, 13, 15]) >>> sample_weight = np.array([0, 1, 1, 1, 1, 1]) >>> sample_weight, X, y = check_null_weight(sample_weight, X, y) >>> print(sample_weight) [1. 1. 1. 1. 1.] >>> print(X) [[1] [2] [3] [4] [5]] >>> print(y) [ 7 9 11 13 15] """ if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) non_null_weight = sample_weight != 0 X = _safe_indexing(X, non_null_weight) y = _safe_indexing(y, non_null_weight) sample_weight = _safe_indexing(sample_weight, non_null_weight) sample_weight = cast(Optional[NDArray], sample_weight) return sample_weight, X, y
def _fit_resample(self, X, y): # Find the nearest neighbour of every point nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs) nn.fit(X) nns = nn.kneighbors(X, return_distance=False)[:, 1] links = self.is_tomek(y, nns, self.sampling_strategy_) self.sample_indices_ = np.flatnonzero(np.logical_not(links)) return ( _safe_indexing(X, self.sample_indices_), _safe_indexing(y, self.sample_indices_), )
def test_safe_indexing_1d_array_error(X_constructor): # check that we are raising an error if the array-like passed is 1D and # we try to index on the 2nd dimension X = list(range(5)) if X_constructor == 'array': X_constructor = np.asarray(X) elif X_constructor == 'series': pd = pytest.importorskip("pandas") X_constructor = pd.Series(X) err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or pandas" with pytest.raises(ValueError, match=err_msg): _safe_indexing(X_constructor, [0, 1], axis=1)
def generator(X, y, sample_weight, indices, batch_size): while True: for index in range(0, len(indices), batch_size): X_res = _safe_indexing(X, indices[index:index + batch_size]) y_res = _safe_indexing(y, indices[index:index + batch_size]) if issparse(X_res) and not keep_sparse: X_res = X_res.toarray() if sample_weight is None: yield X_res, y_res else: sw_res = _safe_indexing(sample_weight, indices[index:index + batch_size]) yield X_res, y_res, sw_res
def accumulated_local_effects(est, x, feature, n_quantiles): """calculates ale for a feature""" ale = np.array(n_quantiles,) features_indices = np.asarray( _get_column_indices(x, feature), dtype=np.int32, order='C' ).ravel() quantiles = _quantiles_from_x(_safe_indexing(x, features_indices, axis=1), n_quantiles) x_feat = _safe_indexing(x, feature, axis=1) if x_feat.to_numpy().dtype.name == "category" or x_feat.to_numpy().dtype == "object": ale = _ale_for_categorical(est, quantiles, x, x_feat) else: ale = _ale_for_numeric(est, quantiles, x, x_feat) return ale
def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() random_state = check_random_state(self.random_state) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): # select a sample from the current class idx_maj = np.flatnonzero(y == target_class) sel_idx_maj = random_state.randint( low=0, high=target_stats[target_class], size=self.n_seeds_S) idx_maj_sample = idx_maj[sel_idx_maj] minority_class_indices = np.flatnonzero(y == class_minority) C_indices = np.append(minority_class_indices, idx_maj_sample) # create the set composed of all minority samples and one # sample from the current class. C_x = _safe_indexing(X, C_indices) C_y = _safe_indexing(y, C_indices) # create the set S with removing the seed from S # since that it will be added anyway idx_maj_extracted = np.delete(idx_maj, sel_idx_maj, axis=0) S_x = _safe_indexing(X, idx_maj_extracted) S_y = _safe_indexing(y, idx_maj_extracted) self.estimator_.fit(C_x, C_y) pred_S_y = self.estimator_.predict(S_x) S_misclassified_indices = np.flatnonzero(pred_S_y != S_y) idx_tmp = idx_maj_extracted[S_misclassified_indices] idx_under = np.concatenate( (idx_under, idx_maj_sample, idx_tmp), axis=0) else: idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)), axis=0) X_resampled = _safe_indexing(X, idx_under) y_resampled = _safe_indexing(y, idx_under) # apply Tomek cleaning tl = TomekLinks(sampling_strategy=list(self.sampling_strategy_.keys())) X_cleaned, y_cleaned = tl.fit_resample(X_resampled, y_resampled) self.sample_indices_ = _safe_indexing(idx_under, tl.sample_indices_) idx_under = self.sample_indices_ if sample_weight is not None: # sample_weight is already validated in self.fit_resample() sample_weight_under = _safe_indexing(sample_weight, idx_under) return X_cleaned, y_cleaned, sample_weight_under else: return X_cleaned, y_cleaned
def evaluate(D, sol): phenotype = SamplingBenchmark.map_to_phenotype( CustomSamplingBenchmark.to_phenotype(sol)) X_sampled = _safe_indexing(self.X_train, phenotype) y_sampled = _safe_indexing(self.y_train, phenotype) if X_sampled.shape[0] > 0: cls = self.evaluator.fit(X_sampled, y_sampled) y_predicted = cls.predict(self.X_valid) quality = accuracy_score(self.y_valid, y_predicted) size_percentage = len(y_sampled) / len(sol) return (1 - quality) * size_percentage else: return math.inf
def _fit_transformer(self, y): """Check transformer and fit transformer. Create the default transformer, fit it and make additional inverse check on a subset (optional). """ if (self.transformer is not None and (self.func is not None or self.inverse_func is not None)): raise ValueError("'transformer' and functions 'func'/" "'inverse_func' cannot both be set.") elif self.transformer is not None: self.transformer_ = clone(self.transformer) else: if self.func is not None and self.inverse_func is None: raise ValueError("When 'func' is provided, 'inverse_func' must" " also be provided") self.transformer_ = FunctionTransformer( func=self.func, inverse_func=self.inverse_func, validate=True, check_inverse=self.check_inverse) # XXX: sample_weight is not currently passed to the # transformer. However, if transformer starts using sample_weight, the # code should be modified accordingly. At the time to consider the # sample_prop feature, it is also a good use case to be considered. self.transformer_.fit(y) if self.check_inverse: idx_selected = slice(None, None, max(1, y.shape[0] // 10)) y_sel = _safe_indexing(y, idx_selected) y_sel_t = self.transformer_.transform(y_sel) if not np.allclose(y_sel, self.transformer_.inverse_transform(y_sel_t)): warnings.warn("The provided functions or transformer are" " not strictly inverse of each other. If" " you are sure you want to proceed regardless" ", set 'check_inverse=False'", UserWarning)
def _fit_resample(self, X, y): self._validate_estimator() X_resampled = [X.copy()] y_resampled = [y.copy()] for class_sample, n_samples in self.sampling_strategy_.items(): print(self.sampling_strategy_.items()) if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples(X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0) X_resampled.append(X_new) y_resampled.append(y_new) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled, format=X.format) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) from IPython.display import display, HTML import pandas as pd pd.set_option("display.max_rows", None, "display.max_columns", None) new = pd.DataFrame(X_new) display(new) return X_resampled, y_resampled
def _local_parallel_build_trees(sampler, tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None, n_samples_bootstrap=None): # resample before to fit the tree X_resampled, y_resampled = sampler.fit_resample(X, y) if sample_weight is not None: sample_weight = _safe_indexing(sample_weight, sampler.sample_indices_) if _get_n_samples_bootstrap is not None: n_samples_bootstrap = min(n_samples_bootstrap, X_resampled.shape[0]) tree = _parallel_build_trees( tree, forest, X_resampled, y_resampled, sample_weight, tree_idx, n_trees, verbose=verbose, class_weight=class_weight, n_samples_bootstrap=n_samples_bootstrap, ) return sampler, tree
def make_sample(imbalanced_data_arr2, diff): # 将数据集分开为少数类数据和多数类数据 minor_data_arr2, major_data_arr2 = seperate_minor_and_major_data(imbalanced_data_arr2) imbalanced_featured_data = imbalanced_data_arr2[:, : -1] imbalanced_label_data = imbalanced_data_arr2[:, -1] # 原始少数样本的特征集 old_feature_data = minor_data_arr2[:, : -1] # 原始少数样本的标签值 old_label_data = minor_data_arr2[0][-1] danger_index = in_danger(imbalanced_featured_data, old_feature_data, old_label_data, imbalanced_label_data) # 少数样本中噪音集合,也就是最终要产生新样本的集合 danger_index_data = _safe_indexing(old_feature_data, danger_index) # 获取每一个少数类样本点周围最近的n_neighbors-1个点的位置矩阵 nns = NearestNeighbors(n_neighbors=6).fit(old_feature_data).kneighbors(danger_index_data, return_distance=False)[:, 1:] # 随机产生diff个随机数作为之后产生新样本的选取的样本下标值 samples_indices = np.random.randint(low=0, high=np.shape(danger_index_data)[0], size=diff) # 随机产生diff个随机数作为之后产生新样本的间距值 steps = np.random.uniform(size=diff) cols = np.mod(samples_indices, nns.shape[1]) reshaped_feature = np.zeros((diff, danger_index_data.shape[1])) for i, (col, step) in enumerate(zip(cols, steps)): row = samples_indices[i] reshaped_feature[i] = danger_index_data[row] - step * (danger_index_data[row] - old_feature_data[nns[row, col]]) new_min_feature_data = np.vstack((reshaped_feature, old_feature_data)) return new_min_feature_data
def test_safe_indexing_1d_container_mask(array_type, indices_type): indices = [False] + [True] * 2 + [False] * 6 array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=0) assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
def _init_pretest(self, features, target): """Set the sample of data used to verify pipelines work with the passed data set. This is not intend for anything other than perfunctory dataset pipeline compatibility testing """ num_unique_target = len(np.unique(target)) # make sure train_size is at least num_unique_target train_size = max(min(50, int(0.9 * features.shape[0])), num_unique_target) self.pretest_X, _, self.pretest_y, _ = \ train_test_split( features, target, random_state=self.random_state, test_size=None, train_size=train_size ) #Make sure there is a least one example from each class #for this evaluative test sample if not np.array_equal(np.unique(target), np.unique(self.pretest_y)): unique_target_idx = np.unique(target, return_index=True)[1] self.pretest_y[0:unique_target_idx.shape[0]] = \ _safe_indexing(target, unique_target_idx)
def _fit_resample(self, X, y): self._validate_estimator() X_resampled = [X.copy()] y_resampled = [y.copy()] for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples( X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0 ) X_resampled.append(X_new) y_resampled.append(y_new) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled, format=X.format) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, y_resampled
def calculate(self, chromosome: ndarray) -> float: labels = self.cluster.run(chromosome, self.samples) self.samples, labels = check_X_y(self.samples, labels) le = LabelEncoder() labels = le.fit_transform(labels) n_samples, _ = self.samples.shape n_labels = len(le.classes_) check_number_of_labels(n_labels, n_samples) intra_dists = np.zeros(n_labels) centroids = np.zeros((n_labels, len(self.samples[0])), dtype=float) for k in range(n_labels): cluster_k = _safe_indexing(self.samples, labels == k) centroid = chromosome[k] centroids[k] = centroid intra_dists[k] = np.average( pairwise_distances(cluster_k, [centroid])) centroid_distances = pairwise_distances(centroids) if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0): return 0.0 centroid_distances[centroid_distances == 0] = np.inf combined_intra_dists = intra_dists[:, None] + intra_dists scores = np.max(combined_intra_dists / centroid_distances, axis=1) return 1 / np.mean(scores)
def _boost_real(self, iboost, X, y, sample_weight, random_state): """Implement a single boost using the SAMME.R real algorithm.""" estimator, sampler = self._make_sampler_estimator(random_state=random_state) X_res, y_res = sampler.fit_resample(X, y) sample_weight_res = _safe_indexing(sample_weight, sampler.sample_indices_) estimator.fit(X_res, y_res, sample_weight=sample_weight_res) y_predict_proba = estimator.predict_proba(X) if iboost == 0: self.classes_ = getattr(estimator, "classes_", None) self.n_classes_ = len(self.classes_) y_predict = self.classes_.take(np.argmax(y_predict_proba, axis=1), axis=0) # Instances incorrectly classified incorrect = y_predict != y # Error fraction estimator_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0)) # Stop if classification is perfect if estimator_error <= 0: return sample_weight, 1.0, 0.0 # Construct y coding as described in Zhu et al [2]: # # y_k = 1 if c == k else -1 / (K - 1) # # where K == n_classes_ and c, k in [0, K) are indices along the second # axis of the y coding with c being the index corresponding to the true # class label. n_classes = self.n_classes_ classes = self.classes_ y_codes = np.array([-1.0 / (n_classes - 1), 1.0]) y_coding = y_codes.take(classes == y[:, np.newaxis]) # Displace zero probabilities so the log is defined. # Also fix negative elements which may occur with # negative sample weights. proba = y_predict_proba # alias for readability np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba) # Boost weight using multi-class AdaBoost SAMME.R alg estimator_weight = ( -1.0 * self.learning_rate * ((n_classes - 1.0) / n_classes) * (y_coding * np.log(y_predict_proba)).sum(axis=1) ) # Only boost the weights if it will fit again if not iboost == self.n_estimators - 1: # Only boost positive weights sample_weight *= np.exp( estimator_weight * ((sample_weight > 0) | (estimator_weight < 0)) ) return sample_weight, 1.0, estimator_error
def permutations(estimator, X, y, cv=None, n_permuations=100, random_state=0, scoring=None): """ This follows the sklearn API sklearn.inspection.permutation_test_score I have modified accordinlgy to accomodate filtering of features using correlation matrix before running cross-validation using the model """ Xs, ys = indexable(X, y) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) random_state = check_random_state(random_state) # corr = CorrMatrix() # corr.fit(X,y) # Xs, ys = corr.transform() score = _permutations(clone(estimator), Xs, ys, cv, scorer) permutation_scores = np.zeros((n_permuations)) for i in range(n_permuations): # corr_p = CorrMatrix() # corr_p.fit(X, y) # Xp, yp = corr_p.transform() yp = _safe_indexing(y, random_state.permutation(len(y))) permutation_scores[i] = _permutations(clone(estimator), Xs, yp, cv, scorer) pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permuations + 1) return score, permutation_scores, pvalue
def _fit_resample(self, X, y): self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples(X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) sparse_func = "tocsc" if X.format == "csc" else "tocsr" X_resampled = getattr(X_resampled, sparse_func)() else: X_resampled = np.vstack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled
def db(X, labels): X, labels = check_X_y(X, labels) le = LabelEncoder() labels = le.fit_transform(labels) n_samples, _ = X.shape n_labels = len(le.classes_) check_number_of_labels(n_labels, n_samples) intra_dists = np.zeros(n_labels) centroids = np.zeros((n_labels, len(X[0])), dtype=float) for k in range(n_labels): cluster_k = _safe_indexing(X, labels == k) centroid = cluster_k.mean(axis=0) centroids[k] = centroid intra_dists[k] = np.average(pairwise_distances( cluster_k, [centroid], metric='euclidean')) centroid_distances = pairwise_distances(centroids, metric='euclidean') if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0): return 0.0 centroid_distances[centroid_distances == 0] = np.inf combined_intra_dists = intra_dists[:, None] + intra_dists scores = np.max(combined_intra_dists / centroid_distances, axis=1) return np.mean(scores)
def SMOTE_Borderline_D(imbalanced_data_arr2): # 将数据集分开为少数类数据和多数类数据 minor_data_arr2, major_data_arr2 = seperate_minor_and_major_data( imbalanced_data_arr2) print('多数类样本数:', len(major_data_arr2), ', 少数类样本数:', len(minor_data_arr2)) imbalanced_featured_data = imbalanced_data_arr2[:, :-1] imbalanced_label_data = imbalanced_data_arr2[:, -1] # 计算多数类数据和少数类数据之间的数量差,也是需要过采样的数量 # 1 n = major_data_arr2.shape[0] - minor_data_arr2.shape[0] # 原始少数样本的特征集 old_feature_data = minor_data_arr2[:, :-1] # 原始少数样本的标签值 old_label_data = minor_data_arr2[0][-1] danger_index = in_danger(imbalanced_featured_data, old_feature_data, old_label_data, imbalanced_label_data) # 少数样本中噪音集合,也就是最终要产生新样本的集合 danger_index_data = _safe_indexing(old_feature_data, danger_index) # 使用K近邻方法产生的新样本特征集 new_feature_data = make_sample(old_feature_data, danger_index_data, n) # 扩展少数类样本集 # 将类别标签数组合并到少数类样本特征集,构建出新的少数类样本数据集 new_labels_data = np.array([old_label_data] * len(new_feature_data)) new_minor_data_arr2 = np.column_stack((new_feature_data, new_labels_data)) # balanced_data_arr2 = np.row_stack((new_minor_data_arr2, major_data_arr2)) balanced_data_arr2 = np.row_stack((major_data_arr2, new_minor_data_arr2)) # 将少数类数据集和多数据类数据集合并,并对样本数据进行打乱重排, # balanced_data_arr2 = concat_and_shuffle_data(new_minor_data_arr2, major_data_arr2) return balanced_data_arr2
def _fit_resample(self, X, y): # FIXME: to be removed in 0.12 if self.n_jobs is not None: warnings.warn( "The parameter `n_jobs` has been deprecated in 0.10 and will be " "removed in 0.12. You can pass an nearest neighbors estimator where " "`n_jobs` is already set instead.", FutureWarning, ) self._validate_estimator() X_resampled = [X.copy()] y_resampled = [y.copy()] for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples(X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0) X_resampled.append(X_new) y_resampled.append(y_new) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled, format=X.format) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, y_resampled