def _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, classes, sample_weight=None, coef_init=None, intercept_init=None): X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C", accept_large_sparse=False) n_samples, n_features = X.shape _check_partial_fit_first_call(self, classes) n_classes = 2 # Allocate datastructures from input arguments self._expanded_class_weight = compute_class_weight( self.class_weight, self.classes_, y) if sample_weight is None: sample_weight = np.ones(n_samples, dtype=np.float64) if getattr(self, "coef_", None) is None or coef_init is not None: self._allocate_parameter_mem(n_classes, n_features, coef_init, intercept_init) elif n_features != self.coef_.shape[-1]: raise ValueError("Number of features %d does not match previous " "data %d." % (n_features, self.coef_.shape[-1])) self.loss_function_ = self._get_loss_function(loss) if not hasattr(self, "t_"): self.t_ = 1.0 self._fit_binary(X, y, alpha=alpha, C=C, learning_rate=learning_rate, sample_weight=sample_weight, max_iter=max_iter) return self
def partial_fit(self, X, y, classes=None): """Partial fitting.""" if not hasattr(self, "_base_clf"): self.set_base_clf() X, y = check_X_y(X, y) if _check_partial_fit_first_call(self, classes): self.classes_ = classes self.ensemble_ = [] self.X_, self.y_ = X, y train_X, train_y = self.remove_outliers(X, y) # Testing all models scores = np.array( [measure(y, clf.predict(X)) for clf in self.ensemble_]) # Pruning if len(self.ensemble_) > 1: alpha_good = scores > (0.5 + self.alpha) # print(scores) self.ensemble_ = [ self.ensemble_[i] for i in np.where(alpha_good)[0] ] if len(self.ensemble_) > self.ensemble_size - 1: worst = np.argmin(scores) del self.ensemble_[worst] # Preparing and training new candidate self.ensemble_.append(base.clone(self._base_clf).fit(train_X, train_y))
def partial_fit(self, X, y, classes=None): """Partial fitting.""" if not hasattr(self, "_base_clf"): self.set_base_clf() X, y = check_X_y(X, y) if _check_partial_fit_first_call(self, classes): self.classes_ = classes self.ensemble_ = [] self.weights_ = [] self.tresholds_ = [] self.X_, self.y_ = X, y # Testing all models scores = np.array([self.metric(y, clf.predict(X)) for clf in self.ensemble_]) # Pruning self.prune(scores) # Preparing and training new candidate candidate_clf = base.clone(self._base_clf).fit(self.X_, self.y_) # Checking tresholds if self.t_strategy == "auto": probas = candidate_clf.predict_proba(self.X_)[:, 0] treshold = self.opt_quants[ np.argmax([self.metric(self.y_, probas < t) for t in self.opt_quants]) ] else: treshold = self.t_strategy self.ensemble_.append(candidate_clf) self.tresholds_.append(treshold)
def partial_fit(self, X, y, classes=None): """Partially fit underlying estimators Should be used when memory is inefficient to train all data. Chunks of data can be passed in several iteration. Parameters ---------- X : (sparse) array-like, shape = [n_samples, n_features] Data. y : (sparse) array-like, shape = [n_samples, ], [n_samples, n_classes] Multi-class targets. An indicator matrix turns on multilabel classification. classes : array, shape (n_classes, ) Classes across all calls to partial_fit. Can be obtained via `np.unique(y_all)`, where y_all is the target vector of the entire dataset. This argument is only required in the first call of partial_fit and can be omitted in the subsequent calls. Returns ------- self """ if _check_partial_fit_first_call(self, classes): if not hasattr(self.estimator, "partial_fit"): raise ValueError(("Base estimator {0}, doesn't have " "partial_fit method").format(self.estimator)) self.estimators_ = [ clone(self.estimator) for _ in range(self.n_classes_) ] # A sparse LabelBinarizer, with sparse_output=True, has been # shown to outperform or match a dense label binarizer in all # cases and has also resulted in less or equal memory consumption # in the fit_ovr function overall. self.label_binarizer_ = LabelBinarizer(sparse_output=True) self.label_binarizer_.fit(self.classes_) if len(np.setdiff1d(y, self.classes_)): raise ValueError( ("Mini-batch contains {0} while classes " + "must be subset of {1}").format(np.unique(y), self.classes_)) Y = self.label_binarizer_.transform(y) Y = Y.tocsc() columns = (col.toarray().ravel() for col in Y.T) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_partial_fit_binary)(estimator, X, column) for estimator, column in izip(self.estimators_, columns)) return self
def _partial_fit(self, X, y, classes=None): if _check_partial_fit_first_call(self, classes): self._label_binarizer = LabelBinarizer() if type_of_target(y).startswith('multilabel'): self._label_binarizer.fit(y) else: self._label_binarizer.fit(classes) super(MLPClassifier_Custom, self)._partial_fit(X, y) return self
def partial_fit(self, X, y, classes=None): """Partial fitting.""" X, y = check_X_y(X, y) self.X_ = X self.y_ = y # print(X[0,0]) if _check_partial_fit_first_call(self, classes): self.classes_ = classes self.ensemble_ = [] self.weights_ = np.array([]) """Partial fitting""" # Preparing and training new candidate if classes is not None: self.classes_ = classes elif self.classes_ is None: raise Exception('Classes not specified') candidate_clf = clone(self.base_estimator) candidate_weight = self._get_weigth_for_candidate(candidate_clf) if self._sampling == 'over': ros = RandomOverSampler(random_state=0) X, y = ros.fit_resample(X, y) elif self._sampling == 'under': rus = RandomUnderSampler(random_state=0) X, y = rus.fit_resample(X, y) if not self._update: candidate_clf.fit(X, y) else: candidate_clf.partial_fit(X, y) self._set_weights() if self._update: random_cl_weight = self._weight_of_random_classifier() for i in range(len(self.ensemble_)): if self.weights_[i] > random_cl_weight: self.ensemble_[i].partial_fit(X, y) self.ensemble_.append(candidate_clf) self.weights_ = np.append(self.weights_, np.array([candidate_weight])) # Post-pruning if len(self.ensemble_) > self.n_estimators: self._prune() # Weights normalization self.weights_ = self.weights_ / np.sum(self.weights_)
def partial_fit(self, X, y, classes=None): X, y = check_X_y(X, y) # Check consitency if hasattr(self, 'X_') and X.shape[1] != self.X_.shape[1]: raise ValueError('number of features does not match') self.X_ = X self.y_ = y if not hasattr(self, 'n_features_'): self.n_features_ = X.shape[1] # Get subspace if not hasattr(self, 'subspace_'): if self.given_subspace is None: self.subspace_ = self._assumed_subspace() else: self.subspace_ = np.array(self.given_subspace) # Acquire subspaced X subspaced_X = X[:, self.subspace_].astype('float64') if _check_partial_fit_first_call(self, classes): self.classes_ = classes # Scaler self.scaler_ = MinMaxScaler() self.scaler_.fit(subspaced_X) # Store the classes seen during fit # TODO It's definitely not optimal y = [list(self.classes_).index(a) for a in y] # Expose if hasattr(self, 'model_'): self.model_ += self.expose(subspaced_X, y) else: self.model_ = self.expose(subspaced_X, y) # HSV self._hue = np.argmax(self.model_, axis=2) / float(len(self.classes_)) self._saturation = np.max(self.model_, axis=2) - \ np.min(self.model_, axis=2) self._value = np.max(self.model_, axis=2) self._hsv = np.dstack((self._hue, self._saturation, self._value)) # Calculate measures self._calculate_measures()
def partial_fit(self, X, y, classes=None): """Partial fitting.""" if not hasattr(self, "_base_clf"): self.set_base_clf() X, y = check_X_y(X, y) self.X_ = X self.y_ = y if _check_partial_fit_first_call(self, classes): self.classes_ = classes self.ensemble_ = [] self.weights_ = np.array([1]) self.age_ = 0 self.iterations_ = np.array([]) """Partial fitting""" if self.age_ > 0: self.overall_accuracy = self.score(self.previous_X, self.previous_y) # Pre-pruning if len(self.ensemble_) > self.ensemble_size and not self.post_pruning: self._prune() # Preparing and training new candidate self.classes_ = classes candidate_clf = base.clone(self._base_clf) candidate_clf.fit(X, y) self.ensemble_.append(candidate_clf) self.iterations_ = np.append(self.iterations_, [1]) self._set_weights() self._rejuvenate() self._aging() self._extinct() # Post-pruning if len(self.ensemble_) > self.ensemble_size and self.post_pruning: self._prune() # Weights normalization self.weights_ = self.weights_ / np.sum(self.weights_) # Ending procedure self.previous_X, self.previous_y = (X, y) self.age_ += 1 self.iterations_ += 1
def partial_fit(self, X, y, classes=None): """Partial fitting.""" if not hasattr(self, "_base_clf"): self.set_base_clf() X, y = check_X_y(X, y) if _check_partial_fit_first_call(self, classes): self.classes_ = classes self.ensemble_ = [] self.X_, self.y_ = X, y # Preparing and training new candidate self.ensemble_.append(base.clone(self._base_clf).fit(self.X_, self.y_)) if len(self.ensemble_) > self.ensemble_size: del self.ensemble_[0]
def partial_fit(self, X, y, classes=None): """Partially fit underlying estimators Should be used when memory is inefficient to train all data. Chunks of data can be passed in several iteration, where the first call should have an array of all target variables. Parameters ---------- X : (sparse) array-like, shape = [n_samples, n_features] Data. y : array-like, shape = [n_samples] Multi-class targets. classes : array, shape (n_classes, ) Classes across all calls to partial_fit. Can be obtained via `np.unique(y_all)`, where y_all is the target vector of the entire dataset. This argument is only required in the first call of partial_fit and can be omitted in the subsequent calls. Returns ------- self """ if _check_partial_fit_first_call(self, classes): self.estimators_ = [ clone(self.estimator) for i in range(self.n_classes_ * (self.n_classes_ - 1) // 2) ] if len(np.setdiff1d(y, self.classes_)): raise ValueError("Mini-batch contains {0} while it " "must be subset of {1}".format( np.unique(y), self.classes_)) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) check_classification_targets(y) combinations = itertools.combinations(range(self.n_classes_), 2) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_partial_fit_ovo_binary)(estimator, X, y, self.classes_[i], self.classes_[j]) for estimator, (i, j) in izip(self.estimators_, (combinations))) self.pairwise_indices_ = None return self
def partial_fit(self, X, y, classes=None): """Partial fitting.""" if not hasattr(self, "_base_clf"): self.set_base_clf() X, y = check_X_y(X, y) if _check_partial_fit_first_call(self, classes): self.classes_ = classes self.ensemble_ = [] if self.oversampled == False: self.X_, self.y_ = X, y else: ros = RandomOverSampler(random_state=42) self.X_, self.y_ = ros.fit_resample(X, y) # Preparing and training new candidate self.ensemble_.append(base.clone(self._base_clf).fit(self.X_, self.y_)) if len(self.ensemble_) > self.ensemble_size: del self.ensemble_[0]
def partial_fit(self, X, y, classes=None): """Partial fitting.""" if not hasattr(self, "_base_clf"): self.set_base_clf() X, y = check_X_y(X, y) self.X_ = X self.y_ = y if _check_partial_fit_first_call(self, classes): self.classes_ = classes self.ensemble_ = [] self.previous_X = self.X_ self.previous_y = self.y_ # Do przemyślenia # if len(self.ensemble_) > 1: # test = self.region_of_competence_predict(X, n_neighbors=5) # Copy the old chunk self.previous_X = self.X_ self.previous_y = self.y_ # Preparing and training new candidate self.classes_ = classes candidate_clf = base.clone(self._base_clf) # Remove outliers # X_wo_outliers, y_wo_outliers = self.remove_outliers(X, y) candidate_clf.fit(X, y) self.ensemble_.append(candidate_clf) # Score base models base_scores = self.f1_score_base_classifiers(X, y) # Prune all classifiers below f1 threshold base_scores = self.prune_threshold(base_scores, threshold=0.94) # Prune the worst classifer if ensemble size exceeded _ = self.prune_worst_classifier(base_scores)
def partial_fit(self, X, y, classes=None, sample_weight=None): """Incremental fit on a batch of samples. This method is expected to be called several times consecutively on different chunks of a dataset so as to implement out-of-core or online learning. This is especially useful when the whole dataset is too big to fit in memory at once. This method has some performance overhead hence it is better to call partial_fit on chunks of data that are as large as possible (as long as fitting in the memory budget) to hide the overhead. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. classes : array-like, shape = [n_classes] (default=None) List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. sample_weight : array-like, shape = [n_samples] (default=None) Weights applied to individual samples (1. for unweighted). Returns ------- self : object Returns self. """ X = check_array(X, accept_sparse='csr', dtype=np.float64) _, n_features = X.shape if _check_partial_fit_first_call(self, classes): # This is the first call to partial_fit: # initialize various cumulative counters n_effective_classes = len(classes) if len(classes) > 1 else 2 self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64) self.feature_count_ = np.zeros((n_effective_classes, n_features), dtype=np.float64) self.complement_class_count_ = np.zeros(n_effective_classes, dtype=np.float64) self.complement_feature_count_ = np.zeros( (n_effective_classes, n_features), dtype=np.float64) elif n_features != self.coef_.shape[1]: msg = "Number of features %d does not match previous data %d." raise ValueError(msg % (n_features, self.coef_.shape[-1])) Y = label_binarize(y, classes=self.classes_) if Y.shape[1] == 1: Y = np.concatenate((1 - Y, Y), axis=1) n_samples, n_classes = Y.shape if X.shape[0] != Y.shape[0]: msg = "X.shape[0]=%d and y.shape[0]=%d are incompatible." raise ValueError(msg % (X.shape[0], y.shape[0])) # label_binarize() returns arrays with dtype=np.int64. # We convert it to np.float64 to support sample_weight consistently Y = Y.astype(np.float64) if sample_weight is not None: sample_weight = np.atleast_2d(sample_weight) Y *= check_array(sample_weight).T class_prior = self.class_prior # Count raw events from data before updating the class log prior # and feature log probas self._count(X, Y) # XXX: OPTIM: we could introduce a public finalization method to # be called by the user explicitly just once after several consecutive # calls to partial_fit and prior any call to predict[_[log_]proba] # to avoid computing the smooth log probas at each call to partial fit alpha = self._check_alpha() self._update_feature_log_prob(alpha) self._update_class_log_prior(class_prior=class_prior) return self
def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, 'csr') _, n_features = X.shape labelbin = LabelBinarizer() Y = labelbin.fit_transform(y) self.classes_ = labelbin.classes_ if Y.shape[1] == 1: Y = np.concatenate((1 - Y, Y), axis=1) # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64. # We convert it to np.float64 to support sample_weight consistently; # this means we also don't have to cast X to floating point Y = Y.astype(np.float64) if sample_weight is not None: sample_weight = np.atleast_2d(sample_weight) Y *= check_array(sample_weight).T class_prior = self.class_prior # Count raw events from data before updating the class log prior # and feature log probas n_effective_classes = Y.shape[1] self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64) self.feature_count_ = np.zeros((n_effective_classes, n_features), dtype=np.float64) self._count(X, Y) self._update_feature_log_prob() self._update_class_log_prior(class_prior=class_prior) return self def partial_fit(self, X, y, classes=None, sample_weight=None): X = check_array(X, accept_sparse='csr', dtype=np.float64) _, n_features = X.shape self.coef_ = self._get_coef() #self.intercept_ = self._get_intercept() if _check_partial_fit_first_call(self, classes): # This is the first call to partial_fit: # initialize various cumulative counters n_effective_classes = len(classes) if len(classes) > 1 else 2 self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64) self.feature_count_ = np.zeros((n_effective_classes, n_features), dtype=np.float64) elif n_features != self.coef_.shape[1]: msg = "Number of features %d does not match previous data %d." raise ValueError(msg % (n_features, self.coef_.shape[-1])) Y = label_binarize(y, classes=self.classes_) if Y.shape[1] == 1: Y = np.concatenate((1 - Y, Y), axis=1) n_samples, n_classes = Y.shape if X.shape[0] != Y.shape[0]: msg = "X.shape[0]=%d and y.shape[0]=%d are incompatible." raise ValueError(msg % (X.shape[0], y.shape[0])) # label_binarize() returns arrays with dtype=np.int64. # We convert it to np.float64 to support sample_weight consistently Y = Y.astype(np.float64) if sample_weight is not None: sample_weight = np.atleast_2d(sample_weight) Y *= check_array(sample_weight).T class_prior = self.class_prior # Count raw events from data before updating the class log prior # and feature log probas self._count(X, Y) # XXX: OPTIM: we could introduce a public finalization method to # be called by the user explicitly just once after several consecutive # calls to partial_fit and prior any call to predict[_[log_]proba] # to avoid computing the smooth log probas at each call to partial fit self._update_feature_log_prob() self._update_class_log_prior(class_prior=class_prior) return self
def partial_fit(self, X: Union[np.array, pd.DataFrame], y: Union[np.array, pd.Series], classes: Union[list, np.ndarray]=None): """ Fit a single DTC using the given subset of x and y. Passes subset to fit, rather than using the same data each time. Wrap with Dask Incremental to handle subset feeding. First call needs to be supplied with the expected classes (similar to existing models with .partial_fit()) in case not all classes are present in the first subset. TODO: This currently expected every call, but alternative could be checked with the existing sklearn mechanism. Additionally, the case where not all classes are presented in the first or subsequent subsets needs to be handled. For the RandomForestClassifier, tree predictions are averaged in sklearn.ensemble.forest.accumulate_prediction unction. This sums the output matrix with dimensions n rows x n classes and fails if the class dimension differs. The class dimension is defined at the individual estimator level during the .fit() call, which sets the following attributes: - self.n_outputs_ = y.shape[1], which is then used by _validate_y_class_weight()), always called in .fit() to set: - self.classes_ - self.n_classes_ This object sets classes_ and n_classes_ depending on the supplied classes. The Individual trees set theirs depending on the data available in the subset. The predict_proba method is modified to standardise shape to the dimensions defined in this object. :param x: :param y: :return: """ # Set classes for forest (this only needs to be done once). # Not for each individual tree, these will be set by .fit() using the classes available in the subset. # Check classes_ is set, or provided # Returns false if nothing to do classes_need_setting = _check_partial_fit_first_call(self, classes) # If classes not set, set # Above will error if not set and classes = None if classes_need_setting: self.classes_ = np.array(classes) self.n_classes_ = len(classes) # Fit the next estimator, if not done if self._fit_estimators < self.max_n_estimators: t0 = time.time() self.fit(X, y) t1 = time.time() if self.verbose > 0: print(f"Fit estimators {self._fit_estimators} - {self._fit_estimators + self.n_estimators_per_chunk} " f"/ {self.max_n_estimators}") print(f"Fit time: {round(t1 - t0, 2)}") print(len(self.estimators_)) self._fit_estimators += self.n_estimators_per_chunk # If still not done, prep to fit next if self._fit_estimators < self.max_n_estimators: self.n_estimators += self.n_estimators_per_chunk else: if self.verb > 0: print('Done') return self
def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): X, y = check_X_y(X, y) # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. epsilon = 1e-9 * np.var(X, axis=0).max() if _refit: self.classes_ = None if _check_partial_fit_first_call(self, classes): # This is the first call to partial_fit: # initialize various cumulative counters n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_prior_ = np.zeros(n_classes) self.class_count_ = np.zeros(n_classes) else: if X.shape[1] != self.theta_.shape[1]: msg = "Number of features %d does not match previous data %d." raise ValueError(msg % (X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= epsilon classes = self.classes_ unique_y = np.unique(y) unique_y_in_classes = in1d(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError("The target label(s) %s in y do not exist in the " "initial classes %s" % (y[~unique_y_in_classes], classes)) for y_i in unique_y: i = classes.searchsorted(y_i) X_i = X[y == y_i, :] if sample_weight is not None: sw_i = sample_weight[y == y_i] N_i = sw_i.sum() else: sw_i = None N_i = X_i.shape[0] new_theta, new_sigma = self._update_mean_variance( self.class_count_[i], self.theta_[i, :], self.sigma_[i, :], X_i, sw_i) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_[i] += N_i self.sigma_[:, :] += epsilon self.class_prior_[:] = self.class_count_ / np.sum(self.class_count_) #print self.class_prior_[:] return self
def _partial_fit(self, X, y, classes=None): _check_partial_fit_first_call(self, classes) super(MLPClassifier, self)._partial_fit(X, y) return self
def partial_fit(self, X, y, classes=None): if _check_partial_fit_first_call(self, classes): self.fit(X, y) else: for e in self.ensemble_: e.partial_fit(X, y)
def partial_fit(self, X, y, classes=None, sample_weight=None): """Incremental fit on a batch of samples. This method is expected to be called several times consecutively on different chunks of a dataset so as to implement out-of-core or online learning. This is especially useful when the whole dataset is too big to fit in memory at once. This method has some performance overhead hence it is better to call partial_fit on chunks of data that are as large as possible (as long as fitting in the memory budget) to hide the overhead. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. classes : array-like, shape = [n_classes] (default=None) List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. sample_weight : array-like, shape = [n_samples] (default=None) Weights applied to individual samples (1. for unweighted). Returns ------- self : object Returns self. """ X = check_array(X, accept_sparse='csr', dtype=np.float64) _, n_features = X.shape if _check_partial_fit_first_call(self, classes): # This is the first call to partial_fit: # initialize various cumulative counters n_effective_classes = len(classes) if len(classes) > 1 else 2 self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64) self.feature_count_ = np.zeros((n_effective_classes, n_features), dtype=np.float64) self.complement_class_count_ = np.zeros(n_effective_classes, dtype=np.float64) self.complement_feature_count_ = np.zeros((n_effective_classes, n_features), dtype=np.float64) elif n_features != self.coef_.shape[1]: msg = "Number of features %d does not match previous data %d." raise ValueError(msg % (n_features, self.coef_.shape[-1])) Y = label_binarize(y, classes=self.classes_) if Y.shape[1] == 1: Y = np.concatenate((1 - Y, Y), axis=1) n_samples, n_classes = Y.shape if X.shape[0] != Y.shape[0]: msg = "X.shape[0]=%d and y.shape[0]=%d are incompatible." raise ValueError(msg % (X.shape[0], y.shape[0])) # label_binarize() returns arrays with dtype=np.int64. # We convert it to np.float64 to support sample_weight consistently Y = Y.astype(np.float64) if sample_weight is not None: sample_weight = np.atleast_2d(sample_weight) Y *= check_array(sample_weight).T class_prior = self.class_prior # Count raw events from data before updating the class log prior # and feature log probas self._count(X, Y) # XXX: OPTIM: we could introduce a public finalization method to # be called by the user explicitly just once after several consecutive # calls to partial_fit and prior any call to predict[_[log_]proba] # to avoid computing the smooth log probas at each call to partial fit alpha = self._check_alpha() self._update_feature_log_prob(alpha) self._update_class_log_prior(class_prior=class_prior) return self
def partial_fit(self, X, y, classes=None): """Partial fitting.""" if not hasattr(self, "_base_clf"): self.set_base_clf() X, y = check_X_y(X, y) if _check_partial_fit_first_call(self, classes): self.classes_ = classes self.ensemble_ = [] self.X_, self.y_ = X, y train_X, train_y = X, y unique, counts = np.unique(train_y, return_counts=True) k_neighbors = 5 if counts[0] - 1 < 5: k_neighbors = counts[0] - 1 if self.oversampler == "SMOTE" and k_neighbors > 0: smote = SMOTE(random_state=42, k_neighbors=k_neighbors) train_X, train_y = smote.fit_resample(train_X, train_y) elif self.oversampler == "svmSMOTE" and k_neighbors > 0: try: svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors) train_X, train_y = svmSmote.fit_resample(train_X, train_y) except ValueError: pass elif self.oversampler == "borderline1" and k_neighbors > 0: borderlineSmote1 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-1') train_X, train_y = borderlineSmote1.fit_resample(train_X, train_y) elif self.oversampler == "borderline2" and k_neighbors > 0: borderlineSmote2 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-2') train_X, train_y = borderlineSmote2.fit_resample(train_X, train_y) elif self.oversampler == "ADASYN" and k_neighbors > 0: try: adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors) train_X, train_y = adasyn.fit_resample(train_X, train_y) except RuntimeError: pass elif self.oversampler == "SLS" and k_neighbors > 0: sls = Safe_Level_SMOTE(n_neighbors=k_neighbors) train_X, train_y = sls.sample(train_X, train_y) # Testing all models scores = np.array([ba(y, clf.predict(X)) for clf in self.ensemble_]) # Pruning if len(self.ensemble_) > 1: alpha_good = scores > (0.5 + self.alpha) self.ensemble_ = [ self.ensemble_[i] for i in np.where(alpha_good)[0] ] if len(self.ensemble_) > self.ensemble_size - 1: worst = np.argmin(scores) del self.ensemble_[worst] # Preparing and training new candidate self.ensemble_.append(base.clone(self._base_clf).fit(train_X, train_y))
def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): """Actual implementation of Gaussian NB fitting. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target values. classes : array-like, shape (n_classes,), optional (default=None) List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. _refit: bool, optional (default=False) If true, act as though this were the first time we called _partial_fit (ie, throw away any past fitting and start over). sample_weight : array-like, shape (n_samples,), optional (default=None) Weights applied to individual samples (1. for unweighted). Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) check_consistent_length(y, sample_weight) # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. epsilon = 1e-9 * np.var(X, axis=0).max() if _refit: self.classes_ = None if _check_partial_fit_first_call(self, classes): # This is the first call to partial_fit: # initialize various cumulative counters n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_count_ = np.zeros(n_classes, dtype=np.float64) # Initialise the class prior n_classes = len(self.classes_) # Take into account the priors if self.priors is not None: priors = np.asarray(self.priors) # Check that the provide prior match the number of classes if len(priors) != n_classes: raise ValueError('Number of priors must match number of' ' classes.') # Check that the sum is 1 if priors.sum() != 1.0: raise ValueError('The sum of the priors should be 1.') # Check that the prior are non-negative if (priors < 0).any(): raise ValueError('Priors must be non-negative.') self.class_prior_ = priors else: # Initialize the priors to zeros for each class self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64) else: if X.shape[1] != self.theta_.shape[1]: msg = "Number of features %d does not match previous data %d." raise ValueError(msg % (X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= epsilon classes = self.classes_ unique_y = np.unique(y) unique_y_in_classes = np.in1d(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError("The target label(s) %s in y do not exist in the " "initial classes %s" % (unique_y[~unique_y_in_classes], classes)) for y_i in unique_y: i = classes.searchsorted(y_i) X_i = X[y == y_i, :] if sample_weight is not None: sw_i = sample_weight[y == y_i] N_i = sw_i.sum() else: sw_i = None N_i = X_i.shape[0] new_theta, new_sigma = self._update_mean_variance( self.class_count_[i], self.theta_[i, :], self.sigma_[i, :], X_i, sw_i) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_[i] += N_i self.sigma_[:, :] += epsilon # Update if only no priors is provided if self.priors is None: # Empirical prior, with sample_weight taken into account self.class_prior_ = self.class_count_ / self.class_count_.sum() return self
def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") X, y = check_X_y(X, y) if self.bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, shape=X.shape[1]) X = clip_to_bounds(X, self.bounds) self.epsilon_ = self.var_smoothing if _refit: self.classes_ = None if _check_partial_fit_first_call(self, classes): n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_count_ = np.zeros(n_classes, dtype=np.float64) if self.priors is not None: priors = np.asarray(self.priors) if len(priors) != n_classes: raise ValueError( "Number of priors must match number of classes.") if not np.isclose(priors.sum(), 1.0): raise ValueError("The sum of the priors should be 1.") if (priors < 0).any(): raise ValueError("Priors must be non-negative.") self.class_prior_ = priors else: # Initialize the priors to zeros for each class self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64) else: if X.shape[1] != self.theta_.shape[1]: raise ValueError( "Number of features %d does not match previous data %d." % (X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= self.epsilon_ classes = self.classes_ unique_y = np.unique(y) unique_y_in_classes = np.in1d(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError( "The target label(s) %s in y do not exist in the initial classes %s" % (unique_y[~unique_y_in_classes], classes)) noisy_class_counts = self._noisy_class_counts(y) for _i, y_i in enumerate(unique_y): i = classes.searchsorted(y_i) X_i = X[y == y_i, :] n_i = noisy_class_counts[_i] new_theta, new_sigma = self._update_mean_variance( self.class_count_[i], self.theta_[i, :], self.sigma_[i, :], X_i, n_noisy=n_i) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_[i] += n_i self.sigma_[:, :] += self.epsilon_ # Update if only no priors is provided if self.priors is None: # Empirical prior, with sample_weight taken into account self.class_prior_ = self.class_count_ / self.class_count_.sum() self.accountant.spend(self.epsilon, 0) return self
def _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, classes, sample_weight, coef_init, intercept_init, per_feature_alpha, per_feature_beta, modal_vector): X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C") n_samples, n_features = X.shape _check_partial_fit_first_call(self, classes) n_classes = self.classes_.shape[0] # Allocate datastructures from input arguments self._expanded_class_weight = compute_class_weight( self.class_weight, self.classes_, y) sample_weight = self._validate_sample_weight(sample_weight, n_samples) fitted = getattr(self, "coef_", None) is not None or \ coef_init is not None if getattr(self, "coef_", None) is None or coef_init is not None: self._allocate_parameter_mem(n_classes, n_features, coef_init, intercept_init) elif n_features != self.coef_.shape[-1]: raise ValueError("Number of features %d does not match previous " "data %d." % (n_features, self.coef_.shape[-1])) # NOTE: put initialization of the 3 additionial vectors here, # might make more sense to put in self._allocate_parameter_mem if modal_vector is None: modal_vector = np.zeros((n_classes, n_features)) elif modal_vector.shape[-1] != n_features: raise ValueError('Shape of modal_vecter must be the same as ' 'the coefficient vectors') elif len(modal_vector.shape) == 1: # passing single vector in modal_vector = np.stack([modal_vector for _ in range(n_classes)]) l1_ratio = self.l1_ratio if self.penalty == 'l2': l1_ratio = 0.0 elif self.penalty == 'l1': l1_ratio = 1.0 if per_feature_alpha is None and per_feature_beta is None: if per_feature_alpha is None: per_feature_alpha = np.ones( self.coef_.shape) * alpha * (1.0 - l1_ratio) if per_feature_beta is None: per_feature_beta = np.ones(self.coef_.shape) * alpha * l1_ratio else: if per_feature_alpha is None: per_feature_alpha = np.zeros(self.coef_.shape) elif self.penalty == 'l1': raise ValueError('Penalty set to l1 but per_feature_alpha ' 'is still provided') if per_feature_beta is None: per_feature_beta = np.zeros(self.coef_.shape) elif self.penalty == 'l2': raise ValueError('Penalty set to l2 but per_feature_beta ' 'is still provided') if per_feature_alpha.shape[-1] != n_features: raise ValueError('Shape of per_feature_alpha must be the same as ' 'the coefficient vectors') if per_feature_beta.shape[-1] != n_features: raise ValueError('Shape of per_feature_beta must be the same as ' 'the coefficient vectors') self.loss_function_ = self._get_loss_function(loss) if not hasattr(self, "t_"): self.t_ = 1.0 # delegate to concrete training procedure if n_classes > 2: if not fitted: self.coef_ = modal_vector.copy() self._fit_multiclass(X, y, alpha=alpha, C=C, per_feature_alpha=per_feature_alpha, per_feature_beta=per_feature_beta, modal_vector=modal_vector, learning_rate=learning_rate, sample_weight=sample_weight, max_iter=max_iter) elif n_classes == 2: if not fitted: # print("overwrite initial weight vector with modal vector") self.coef_ = modal_vector[0].copy() self._fit_binary(X, y, alpha=alpha, C=C, per_feature_alpha=per_feature_alpha, per_feature_beta=per_feature_beta, modal_vector=modal_vector[0], learning_rate=learning_rate, sample_weight=sample_weight, max_iter=max_iter) else: raise ValueError( "The number of classes has to be greater than one;" " got %d class" % n_classes) return self