def _launch_svc(self, kernel_train, x_test, y_train, y_test, c): if self._algorithm_params['balanced']: svc = OneVsOneClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced')) else: svc = OneVsOneClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6)) svc.fit(kernel_train, y_train) y_hat_train = svc.predict(kernel_train) y_hat = svc.predict(x_test) proba_test = svc.predict_proba(x_test)[:, 1] return svc, y_hat, y_hat_train
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin): """Gaussian process classification (GPC) based on Laplace approximation. The implementation is based on Algorithm 3.1, 3.2, and 5.1 of Gaussian Processes for Machine Learning (GPML) by Rasmussen and Williams. Internally, the Laplace approximation is used for approximating the non-Gaussian posterior by a Gaussian. Currently, the implementation is restricted to using the logistic link function. For multi-class classification, several binary one-versus rest classifiers are fitted. Note that this class thus does not implement a true multi-class Laplace approximation. Parameters ---------- kernel : kernel object The kernel specifying the covariance function of the GP. If None is passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that the kernel's hyperparameters are optimized during fitting. optimizer : string or callable, optional (default: "fmin_l_bfgs_b") Can either be one of the internally supported optimizers for optimizing the kernel's parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:: def optimizer(obj_func, initial_theta, bounds): # * 'obj_func' is the objective function to be maximized, which # takes the hyperparameters theta as parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * 'initial_theta': the initial value for theta, which can be # used by local optimizers # * 'bounds': the bounds on the values of theta .... # Returned are the best found hyperparameters theta and # the corresponding value of the target function. return theta_opt, func_min Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize is used. If None is passed, the kernel's parameters are kept fixed. Available internal optimizers are:: 'fmin_l_bfgs_b' n_restarts_optimizer: int, optional (default: 0) The number of restarts of the optimizer for finding the kernel's parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel's initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer=0 implies that one run is performed. max_iter_predict: int, optional (default: 100) The maximum number of iterations in Newton's method for approximating the posterior during predict. Smaller values will reduce computation time at the cost of worse results. warm_start : bool, optional (default: False) If warm-starts are enabled, the solution of the last Newton iteration on the Laplace approximation of the posterior mode is used as initialization for the next call of _posterior_mode(). This can speed up convergence when _posterior_mode is called several times on similar problems as in hyperparameter optimization. copy_X_train : bool, optional (default: True) If True, a persistent copy of the training data is stored in the object. Otherwise, just a reference to the training data is stored, which might cause predictions to change if the data is modified externally. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. multi_class: string, default: "one_vs_rest" Specifies how multi-class classification problems are handled. Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest", one binary Gaussian process classifier is fitted for each class, which is trained to separate this class from the rest. In "one_vs_one", one binary Gaussian process classifier is fitted for each pair of classes, which is trained to separate these two classes. The predictions of these binary predictors are combined into multi-class predictions. Note that "one_vs_one" does not support predicting probability estimates. n_jobs : int, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Attributes ---------- kernel_ : kernel object The kernel used for prediction. In case of binary classification, the structure of the kernel is the same as the one passed as parameter but with optimized hyperparameters. In case of multi-class classification, a CompoundKernel is returned which consists of the different kernels used in the one-versus-rest classifiers. log_marginal_likelihood_value_: float The log-marginal-likelihood of ``self.kernel_.theta`` classes_ : array-like, shape = (n_classes,) Unique class labels. n_classes_ : int The number of classes in the training data """ def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None, multi_class="one_vs_rest", n_jobs=1): self.kernel = kernel self.optimizer = optimizer self.n_restarts_optimizer = n_restarts_optimizer self.max_iter_predict = max_iter_predict self.warm_start = warm_start self.copy_X_train = copy_X_train self.random_state = random_state self.multi_class = multi_class self.n_jobs = n_jobs def fit(self, X, y): """Fit Gaussian process classification model Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples,) Target values, must be binary Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, multi_output=False) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, self.max_iter_predict, self.warm_start, self.copy_X_train, self.random_state) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.size if self.n_classes_ == 1: raise ValueError("GaussianProcessClassifier requires 2 or more " "distinct classes. Only class %s present." % self.classes_[0]) if self.n_classes_ > 2: if self.multi_class == "one_vs_rest": self.base_estimator_ = \ OneVsRestClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class == "one_vs_one": self.base_estimator_ = \ OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class) self.base_estimator_.fit(X, y) if self.n_classes_ > 2: self.log_marginal_likelihood_value_ = np.mean([ estimator.log_marginal_likelihood() for estimator in self.base_estimator_.estimators_ ]) else: self.log_marginal_likelihood_value_ = \ self.base_estimator_.log_marginal_likelihood() return self def predict(self, X): """Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array, shape = (n_samples,) Predicted target values for X, values are from ``classes_`` """ check_is_fitted(self, ["classes_", "n_classes_"]) X = check_array(X) return self.base_estimator_.predict(X) def predict_proba(self, X): """Return probability estimates for the test vector X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array-like, shape = (n_samples, n_classes) Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ check_is_fitted(self, ["classes_", "n_classes_"]) if self.n_classes_ > 2 and self.multi_class == "one_vs_one": raise ValueError("one_vs_one multi-class mode does not support " "predicting probability estimates. Use " "one_vs_rest mode instead.") X = check_array(X) return self.base_estimator_.predict_proba(X) @property def kernel_(self): if self.n_classes_ == 2: return self.base_estimator_.kernel_ else: return CompoundKernel([ estimator.kernel_ for estimator in self.base_estimator_.estimators_ ]) def log_marginal_likelihood(self, theta=None, eval_gradient=False): """Returns log-marginal likelihood of theta for training data. In the case of multi-class classification, the mean log-marginal likelihood of the one-versus-rest classifiers are returned. Parameters ---------- theta : array-like, shape = (n_kernel_params,) or none Kernel hyperparameters for which the log-marginal likelihood is evaluated. In the case of multi-class classification, theta may be the hyperparameters of the compound kernel or of an individual kernel. In the latter case, all individual kernel get assigned the same theta values. If None, the precomputed log_marginal_likelihood of ``self.kernel_.theta`` is returned. eval_gradient : bool, default: False If True, the gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta is returned additionally. Note that gradient computation is not supported for non-binary classification. If True, theta must not be None. Returns ------- log_likelihood : float Log-marginal likelihood of theta for training data. log_likelihood_gradient : array, shape = (n_kernel_params,), optional Gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta. Only returned when eval_gradient is True. """ check_is_fitted(self, ["classes_", "n_classes_"]) if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ theta = np.asarray(theta) if self.n_classes_ == 2: return self.base_estimator_.log_marginal_likelihood( theta, eval_gradient) else: if eval_gradient: raise NotImplementedError( "Gradient of log-marginal-likelhood not implemented for " "multi-class GPC.") estimators = self.base_estimator_.estimators_ n_dims = estimators[0].kernel_.n_dims if theta.shape[0] == n_dims: # use same theta for all sub-kernels return np.mean([ estimator.log_marginal_likelihood(theta) for i, estimator in enumerate(estimators) ]) elif theta.shape[0] == n_dims * self.classes_.shape[0]: # theta for compound kernel return np.mean([ estimator.log_marginal_likelihood(theta[n_dims * i:n_dims * (i + 1)]) for i, estimator in enumerate(estimators) ]) else: raise ValueError( "Shape of theta must be either %d or %d. " "Obtained theta with shape %d." % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin): """Gaussian process classification (GPC) based on Laplace approximation. The implementation is based on Algorithm 3.1, 3.2, and 5.1 of Gaussian Processes for Machine Learning (GPML) by Rasmussen and Williams. Internally, the Laplace approximation is used for approximating the non-Gaussian posterior by a Gaussian. Currently, the implementation is restricted to using the logistic link function. For multi-class classification, several binary one-versus rest classifiers are fitted. Note that this class thus does not implement a true multi-class Laplace approximation. Parameters ---------- kernel : kernel object The kernel specifying the covariance function of the GP. If None is passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that the kernel's hyperparameters are optimized during fitting. optimizer : string or callable, optional (default: "fmin_l_bfgs_b") Can either be one of the internally supported optimizers for optimizing the kernel's parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:: def optimizer(obj_func, initial_theta, bounds): # * 'obj_func' is the objective function to be maximized, which # takes the hyperparameters theta as parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * 'initial_theta': the initial value for theta, which can be # used by local optimizers # * 'bounds': the bounds on the values of theta .... # Returned are the best found hyperparameters theta and # the corresponding value of the target function. return theta_opt, func_min Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize is used. If None is passed, the kernel's parameters are kept fixed. Available internal optimizers are:: 'fmin_l_bfgs_b' n_restarts_optimizer : int, optional (default: 0) The number of restarts of the optimizer for finding the kernel's parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel's initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer=0 implies that one run is performed. max_iter_predict : int, optional (default: 100) The maximum number of iterations in Newton's method for approximating the posterior during predict. Smaller values will reduce computation time at the cost of worse results. warm_start : bool, optional (default: False) If warm-starts are enabled, the solution of the last Newton iteration on the Laplace approximation of the posterior mode is used as initialization for the next call of _posterior_mode(). This can speed up convergence when _posterior_mode is called several times on similar problems as in hyperparameter optimization. copy_X_train : bool, optional (default: True) If True, a persistent copy of the training data is stored in the object. Otherwise, just a reference to the training data is stored, which might cause predictions to change if the data is modified externally. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. multi_class: string, default : "one_vs_rest" Specifies how multi-class classification problems are handled. Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest", one binary Gaussian process classifier is fitted for each class, which is trained to separate this class from the rest. In "one_vs_one", one binary Gaussian process classifier is fitted for each pair of classes, which is trained to separate these two classes. The predictions of these binary predictors are combined into multi-class predictions. Note that "one_vs_one" does not support predicting probability estimates. n_jobs : int, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Attributes ---------- kernel_ : kernel object The kernel used for prediction. In case of binary classification, the structure of the kernel is the same as the one passed as parameter but with optimized hyperparameters. In case of multi-class classification, a CompoundKernel is returned which consists of the different kernels used in the one-versus-rest classifiers. log_marginal_likelihood_value_ : float The log-marginal-likelihood of ``self.kernel_.theta`` classes_ : array-like, shape = (n_classes,) Unique class labels. n_classes_ : int The number of classes in the training data .. versionadded:: 0.18 """ def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None, multi_class="one_vs_rest", n_jobs=1): self.kernel = kernel self.optimizer = optimizer self.n_restarts_optimizer = n_restarts_optimizer self.max_iter_predict = max_iter_predict self.warm_start = warm_start self.copy_X_train = copy_X_train self.random_state = random_state self.multi_class = multi_class self.n_jobs = n_jobs def fit(self, X, y): """Fit Gaussian process classification model Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples,) Target values, must be binary Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, multi_output=False) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, self.max_iter_predict, self.warm_start, self.copy_X_train, self.random_state) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.size if self.n_classes_ == 1: raise ValueError("GaussianProcessClassifier requires 2 or more " "distinct classes. Only class %s present." % self.classes_[0]) if self.n_classes_ > 2: if self.multi_class == "one_vs_rest": self.base_estimator_ = \ OneVsRestClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class == "one_vs_one": self.base_estimator_ = \ OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class) self.base_estimator_.fit(X, y) if self.n_classes_ > 2: self.log_marginal_likelihood_value_ = np.mean( [estimator.log_marginal_likelihood() for estimator in self.base_estimator_.estimators_]) else: self.log_marginal_likelihood_value_ = \ self.base_estimator_.log_marginal_likelihood() return self def predict(self, X): """Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array, shape = (n_samples,) Predicted target values for X, values are from ``classes_`` """ check_is_fitted(self, ["classes_", "n_classes_"]) X = check_array(X) return self.base_estimator_.predict(X) def predict_proba(self, X): """Return probability estimates for the test vector X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array-like, shape = (n_samples, n_classes) Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ check_is_fitted(self, ["classes_", "n_classes_"]) if self.n_classes_ > 2 and self.multi_class == "one_vs_one": raise ValueError("one_vs_one multi-class mode does not support " "predicting probability estimates. Use " "one_vs_rest mode instead.") X = check_array(X) return self.base_estimator_.predict_proba(X) @property def kernel_(self): if self.n_classes_ == 2: return self.base_estimator_.kernel_ else: return CompoundKernel( [estimator.kernel_ for estimator in self.base_estimator_.estimators_]) def log_marginal_likelihood(self, theta=None, eval_gradient=False): """Returns log-marginal likelihood of theta for training data. In the case of multi-class classification, the mean log-marginal likelihood of the one-versus-rest classifiers are returned. Parameters ---------- theta : array-like, shape = (n_kernel_params,) or none Kernel hyperparameters for which the log-marginal likelihood is evaluated. In the case of multi-class classification, theta may be the hyperparameters of the compound kernel or of an individual kernel. In the latter case, all individual kernel get assigned the same theta values. If None, the precomputed log_marginal_likelihood of ``self.kernel_.theta`` is returned. eval_gradient : bool, default: False If True, the gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta is returned additionally. Note that gradient computation is not supported for non-binary classification. If True, theta must not be None. Returns ------- log_likelihood : float Log-marginal likelihood of theta for training data. log_likelihood_gradient : array, shape = (n_kernel_params,), optional Gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta. Only returned when eval_gradient is True. """ check_is_fitted(self, ["classes_", "n_classes_"]) if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ theta = np.asarray(theta) if self.n_classes_ == 2: return self.base_estimator_.log_marginal_likelihood( theta, eval_gradient) else: if eval_gradient: raise NotImplementedError( "Gradient of log-marginal-likelihood not implemented for " "multi-class GPC.") estimators = self.base_estimator_.estimators_ n_dims = estimators[0].kernel_.n_dims if theta.shape[0] == n_dims: # use same theta for all sub-kernels return np.mean( [estimator.log_marginal_likelihood(theta) for i, estimator in enumerate(estimators)]) elif theta.shape[0] == n_dims * self.classes_.shape[0]: # theta for compound kernel return np.mean( [estimator.log_marginal_likelihood( theta[n_dims * i:n_dims * (i + 1)]) for i, estimator in enumerate(estimators)]) else: raise ValueError("Shape of theta must be either %d or %d. " "Obtained theta with shape %d." % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))
size=0.2, seed=123) # Training the model from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsOneClassifier # Lets us treat each col of y independently clf = OneVsOneClassifier( LogisticRegression()) # Fits a sep classifier for each of the cols clf.fit(X_train, y_train) # C - Making predictions # Predicting on holdout data holdout = pd.read_csv('HoldoutData.csv', index_col=0) holdout = holdout[NUMERIC_COLUMNS].fillna( -1000) # Select just numeric columns and replace NaNs predictions = clf.predict_proba( holdout) # Predicts probabilities for each label # If .predict() was used - output would be 0 or 1 # Log loss penalises for being confident and wrong # As a result, there would be a worse performance compared to .predict_proba() # Submitting your predictions as a csv # Submission - df with column headers and row with probabilities for each column # All formatting can be done with the pandas to_csv function # Cols have orig column name separated from value by two '_' (some already contained '_') # Prediction - array of values, needs to be converted to a df prediction_df = pd.DataFrame( columns=pd.get_dummies( df[LABEELS], prefix_sep='_').columns, # Separates orig col names from col values
class log_kernel_MOM(BaseEstimator): ''' Logistic Regression Kernel MOM Kernel logarithmic regression MOM risk minimization using IRLS with regularization L2 Parameters ---------- K : int, default 10 number of blocks for the computation of the MOM. A big value of K deals with more outliers but small values of K are better for the performance when there are no outliers. eta0 : float, default 1 step size parameter, the step size is defined as the i-th iteration by 1/(1+eta0*i). beta : float, default 1 L2 regularization parameter. epoch : int, default 200 number of iterations before the end of the algorithm. kernel : {'rbf','poly', callable function}, default 'rbf' kernel used in the algorithm. A callable function can be given, it should take as entry two matrices X1, X2 and return the pairwise kernel distance matrix gamma : float, default 1/n_features coefficient used if the kernel is 'rbf' in which case the kernel function is exp(-gamma*x^2) degree : int, default 3 degree of the polynomial if the kernel is 'poly' agg : int, default 1 number of runs of the algorithm on which we aggregate. One might want to decrease this number if the complexity is a problem. verbose : boolean, default True display a message at the end of each run if agg > 1. progress : boolean, default False display a progress bar to monitor the algorithm on each run (agg > 1 means several progress bar). compter : boolean, default False used for outlier detection, if compter=True, the number of time each point is used in the algorithm will be recorded in the attribute "counts". multi : {'ovr','ovo'} , default 'ovr' method used to go from binary classification to multiclass classification. 'ovr' means "one vs the rest" and 'ovo' means "one vs one" . Attributes ---------- alpha : array like, length = n_sample alpha is updated in the algorithm, provides with the final coefficients of the decision function. counts : array like, length = n_sampled the i-th element record the number of time the i-th element of the training dataset X has been used. Only if compter=True. Methods ------- fit(X,y) : fit the model X : numpy matrix size = (n_samples,n_features) y : array like, length = n_samples predict(X) : predict the class of the points in X X : numpy matrix size = (n_samples,n_features) returns array-like, length = n_samples. predict_proba(X) : predict the probability that each point belong to each class. X : numpy matrox size = (n_samples,n_features) returns matrix, size = (n_samples,n_class) ''' def __init__(self, K=10, eta0=1, beta=1, epoch=200, kernel='rbf', gamma=None, degree=3, agg=1, verbose=True, progress=False, compter=False, multi='ovr', augmenter=1, power=2 / 3): args, _, _, values = inspect.getargvalues(inspect.currentframe()) values.pop("self") for arg, val in values.items(): setattr(self, arg, val) binary_clf = log_kernel_MOM_binary(K, eta0, beta, epoch, gamma, degree, agg, verbose, progress, compter, power) if multi == "ovr": self.clf = OneVsRestClassifier(binary_clf) elif multi == "ovo": self.clf = OneVsOneClassifier(binary_clf) else: raise NameError('Multiclass meta-algorithm not known') def fit(self, X, y): self.X = X perm = np.array([]) if (self.kernel == 'poly'): kfunc = lambda x, y: polynomial_kernel( x, y, degree=self.degree, gamma=self.gamma) elif (self.kernel == 'rbf'): kfunc = lambda x, y: rbf_kernel(x, y, self.gamma) else: kfunc = self.kernel Kernel = kfunc(np.array(X), np.array(X)) for f in range(self.augmenter): perm = np.hstack([perm, np.random.permutation(len(X))]) self.perm = perm.astype(np.int64) self.clf.fit(Kernel[self.perm][:, self.perm], y[self.perm]) return self def predict(self, xtest): if (self.kernel == 'poly'): kfunc = lambda x, y: polynomial_kernel( x, y, degree=self.degree, gamma=self.gamma) elif (self.kernel == 'rbf'): kfunc = lambda x, y: rbf_kernel(x, y, self.gamma) else: kfunc = self.kernel KC = kfunc(xtest, self.X[self.perm]) return self.clf.predict(KC) def predict_proba(xtest): if (self.kernel == 'poly'): kfunc = lambda x, y: polynomial_kernel( x, y, degree=self.degree, gamma=self.gamma) elif (self.kernel == 'rbf'): kfunc = lambda x, y: rbf_kernel(x, y, self.gamma) else: kfunc = self.kernel KC = kfunc(xtest, self.X[self.perm]) return self.clf.predict_proba(KC) def score(self, X, y): return np.mean(self.predict(X) == y) def set_params(self, **params): self.__init__(**params) return self
class ClassifierRunner(): def __init__(self, pipeline, clf, clf_name, example_indices, selector_params=None, multiclass=None, compare_classifiers=None): self.pipeline = pipeline self.clf = clf self.clf_name = clf_name self.example_indices = example_indices self.selector_params = selector_params self.selector_param_str = self.selector_params['combine'] + str( self.selector_params['threshold']) self.multiclass = multiclass self.compare_classifiers = compare_classifiers def fit(self, X_train_tranformed, y_train, num_classes): self.num_classes = num_classes self.num_features = len(X_train_tranformed[0]) if hasattr(self.clf, 'reset'): # reset classifier from sklearn library self.clf = self.clf.reset() else: self.clf = base.clone(self.clf) if self.num_classes > 2: if self.multiclass == 'ovr': self.clf = OneVsRestClassifier(self.clf) else: self.clf = OneVsOneClassifier(self.clf) self.clf.fit(X_train_tranformed, y_train) def run_prediction(self, X_test_transformed, y_test): # get predictions y_pred = self.clf.predict(X_test_transformed) self.proba = None if (self.num_classes == 2 or self.multiclass == 'ovr') and self.clf.__class__.__name__ != 'SVC': self.proba = self.clf.predict_proba(X_test_transformed) # get ids of examples that were misclassified if self.compare_classifiers == 'mcnemar': for i in range(len(self.y_pred)): if y_pred[i] != y_test[i]: self.pipeline.misclassified_map[self.clf_name].append( self.example_indices[i]) self.write_predictions(y_pred) self.write_metrics(y_test, y_pred) def write_predictions(self, y_pred): if self.proba is not None and y_pred is not None: for i in range(len(y_pred)): self.pipeline.prediction_scores[self.selector_param_str][ self.clf_name][self.example_indices[i]] = self.proba[i] self.pipeline.predictions[self.selector_param_str][ self.clf_name][self.example_indices[i]] = y_pred[i] def write_metrics(self, y_test, y_pred): average = 'binary' if self.num_classes > 2 else 'micro' auc_score = [] if self.proba is not None: if average == 'binary': auc_score = roc_auc_score(y_test, y_pred) else: y_bin = label_binarize(y_test, classes=range(self.num_classes)) for i in range(self.num_classes): y_temp = y_bin[:, i] auc_score.append(roc_auc_score(y_temp, self.proba[:, i])) auc_score = str(auc_score) self.pipeline.results = self.pipeline.results.append( { 'base clf': self.clf_name, 'num features': self.num_features, 'accuracy': accuracy_score(y_test, y_pred), 'precision': precision_score(y_test, y_pred, average=average), 'recall': recall_score(y_test, y_pred, average=average), 'auc': auc_score if auc_score is not None else -1, 'f1': f1_score(y_test, y_pred, average=average) }, ignore_index=True) if self.selector_params is not None: for k in self.selector_params.keys(): self.pipeline.results.loc[len(results) - 1, k] = self.selector_params[k]
class perceptronMOM(BaseEstimator): '''Perceptron MOM classifier. Perceptron MOM risk minimization. The Perceptron minimize the perceptron loss using SGD without regularization. Parameters ---------- w0 : array-like, length = n_features + 1, default ones(n_features + 1) initial coefficients (including the intercept) of the classifier. K : int, default 10 number of blocks for the computation of the MOM. A big value of K deals with more outliers but small values of K are better for the performance when there are no outliers. eta0 : float, default 1 step size parameter, the step size is defined as the i-th iteration by 1/(1+eta0*i). epoch : int, default 200 number of iterations before the end of the algorithm. mu : float between 0 and 1, default 0.95 coefficient in the momentum. agg : int, default 1 number of runs of the algorithm on which we aggregate. One might want to decrease this number if the complexity is a problem. compter : boolean, default False used for outlier detection, if compter=True, the number of time each point is used in the algorithm will be recorded in the attribute "counts". progress : boolean, default False display a progress bar to monitor the algorithm on each run (agg > 1 means several progress bar). verbose : boolean, default True display a message at the end of each run if agg > 1. multi : {'ovr','ovo'} , default 'ovr' method used to go from binary classification to multiclass classification. 'ovr' means "one vs the rest" and 'ovo' means "one vs one" . Attributes ---------- w0 : array like, length = n_features + 1 w0 is updated in the algorithm, provides with the final coefficients of the decision function. counts : array like, length = n_sampled the i-th element record the number of time the i-th element of the training dataset X has been used. Only if compter=True. Methods ------- fit(X,y) : fit the model X : numpy matrix size = (n_samples,n_features) y : array like, length = n_samples predict(X) : predict the class of the points in X X : numpy matrix size = (n_samples,n_features) returns array-like, length = n_samples. predict_proba(X) : predict the probability that each point belong to each class. X : numpy matrox size = (n_samples,n_features) returns matrix, size = (n_samples,n_class) ''' def __init__( self,w0=None,K=10,eta0=1,epoch=100,mu=0.95,agg=1,compter=False,progress=False, verbose = True, multi='ovr'): binary_clf=perceptronMOM_binary(w0,K,eta0,epoch,mu,agg,compter,progress,verbose) args, _, _, values = inspect.getargvalues(inspect.currentframe()) values.pop("self") for arg, val in values.items(): setattr(self, arg, val) if multi=="ovr": self.clf=OneVsRestClassifier(binary_clf) elif multi=="ovo": self.clf=OneVsOneClassifier(binary_clf) else: raise NameError('Multiclass meta-algorithm not known') def fit(self,X,y): self.clf.fit(X,y) return self def predict(self,X): return self.clf.predict(X) def predict_proba(self,X): return self.clf.predict_proba(X) def score(self,X,y): return np.mean(self.predict(X)==y) def set_params(self,**params): self.__init__(**params) return self
class SeCoEstimator(BaseEstimator, ClassifierMixin): """A classifier using rules learned with the *Separate-and-Conquer* (SeCo) algorithm, also known as *Covering* algorithm. Wraps `_BaseSeCoEstimator` to handle multi-class problems, selecting a multi-class strategy and making sure that `_BaseSeCoEstimator` always sees an integer range [0..n_classes_) of class labels, where 0 is the intended fallback class; i.e. the biggest class in multi-class problems, or the negative class when learning a binary concept. The concrete SeCo variant to run is defined by `algorithm_config`. Fields ----- algorithm_config : subclass of SeCoAlgorithmConfiguration Defines the concrete SeCo algorithm to run, see :class:`SeCoAlgorithmConfiguration`. Parameters ----- multi_class : callable or str or None Which strategy to use for non-binary problems. Possible values: - None: auto-select; use 'direct' if possible (`algorithm_config.direct_multiclass_support()` returns True), 'one_vs_rest' otherwise. - A callable: Construct `self.base_estimator_ = multi_class(_BaseSeCoEstimator())` and delegate to that estimator. Useful if you want to roll a different binarization strategy, e.g. >>> import sklearn.multiclass, functools >>> multi_class=functools.partial( ... sklearn.multiclass.OutputCodeClassifier, ... code_size=0.7, random_state=42) If you use this, make sure to pass to `_BaseSeCoEstimator` classes `y` from an integer range [0..n_classes_), e.g. using `LabelEncoder`. Also be aware of class order influence on tie-breaking. - 'direct': Directly learn a theory of rules with different heads (target classes). Uses :class:`BySizeLabelEncoder` internally. - 'one_vs_rest': Use `sklearn.multiclass.OneVsRestClassifier` for class binarization and learn binary theories. - 'one_vs_one': Use `sklearn.multiclass.OneVsOneClassifier` for class binarization and learn binary theories. - TODO: multi_class strategy of ripper: OneVsRest, remove C_i after learning rules for it random_state : None | int | instance of np.random.RandomState RNG, may be used by the algorithm. Value passed through `sklearn.utils.check_random_state`. n_jobs : int, optional Passed to `OneVsRestClassifier` or `OneVsOneClassifier` if these are used. Attributes ----- base_estimator_ : estimator instance The estimator object that all tasks are delegated to. One of `sklearn.multiclass.OneVsRestClassifier`, `sklearn.multiclass.OneVsOneClassifier` or `sklearn_seco.util.TargetTransformingMetaEstimator` if demanded by the `multi_class_` strategy, a `_BaseSeCoEstimator` otherwise. multi_class_ : callable or str The actual strategy used on a non-binary problem. Relevant if `multi_class=None` demanded auto-selection. classes_ : np.ndarray `np.unique(y)` See Also ----- `_BaseSeCoEstimator` """ algorithm_config: Type[SeCoAlgorithmConfiguration] # TODO: _BaseSeCoEstimator.export_text equivalent inverting binarization & target transformation for display def _more_tags(self): # tell sklearn >= 0.21 that we can handle categorical data return {'X_types': ['2darray', 'categorical'], 'allow_nan': True} def __init__(self, multi_class=None, random_state=1, n_jobs=1): self.multi_class = multi_class self.random_state = random_state self.n_jobs = n_jobs def fit(self, X, y, **kwargs): """Learn SeCo theory/theories on training data `X, y`. For possible parameters (`**kwargs`), refer to :class:`_BaseSeCoEstimator`. """ X, y = check_X_y(X, y, force_all_finite='allow-nan') self.multi_class_ = self.multi_class self.base_estimator_ = _BaseSeCoEstimator( self.algorithm_config, random_state=self.random_state, **kwargs) # NOTE: if using multiprocessing (e.g. through OvO or OvR), all # sub-estimators share the same random seed/state. # I think this should not harm. def wrapper_ordering_classes_by_size(estimator): # BySizeLabelEncoder ensures: first class = default = biggest # and that classes form an integer range [0..n_classes_) return TargetTransformingMetaEstimator(BySizeLabelEncoder(), estimator) self.classes_ = np.unique(y) n_classes_ = self.classes_.size if n_classes_ == 1: raise ValueError("SeCoEstimator requires 2 or more distinct " "classes. Only 1 class (%s) present." % self.classes_[0]) elif n_classes_ == 2: self.base_estimator_ = wrapper_ordering_classes_by_size( self.base_estimator_) else: # n_classes_ > 2 if self.multi_class_ is None: # default / auto-selection if self.algorithm_config.direct_multiclass_support(): self.multi_class_ = "direct" else: self.multi_class_ = "one_vs_rest" if callable(self.multi_class_): self.base_estimator_ = self.multi_class_(self.base_estimator_) elif self.multi_class_ == "one_vs_rest": self.base_estimator_ = OneVsRestClassifier( self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class_ == "one_vs_one": self.base_estimator_ = OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class_ == "direct": # TODO: if self.multi_class=='direct' (not `None` auto-detect), only assertion prevents binary-only learner to silently learn on multiclass training data self.base_estimator_ = wrapper_ordering_classes_by_size( self.base_estimator_) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class_) # NOTE: param categorical_features is data dependent, but OvR/OvO don't # pass extra parameters through fit(), so it has to be in # `_BaseSeCoEstimator.__init__`. self.base_estimator_.fit(X, y) return self def predict(self, X): """Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array, shape = (n_samples,) Predicted target values for X, values are from ``classes_`` """ check_is_fitted(self, ["classes_"]) X = check_array(X, force_all_finite='allow-nan') return self.base_estimator_.predict(X) @if_delegate_has_method('base_estimator_') def predict_proba(self, X): # noinspection PyUnresolvedReferences return self.base_estimator_.predict_proba(X) @if_delegate_has_method('base_estimator_') def decision_function(self, X): # noinspection PyUnresolvedReferences return self.base_estimator_.decision_function(X) def get_seco_estimators(self) -> Sequence[_BaseSeCoEstimator]: """ :return: The `_BaseSeCoEstimator` instances that were trained. Depending on the multi-class strategy, the class labels they use differ in order and value. Cannot be used when self.multi_class_ is a callable. """ check_is_fitted(self, 'base_estimator_') is_binary = len(self.classes_) == 2 if is_binary or self.multi_class_ == "direct": assert isinstance(self.base_estimator_, TargetTransformingMetaEstimator) return [self.base_estimator_.estimator] elif self.multi_class_ == "one_vs_rest": assert isinstance(self.base_estimator_, OneVsRestClassifier) return self.base_estimator_.estimators_ elif self.multi_class_ == "one_vs_one": assert isinstance(self.base_estimator_, OneVsOneClassifier) return self.base_estimator_.estimators_ else: assert False, "invalid state: unknown type of base_estimator_ " \ f"({str(self.base_estimator_)})"
class RobustWeightedClassifier(BaseEstimator, ClassifierMixin): """Algorithm for robust classification using reweighting algorithm. This model use iterative reweighting of samples to make a regression or classification estimator robust. The principle of the algorithm is to use an empirical risk minimization principle where the risk is estimated using a robust estimator (for example Huber estimator or median-of-means estimator)[1], [3]. The idea behind this algorithm was mentioned before in [2]. This idea translates in an iterative algorithm where the sample_weight are changed at each iterations and are dependent of the sample. Informally the outliers should have small weight while the inliers should have big weight, where outliers are sample with a big loss function. This algorithm enjoy a non-zero breakdown-point (it can handle arbitrarily bad outliers). When the "mom" weighting scheme is used, k outliers can be tolerated. When the "Huber" weighting scheme is used, asymptotically the number of outliers has to be less than half the sample size. Read more in the :ref:`User Guide <robust>`. Parameters ---------- weighting : string, default="huber" Weighting scheme used to make the estimator robust. Can be 'huber' for huber-type weights or 'mom' for median-of-means type weights. max_iter : int, default=100 Maximum number of iterations. For more information, see the optimization scheme of base_estimator and the eta0 and burn_in parameter. burn_in : int, default=10 Number of steps used without changing the learning rate. Can be useful to make the weight estimation better at the beginning. eta0 : float, default=0.01 Constant step-size used during the burn_in period. Used only if burn_in>0. Can have a big effect on efficiency. c : float>0 or None, default=None Parameter used for Huber weighting procedure, used only if weightings is 'huber'. Measure the robustness of the weighting procedure. A small value of c means a more robust estimator. Can have a big effect on efficiency. If None, c is estimated at each step using half the Inter-quartile range, this tends to be conservative (robust). k : int < sample_size/2, default=1 Parameter used for mom weighting procedure, used only if weightings is 'mom'. 2k+1 is the number of blocks used for median-of-means estimation, higher value of k means a more robust estimator. Can have a big effect on efficiency. If None, k is estimated using the number of points distant from the median of means of more than 2 times a robust estimate of the scale (using the inter-quartile range), this tends to be conservative (robust). loss : string, None or callable, default="log" Name of the loss used, must be the same loss as the one optimized in base_estimator. Classification losses supported : 'log', 'hinge'. If 'log', then the base_estimator must support predict_proba. Regression losses supported : 'squared_loss', . sgd_args : dict, default={} arguments of the SGDClassifier base estimator. multi_class : string, default="ovr" multi-class scheme. Can be either "ovo" for OneVsOneClassifier or "ovr" for OneVsRestClassifier or "binary" for binary classification. n_jobs : int, default=1 number of jobs used in the multi-class meta-algorithm computation. tol : float or None, (default = 1e-3) The stopping criterion. If it is not None, training will stop when (loss > best_loss - tol) for n_iter_no_change consecutive epochs. n_iter_no_change : int, default=10 Number of iterations with no improvement to wait before early stopping. random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Attributes ---------- classes_ : ndarray of shape (n_classes, ) A list of class labels known to the classifier. coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) Coefficient of the features in the decision function. Only available if multi_class = "binary" intercept_ : ndarray of shape (1,) or (n_classes,) Intercept (a.k.a. bias) added to the decision function. Only available if multi_class = "binary" n_iter_ : ndarray of shape (n_classes,) or (1, ) Actual number of iterations for all classes. If binary or multinomial, it returns only 1 element. For liblinear solver, only the maximum number of iteration across all classes is given. base_estimator_ : object, The fitted base estimator SGDCLassifier. weights_ : array like, length = n_sample. Weight of each sample at the end of the algorithm. Can be used as a measure of how much of an outlier a sample is. Only available if multi_class = "binary" Notes ----- Often, there is a need to use RobustScaler as preprocessing. Examples -------- >>> from sklearn_extra.robust import RobustWeightedClassifier >>> from sklearn.datasets import make_blobs >>> import numpy as np >>> rng = np.random.RandomState(42) >>> X,y = make_blobs(n_samples=100, centers=np.array([[-1, -1], [1, 1]]), ... random_state=rng) >>> clf=RobustWeightedClassifier() >>> _ = clf.fit(X, y) >>> score = np.mean(clf.predict(X)==y) References ---------- [1] Guillaume Lecué, Matthieu Lerasle and Timothée Mathieu. "Robust classification via MOM minimization", Mach Learn 109, (2020). https://doi.org/10.1007/s10994-019-05863-6 (2018). arXiv:1808.03106 [2] Christian Brownlees, Emilien Joly and Gábor Lugosi. "Empirical risk minimization for heavy-tailed losses", Ann. Statist. Volume 43, Number 6 (2015), 2507-2536. [3] Stanislav Minsker and Timothée Mathieu. "Excess risk bounds in robust empirical risk minimization" arXiv preprint (2019). arXiv:1910.07485. """ def __init__( self, weighting="huber", max_iter=100, burn_in=10, eta0=0.01, c=None, k=0, loss="log", sgd_args=None, multi_class="ovr", n_jobs=1, tol=1e-3, n_iter_no_change=10, random_state=None, ): self.weighting = weighting self.max_iter = max_iter self.burn_in = burn_in self.eta0 = eta0 self.c = c self.k = k self.loss = loss self.sgd_args = sgd_args self.multi_class = multi_class self.n_jobs = n_jobs self.tol = tol self.n_iter_no_change = n_iter_no_change self.random_state = random_state def fit(self, X, y): """Fit the model to data matrix X and target(s) y. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). Returns ------- self : returns an estimator trained with RobustWeightedClassifier. """ if self.sgd_args is None: sgd_args = {} else: sgd_args = self.sgd_args # Define the base estimator base_robust_estimator_ = _RobustWeightedEstimator( SGDClassifier(**sgd_args, loss=self.loss), weighting=self.weighting, loss=self.loss, burn_in=self.burn_in, c=self.c, k=self.k, eta0=self.eta0, max_iter=self.max_iter, tol=self.tol, n_iter_no_change=self.n_iter_no_change, random_state=self.random_state, ) if self.multi_class == "ovr": self.base_estimator_ = OneVsRestClassifier( base_robust_estimator_, n_jobs=self.n_jobs ) elif self.multi_class == "binary": self.base_estimator_ = base_robust_estimator_ elif self.multi_class == "ovo": self.base_estimator_ = OneVsOneClassifier( base_robust_estimator_, n_jobs=self.n_jobs ) else: raise ValueError("No such multiclass method implemented.") self.base_estimator_.fit(X, y) if self.multi_class == "binary": self.weights_ = self.base_estimator_.weights_ self.coef_ = self.base_estimator_.coef_ self.intercept_ = self.base_estimator_.intercept_ self.n_iter_ = self.max_iter * len(X) self.classes_ = self.base_estimator_.classes_ return self def predict(self, X): """Predict using the estimator trained with RobustWeightedClassifier. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Returns ------- y : array-like, shape (n_samples, n_outputs) The predicted values. """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.predict(X) def _check_proba(self): if self.loss != "log": raise AttributeError( "Probability estimates are not available for" " loss=%r" % self.loss ) @property def predict_proba(self): """ Probability estimates when binary classification. Parameters ---------- X : array-like of shape (n_samples, n_features) Vector to be scored, where `n_samples` is the number of samples and `n_features` is the number of features. Returns ------- T : array-like of shape (n_samples, n_classes) Returns the probability of the sample for each class in the model, """ check_is_fitted(self, attributes=["base_estimator_"]) self._check_proba() return self._predict_proba def _predict_proba(self, X): return self.base_estimator_.predict_proba(X) @property def _estimator_type(self): return self.base_estimator._estimator_type def score(self, X, y=None): """Returns the score on the given data, using ``base_estimator_.score``. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples, n_output) or (n_samples,), optional Target relative to X for classification or regression; None for unsupervised learning. Returns ------- score : float """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.score(X, y) def decision_function(self, X): """Predict using the linear model Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Returns ------- array, shape (n_samples,) Predicted target values per element in X. """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.decision_function(X)