def fit_transform(self,X,y=None): """ Fit an sklearn classifier to data Parameters ---------- X : pandas dataframe or array-like training samples y : array like, required for array-like X and not used presently for pandas dataframe class labels Returns ------- self: object """ if isinstance(X,pd.DataFrame): df = X (X,y,self.vectorizer) = self.convert_numpy(df) else: check_X_y(X,y) self.clf.fit(X,y) return self
def check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred, y_test_pred): """Internal shape to check input data shapes are consistent. Parameters ---------- X_train : numpy array of shape (n_samples, n_features) The training samples. y_train : list or array of shape (n_samples,) The ground truth of training samples. X_test : numpy array of shape (n_samples, n_features) The test samples. y_test : list or array of shape (n_samples,) The ground truth of test samples. y_train_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the training samples. y_test_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the test samples. Returns ------- X_train : numpy array of shape (n_samples, n_features) The training samples. y_train : list or array of shape (n_samples,) The ground truth of training samples. X_test : numpy array of shape (n_samples, n_features) The test samples. y_test : list or array of shape (n_samples,) The ground truth of test samples. y_train_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the training samples. y_test_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the test samples. """ # check input data shapes are consistent X_train, y_train = check_X_y(X_train, y_train) X_test, y_test = check_X_y(X_test, y_test) y_test_pred = column_or_1d(y_test_pred) y_train_pred = column_or_1d(y_train_pred) check_consistent_length(y_train, y_train_pred) check_consistent_length(y_test, y_test_pred) if X_train.shape[1] != X_test.shape[1]: raise ValueError("X_train {0} and X_test {1} have different number " "of features.".format(X_train.shape, X_test.shape)) return X_train, y_train, X_test, y_test, y_train_pred, y_test_pred
def fit(self,X,y=None): """Fit a model: Parameters ---------- X : pandas dataframe or array-like training samples. If pandas dataframe can handle dict of feature in one column or cnvert a set of columns y : array like, required for array-like X and not used presently for pandas dataframe class labels Returns ------- self: object """ if isinstance(X,pd.DataFrame): df = X if not self.dict_feature is None: if not self.target_readable is None: self.create_class_id_map(df,self.target,self.target_readable) (X,y) = self._load_from_dict(df) num_class = len(np.unique(y)) else: (X,y,self.vectorizer) = self.convert_numpy(df) num_class = len(y.unique()) else: check_X_y(X,y) num_class = len(np.unique(y)) self.clf = xgb.XGBClassifier(**self.params) print self.clf.get_params(deep=True) self.clf.fit(X,y,verbose=True) return self
def test_check_array_warn_on_dtype_deprecation(): X = np.asarray([[0.0], [1.0]]) Y = np.asarray([[2.0], [3.0]]) with pytest.warns(DeprecationWarning, match="'warn_on_dtype' is deprecated"): check_array(X, warn_on_dtype=True) with pytest.warns(DeprecationWarning, match="'warn_on_dtype' is deprecated"): check_X_y(X, Y, warn_on_dtype=True)
def fit(self,X,y=None): """Derived from https://github.com/fchollet/keras/blob/master/keras/wrappers/scikit_learn.py Adds: Handling pandas inputs Saving of model into the class to allow for easy pickling Parameters ---------- X : pandas dataframe or array-like training samples y : array like, required for array-like X and not used presently for pandas dataframe class labels Returns ------- self: object """ if isinstance(X,pd.DataFrame): df = X (X,y,self.vectorizer) = self.convert_numpy(df) else: check_X_y(X,y) input_width = X.shape[1] num_classes = len(y.unique()) logger.info("input_width %d",input_width) logger.info("num_classes %d",num_classes) train_y = np_utils.to_categorical(y, num_classes) self.model = self.model_create(input_width,num_classes) if len(y.shape) == 1: self.classes_ = list(np.unique(y)) if self.loss == 'categorical_crossentropy': y = to_categorical(y) else: self.classes_ = np.arange(0, y.shape[1]) if self.compiled_model_ is None: self.compiled_model_ = copy.deepcopy(self.model) self.compiled_model_.compile(optimizer=self.optimizer, loss=self.loss) history = self.compiled_model_.fit( X, y, batch_size=self.train_batch_size, nb_epoch=self.nb_epoch, verbose=self.verbose, shuffle=self.shuffle, show_accuracy=self.show_accuracy, validation_split=self.validation_split, validation_data=self.validation_data, callbacks=self.callbacks) self.config_ = self.model.to_json() self.compiled_model_.save_weights(self.tmp_model) with open(self.tmp_model, mode='rb') as file: # b is important -> binary self.model_saved = file.read() return self
def fit(self,X,y=None): """Convert data to vw lines and then train for required iterations Parameters ---------- X : pandas dataframe or array-like training samples y : array like, required for array-like X and not used presently for pandas dataframe class labels Returns ------- self: object Caveats : 1. A seldon specific fork of wabbit_wappa is needed to allow vw to run in server mode without save_resume. Save_resume seems to cause issues with the scores returned. Maybe connected to https://github.com/JohnLangford/vowpal_wabbit/issues/262 """ if isinstance(X,pd.DataFrame): df = X df_base = self._exclude_include_features(df) df_base = df_base.fillna(0) else: check_X_y(X,y) df = pd.DataFrame(X) df_y = pd.DataFrame(y,columns=list('y')) self.target='y' df_base = pd.concat([df,df_y],axis=1) print df_base.head() min_target = df_base[self.target].astype(float).min() print "min target ",min_target if min_target == 0: self.zero_based = True else: self.zero_based = False if not self.target_readable is None: self.create_class_id_map(df,self.target,self.target_readable,zero_based=self.zero_based) self.num_classes = len(df_base[self.target].unique()) print "num classes ",self.num_classes self._start_vw_if_needed("train") df_vw = df_base.apply(self._convert_row,axis=1) for i in range(0,self.num_iterations): for (index,val) in df_vw.iteritems(): self.vw.send_line(val,parse_result=False) self._save_model(self.model_file) return self
def fit(self, X, y): """Fit joint quantile regression model. Parameters ---------- inputs : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data. targets : {array-like}, shape = [n_samples] Target values. Returns ------- self : returns an instance of self. """ if self.eps > 0 and self.nc_const: raise UserWarning("eps is considered null because you chose to " "enfoce non-crossing constraints.") X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], y_numeric=True) y = asarray(y).flatten() self._validate_params() self.linop_ = self._get_kernel_map(X) gram = self.linop_.Gram_dense(X) self.reg_c_ = 1. / self.lbda # Solve the optimization problem # probs = asarray(self.probs).reshape((-1, 1)) probs = asarray(self.probs).flatten() if self.nc_const: self._qp_nc(gram, y, probs) else: self._coneqp(gram, y, probs) return self
def fit(self, X, y=None): """Fit the model using X as training data. Parameters ---------- X : {array-like, sparse matrix}, optional Training data. If array or matrix, shape = [n_samples, n_features] If X is None, a "lazy fitting" is performed. If kneighbors is called, the fitting with with the data there is done. Also the caching of computed hash values is deactivated in this case. y : list, optional (default = None) List of classes for the given input of X. Size have to be n_samples.""" if y is not None: self._y_is_csr = True _, self._y = check_X_y(X, y, "csr", multi_output=True) if self._y.ndim == 1 or self._y.shape[1] == 1: self._y_is_csr = False else: self._y_is_csr = False X_csr = csr_matrix(X) self._index_elements_count = X_csr.shape[0] instances, features = X_csr.nonzero() maxFeatures = int(max(X_csr.getnnz(1))) data = X_csr.data # returns a pointer to the inverse index stored in c++ self._pointer_address_of_nearestNeighbors_object = _nearestNeighbors.fit(instances.tolist(), features.tolist(), data.tolist(), X_csr.shape[0], maxFeatures, self._pointer_address_of_nearestNeighbors_object)
def fit(self,X,y): ''' Fits variational relevance vector regression Parameters ----------- X: array-like of size [n_samples, n_features] Training data, matrix of explanatory variables y: array-like of size [n_samples, n_features] Target values Returns ------- self : object Returns self. ''' X,y = check_X_y(X,y, dtype = np.float64) # kernelise features K = self._get_kernel( X, X) # use fit method of RegressionARD _ = super(VRVR,self).fit(K,y) self.relevant_ = np.where(self.active_== True)[0] if X.ndim == 1: self.relevant_vectors_ = X[self.relevant_] else: self.relevant_vectors_ = X[self.relevant_,:] return self
def anotherfit(self, X, y): X,y=check_X_y(X,y) GaussianNB.fit(self,X,y) for name in self.equivalent: super(GaussianNB,self).__setattr__(name,self.__getattribute__(self.equivalent[name]))
def my_smote(X, y, minority_target=None, per=0.5): """ This object is an implementation of SMOTE - Synthetic Minority Over-sampling Technique, and the variations Borderline SMOTE 1, 2 and SVM-SMOTE. :param X: nd-array, sparse matrix, shape=[n_samples, n_features] :param y: nd-array, list, shape=[n_samples] :param minority_target: list :param per :return: """ X, Y = check_X_y(X, y, 'csr') unique_label = list(set(Y)) label_count = [np.sum(Y == i) for i in unique_label] if minority_target is None: minority_index = [np.argmin(label_count)] else: minority_index = [unique_label.index(target) for target in minority_target] majority = np.max(label_count) for i in minority_index: N = (int((majority * 1.0 / (1 - per) - majority) / label_count[i]) - 1) * 100 safe, synthetic, danger = _smote._borderlineSMOTE(X, Y, unique_label[i], N, k=5) syn_label = np.array([unique_label[i]] * synthetic.shape[0]) X = sp.vstack([X, synthetic]) Y = np.concatenate([Y, syn_label]) return X, Y
def fit(self, X, y): X, y = check_X_y(X, y) print("c=%s, cov_algo=%s" % (self.c, self.cov_algo)) classes = np.unique(y) self.classes_ = np.unique(y) n_classes = len(self.classes_) self.class_prior_ = np.zeros(n_classes) self.class_count_ = np.zeros(n_classes) unique_y = np.unique(y) for y_i in unique_y: i = classes.searchsorted(y_i) X_i = X[y == y_i, :] sw_i = None N_i = X_i.shape[0] self.class_count_[i] += N_i self.class_prior_[:] = self.class_count_ / np.sum(self.class_count_) self.priors = self.class_prior_ self.posteriors = [] for klass in self.classes_: examples = self._examples_for_class(klass, X, y) mean = np.array(examples.mean(0))[0] cov = self._cov(examples) cov_smoothed = cov + (self.c * np.eye(mean.shape[0])) p_x = multivariate_normal(mean=mean, cov=cov_smoothed) self.posteriors.append(p_x) return self
def fit(self, X, y): """Fit ORFF ridge regression model. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data. y : {array-like}, shape = [n_samples] or [n_samples, n_targets] Target values. Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) self._validate_params() self.p = y.shape[1] if y.ndim > 1 else 1 solver_params = self.solver_params or {} self.linop_ = self._get_kernel(X, y) self.phix_ = self.linop_.get_orff_map(X, self.D) risk = ORFFRidgeRisk(self.lbda, 'LS') self.solver_res_ = minimize(risk.functional_grad_val, zeros(self.phix_.shape[1], dtype=X.dtype), args=(y.ravel(), self.phix_, self.linop_), method=self.solver, jac=True, options=solver_params) self.coefs_ = self.solver_res_.x return self
def fit(self, x, y): """ Constructs GAM model(s) to predict y from X x: 1 or 2 dimensional array of predictor values with each row being one observation y: 1 or 2 dimensional array of predicted values (a GAM model is constructed for each output if y is 2 dimensional) """ # Input validation for standard estimators using sklearn utils x, y = check_X_y(x, y, accept_sparse=["csr", "csc", "coo"], multi_output=True) # Convert to R matrices if ( x.ndim == 1 ): # If we're only looking at 1 x at a time, shape[1] will give an error for one-dimensional arrays. Sklearn input validation doesn't change that. rX = r.matrix(x, nrow=x.shape[0], ncol=1) else: rX = r.matrix(x, nrow=x.shape[0], ncol=x.shape[1]) if ( y.ndim == 1 ): # If we're only looking at 1 y at a time, shape[1] will give an error for one-dimensional arrays rY = r.matrix(y, nrow=y.shape[0], ncol=1) else: rY = r.matrix(y, nrow=y.shape[0], ncol=y.shape[1]) # Compute models (one for each column in y) self.gammodels = self.computeGAM(rX, rY) return self
def _check_params(self, X, y): # checking input data and scaling it if y is continuous X, y = check_X_y(X, y) if not self.categorical: ss = StandardScaler() X = ss.fit_transform(X) y = ss.fit_transform(y) # sanity checks methods = ['JMI', 'JMIM', 'MRMR'] if self.method not in methods: raise ValueError('Please choose one of the following methods:\n' + '\n'.join(methods)) if not isinstance(self.k, int): raise ValueError("k must be an integer.") if self.k < 1: raise ValueError('k must be larger than 0.') if self.categorical and np.any(self.k > np.bincount(y)): raise ValueError('k must be smaller than your smallest class.') if not isinstance(self.categorical, bool): raise ValueError('Categorical must be Boolean.') if self.categorical and np.unique(y).shape[0] > 5: print 'Are you sure y is categorical? It has more than 5 levels.' if not self.categorical and self._isinteger(y): print 'Are you sure y is continuous? It seems to be discrete.' if self._isinteger(X): print ('The values of X seem to be discrete. MI_FS will treat them' 'as continuous.') return X, y
def fit(self,X,y): ''' Fits L2VM model Parameters: ----------- X: numpy array of size 'n x m' Matrix of explanatory variables Y: numpy array of size 'n x ' Vector of dependent variable Return ------ obj: self self ''' X,y = check_X_y(X,y, dtype = np.float64) K = get_kernel(X, X, self.gamma, self.degree, self.coef0, self.kernel, self.kernel_params ) self._model = LogisticRegression( penalty = "l1", dual = False, C = self.C, tol = self.tol, fit_intercept = self.fit_intercept, intercept_scaling=self.intercept_scaling, n_jobs = self.n_jobs, solver = 'liblinear', multi_class = 'ovr', max_iter = self.max_iter, verbose = self.verbose, random_state = self.random_state) self._model = self._model.fit(K,y) self.relevant_indices_ = [np.where(coefs!=0)[0] for coefs in self._model.coef_] self.relevant_vectors_ = [X[rvi,:] for rvi in self.relevant_indices_] self.classes_ = self._model.classes_ return self
def fit(self,X,y): ''' Fits ElasticNet Regression with kernelised features Parameters ---------- X: array-like of size [n_samples, n_features] Matrix of explanatory variables y: array-like of size (n_samples,) Vector of dependent variable Returns ------- obj: self self ''' X,y = check_X_y(X,y, dtype = np.float64) K = get_kernel(X, X, self.gamma, self.degree, self.coef0, self.kernel, self.kernel_params ) model = ElasticNet(self.alpha, self.l1_ratio, self.fit_intercept, self.normalize, self.precompute, self.max_iter, self.copy_X, self.tol, self.warm_start, self.positive, self.random_state, self.selection) self._model = model.fit(K,y) self.relevant_indices_ = np.where(self._model.coef_ != 0)[0] self.relevant_vectors_ = X[self.relevant_indices_,:] return self
def fit(self, X, y): X, y = check_X_y(X, y, dtype=np.int64, accept_sparse='csr') n_rows = X.shape[0] self.classes_ = np.unique(y) if sp.sparse.issparse(X): if self.debug: print('Features are sparse, choosing faster learning') self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm,self.k, X.shape[1]), format='Sparse', debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000, normalize=self.normalize) for i in range(n_rows): sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)] self.classifier.append(sparse,str(y[i])) else: self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm, self.k, X.shape[1]), debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000, normalize=self.normalize) if y.dtype != 'O': y = y.astype(str) for i in range(n_rows): self.classifier.append(list(X[i].toarray()[0]), y[i]) self.classifier.train() return self
def sample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ # Check the consistency of X and y X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) check_is_fitted(self, 'ratio_') self._check_X_y(X, y) return self._sample(X, y)
def fit(self, X, y): """Fit ONORMA model. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data. y : {array-like}, shape = [n_samples] or [n_samples, n_targets] Target values. Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, False, y_numeric=True, multi_output=True) self._validate_params() self.T_ = X.shape[0] if self.T is None else self.T self.t_ = 0 if y.ndim > 1: self.coefs_ = zeros(self.T_ * y.shape[1]) for i in range(self.T_): idx = i % X.shape[0] self.partial_fit(X[idx, :], y[idx, :]) else: self.coefs_ = zeros(self.T_) for i in range(self.T_): idx = i % X.shape[0] self.partial_fit(X[idx, :], y[idx]) return self
def fit(self,X,y): ''' Fit Relevance Vector Regression Model Parameters ----------- X: {array-like,sparse matrix} of size [n_samples, n_features] Training data, matrix of explanatory variables y: array-like of size [n_samples, n_features] Target values Returns ------- self: object self ''' X,y = check_X_y(X,y, accept_sparse = ['csr','coo','bsr'], dtype = np.float64) # kernelise features K = get_kernel( X, X, self.gamma, self.degree, self.coef0, self.kernel, self.kernel_params) # use fit method of RegressionARD _ = super(RVR,self).fit(K,y) # convert to csr (need to use __getitem__) convert_tocsr = [scipy.sparse.coo.coo_matrix, scipy.sparse.dia.dia_matrix, scipy.sparse.bsr.bsr_matrix] if type(X) in convert_tocsr: X = X.tocsr() self.relevant_ = np.where(self.active_== True)[0] if X.ndim == 1: self.relevant_vectors_ = X[self.relevant_] else: self.relevant_vectors_ = X[self.relevant_,:] return self
def fit(self,X,y): ''' Fits Logistic Regression with ARD Parameters ---------- X: array-like of size [n_samples, n_features] Training data, matrix of explanatory variables y: array-like of size [n_samples] Target values Returns ------- self : object Returns self. ''' X, y = check_X_y(X, y, accept_sparse = None, dtype=np.float64) n_samples, n_features = X.shape # preprocess features self._X_mean = np.zeros(n_features) self._X_std = np.ones(n_features) if self.normalize: self._X_mean, self._X_std = np.mean(X,0), np.std(X,0) X = (X - self._X_mean) / self._X_std if self.fit_intercept: X = np.concatenate((np.ones([n_samples,1]),X),1) n_features += 1 # preprocess targets check_classification_targets(y) self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("Need samples of at least 2 classes" " in the data, but the data contains only one" " class: %r" % self.classes_[0]) # if multiclass use OVR (i.e. fit classifier for each class) self.coef_,self.active_ ,self.lambda_= list(),list(),list() self.intercept_, self.sigma_ = list(),list() for pos_class in self.classes_: if n_classes == 2: pos_class = self.classes_[1] mask = (y == pos_class) y_bin = np.zeros(y.shape, dtype=np.float64) y_bin[mask] = 1 coef_, intercept_, active_ , sigma_ , A = self._fit(X,y_bin, n_samples,n_features) self.coef_.append(coef_) self.active_.append(active_) self.intercept_.append(intercept_) self.sigma_.append(sigma_) self.lambda_.append(A) # in case of binary classification fit only one classifier if n_classes == 2: break return self
def fit(self,X,y): ''' Fits variational Bayesian Logistic Regression Parameters ---------- X: array-like of size [n_samples, n_features] Matrix of explanatory variables y: array-like of size [n_samples] Vector of dependent variables Returns ------- self: object self ''' # preprocess data X,y = check_X_y( X, y , dtype = np.float64) check_classification_targets(y) self.classes_ = np.unique(y) n_classes = len(self.classes_) # take into account bias term if required n_samples, n_features = X.shape n_features = n_features + int(self.fit_intercept) if self.fit_intercept: X = np.hstack( (np.ones([n_samples,1]),X)) # handle multiclass problems using One-vs-Rest if n_classes < 2: raise ValueError("Need samples of at least 2 classes") if n_classes > 2: self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes self.intercept_ = [0]*n_classes else: self.coef_, self.sigma_, self.intercept_ = [0],[0],[0] # huperparameters of a = self.a + 0.5 * n_features b = self.b for i in range(len(self.coef_)): if n_classes == 2: pos_class = self.classes_[1] else: pos_class = self.classes_[i] mask = (y == pos_class) y_bin = np.ones(y.shape, dtype=np.float64) y_bin[~mask] = 0 coef_, sigma_ = self._fit(X,y_bin,a,b) intercept_ = 0 if self.fit_intercept: intercept_ = coef_[0] coef_ = coef_[1:] self.coef_[i] = coef_ self.intercept_[i] = intercept_ self.sigma_[i] = sigma_ self.coef_ = np.asarray(self.coef_) return self
def fit(self,X,y): ''' Fit Relevance Vector Classifier Parameters ----------- X: array-like of size [n_samples, n_features] Training data, matrix of explanatory variables y: array-like of size [n_samples, n_features] Target values Returns ------- self: object self ''' X,y = check_X_y(X,y, accept_sparse = None, dtype = np.float64) # kernelise features K = get_kernel( X, X, self.gamma, self.degree, self.coef0, self.kernel, self.kernel_params) # use fit method of RegressionARD _ = super(RVC,self).fit(K,y) self.relevant_ = [np.where(active==True)[0] for active in self.active_] if X.ndim == 1: self.relevant_vectors_ = [ X[relevant_] for relevant_ in self.relevant_] else: self.relevant_vectors_ = [ X[relevant_,:] for relevant_ in self.relevant_ ] return self
def f_classifNumba(X, y): """Compute the ANOVA F-value for the provided sample. Read more in the :ref:`User Guide <univariate_feature_selection>`. Parameters ---------- X : {array-like, sparse matrix} shape = [n_samples, n_features] The set of regressors that will tested sequentially. y : array of shape(n_samples) The data matrix. Returns ------- F : array, shape = [n_features,] The set of F values. pval : array, shape = [n_features,] The set of p-values. See also -------- chi2: Chi-squared stats of non-negative features for classification tasks. f_regression: F-value between label/feature for regression tasks. """ X, y = check_X_y(X, y, ['csr', 'csc', 'coo']) args = [X[safe_mask(X, y == k)] for k in np.unique(y)] return f_onewayNumba(*args)
def fit(self, X, y): """Find the classes statistics before to perform sampling. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ # Check the consistency of X and y X, y = check_X_y(X, y) super(SMOTEENN, self).fit(X, y) # Fit using SMOTE self.sm.fit(X, y) return self
def sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ # Check the consistency of X and y X, y = check_X_y(X, y) super(SMOTEENN, self).sample(X, y) # Transform using SMOTE X, y = self.sm.sample(X, y) # Fit and transform using ENN return self.enn.fit_sample(X, y)
def fit(self, X, y): X, y = check_X_y(X, y, 'csr') _, n_features = X.shape labelbin = LabelBinarizer() Y = labelbin.fit_transform(y) self.classes_ = labelbin.classes_ if Y.shape[1] == 1: Y = np.concatenate((1 - Y, Y), axis=1) # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64. # so we don't have to cast X to floating point Y = Y.astype(np.float64) # Count raw events from data n_effective_classes = Y.shape[1] self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64) self.ratios_ = np.full((n_effective_classes, n_features), self.alpha, dtype=np.float64) self._compute_ratios(X, Y) # flugglyness for i in range(n_effective_classes): X_i = X.multiply(self.ratios_[i]) svm = LinearSVC(C=self.C, max_iter=self.max_iter) Y_i = Y[:,i] svm.fit(X_i, Y_i) self.svm_.append(svm) return self
def _check_X_y(X, y): """Overwrite the checking to let pass some string for categorical features. """ y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) return X, y, binarize_y
def transform(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_subset, n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_subset, n_samples_new) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_subset, n_samples, ) If `return_indices` is `True`, a boolean array will be returned containing the which samples have been selected. """ # Check the consistency of X and y X, y = check_X_y(X, y) super(EasyEnsemble, self).transform(X, y) X_resampled = [] y_resampled = [] if self.return_indices: idx_under = [] for s in range(self.n_subsets): if self.verbose: print("Creation of the set #{}".format(s)) # Create the object for random under-sampling rus = RandomUnderSampler(ratio=self.ratio_, return_indices=self.return_indices, random_state=self.rs_, verbose=self.verbose, replacement=self.replacement) if self.return_indices: sel_x, sel_y, sel_idx = rus.fit_transform(X, y) else: sel_x, sel_y = rus.fit_transform(X, y) X_resampled.append(sel_x) y_resampled.append(sel_y) if self.return_indices: idx_under.append(sel_idx) if self.return_indices: return (np.array(X_resampled), np.array(y_resampled), np.array(idx_under)) else: return np.array(X_resampled), np.array(y_resampled)
def check_Xs_y(Xs, y, multiview=False, enforce_views=None, return_dimensions=False): r""" Checks Xs and y for consistent length. Xs is set to be of dimension 3. Parameters ---------- Xs : nd-array, list Input data. y : nd-array, list Labels. multiview : boolean, (default=False) If True, throws error if just 1 data matrix given. enforce_views : int, (default=not checked) If provided, ensures this number of views in Xs. Otherwise not checked. return_dimensions : boolean, (default=False) If True, the function also returns the dimensions of the multiview dataset. The dimensions are n_views, n_samples, n_features where n_samples and n_views are respectively the number of views and the number of samples, and n_features is a list of length n_views containing the number of features of each view. Returns ------- Xs_converted : object The converted and validated Xs (list of data arrays). y_converted : object The converted and validated y. n_views : int The number of views in the dataset. Returned only if ``return_dimensions`` is ``True``. n_samples : int The number of samples in the dataset. Returned only if ``return_dimensions`` is ``True``. n_features : list List of length ``n_views`` containing the number of features in each view. Returned only if ``return_dimensions`` is ``True``. """ if return_dimensions: Xs_converted, n_views, n_samples, n_features = check_Xs( Xs, multiview=multiview, enforce_views=enforce_views, return_dimensions=True, ) else: Xs_converted = check_Xs(Xs, multiview=multiview, enforce_views=enforce_views) _, y_converted = check_X_y(Xs_converted[0], y, allow_nd=False) if return_dimensions: return Xs_converted, y_converted, n_views, n_samples, n_features else: return Xs_converted, y_converted
def fit(self, X, y): # Check if the dimensions are okay X, y = check_X_y(X, y) check_classification_targets(y) # Get the unique labels self.classes_, y = np.unique(y, return_inverse=True) n_samples, n_features = X.shape n_classes = len(self.classes_) # Check the number of classes if n_classes < 2: raise ValueError('y has less than 2 classes') if self.priors is None: self.priors_ = np.bincount(y) / float(n_samples) else: self.priors_ = self.priors cov = None store_covariance = self.store_covariance or self.store_covariances # Store the covariance if flag is true if store_covariance: cov = [] means = [] # Stores the class means scalings = [ ] # The variance in the rotated coordinate system (scaling) rotations = [] # Rotation of the gaussian to principal axes # For all the given classes for ind in xrange(n_classes): # Subset the classes Xg = X[y == ind, :] # Find the means of the classes meang = Xg.mean(0) means.append(meang) if len(Xg) == 1: raise ValueError('y has only 1 sample in class %s, covariance ' 'is ill defined.' % str(self.classes_[ind])) # Center thr data Xgc = Xg - meang # Xgc = U * S * V.T U, S, Vt = np.linalg.svd(Xgc, full_matrices=False) rank = np.sum(S > self.tol) if rank < n_features: warnings.warn("Variables are collinear") S2 = (S**2) / (len(Xg) - 1) S2 = ((1 - self.reg_param) * S2) + self.reg_param if self.store_covariance or store_covariance: # cov = V * (S^2 / (n-1)) * V.T cov.append(np.dot(S2 * Vt.T, Vt)) # .T gives the transpose scalings.append(S2) rotations.append(Vt.T) # Get the pooled covariance matrix estimate self.class_covariance_ = _class_cov(X, y) # Store the covariance matrices if self.store_covariance or store_covariance: self.covariance_ = cov # Initialize total_covariance_ self.total_covariance_ = [] # Change the covariance matrices depending on alpha for ind in xrange(n_classes): self.total_covariance_.append( self.alpha * cov[ind] + (1 - self.alpha) * self.class_covariance_ ) # New estimate of the covariance matrix # Store the other attributes self.means_ = np.asarray(means) self.scalings_ = scalings self.rotations_ = rotations return self
def fit(self, X, y, groups=None, sample_weight=None): """ Fit ensemble classifers and the meta-classifier. Parameters ---------- X : numpy array, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : numpy array, shape = [n_samples] Target values. groups : numpy array/None, shape = [n_samples] The group that each sample belongs to. This is used by specific folding strategies such as GroupKFold() sample_weight : array-like, shape = [n_samples], optional Sample weights passed as sample_weights to each regressor in the regressors list as well as the meta_regressor. Raises error if some regressor does not support sample_weight in the fit() method. Returns ------- self : object """ if self.use_clones: self.clfs_ = clone(self.classifiers) self.meta_clf_ = clone(self.meta_classifier) else: self.clfs_ = self.classifiers self.meta_clf_ = self.meta_classifier if self.verbose > 0: print("Fitting %d classifiers..." % (len(self.classifiers))) final_cv = check_cv(self.cv, y, classifier=self.stratify) if isinstance(self.cv, int): # Override shuffle parameter in case of self generated # cross-validation strategy final_cv.shuffle = self.shuffle final_cv.random_state = self.random_state # Input validation. X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], dtype=None) if sample_weight is None: fit_params = None else: fit_params = dict(sample_weight=sample_weight) meta_features = None for n, model in enumerate(self.clfs_): if self.verbose > 0: i = self.clfs_.index(model) + 1 print("Fitting classifier%d: %s (%d/%d)" % (i, _name_estimators( (model, ))[0][0], i, len(self.clfs_))) if self.verbose > 2: if hasattr(model, 'verbose'): model.set_params(verbose=self.verbose - 2) if self.verbose > 1: print(_name_estimators((model, ))[0][1]) prediction = cross_val_predict( model, X, y, groups=groups, cv=final_cv, n_jobs=self.n_jobs, fit_params=fit_params, verbose=self.verbose, pre_dispatch=self.pre_dispatch, method='predict_proba' if self.use_probas else 'predict') if not self.use_probas: prediction = prediction[:, np.newaxis] elif self.drop_last_proba: prediction = prediction[:, :-1] if meta_features is None: meta_features = prediction else: meta_features = np.column_stack((meta_features, prediction)) if self.store_train_meta_features: self.train_meta_features_ = meta_features # Fit the base models correctly this time using ALL the training set for model in self.clfs_: if sample_weight is None: model.fit(X, y) else: model.fit(X, y, sample_weight=sample_weight) # Fit the secondary model if self.use_features_in_secondary: meta_features = self._stack_first_level_features(X, meta_features) if sample_weight is None: self.meta_clf_.fit(meta_features, y) else: self.meta_clf_.fit(meta_features, y, sample_weight=sample_weight) return self
def _fit(self, X, y): X, y = check_X_y(X, y, "csr") if self.test_data is not None: test_data = check_array(self.test_data, "cst") # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[1] estimator = clone(self.estimator) # Genetic Algorithm toolbox = base.Toolbox() init_features = partial(_init_selected_features, n_features=n_features) toolbox.register("individual", tools.initIterate, creator.Individual, init_features) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", _eval_function, gaobject=self, estimator=estimator, X=X, y=y, cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params, caching=self.caching, test_data=test_data) toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba) toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs > 1: pool = multiprocessing.Pool(processes=self.n_jobs) toolbox.register("map", pool.map) elif self.n_jobs < 0: pool = multiprocessing.Pool( processes=max(cpu_count() + 1 + self.n_jobs, 1)) toolbox.register("map", pool.map) pop = toolbox.population(n=self.n_population) hof = tools.HallOfFame(5, similar=np.array_equal) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", self.rounded_mean) stats.register("std", self.rounded_std) stats.register("min", self.rounded_min) stats.register("max", self.rounded_max) if self.verbose > 0: print("Selecting features with genetic algorithm.") _, log = algorithms.eaSimple(pop, toolbox, cxpb=self.crossover_proba, mutpb=self.mutation_proba, ngen=self.n_generations, stats=stats, halloffame=hof, verbose=self.verbose) if self.n_jobs != 1: pool.close() pool.join() print('done') # Set final attributes support_ = np.array(hof, dtype=np.bool)[0] self.estimator_ = clone(self.estimator) self.estimator_.fit(X[:, support_], y) self.generation_scores_ = np.array( [score for score, _ in log.select("max")]) self.n_features_ = support_.sum() self.support_ = support_ return self
def make_imbalance(self, ratio=None, random_state=None): """ Built on the imblearn.make_imbalance function :param ratio: dict or list Ratio to use for resampling the data set. - When 'dict', the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. - When 'list', the values correspond to the proportions of samples (float) assigned to each class. In this case the number of samples is maintained but the samples per class are adjusted to the given proportions. :param random_state: int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. :return: """ x, y = check_X_y(self.data, self.target) original_dataset_size = len(y) n_classes = len(self.target_names) if isinstance(ratio, dict): ratio_ = ratio elif isinstance(ratio, list): weights = ratio if len(weights) != n_classes: raise ValueError( "{} classes available but only {} values provided".format( n_classes, len(weights))) ratio_ = {} for i in range(n_classes): ratio_[i] = int(round(weights[i] * original_dataset_size, 0)) else: raise TypeError("Expected dict or list; {} provided".format( type(ratio))) if sum(ratio_.values()) < original_dataset_size: rus = RandomUnderSampler(ratio=ratio_, random_state=random_state) self.data, self.target = rus.fit_sample(x, y) elif sum(ratio_.values()) == original_dataset_size: original_distribution = Counter(y) interim_ratio = {} for key in ratio_: if ratio_[key] >= original_distribution[key]: interim_ratio[key] = original_distribution[key] else: interim_ratio[key] = ratio_[key] with warnings.catch_warnings(): warnings.simplefilter("ignore") rus = RandomUnderSampler(ratio=interim_ratio, random_state=random_state) x_int, y_int = rus.fit_sample(x, y) with warnings.catch_warnings(): # Silencing RandomOverSampler UserWarning: After over-sampling, the number of samples in class A will # be larger than the number of samples in the majority class warnings.simplefilter("ignore") ros = RandomOverSampler(ratio=ratio_, random_state=random_state) self.data, self.target = ros.fit_sample(x_int, y_int) else: raise ValueError( "The requested dataset cannot be larger than the original dataset" )
def fit(self, X, y=None): """Derived from https://github.com/fchollet/keras/blob/master/keras/wrappers/scikit_learn.py Adds: Handling pandas inputs Saving of model into the class to allow for easy pickling Parameters ---------- X : pandas dataframe or array-like training samples y : array like, required for array-like X and not used presently for pandas dataframe class labels Returns ------- self: object """ if isinstance(X, pd.DataFrame): df = X (X, y, self.vectorizer) = self.convert_numpy(df) else: check_X_y(X, y) input_width = X.shape[1] num_classes = len(y.unique()) logger.info("input_width %d", input_width) logger.info("num_classes %d", num_classes) train_y = np_utils.to_categorical(y, num_classes) self.model = self.model_create(input_width, num_classes) if len(y.shape) == 1: self.classes_ = list(np.unique(y)) if self.loss == 'categorical_crossentropy': y = to_categorical(y) else: self.classes_ = np.arange(0, y.shape[1]) if self.compiled_model_ is None: self.compiled_model_ = copy.deepcopy(self.model) self.compiled_model_.compile(optimizer=self.optimizer, loss=self.loss) history = self.compiled_model_.fit( X, y, batch_size=self.train_batch_size, nb_epoch=self.nb_epoch, verbose=self.verbose, shuffle=self.shuffle, show_accuracy=self.show_accuracy, validation_split=self.validation_split, validation_data=self.validation_data, callbacks=self.callbacks) self.config_ = self.model.to_json() self.compiled_model_.save_weights(self.tmp_model) with open(self.tmp_model, mode='rb') as file: # b is important -> binary self.model_saved = file.read() return self
def partial_fit(self, X=None, y=None, labels=None, n_features=10): if X is None: if labels is None: raise ValueError("labels should be provided at first call to " "partial_fit.") if n_features is None: raise ValueError("n_features should be provided at first call " "to partial_fit.") self.rng_ = check_random_state(self.random_state) n_hidden = self.n_hidden self.classes_ = labels self.wi_ = self.rng_.multivariate_normal( np.zeros((n_features + 1) * n_hidden), self.prior_scale * np.eye((n_features + 1) * n_hidden), size=self.n_iter) self.wo_ = self.rng_.multivariate_normal( np.zeros(n_hidden * len(labels)), self.prior_scale * np.eye(n_hidden * len(self.classes_)), size=self.n_iter) else: n_hidden = self.n_hidden X, y = check_X_y(X, y) n_features = self.wi_.shape[1] // n_hidden samples_i = np.zeros((self.n_iter, n_features, n_hidden)) samples_o = np.zeros((self.n_iter, n_hidden, len(self.classes_))) weights = np.zeros(self.n_iter) cov_i = self.scale * np.eye(n_features * n_hidden) cov_o = self.scale * np.eye(n_hidden * len(self.classes_)) for i in range(self.n_iter): s_i = self.rng_.multivariate_normal(self.wi_[i], cov_i) samples_i[i] = s_i.reshape(n_features, n_hidden) s_o = self.rng_.multivariate_normal(self.wo_[i], cov_o) samples_o[i] = s_o.reshape(n_hidden, len(self.classes_)) reg = self.alpha * (np.dot(s_i, s_i) + np.dot(s_o, s_o)) loss = -log_loss(y, self.forward(X, samples_i[i], samples_o[i]), labels=self.classes_) weights[i] = loss - reg self.samples_i_ = samples_i self.samples_o_ = samples_o self.weights_ = softmax_1D(weights) self.multi_ = self.rng_.multinomial(self.n_iter, self.weights_) resampled = np.repeat(np.arange(self.n_iter), self.multi_) self.wi_ = self.wi_[resampled] self.wo_ = self.wo_[resampled] if self.local not in [None, "mh", "basinhopping"]: raise ValueError( "local should be one of None, mh or basinhopping") if self.local == "mh": for i in range(self.n_iter): self.wi_[i], self.wo_[i] = self.mh_step( X, y, self.wi_[i], self.wo_[i]) elif self.local == "basinhopping": wi_len = len(self.wi_[0]) for i in range(self.n_iter): x0 = np.concatenate((self.wi_[i], self.wo_[i])) opt_func = partial(log_likelihood, X=X, y=y, mlp=self) res = basinhopping(opt_func, x0) self.wi_[i], self.wo_[i] = res.x[:wi_len], res.x[wi_len:] self.coef_i_ = np.mean(self.wi_.reshape(self.n_iter, n_features, n_hidden), axis=0) self.coef_o_ = np.mean(self.wo_.reshape(self.n_iter, n_hidden, len(self.classes_)), axis=0) return self
def _batch_fit(self, X, y, check_input=False): print('Batch fit') if check_input: X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self) current_n_samples, n_features = X.shape # Update stats - they are 0 if this is the first step updated_mean, updated_var, updated_n_samples_seen_ = _incremental_mean_and_var( X, last_mean=self.mean_, last_variance=self.var_, last_sample_count=self.n_samples_seen_) # Whitening if self.n_samples_seen_ == 0: # If it is the first step, simply whiten X X = np.subtract(X, updated_mean) else: col_batch_mean = np.mean(X, axis=0) X = np.subtract(X, col_batch_mean) # Updating algorithm # First update class means updated_class_mean = self.class_mean_ updated_class_n_samples_seen_ = self.class_n_samples_seen_ # print('updated_class_n_samples_seen_', updated_class_n_samples_seen_) # print('updated_class_mean', updated_class_mean) for i, current_class in enumerate(self.classes_): current_class_samples = X[y == current_class, :] n_current_class_samples = current_class_samples.shape[0] previous_n_class_samples = updated_class_n_samples_seen_[i] if n_current_class_samples > 0 and previous_n_class_samples > 0: previous_class_sum_current_class = updated_class_mean[ i, :] * updated_class_n_samples_seen_[i] current_class_sum_current_class = np.sum(current_class_samples, axis=0) # print('previous_class_sum_current_class.shape', previous_class_sum_current_class.shape) # print('current_class_sum_current_class.shape', current_class_sum_current_class.shape) # print('updated_class_mean.shape', updated_class_mean.shape) # print('updated_class_n_samples_seen_.shape', updated_class_n_samples_seen_[i]) updated_class_n_samples_seen_[i] += n_current_class_samples updated_class_mean[i, :] = (previous_class_sum_current_class + current_class_sum_current_class) /\ previous_n_class_samples elif n_current_class_samples > 0: updated_class_mean[i, :] = np.mean(current_class_samples, axis=0) updated_class_n_samples_seen_[i] = n_current_class_samples # Then update between class scatter updated_between_scatter = self.between_scatter for i, current_class_mean in enumerate(updated_class_mean): n = X[y == self.classes_[i], :].shape[0] current_class_mean = current_class_mean.reshape(1, n_features) updated_mean = updated_mean.reshape(1, n_features) if n > 0: updated_between_scatter += n * ( current_class_mean - updated_mean).T.dot(current_class_mean - updated_mean) # if np.any(np.isnan(updated_between_scatter)): # print('Reached nan:::: ', n) # print('Updatec class mean:::', updated_class_mean) # print('updated mean::::', updated_mean) updated_class_within_scatter = self.class_within_scatter for i, current_class_mean in enumerate(updated_class_mean): current_class_samples = X[y == self.classes_[i], :] n_current_class_samples = current_class_samples.shape[0] l_c = current_class_samples.shape[0] n_c = self.class_n_samples_seen_[i] mean_y_c = np.reshape(np.mean(current_class_samples, axis=0), (n_features, 1)) if n_current_class_samples > 0 and n_c > 0: # print('current_class_samples.shape', current_class_samples.shape) mean_x_c = np.reshape(self.class_mean_[i, :], (n_features, 1)) D_c = (mean_y_c - mean_x_c).dot((mean_y_c - mean_x_c).T) E_c = np.zeros(D_c.shape) for current_samples, j in enumerate(current_class_samples): E_c += (current_samples - mean_x_c).dot( (current_samples - mean_x_c).T) F_c = np.zeros(D_c.shape) for current_samples, j in enumerate(current_class_samples): F_c += (current_samples - mean_y_c).dot( (current_samples - mean_y_c).T) updated_class_within_scatter[:, :, i] += ((n_c * l_c * l_c) * D_c / np.square(n_c + l_c)) + \ ((np.square(n_c) * E_c) / np.square(n_c + l_c)) + \ ((l_c * (l_c + (2 * n_c)) * F_c) / np.square(n_c + l_c)) elif n_current_class_samples > 0: updated_class_within_scatter[:, :, i] = (current_class_samples - mean_y_c).dot( (current_class_samples - mean_y_c).T) updated_within_scatter = np.sum(updated_class_within_scatter, axis=2) # Final values after computation self.n_samples_seen_ = updated_n_samples_seen_ self.class_n_samples_seen_ = updated_class_n_samples_seen_ self.mean_ = updated_mean self.class_mean_ = updated_class_mean self.var_ = updated_var self.between_scatter = updated_between_scatter self.within_scatter = updated_within_scatter self.class_within_scatter = updated_class_within_scatter
def _single_fit(self, X, y, check_input=False): print('Single Fit') if check_input: X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self)
def fit(self, X, y): if self.model is not None: thundersvm.model_free(c_void_p(self.model)) self.model = None sparse = sp.isspmatrix(X) self._sparse = sparse and not callable(self.kernel) X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr') y = self.label_validate(y) solver_type = SVM_TYPE.index(self._impl) if self.gamma == 'auto': self._gamma = 1.0 / X.shape[1] else: self._gamma = self.gamma if self.kernel not in KERNEL_TYPE: print( "The kernel parameter not recognized, please refer to the document." ) exit() else: kernel = KERNEL_TYPE.index(self.kernel) fit = self._sparse_fit if self._sparse else self._dense_fit thundersvm.model_new.restype = c_void_p self.model = thundersvm.model_new(solver_type) if self.max_mem_size != -1: thundersvm.set_memory_size(c_void_p(self.model), self.max_mem_size) fit(X, y, solver_type, kernel) if self._train_succeed[0] == -1: print("Training failed!") return self.n_sv = thundersvm.n_sv(c_void_p(self.model)) csr_row = (c_int * (self.n_sv + 1))() csr_col = (c_int * (self.n_sv * self.n_features))() csr_data = (c_float * (self.n_sv * self.n_features))() data_size = (c_int * 1)() thundersvm.get_sv(csr_row, csr_col, csr_data, data_size, c_void_p(self.model)) dual_coef = (c_float * ((self.n_classes - 1) * self.n_sv))() thundersvm.get_coef(dual_coef, self.n_classes, self.n_sv, c_void_p(self.model)) self.dual_coef_ = np.array([ dual_coef[index] for index in range(0, (self.n_classes - 1) * self.n_sv) ]).astype(float) self.dual_coef_ = np.reshape(self.dual_coef_, (self.n_classes - 1, self.n_sv)) rho_size = int(self.n_classes * (self.n_classes - 1) / 2) self.n_binary_model = rho_size rho = (c_float * rho_size)() thundersvm.get_rho(rho, rho_size, c_void_p(self.model)) if self.kernel == 'linear': coef = (c_float * (self.n_binary_model * self.n_sv))() thundersvm.get_linear_coef(coef, self.n_binary_model, self.n_features, c_void_p(self.model)) self.coef_ = np.array([ coef[index] for index in range(0, self.n_binary_model * self.n_features) ]).astype(float) self.coef_ = np.reshape(self.coef_, (self.n_binary_model, self.n_features)) self.intercept_ = np.array( [rho[index] for index in range(0, rho_size)]).astype(float) self.row = np.array( [csr_row[index] for index in range(0, self.n_sv + 1)]) self.col = np.array( [csr_col[index] for index in range(0, data_size[0])]) self.data = np.array( [csr_data[index] for index in range(0, data_size[0])]) self.support_vectors_ = sp.csr_matrix((self.data, self.col, self.row)) if self._sparse == False: self.support_vectors_ = self.support_vectors_.toarray(order='C') n_support_ = (c_int * self.n_classes)() thundersvm.get_support_classes(n_support_, self.n_classes, c_void_p(self.model)) self.n_support_ = np.array([ n_support_[index] for index in range(0, self.n_classes) ]).astype(int) self.shape_fit_ = X.shape return self
def make_imbalance(X, y, sampling_strategy=None, random_state=None, verbose=False, **kwargs): """Turns a dataset into an imbalanced dataset with a specific sampling strategy. A simple toy dataset to visualize clustering and classification algorithms. Read more in the :ref:`User Guide <make_imbalanced>`. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data to be imbalanced. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. sampling_strategy : dict, or callable, Ratio to use for resampling the data set. - When ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class. - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. verbose : bool, optional (default=False) Show information regarding the sampling. kwargs : dict, optional Dictionary of additional keyword arguments to pass to ``sampling_strategy``. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the imbalanced data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` Notes ----- See :ref:`sphx_glr_auto_examples_applications_plot_multi_class_under_sampling.py`, :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py`, and :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import load_iris >>> from imblearn.datasets import make_imbalance >>> data = load_iris() >>> X, y = data.data, data.target >>> print('Distribution before imbalancing: {}'.format(Counter(y))) Distribution before imbalancing: Counter({0: 50, 1: 50, 2: 50}) >>> X_res, y_res = make_imbalance(X, y, ... sampling_strategy={0: 10, 1: 20, 2: 30}, ... random_state=42) >>> print('Distribution after imbalancing: {}'.format(Counter(y_res))) Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10}) """ X, y = check_X_y(X, y) target_stats = Counter(y) # restrict ratio to be a dict or a callable if isinstance(sampling_strategy, dict) or callable(sampling_strategy): sampling_strategy_ = check_sampling_strategy(sampling_strategy, y, "under-sampling", **kwargs) else: raise ValueError( "'sampling_strategy' has to be a dictionary or a " "function returning a dictionary. Got {} instead.".format( type(sampling_strategy))) if verbose: print( "The original target distribution in the dataset is: %s", target_stats, ) rus = RandomUnderSampler( sampling_strategy=sampling_strategy_, replacement=False, random_state=random_state, ) X_resampled, y_resampled = rus.fit_resample(X, y) if verbose: print("Make the dataset imbalanced: %s", Counter(y_resampled)) return X_resampled, y_resampled
def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): X, y = check_X_y(X, y) # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. epsilon = 1e-9 * np.var(X, axis=0).max() if _refit: self.classes_ = None if _check_partial_fit_first_call(self, classes): # This is the first call to partial_fit: # initialize various cumulative counters n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_prior_ = np.zeros(n_classes) self.class_count_ = np.zeros(n_classes) else: if X.shape[1] != self.theta_.shape[1]: msg = "Number of features %d does not match previous data %d." raise ValueError(msg % (X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= epsilon classes = self.classes_ unique_y = np.unique(y) unique_y_in_classes = in1d(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError("The target label(s) %s in y do not exist in the " "initial classes %s" % (y[~unique_y_in_classes], classes)) for y_i in unique_y: i = classes.searchsorted(y_i) X_i = X[y == y_i, :] if sample_weight is not None: sw_i = sample_weight[y == y_i] N_i = sw_i.sum() else: sw_i = None N_i = X_i.shape[0] new_theta, new_sigma = self._update_mean_variance( self.class_count_[i], self.theta_[i, :], self.sigma_[i, :], X_i, sw_i) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_[i] += N_i self.sigma_[:, :] += epsilon self.class_prior_[:] = self.class_count_ / np.sum(self.class_count_) #print self.class_prior_[:] return self
def test_check_array_min_samples_and_features_messages(): # empty list is considered 2D by default: msg = "0 feature(s) (shape=(1, 0)) while a minimum of 1 is required." assert_raise_message(ValueError, msg, check_array, [[]]) # If considered a 1D collection when ensure_2d=False, then the minimum # number of samples will break: msg = "0 sample(s) (shape=(0,)) while a minimum of 1 is required." assert_raise_message(ValueError, msg, check_array, [], ensure_2d=False) # Invalid edge case when checking the default minimum sample of a scalar msg = "Singleton array array(42) cannot be considered a valid collection." assert_raise_message(TypeError, msg, check_array, 42, ensure_2d=False) # Simulate a model that would need at least 2 samples to be well defined X = np.ones((1, 10)) y = np.ones(1) msg = "1 sample(s) (shape=(1, 10)) while a minimum of 2 is required." assert_raise_message(ValueError, msg, check_X_y, X, y, ensure_min_samples=2) # The same message is raised if the data has 2 dimensions even if this is # not mandatory assert_raise_message(ValueError, msg, check_X_y, X, y, ensure_min_samples=2, ensure_2d=False) # Simulate a model that would require at least 3 features (e.g. SelectKBest # with k=3) X = np.ones((10, 2)) y = np.ones(2) msg = "2 feature(s) (shape=(10, 2)) while a minimum of 3 is required." assert_raise_message(ValueError, msg, check_X_y, X, y, ensure_min_features=3) # Only the feature check is enabled whenever the number of dimensions is 2 # even if allow_nd is enabled: assert_raise_message(ValueError, msg, check_X_y, X, y, ensure_min_features=3, allow_nd=True) # Simulate a case where a pipeline stage as trimmed all the features of a # 2D dataset. X = np.empty(0).reshape(10, 0) y = np.ones(10) msg = "0 feature(s) (shape=(10, 0)) while a minimum of 1 is required." assert_raise_message(ValueError, msg, check_X_y, X, y) # nd-data is not checked for any minimum number of features by default: X = np.ones((10, 0, 28, 28)) y = np.ones(10) X_checked, y_checked = check_X_y(X, y, allow_nd=True) assert_array_equal(X, X_checked) assert_array_equal(y, y_checked)
def fit(self, X, y): """Find the classes statistics before to perform sampling. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ # Check the consistency of X and y X, y = check_X_y(X, y) self.min_c_ = None self.maj_c_ = None self.stats_c_ = {} self.X_shape_ = None if hasattr(self, 'ratio'): self._validate_ratio() if hasattr(self, 'size_ngh'): self._validate_size_ngh_deprecation() elif hasattr(self, 'k') and not hasattr(self, 'm'): self._validate_k_deprecation() elif hasattr(self, 'k') and hasattr(self, 'm'): self._validate_k_m_deprecation() self.logger.info('Compute classes statistics ...') # Raise an error if there is only one class # if uniques.size == 1: # raise RuntimeError("Only one class detected, aborting...") # Raise a warning for the moment to be compatible with BaseEstimator self.logger.debug('The number of classes is %s', np.unique(y).size) self.logger.debug('Shall we raise a warning: %s', np.unique(y).size == 1) if np.unique(y).size == 1: warnings.simplefilter('always', UserWarning) warnings.warn('Only one class detected, something will get wrong') self.logger.debug('The warning should has been raised.') # Store the size of X to check at sampling time if we have the # same data self.X_shape_ = X.shape # Create a dictionary containing the class statistics self.stats_c_ = Counter(y) # Find the minority and majority classes self.min_c_ = min(self.stats_c_, key=self.stats_c_.get) self.maj_c_ = max(self.stats_c_, key=self.stats_c_.get) self.logger.info('%s classes detected: %s', np.unique(y).size, self.stats_c_) # Check if the ratio provided at initialisation make sense if isinstance(self.ratio, float): if self.ratio < (self.stats_c_[self.min_c_] / self.stats_c_[self.maj_c_]): raise RuntimeError('The ratio requested at initialisation' ' should be greater or equal than the' ' balancing ratio of the current data.') return self
def _eval_function(individual, gaobject, estimator, X, y, cv, scorer, verbose, fit_params, caching, test_data=None): individual_sum = np.sum(individual, axis=0) if individual_sum == 0: return -10000, individual_sum individual_tuple = tuple(individual) if caching and individual_tuple in gaobject.scores_cache: return gaobject.scores_cache[individual_tuple], individual_sum x_selected = X[:, np.array(individual, dtype=np.bool)] scores = [] x_holdout_selected = [] if fit_params['eval_set'] is not None: eval_set_params = copy.deepcopy(fit_params) for i, valid_data in enumerate(eval_set_params['eval_set']): x_holdout, y_holdout = check_X_y(valid_data[0], valid_data[1], "csr") x_holdout_selected = x_holdout[:, np.array(individual, dtype=np.bool)] eval_set_params['eval_set'][i][0] = x_holdout_selected eval_set_params['eval_set'][i][1] = y_holdout else: eval_set_params = fit_params fold = 0 x_test_selected = oof_test = oof_train = oof_test_skf = oof_holdout = None if test_data is not None: x_test_selected = test_data[:, np.array(individual, dtype=np.bool)] oof_train = np.zeros((x_selected.shape[0], )) oof_holdout = np.empty( (cv.get_n_splits(), x_holdout_selected.shape[0])) oof_test = np.zeros((x_test_selected.shape[0], )) oof_test_skf = np.empty((cv.get_n_splits(), x_test_selected.shape[0])) fit_time = score_time = 0 start_time = time.time() for train, test in cv.split(x_selected, y): x_selected_test, y_test = check_X_y(x_selected[test], y[test], "csr") eval_set_params['eval_set'].append([x_selected_test, y_test]) eval_set_params['eval_names'].append('cv-valid') print('reset estimator') estimator = clone(estimator) score = _fit_and_score(estimator=estimator, X=x_selected, y=y, scorer=scorer, train=train, test=test, verbose=verbose, parameters=None, fit_params=eval_set_params) # cleanup for the next round del eval_set_params['eval_names'][1] del eval_set_params['eval_set'][1], x_selected_test, y_test gc.collect() fit_time = time.time() - start_time print('Learning done in {:f} seconds'.format(fit_time)) scores.append(score) # if it is not empty - we want oof predictions if test_data is not None: oof_train[test] = estimator.booster_.predict( x_selected[test], num_iteration=estimator.best_iteration_) oof_test_skf[fold, :] = estimator.booster_.predict( x_test_selected, num_iteration=estimator.best_iteration_) oof_holdout[fold, :] = estimator.booster_.predict( x_holdout_selected, num_iteration=estimator.best_iteration_) fold += 1 score_time = time.time() - start_time - fit_time print('predicting done in {:f} seconds'.format(score_time)) total_time = score_time + fit_time print('individual done in {:f} seconds'.format(total_time)) scores_mean = np.mean(scores) scores_std = np.std(scores) data_dict = {} if test_data is not None: oof_test[:] = oof_test_skf.mean(axis=0) oof_train = oof_train.reshape(-1, 1) oof_test = oof_test.reshape(-1, 1) data_dict = { 'holdout_score': float(estimator.best_score_['oof']['auc']), 'holdout_prediction_folds': oof_holdout, 'estimator_scores': estimator.best_score_, 'oof_test_folds': oof_test_skf, 'oof_train': oof_train, 'oof_test_mean': oof_test, 'estimator_params': estimator.get_params(), 'estimator_feature_importance': estimator.feature_importances_, 'estimator_best_iteration': int(estimator.best_iteration_), 'estimator_n_features_': estimator.n_features_, 'original_n_features': X.shape[0], 'cv_scores': scores, 'cv_score': scores_mean, 'cv_score_std': scores_std, 'folds': fold, 'individual': individual, 'individual_hash': str(hash(tuple(individual))), 'time': time.time() } del scores, oof_test_skf, oof_test, oof_train, eval_set_params del x_test_selected gc.collect() name = '{:.5f}_{:d}_{:.4f}_{:.4f}_{}_oof_data'.format( data_dict['holdout_score'], data_dict['estimator_n_features_'], data_dict['cv_score'], data_dict['cv_score_std'], data_dict['individual_hash']) save_oof_predictions(name, data_dict) if caching: gaobject.scores_cache[individual_tuple] = scores_mean filename = os.path.join(os.getcwd(), 'cache.z') joblib.dump(gaobject.scores_cache, filename, compress=True) del filename print(80 * '=') print(80 * '=') print('Individual scored') print('holdout-score: {:.5f}'.format(data_dict['holdout_score'])) print('cv-score : {:.5f}'.format(data_dict['cv_score'])) print('n_features : {:6d}'.format(data_dict['estimator_n_features_'])) print(80 * '=') print(80 * '=') del data_dict gc.collect() return scores_mean, individual_sum
def fit(self, X, y, sample_weight=None, relative_penalties=None): """Fit the model to training data. If n_splits > 1 also run n-fold cross validation on all values in lambda_path. The model will be fit n+1 times. On the first pass, the lambda_path will be determined, on the remaining passes, the model performance for each value of lambda. After cross validation, the attribute `cv_mean_score_` will contain the mean score over all folds for each value of lambda, and `cv_standard_error_` will contain the standard error of `cv_mean_score_` for each value of lambda. The value of lambda which achieves the best performance in cross validation will be saved to `lambda_max_` additionally, the largest value of lambda s.t.: cv_score(l) >= cv_score(lambda_max_) -\ cut_point * standard_error(lambda_max_) will be saved to `lambda_best_`. Parameters ---------- X : array, shape (n_samples, n_features) Input features Y : array, shape (n_samples,) Target values sample_weight : array, shape (n_samples,) Optional weight vector for observations relative_penalties: array, shape (n_features,) Optional relative weight vector for penalty. 0 entries remove penalty. Returns ------- self : object Returns self. """ if self.alpha > 1 or self.alpha < 0: raise ValueError("alpha must be between 0 and 1") if self.n_splits > 0 and self.n_splits < 3: raise ValueError("n_splits must be at least 3") X, y = check_X_y(X, y, accept_sparse='csr', ensure_min_samples=2) if sample_weight is None: sample_weight = np.ones(X.shape[0]) self._fit(X, y, sample_weight, relative_penalties) if self.n_splits >= 3: cv_scores = _score_lambda_path(self, X, y, sample_weight, relative_penalties, self.n_splits, self.scoring, classifier=False, n_jobs=self.n_jobs, verbose=self.verbose) self.cv_mean_score_ = np.atleast_1d(np.mean(cv_scores, axis=0)) self.cv_standard_error_ = np.atleast_1d(stats.sem(cv_scores)) self.lambda_max_inx_ = np.argmax(self.cv_mean_score_) self.lambda_max_ = self.lambda_path_[self.lambda_max_inx_] target_score = self.cv_mean_score_[self.lambda_max_inx_] -\ self.cut_point * self.cv_standard_error_[self.lambda_max_inx_] self.lambda_best_inx_ = np.argwhere( self.cv_mean_score_ >= target_score)[0] self.lambda_best_ = self.lambda_path_[self.lambda_best_inx_] self.coef_ = self.coef_path_[..., self.lambda_best_inx_] self.coef_ = self.coef_.squeeze(axis=self.coef_.ndim - 1) self.intercept_ = self.intercept_path_[ ..., self.lambda_best_inx_].squeeze() if self.intercept_.shape == (): # convert 0d array to scalar self.intercept_ = float(self.intercept_) return self
def discr_stat(X, Y, dissimilarity="euclidean", remove_isolates=True, return_rdfs=True): """ Computes the discriminability statistic. Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) Input data. If dissimilarity=='precomputed', the input should be the dissimilarity matrix. Y : 1d-array, shape (n_samples) Input labels. dissimilarity : str, {"euclidean" (default), "precomputed"} Dissimilarity measure can be 'euclidean' (pairwise Euclidean distances between points in the dataset) or 'precomputed' (pre-computed dissimilarities). remove_isolates : bool, optional, default=True Whether to remove data that have single label. return_rdfs : bool, optional, default=False Whether to return rdf for all data points. Returns ------- stat : float Discriminability statistic. rdfs : array, shape (n_samples, max{len(id)}) Rdfs for each sample. Only returned if ``return_rdfs==True``. """ check_X_y(X, Y, accept_sparse=True) uniques, counts = np.unique(Y, return_counts=True) if remove_isolates: idx = np.isin(Y, uniques[counts != 1]) labels = Y[idx] if (dissimilarity == "euclidean" or dissimilarity == "cosine" or dissimilarity == "haversine" or dissimilarity == "manhattan" or dissimilarity == "mahalanobis"): X = X[idx] else: X = X[np.ix_(idx, idx)] else: labels = Y if dissimilarity == "euclidean": dissimilarities = euclidean_distances(X) elif dissimilarity == "cosine": dissimilarities = cosine_distances(X) elif dissimilarity == "haversine": dissimilarities = haversine_distances(X) elif dissimilarity == "manhattan": dissimilarities = manhattan_distances(X) else: dissimilarities = X rdfs = _discr_rdf(dissimilarities, labels) rdfs[rdfs < 0.5] = np.nan stat = np.nanmean(rdfs) if return_rdfs: return stat, rdfs else: return stat
def fit(self, X, y, labels=None, neighbors=None): """Generate the intra-label and inter-label distribution. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. labels : array-like, shape (n_samples,) Labels of each sample. neighbors : array-like, (n_neighboring_pairs, 2) An array that contains all neighboring pairs. Each row is a unique neighboring pair. Returns ------- self : object, Return self. """ # Check data X, y = check_X_y(X, y, dtype=None) # Set statistics self.majority_class_label_ = Counter(y).most_common()[0][0] self.unique_cluster_labels_ = ( np.unique(labels) if labels is not None else np.array(0, dtype=int) ) self.unique_class_labels_ = np.unique(y) self.n_samples_ = len(X) # Set default attributes self.labels_ = ( np.repeat(0, len(X)) if labels is None else check_array(labels, ensure_2d=False) ) self.neighbors_ = ( np.empty((0, 2), dtype=int) if neighbors is None else check_array(neighbors, ensure_2d=False) ) self.intra_distribution_ = { (0, class_label): 1.0 for class_label in np.unique(y) if class_label != self.majority_class_label_ } self.inter_distribution_ = {} # Fit distributor self._fit(X, y, labels, neighbors) # Validate fitting procedure self._validate_fitting() return self
def fit(self, X, Y): """Fit the model to data matrix X and targets Y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Y : array-like, shape (n_samples, n_classes) The target values. Returns ------- self : object Returns self. """ X, Y = check_X_y(X, Y, multi_output=True, accept_sparse=True) random_state = check_random_state(self.random_state) check_array(X, accept_sparse=True) self.order_ = self.order if self.order_ is None: self.order_ = np.array(range(Y.shape[1])) elif isinstance(self.order_, str): if self.order_ == 'random': self.order_ = random_state.permutation(Y.shape[1]) elif sorted(self.order_) != list(range(Y.shape[1])): raise ValueError("invalid order") self.estimators_ = [clone(self.base_estimator) for _ in range(Y.shape[1])] self.classes_ = [] if self.cv is None: Y_pred_chain = Y[:, self.order_] if sp.issparse(X): X_aug = sp.hstack((X, Y_pred_chain), format='lil') X_aug = X_aug.tocsr() else: X_aug = np.hstack((X, Y_pred_chain)) elif sp.issparse(X): Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1])) X_aug = sp.hstack((X, Y_pred_chain), format='lil') else: Y_pred_chain = np.zeros((X.shape[0], Y.shape[1])) X_aug = np.hstack((X, Y_pred_chain)) del Y_pred_chain for chain_idx, estimator in enumerate(self.estimators_): y = Y[:, self.order_[chain_idx]] estimator.fit(X_aug[:, :(X.shape[1] + chain_idx)], y) if self.cv is not None and chain_idx < len(self.estimators_) - 1: col_idx = X.shape[1] + chain_idx cv_result = cross_val_predict( self.base_estimator, X_aug[:, :col_idx], y=y, cv=self.cv) if sp.issparse(X_aug): X_aug[:, col_idx] = np.expand_dims(cv_result, 1) else: X_aug[:, col_idx] = cv_result self.classes_.append(estimator.classes_) return self
def visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False): # pragma: no cover """ Utility function for visualizing the results in examples Internal use only :param clf_name: The name of the detector :type clf_name: str :param X_train: The training samples :param X_train: numpy array of shape (n_samples, n_features) :param y_train: The ground truth of training samples :type y_train: list or array of shape (n_samples,) :param X_test: The test samples :type X_test: numpy array of shape (n_samples, n_features) :param y_test: The ground truth of test samples :type y_test: list or array of shape (n_samples,) :param y_train_pred: The predicted outlier scores on the training samples :type y_train_pred: numpy array of shape (n_samples, n_features) :param y_test_pred: The predicted outlier scores on the test samples :type y_test_pred: numpy array of shape (n_samples, n_features) :param show_figure: If set to True, show the figure :type show_figure: bool, optional (default=True) :param save_figure: If set to True, save the figure to the local :type save_figure: bool, optional (default=False) """ if X_train.shape[1] != 2 or X_test.shape[1] != 2: raise ValueError("Input data has to be 2-d for visualization. The " "input data has {shape}.".format(shape=X_train.shape)) X_train, y_train = check_X_y(X_train, y_train) X_test, y_test = check_X_y(X_test, y_test) c_train = get_color_codes(y_train) c_test = get_color_codes(y_test) fig = plt.figure(figsize=(12, 10)) plt.suptitle("Demo of {clf_name}".format(clf_name=clf_name)) fig.add_subplot(221) plt.scatter(X_train[:, 0], X_train[:, 1], c=c_train) plt.title('Train ground truth') legend_elements = [ Line2D([0], [0], marker='o', color='w', label='normal', markerfacecolor='b', markersize=8), Line2D([0], [0], marker='o', color='w', label='outlier', markerfacecolor='r', markersize=8) ] plt.legend(handles=legend_elements, loc=4) fig.add_subplot(222) plt.scatter(X_test[:, 0], X_test[:, 1], c=c_test) plt.title('Test ground truth') plt.legend(handles=legend_elements, loc=4) fig.add_subplot(223) plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train_pred) plt.title('Train prediction by {clf_name}'.format(clf_name=clf_name)) legend_elements = [ Line2D([0], [0], marker='o', color='w', label='normal', markerfacecolor='0', markersize=8), Line2D([0], [0], marker='o', color='w', label='outlier', markerfacecolor='yellow', markersize=8) ] plt.legend(handles=legend_elements, loc=4) fig.add_subplot(224) plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test_pred) plt.title('Test prediction by {clf_name}'.format(clf_name=clf_name)) plt.legend(handles=legend_elements, loc=4) if save_figure: plt.savefig('{clf_name}.png'.format(clf_name=clf_name), dpi=300) if show_figure: plt.show() return
def fit(self, X, y, feature_labels=None): # -1 for unlabeled """Fit rule lists to data Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array_like, shape = [n_samples] Labels feature_labels : array_like, shape = [n_features], optional (default: None) String labels for each feature. If none, features are simply enumerated Returns ------- self : returns an instance of self. """ if len(set(y)) != 2: raise Exception( "Only binary classification is supported at this time!") X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self) if feature_labels == None: feature_labels = ["ft" + str(i + 1) for i in range(len(X[0]))] self.feature_labels = feature_labels if type(X) != list: X = np.array(X).tolist() if 'str' not in str(type(X[0][0])): if self.verbose: print "Warning: non-categorical data. Trying to discretize. (Please convert categorical values to strings to avoid this.)" X = self.discretize(X, y) permsdic = defaultdict( default_permsdic) #We will store here the MCMC results data = list(X[:]) #Now find frequent itemsets #Mine separately for each class data_pos = [x for i, x in enumerate(data) if y[i] == 0] data_neg = [x for i, x in enumerate(data) if y[i] == 1] assert len(data_pos) + len(data_neg) == len(data) try: itemsets = [ r[0] for r in fpgrowth( data_pos, supp=self.minsupport, zmax=self.maxcardinality) ] itemsets.extend([ r[0] for r in fpgrowth( data_neg, supp=self.minsupport, zmax=self.maxcardinality) ]) except TypeError: itemsets = [ r[0] for r in fpgrowth( data_pos, supp=self.minsupport, max=self.maxcardinality) ] itemsets.extend([ r[0] for r in fpgrowth( data_neg, supp=self.minsupport, max=self.maxcardinality) ]) itemsets = list(set(itemsets)) if self.verbose: print len(itemsets), 'rules mined' #Now form the data-vs.-lhs set #X[j] is the set of data points that contain itemset j (that is, satisfy rule j) X = [set() for j in range(len(itemsets) + 1)] X[0] = set(range(len(data))) #the default rule satisfies all data for (j, lhs) in enumerate(itemsets): X[j + 1] = set( [i for (i, xi) in enumerate(data) if set(lhs).issubset(xi)]) #now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = ( X, np.vstack( (y, 1 - y)).T.astype(int), nruleslen, lhs_len, itemsets_all) #Do MCMC res, Rhat = run_bdl_multichain_serial(self.max_iter, self.thinning, self.alpha, self.listlengthprior, self.listwidthprior, Xtrain, Ytrain, nruleslen, lhs_len, self.maxcardinality, permsdic, self.burnin, self.n_chains, [None] * self.n_chains, verbose=self.verbose) #Merge the chains permsdic = merge_chains(res) ###The point estimate, BRL-point self.d_star = get_point_estimate( permsdic, lhs_len, Xtrain, Ytrain, self.alpha, nruleslen, self.maxcardinality, self.listlengthprior, self.listwidthprior, verbose=self.verbose) #get the point estimate if self.d_star: #Compute the rule consequent self.theta, self.ci_theta = get_rule_rhs(Xtrain, Ytrain, self.d_star, self.alpha, True) return self
def fit(self, X, y, sample_weight=None): """Fit linear model. Derived-from - and meant to override - the fit method of the base class. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Training data y : array_like, shape (n_samples, n_targets) Target values. Will be cast to X's dtype if necessary sample_weight : numpy array of shape [n_samples] Individual weights for each sample .. versionadded:: 0.17 parameter *sample_weight* support to LinearRegression. Returns ------- self : returns an instance of self. """ def lstsq(a, b, cond=None, overwrite_a=False, overwrite_b=False, check_finite=True, lapack_driver=None): """ Compute least-squares solution to equation Ax = b. Compute a vector x such that the 2-norm ``|b - A x|`` is minimized. This code was adapted from the Scipy distribution: https://github.com/scipy/scipy/blob/v1.2.1/scipy/linalg/basic.py#L1047-L1264 Parameters ---------- a : (M, N) array_like Left hand side matrix (2-D array). b : (M,) or (M, K) array_like Right hand side matrix or vector (1-D or 2-D array). cond : float, optional Cutoff for 'small' singular values; used to determine effective rank of a. Singular values smaller than ``rcond * largest_singular_value`` are considered zero. overwrite_a : bool, optional Discard data in `a` (may enhance performance). Default is False. overwrite_b : bool, optional Discard data in `b` (may enhance performance). Default is False. check_finite : bool, optional Whether to check that the input matrices contain only finite numbers. Disabling may give a performance gain, but may result in problems (crashes, non-termination) if the inputs do contain infinities or NaNs. lapack_driver : str, optional Which LAPACK driver is used to solve the least-squares problem. Options are ``'gelsd'``, ``'gelsy'``, ``'gelss'``. Default (``'gelsd'``) is a good choice. However, ``'gelsy'`` can be slightly faster on many problems. ``'gelss'`` was used historically. It is generally slow but uses less memory. .. versionadded:: 0.17.0 Returns ------- x : (N,) or (N, K) ndarray Least-squares solution. Return shape matches shape of `b`. residues : (0,) or () or (K,) ndarray Sums of residues, squared 2-norm for each column in ``b - a x``. If rank of matrix a is ``< N`` or ``N > M``, or ``'gelsy'`` is used, this is a length zero array. If b was 1-D, this is a () shape array (numpy scalar), otherwise the shape is (K,). rank : int Effective rank of matrix `a`. s : (min(M,N),) ndarray or None Singular values of `a`. The condition number of a is ``abs(s[0] / s[-1])``. None is returned when ``'gelsy'`` is used. Raises ------ LinAlgError If computation does not converge. ValueError When parameters are wrong. See Also -------- optimize.nnls : linear least squares with non-negativity constraint Examples -------- >>> from scipy.linalg import lstsq >>> import matplotlib.pyplot as plt Suppose we have the following data: >>> x = np.array([1, 2.5, 3.5, 4, 5, 7, 8.5]) >>> y = np.array([0.3, 1.1, 1.5, 2.0, 3.2, 6.6, 8.6]) We want to fit a quadratic polynomial of the form ``y = a + b*x**2`` to this data. We first form the "design matrix" M, with a constant column of 1s and a column containing ``x**2``: >>> M = x[:, np.newaxis]**[0, 2] >>> M array([[ 1. , 1. ], [ 1. , 6.25], [ 1. , 12.25], [ 1. , 16. ], [ 1. , 25. ], [ 1. , 49. ], [ 1. , 72.25]]) We want to find the least-squares solution to ``M.dot(p) = y``, where ``p`` is a vector with length 2 that holds the parameters ``a`` and ``b``. >>> p, res, rnk, s = lstsq(M, y) >>> p array([ 0.20925829, 0.12013861]) Plot the data and the fitted curve. >>> plt.plot(x, y, 'o', label='data') >>> xx = np.linspace(0, 9, 101) >>> yy = p[0] + p[1]*xx**2 >>> plt.plot(xx, yy, label='least squares fit, $y = a + bx^2$') >>> plt.xlabel('x') >>> plt.ylabel('y') >>> plt.legend(framealpha=1, shadow=True) >>> plt.grid(alpha=0.25) >>> plt.show() """ a1 = _asarray_validated(a, check_finite=check_finite) b1 = _asarray_validated(b, check_finite=check_finite) if len(a1.shape) != 2: raise ValueError('expected matrix') m, n = a1.shape if len(b1.shape) == 2: nrhs = b1.shape[1] else: nrhs = 1 if m != b1.shape[0]: raise ValueError('incompatible dimensions') if m == 0 or n == 0: # Zero-sized problem, confuses LAPACK x = np.zeros((n, ) + b1.shape[1:], dtype=np.common_type(a1, b1)) if n == 0: residues = np.linalg.norm(b1, axis=0)**2 else: residues = np.empty((0, )) return x, residues, 0, np.empty((0, )) driver = lapack_driver if driver is None: global default_lapack_driver driver = default_lapack_driver if driver not in ('gelsd', 'gelsy', 'gelss'): raise ValueError('LAPACK driver "%s" is not found' % driver) lapack_func, lapack_lwork = get_lapack_funcs( (driver, '%s_lwork' % driver), (a1, b1)) real_data = True if (lapack_func.dtype.kind == 'f') else False if m < n: # need to extend b matrix as it will be filled with # a larger solution matrix if len(b1.shape) == 2: b2 = np.zeros((n, nrhs), dtype=lapack_func.dtype) b2[:m, :] = b1 else: b2 = np.zeros(n, dtype=lapack_func.dtype) b2[:m] = b1 b1 = b2 overwrite_a = overwrite_a or _datacopied(a1, a) overwrite_b = overwrite_b or _datacopied(b1, b) if cond is None: cond = np.finfo(lapack_func.dtype).eps a1_wrk = np.copy(a1) b1_wrk = np.copy(b1) lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond) x_check, s_check, rank_check, info = lapack_func( a1_wrk, b1_wrk, lwork, iwork, cond, False, False) driver = 'gelss' if driver in ('gelss', 'gelsd'): if driver == 'gelss': if not context: a1_wrk = np.copy(a1) b1_wrk = np.copy(b1) lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond) x, s, rank, info = lapack_func(a1_wrk, b1_wrk, lwork, iwork, cond, False, False) else: try: # Check that we aren't dealing with an underconstrained problem ... if m < n: pkg.log.error( Exception( "Underconstrained problems not yet supported by Magma." )) # Initialize a1_trans = np.copy(a1, order='F') a1_gpu = gpuarray.to_gpu(a1_trans) # Note that the result for 'x' gets written to the vector inputted for b x_trans = np.copy(b1, order='F') x_gpu = gpuarray.to_gpu(x_trans) # Init singular-value decomposition (SVD) output & buffer arrays s = np.zeros(min(m, n), np.float32) u = np.zeros((m, m), np.float32) vh = np.zeros((n, n), np.float32) # Query and allocate optimal workspace # n.b.: - the result for 'x' gets written to the input vector for b, so we just label b->x # - assume magma variables lda=ldb=m throughout here lwork_SVD = magma.magma_sgesvd_buffersize( 'A', 'A', m, n, a1_trans.ctypes.data, m, s.ctypes.data, u.ctypes.data, m, vh.ctypes.data, n) # For some reason, magma_sgels_buffersize() does not return the right value for large problems, so # we compute the values used for the validation check (see Magma SGELS documentation) directly and use that #lwork_LS = magma.magma_sgels_buffersize('n', m, n, nrhs, a1_trans.ctypes.data, m, x_trans.ctypes.data, m) nb = magma.magma_get_sgeqrf_nb(m, n) check = (m - n + nb) * (nrhs + nb) + nrhs * nb lwork_LS = check # Allocate workspaces hwork_SVD = np.zeros(lwork_SVD, np.float32, order='F') hwork_LS = np.zeros(lwork_LS, np.float32) # Compute SVD timer.start("SVD") magma.magma_sgesvd('A', 'A', m, n, a1_trans.ctypes.data, m, s.ctypes.data, u.ctypes.data, m, vh.ctypes.data, n, hwork_SVD.ctypes.data, lwork_SVD) timer.stop("SVD") # Note, the use of s_i>rcond here; this is meant to select # values that are effectively non-zero. Results will depend # somewhat on the choice for this value. This criterion was # adopted from that utilized by scipy.linalg.basic.lstsq() rcond = np.finfo(lapack_func.dtype).eps * s[0] rank = sum(1 for s_i in s if s_i > rcond) # Run LS solver timer.start("LS") magma.magma_sgels_gpu('n', m, n, nrhs, a1_gpu.gpudata, m, x_gpu.gpudata, m, hwork_LS.ctypes.data, lwork_LS) timer.stop("LS") # Unload result from GPU x = x_gpu.get() except magma.MagmaError as e: info = e._status else: info = 0 elif driver == 'gelsd': if real_data: if not context: raise Exception( "For some reason, the CUDA implementation of fit() is being called when context is False." ) else: raise Exception( "gelsd not supported using Cuda yet") else: # complex data raise LinAlgError( "driver=%s not yet supported for complex data" % (driver)) if info > 0: raise LinAlgError( "SVD did not converge in Linear Least Squares") if info < 0: raise ValueError( 'illegal value in %d-th argument of internal %s' % (-info, lapack_driver)) resids = np.asarray([], dtype=x.dtype) if m > n: x1 = x[:n] if rank == n: resids = np.sum(np.abs(x[n:])**2, axis=0) x = x1 elif driver == 'gelsy': raise LinAlgError("driver=%s not yet supported" % (driver)) #pkg.log.close("Done", time_elapsed=True) return x, resids, rank, s n_jobs_ = self.n_jobs X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X, sample_weight=sample_weight) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) if sp.issparse(X): raise Exception( "Sparse matrices not supported yet for Cuda implementation.") else: ############################### self.coef_, self._residues, self.rank_, self.singular_ = lstsq( X, y) ############################### self.coef_ = self.coef_.T if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._set_intercept(X_offset, y_offset, X_scale) return self
def fit(self, X, y): """Fit classifier. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # Validate inputs X and y X, y = check_X_y(X, y) X = check_array(X) self._set_n_classes(y) n_samples = X.shape[0] # initialize matrix for storing newly generated features new_features = np.zeros([n_samples, self.n_base_estimators_]) # build CV datasets X_new, y_new, index_lists = split_datasets( X, y, n_folds=self.n_folds, shuffle_data=self.shuffle_data, random_state=self.random_state) # iterate over all base classifiers for i, clf in enumerate(self.base_estimators): # iterate over all folds for j in range(self.n_folds): # build train and test index full_idx = list(range(n_samples)) test_idx = index_lists[j] train_idx = list_diff(full_idx, test_idx) X_train, y_train = X_new[train_idx, :], y_new[train_idx] X_test, y_test = X_new[test_idx, :], y_new[test_idx] # train the classifier clf.fit(X_train, y_train) # generate the new features on the pseudo test set if self.use_proba: new_features[test_idx, i] = clf.predict_proba(X_test)[:, 1] else: new_features[test_idx, i] = clf.predict(X_test) # build the new dataset for training if self.keep_original: X_new_comb = np.concatenate([X_new, new_features], axis=1) else: X_new_comb = new_features y_new_comb = y_new # train the meta classifier self.meta_clf.fit(X_new_comb, y_new_comb) self.fitted_ = True # train all base classifiers on the full train dataset # iterate over all base classifiers for i, clf in enumerate(self.base_estimators): clf.fit(X_new, y_new) return
def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples in the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. sample_weight : array-like, shape (n_samples,) Weight given to each sample. Returns ------- self : object """ X, y = check_X_y(X, y, copy=False, accept_sparse=['csr'], y_numeric=True, dtype=[np.float64, np.float32]) if sample_weight is not None: sample_weight = np.array(sample_weight) check_consistent_length(y, sample_weight) else: sample_weight = np.ones_like(y) if self.epsilon < 1.0: raise ValueError( "epsilon should be greater than or equal to 1.0, got %f" % self.epsilon) if self.warm_start and hasattr(self, 'coef_'): parameters = np.concatenate((self.coef_, [self.intercept_])) else: if self.fit_intercept: parameters = np.zeros(X.shape[1] + 1) else: parameters = np.zeros(X.shape[1]) # Make sure to initialize the scale parameter to a strictly # positive value: parameters[-1] = 1 # Sigma or the scale factor should be non-negative. # Setting it to be zero might cause undefined bounds hence we set it # to a value close to zero. bounds = np.tile([-np.inf, np.inf], (parameters.shape[0], 1)) bounds[-1][0] = np.finfo(np.float64).eps * 10 parameters, f, dict_ = optimize.fmin_l_bfgs_b( _huber_loss_and_gradient, parameters, args=(X, y, self.epsilon, self.alpha, self.sigma, sample_weight), maxiter=self.max_iter, pgtol=self.tol, bounds=bounds, iprint=0) if dict_['warnflag'] == 2: raise ValueError("HuberRegressor convergence failed:" " l-BFGS-b solver terminated with %s" % dict_['task'].decode('ascii')) # In scipy <= 1.0.0, nit may exceed maxiter. # See https://github.com/scipy/scipy/issues/7854. self.n_iter_ = min(dict_['nit'], self.max_iter) if self.fit_intercept: self.intercept_ = parameters[-1] else: self.intercept_ = 0.0 self.coef_ = parameters[:X.shape[1]] residual = np.abs(y - safe_sparse_dot(X, self.coef_) - self.intercept_) return self
def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") X, y = check_X_y(X, y) if self.bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, shape=X.shape[1]) X = clip_to_bounds(X, self.bounds) self.epsilon_ = self.var_smoothing if _refit: self.classes_ = None if _check_partial_fit_first_call(self, classes): n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_count_ = np.zeros(n_classes, dtype=np.float64) if self.priors is not None: priors = np.asarray(self.priors) if len(priors) != n_classes: raise ValueError( "Number of priors must match number of classes.") if not np.isclose(priors.sum(), 1.0): raise ValueError("The sum of the priors should be 1.") if (priors < 0).any(): raise ValueError("Priors must be non-negative.") self.class_prior_ = priors else: # Initialize the priors to zeros for each class self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64) else: if X.shape[1] != self.theta_.shape[1]: raise ValueError( "Number of features %d does not match previous data %d." % (X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= self.epsilon_ classes = self.classes_ unique_y = np.unique(y) unique_y_in_classes = np.in1d(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError( "The target label(s) %s in y do not exist in the initial classes %s" % (unique_y[~unique_y_in_classes], classes)) noisy_class_counts = self._noisy_class_counts(y) for _i, y_i in enumerate(unique_y): i = classes.searchsorted(y_i) X_i = X[y == y_i, :] n_i = noisy_class_counts[_i] new_theta, new_sigma = self._update_mean_variance( self.class_count_[i], self.theta_[i, :], self.sigma_[i, :], X_i, n_noisy=n_i) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_[i] += n_i self.sigma_[:, :] += self.epsilon_ # Update if only no priors is provided if self.priors is None: # Empirical prior, with sample_weight taken into account self.class_prior_ = self.class_count_ / self.class_count_.sum() self.accountant.spend(self.epsilon, 0) return self
def fit(self, X, y): y, _ = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=True) return self
def fit(self, X, y): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. y : array-like, shape=(n_samples,) Target values. Returns ------- self : object """ fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data # TODO: add support for pre-binned data (pass-through)? # TODO: test input checking X, y = check_X_y(X, y, dtype=[np.float32, np.float64]) y = self._encode_y(y) if X.shape[0] == 1 or X.shape[1] == 1: raise ValueError( 'Passing only one sample or one feature is not supported yet. ' 'See numba issue #3569.' ) rng = check_random_state(self.random_state) self._validate_parameters() self.n_features_ = X.shape[1] # used for validation in predict() if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) toc = time() if self.verbose: duration = toc - tic troughput = X.nbytes / duration print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") self.loss_ = self._get_loss() if self.scoring is not None and self.validation_split is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( f'Not enough data (n_samples={X_binned.shape[0]}) to ' f'perform early stopping with validation_split=' f'{self.validation_split}. Use more training data or ' f'adjust validation_split.' ) # Histogram computation is faster on feature-aligned data. X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. subsample_size = 10000 if X_binned_train.shape[0] < subsample_size: X_binned_small_train = np.ascontiguousarray(X_binned_train) y_small_train = y_train else: indices = rng.choice( np.arange(X_binned_train.shape[0]), subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] if self.verbose: print("Fitting gradient boosted rounds:") n_samples = X_binned_train.shape[0] # values predicted by the trees. Used as-is in regression, and # transformed into probas and / or classes for classification raw_predictions = np.zeros( shape=(n_samples, self.n_trees_per_iteration_), dtype=y_train.dtype ) # gradients and hessians are 1D arrays of size # n_samples * n_trees_per_iteration gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, n_trees_per_iteration=self.n_trees_per_iteration_ ) # predictors_ is a matrix of TreePredictor objects with shape # (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] scorer = check_scoring(self, self.scoring) self.train_scores_ = [] if self.scoring is not None: # Add predictions of the initial model (before the first tree) predicted_train = self._predict_binned(X_binned_train) score_train = scorer._sign * scorer._score_func(y_train, predicted_train) self.train_scores_.append(score_train) if self.validation_split is not None: self.validation_scores_ = [] predicted_val = self._predict_binned(X_binned_val) score_val = scorer._sign * scorer._score_func(y_val, predicted_val) self.validation_scores_.append(score_val) for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() print(f"[{iteration + 1}/{self.max_iter}] ", end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) predictors.append([]) # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate(zip( np.array_split(gradients, self.n_trees_per_iteration_), np.array_split(hessians, self.n_trees_per_iteration_))): # the xxxx_at_k arrays are **views** on the original arrays. # Note that for binary classif and regressions, # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the # whole array. grower = TreeGrower( X_binned_train, gradients_at_k, hessians_at_k, max_bins=self.max_bins, n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) predictors[-1].append(predictor) tic_pred = time() # prepare leaves_data so that _update_raw_predictions can be # @njitted leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] _update_raw_predictions(leaves_data, raw_predictions[:, k]) toc_pred = time() acc_prediction_time += toc_pred - tic_pred should_stop = self._check_early_stopping( scorer, X_binned_small_train, y_small_train, X_binned_val, y_val) if self.verbose: self._print_iteration_stats(iteration_start_time) if should_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self.predictors_ for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.predictors_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " f"{acc_find_split_time:.3f}s") print(f"{'Time spent applying splits:':<32} " f"{acc_apply_split_time:.3f}s") print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") self.train_scores_ = np.asarray(self.train_scores_) if self.scoring is not None and self.validation_split is not None: self.validation_scores_ = np.asarray(self.validation_scores_) return self
def fit(self, X, y, sample_weight=None, relative_penalties=None, groups=None): """Fit the model to training data. If n_splits > 1 also run n-fold cross validation on all values in lambda_path. The model will be fit n+1 times. On the first pass, the lambda_path will be determined, on the remaining passes, the model performance for each value of lambda. After cross validation, the attribute `cv_mean_score_` will contain the mean score over all folds for each value of lambda, and `cv_standard_error_` will contain the standard error of `cv_mean_score_` for each value of lambda. The value of lambda which achieves the best performance in cross validation will be saved to `lambda_max_` additionally, the largest value of lambda s.t.: cv_score(l) >= cv_score(lambda_max_) -\ cut_point * standard_error(lambda_max_) will be saved to `lambda_best_`. Parameters ---------- X : array, shape (n_samples, n_features) Input features y : array, shape (n_samples,) Target values sample_weight : array, shape (n_samples,) Optional weight vector for observations relative_penalties: array, shape (n_features,) Optional relative weight vector for penalty. 0 entries remove penalty. groups: array, shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. If the groups are specified, the groups will be passed to sklearn.model_selection.GroupKFold. If None, then data will be split randomly for K-fold cross-validation via sklearn.model_selection.KFold. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y, accept_sparse='csr', ensure_min_samples=2) if sample_weight is None: sample_weight = np.ones(X.shape[0]) else: sample_weight = np.asarray(sample_weight) if not np.isscalar(self.lower_limits): self.lower_limits = np.asarray(self.lower_limits) if len(self.lower_limits) != X.shape[1]: raise ValueError("lower_limits must equal number of features") if not np.isscalar(self.upper_limits): self.upper_limits = np.asarray(self.upper_limits) if len(self.upper_limits) != X.shape[1]: raise ValueError("upper_limits must equal number of features") if any(self.lower_limits > 0) if isinstance( self.lower_limits, np.ndarray) else self.lower_limits > 0: raise ValueError("lower_limits must be non-positive") if any(self.upper_limits < 0) if isinstance( self.upper_limits, np.ndarray) else self.upper_limits < 0: raise ValueError("upper_limits must be positive") if self.alpha > 1 or self.alpha < 0: raise ValueError("alpha must be between 0 and 1") if self.n_splits > 0 and self.n_splits < 3: raise ValueError("n_splits must be at least 3") self._fit(X, y, sample_weight, relative_penalties) if self.n_splits >= 3: if groups is None: self._cv = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state) else: self._cv = GroupKFold(n_splits=self.n_splits) cv_scores = _score_lambda_path(self, X, y, groups, sample_weight, relative_penalties, self.scoring, n_jobs=self.n_jobs, verbose=self.verbose) self.cv_mean_score_ = np.atleast_1d(np.mean(cv_scores, axis=0)) self.cv_standard_error_ = np.atleast_1d(stats.sem(cv_scores)) self.lambda_max_inx_ = np.argmax(self.cv_mean_score_) self.lambda_max_ = self.lambda_path_[self.lambda_max_inx_] target_score = self.cv_mean_score_[self.lambda_max_inx_] -\ self.cut_point * self.cv_standard_error_[self.lambda_max_inx_] self.lambda_best_inx_ = np.argwhere( self.cv_mean_score_ >= target_score)[0] self.lambda_best_ = self.lambda_path_[self.lambda_best_inx_] self.coef_ = self.coef_path_[..., self.lambda_best_inx_] self.coef_ = self.coef_.squeeze(axis=self.coef_.ndim - 1) self.intercept_ = self.intercept_path_[ ..., self.lambda_best_inx_].squeeze() if self.intercept_.shape == (): # convert 0d array to scalar self.intercept_ = float(self.intercept_) return self
def fit(self, X, y): y, _ = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=False) self.sampling_strategy_ = "sampling_strategy_" return self
def sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ # Check the consistency of X and y X, y = check_X_y(X, y) # Call the parent function super(ADASYN, self).sample(X, y) # Keep the samples from the majority class X_resampled = X.copy() y_resampled = y.copy() # Define the number of sample to create # We handle only two classes problem for the moment. if self.ratio == 'auto': num_samples = (self.stats_c_[self.maj_c_] - self.stats_c_[self.min_c_]) else: num_samples = int((self.ratio * self.stats_c_[self.maj_c_]) - self.stats_c_[self.min_c_]) # Start by separating minority class features and target values. X_min = X[y == self.min_c_] # Print if verbose is true if self.verbose: print('Finding the {} nearest neighbours...'.format(self.k)) # Look for k-th nearest neighbours, excluding, of course, the # point itself. self.nearest_neighbour.fit(X) # Get the distance to the NN _, ind_nn = self.nearest_neighbour.kneighbors(X_min) # Compute the ratio of majority samples next to minority samples ratio_nn = np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) / self.k # Normalize the ratio ratio_nn /= np.sum(ratio_nn) # Compute the number of sample to be generated num_samples_nn = np.round(ratio_nn * num_samples).astype(int) # For each minority samples for x_i, x_i_nn, num_sample_i in zip(X_min, ind_nn, num_samples_nn): # Fix the the seed np.random.seed(self.random_state) # Pick-up the neighbors wanted nn_zs = np.random.randint(1, high=self.k + 1, size=num_sample_i) # Create a new sample for nn_z in nn_zs: step = np.random.uniform() x_gen = x_i + step * (x_i - X[x_i_nn[nn_z], :]) X_resampled = np.vstack((X_resampled, x_gen)) y_resampled = np.hstack((y_resampled, self.min_c_)) if self.verbose: print("Over-sampling performed: {}".format(Counter(y_resampled))) return X_resampled, y_resampled