def predict(self, X): """Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted values. """ self.check_is_fitted() # Check data X = check_X(X, enforce_univariate=True) X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Parallel loop y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(e.predict)(X, check_input=True) for e in self.estimators_) return np.sum(y_hat, axis=0) / len(self.estimators_)
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample are computed as the mean predicted class probabilities of the trees in the forest. The class probability of a single tree is the fraction of samples of the same class in a leaf. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ # Check data self.check_is_fitted() X = check_X(X, enforce_univariate=True) X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(delayed(e.predict_proba)(X) for e in self.estimators_) return np.sum(all_proba, axis=0) / len(self.estimators_)
def predict_proba(self, X): check_is_fitted(self) X = self._validate_X_predict(X) n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) all_proba = [ np.zeros((X.shape[0], j), dtype=np.float64) for j in np.atleast_1d(self.n_classes_) ] lock = threading.Lock() Parallel(n_jobs=-1, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"))( delayed(accumalate_prediction)(e.predict_proba, X, all_proba, lock) for e in self.estimators_) for proba in all_proba: proba /= len(self.estimators_) if len(all_proba) == 1: return all_proba[0] else: return all_proba
def _predict(self, predict_fn, X): check_is_fitted(self, 'estimators_') # Check data X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # avoid storing the output of every estimator by summing them here if predict_fn == "predict": y_hat = np.zeros((X.shape[0]), dtype=np.float64) else: y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64) def _get_fn(est, name): fn = getattr(est, name) if name in ("predict_cumulative_hazard_function", "predict_survival_function"): fn = partial(fn, return_array=True) return fn # Parallel loop lock = threading.Lock() Parallel(n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"))( delayed(_accumulate_prediction)(_get_fn(e, predict_fn), X, [y_hat], lock) for e in self.estimators_) y_hat /= len(self.estimators_) return y_hat
def predict(self, X): """Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the estimators in the ensemble. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- y : ndarray of shape (n_samples,) The predicted values. """ check_is_fitted(self) # Check data X = check_array(X, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False) # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( self.n_estimators, self.n_jobs) all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_predict_regression)( self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X) for i in range(n_jobs)) # Reduce y_hat = sum(all_y_hat) / self.n_estimators return y_hat
def predict_log_proba(self, X): """Predict class log-probabilities for X. The predicted class log-probabilities of an input sample is computed as the log of the mean predicted class probabilities of the base estimators in the ensemble. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- p : ndarray of shape (n_samples, n_classes) The class log-probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ check_is_fitted(self) if hasattr(self.base_estimator_, "predict_log_proba"): # Check data X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) if self.n_features_ != X.shape[1]: raise ValueError("Number of features of the model must " "match the input. Model n_features is {0} " "and input n_features is {1} " "".format(self.n_features_, X.shape[1])) # Partition the estimators n_jobs, n_estimators, starts = _partition_estimators( self.n_estimators, self.n_jobs) all_log_proba = [ _parallel_predict_log_proba( self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_, ) for i in range(n_jobs) ] # Reduce log_proba = all_log_proba[0] for j in range(1, len(all_log_proba)): log_proba = np.logaddexp(log_proba, all_log_proba[j]) log_proba -= np.log(self.n_estimators) return log_proba else: return np.log(self.predict_proba(X))
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample is computed as the mean predicted class probabilities of the trees in the forest. The class probability of a single tree is the fraction of samples of the same class in a leaf. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ # Check data if self.scaling: X = self._scale(X) X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Parallel loop all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_helper)(e, 'predict_proba', X, check_input=False) for e in self.estimators_) # Reduce proba = all_proba[0] if self.n_outputs_ == 1: for j in range(1, len(all_proba)): proba += self.estimator_weights[j] * all_proba[j] # proba /= len(self.estimators_) proba /= np.sum(self.estimator_weights[j]) else: for j in range(1, len(all_proba)): for k in range(self.n_outputs_): proba[k] += self.estimator_weights[j] * all_proba[j][k] for k in range(self.n_outputs_): proba[k] /= np.sum(self.estimator_weights[j]) return proba
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample is computed as the mean predicted class probabilities of the base estimators in the ensemble. If base estimators do not implement a ``predict_proba`` method, then it resorts to voting and the predicted class probabilities of an input sample represents the proportion of estimators predicting each class. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- p : ndarray of shape (n_samples, n_classes) The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ check_is_fitted(self) # Check data X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) if self.n_features_ != X.shape[1]: raise ValueError("Number of features of the model must " "match the input. Model n_features is {0} and " "input n_features is {1}." "".format(self.n_features_, X.shape[1])) # Partition the estimators n_jobs, n_estimators, starts = _partition_estimators( self.n_estimators, self.n_jobs) all_proba = [ _parallel_predict_proba( self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_, ) for i in range(n_jobs) ] # Reduce proba = sum(all_proba) / self.n_estimators return proba
def predict_log_proba(self, X): """Predict class log-probabilities for X. The predicted class log-probabilities of an input sample is computed as the log of the mean predicted class probabilities of the base estimators in the ensemble. Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- p : array of shape = [n_samples, n_classes] The class log-probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ check_is_fitted(self, "classes_") if hasattr(self.base_estimator_, "predict_log_proba"): # Check data X = check_array(X, accept_sparse=['csr', 'csc']) if self.n_features_ != X.shape[1]: raise ValueError("Number of features of the model must " "match the input. Model n_features is {0} " "and input n_features is {1} " "".format(self.n_features_, X.shape[1])) # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( self.n_estimators, self.n_jobs) all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_predict_log_proba) (self.estimators_[starts[i]:starts[i + 1]], self. estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_) for i in range(n_jobs)) # Reduce log_proba = all_log_proba[0] for j in range(1, len(all_log_proba)): # pragma: no cover log_proba = np.logaddexp(log_proba, all_log_proba[j]) log_proba -= np.log(self.n_estimators) return log_proba # else, the base estimator has no predict_log_proba, so... return np.log(self.predict_proba(X))
def predict_proba(self, X): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The input samples. If a Pandas data frame is passed it must have a single column (i.e., univariate classification). RISE has no bespoke method for multivariate classification as yet. Local variables --------------- n_instances : int Number of cases to classify. n_columns : int Number of attributes in X, must match `series_length` determined in `fit`. Returns ------- output : array of shape = [n_instances, n_classes] The class probabilities of all cases. """ # Check data self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) n_instances, n_columns = X.shape if n_columns != self.series_length: raise TypeError( "ERROR number of attributes in the train does not match " "that in the test data." ) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Parallel loop all_proba = Parallel(n_jobs=n_jobs)( delayed(_predict_proba_for_estimator)( X, self.estimators_[i], self.intervals[i], self.lags[i], ) for i in range(self.n_estimators) ) return np.sum(all_proba, axis=0) / self.n_estimators
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample is computed as the mean predicted class probabilities of the base estimators in the ensemble. If base estimators do not implement a ``predict_proba`` method, then it resorts to voting and the predicted class probabilities of an input sample represents the proportion of estimators predicting each class. Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- p : array of shape = [n_samples, n_classes] The class probabilities of the input samples. """ check_is_fitted(self) # Check data X = check_array( X, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False ) if self.n_features_ != X.shape[1]: raise ValueError("Number of features of the model must " "match the input. Model n_features is {0} and " "input n_features is {1}." "".format(self.n_features_, X.shape[1])) # Parallel loop n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs) all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args())( delayed(_parallel_predict_proba)( self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_) for i in range(n_jobs)) # Reduce proba = sum(all_proba) / len(self.estimators_) return proba
def predict(self, X, eval_MSE=False): """Predict regression target for `X`. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted values. """ check_is_fitted(self) # Check data X = self._check_X(X) X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # storing the output of every estimator since those are required to estimate the MSE if self.n_outputs_ > 1: y_hat_all = np.zeros( (X.shape[0], self.n_outputs_, self.n_estimators), dtype=np.float64) else: y_hat_all = np.zeros((X.shape[0], self.n_estimators), dtype=np.float64) for i, e in enumerate(self.estimators_): y_hat_all[..., i] = e.predict(X, check_input=False) # TODO: this actually takes much longer than the sequential execution # which might be caused by the overheads in spawning the threads. # Parallel loop # Parallel(n_jobs=n_jobs, verbose=self.verbose, backend="threading")( # delayed(_save_prediction)(e.predict, X, i, y_hat_all) \ # for i, e in enumerate(self.estimators_) # ) y_hat = np.mean(y_hat_all, axis=1).flatten() if eval_MSE: # TODO: implement the jackknife estimate of variance _MSE_hat = np.std(y_hat_all, axis=1, ddof=1)**2. _MSE_hat = _MSE_hat.flatten() return (y_hat, _MSE_hat) if eval_MSE else y_hat
def decision_function(self, X): """Average of the decision functions of the base classifiers. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- score : ndarray of shape (n_samples, k) The decision function of the input samples. The columns correspond to the classes in sorted order, as they appear in the attribute ``classes_``. Regression and binary classification are special cases with ``k == 1``, otherwise ``k==n_classes``. """ check_is_fitted(self) # Check data X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) if self.n_features_ != X.shape[1]: raise ValueError("Number of features of the model must " "match the input. Model n_features is {0} and " "input n_features is {1} " "".format(self.n_features_, X.shape[1])) # Partition the estimators n_jobs, n_estimators, starts = _partition_estimators( self.n_estimators, self.n_jobs) all_decisions = [ _parallel_decision_function( self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X, ) for i in range(n_jobs) ] # Reduce decisions = sum(all_decisions) / self.n_estimators return decisions
def predict(self, X): """ Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted values. """ check_is_fitted(self) # Check data X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # avoid storing the output of every estimator by summing them here if self.n_outputs_ > 1: y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64) else: y_hat = np.zeros((X.shape[0]), dtype=np.float64) # Parallel loop lock = threading.Lock() # <<< sklearn # Parallel(n_jobs=n_jobs, verbose=self.verbose, # **_joblib_parallel_args(require="sharedmem"))( # delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) # for e in self.estimators_) # >>> monkey patch for e in self.estimators_: _accumulate_prediction(e.predict, X, [y_hat], lock) # --------------- y_hat /= len(self.estimators_) return y_hat
def predict_proba_trees(self, X): check_is_fitted(self) # Check data X = self._validate_X_predict(X) # TODO: we can also avoid data binning for predictions... X_binned = self._bin_data(X, is_training_data=False) n_samples, n_features = X.shape n_estimators = len(self.trees) n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) probas = np.empty((n_estimators, n_samples, n_features)) lock = threading.Lock() Parallel( n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"), )(delayed(_get_tree_prediction)(e.predict_proba, X_binned, probas, lock, tree_idx) for tree_idx, e in enumerate(self.trees)) return probas
def predict(self, X): """ Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : array-like or sparse matrix of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- y : array-like of shape (n_samples,) or (n_samples, n_outputs) The predicted values. """ check_is_fitted(self) # Check data X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Parallel loop # Store the output of every estimator in order to compute confidence intervals y_hat = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"))( delayed(_accumulate_prediction)( e.predict, X, self.minimum_value) for e in self.forest.estimators_) y_hat_below = np.percentile(y_hat, self.confidence_interval_lower, axis=0) y_hat_above = np.percentile(y_hat, self.confidence_interval_upper, axis=0) return np.dstack((y_hat_below, y_hat_above))
def predict(self, X: Union[Solution, List, np.ndarray], eval_MSE=False) -> np.ndarray: """Predict regression target for `X`. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted values. """ check_is_fitted(self) # check data X = self._check_X(X) X = self._validate_X_predict(X) # assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # storing the output of every estimator since those are required to estimate the MSE y_hat_all = (np.zeros( (X.shape[0], self.n_outputs_, self.n_estimators), dtype=np.float64) if self.n_outputs_ > 1 else np.zeros( (X.shape[0], self.n_estimators), dtype=np.float64)) # parallel loop Parallel(n_jobs=n_jobs, verbose=self.verbose, backend="threading")( delayed(_save_prediction)(e.predict, X, i, y_hat_all) for i, e in enumerate(self.estimators_)) y_hat = np.mean(y_hat_all, axis=-1) if eval_MSE: # TODO: implement the jackknife estimate of variance MSE_hat = np.std(y_hat_all, axis=-1, ddof=1)**2.0 return (y_hat, MSE_hat) if eval_MSE else y_hat
def _forest_predict_var(forest, X_test, n_jobs): """Helper function to accumulate predictions and their variances. Parameters ---------- forest : RandomForestRegressor Regressor object. X_test : ndarray, shape (n_test_samples,) The design matrix for testing data. n_jobs : int or None, optional (default=None) The number of jobs to run in parallel. ``None`` means 1. ``-1`` means use all processors. """ check_is_fitted(forest) X_test = forest._validate_X_predict(X_test) n_jobs, _, _ = _partition_estimators(forest.n_estimators, n_jobs) y_hat = np.zeros((X_test.shape[0]), dtype=np.float64) y_var = np.zeros((X_test.shape[0]), dtype=np.float64) # Parallel loop lock = threading.Lock() Parallel(n_jobs=n_jobs, verbose=forest.verbose, **_joblib_parallel_args(require='sharedmem'))( delayed(_accumulate_predictions_and_var)(e.predict, X_test, [y_hat, y_var], lock) for e in forest.estimators_) y_hat /= len(forest.estimators_) y_var /= len(forest.estimators_) y_var -= y_hat**2 return [y_hat, y_var]
def _predict_proba(self, X): """Find probability estimates for each class for all cases in X. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The input samples. If a Pandas data frame is passed it must have a single column (i.e., univariate classification). RISE has no bespoke method for multivariate classification as yet. Attributes ---------- n_instances : int Number of cases to classify. n_columns : int Number of attributes in X, must match `series_length` determined in `fit`. Returns ------- output : array of shape = [n_instances, n_classes] The class probabilities of all cases. """ X = X.squeeze(1) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Parallel loop all_proba = Parallel(n_jobs=n_jobs)( delayed(_predict_proba_for_estimator)( X, self.estimators_[i], self.intervals[i], self.lags[i], ) for i in range(self.n_estimators)) return np.sum(all_proba, axis=0) / self.n_estimators
def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like, shape = [n_samples] The target values (1 for positive, 0 for unlabeled). max_samples : int or float, optional (default=None) Argument to use instead of self.max_samples. max_depth : int, optional (default=None) Override value used when constructing base estimator. Only supported if the base estimator has a max_depth parameter. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if the base estimator supports sample weighting. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) self.y = y # Convert data X, y = check_X_y(X, y, ['csr', 'csc']) if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) check_consistent_length(y, sample_weight) # Remap output n_samples, self.n_features_ = X.shape self._n_samples = n_samples y = self._validate_y(y) # Check parameters self._validate_estimator() if max_depth is not None: self.base_estimator_.max_depth = max_depth # Validate max_samples if max_samples is None: max_samples = self.max_samples elif not isinstance(max_samples, (numbers.Integral, np.integer)): max_samples = int(max_samples * sum(y < 1)) if not (0 < max_samples <= sum(y < 1)): raise ValueError("max_samples must be positive" " and no larger than the number of unlabeled points") # Store validated integer row sampling value self._max_samples = max_samples # Validate max_features if isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") # Store validated integer feature sampling value self._max_features = max_features # Other checks if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") if self.warm_start and self.oob_score: raise ValueError("Out of bag estimate only available" " if warm_start=False") if hasattr(self, "oob_score_") and self.warm_start: del self.oob_score_ if not self.warm_start or not hasattr(self, 'estimators_'): # Free allocated memory, if any self.estimators_ = [] self.estimators_features_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") return self # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(n_more_estimators, self.n_jobs) total_n_estimators = sum(n_estimators) # Advance random state to state after training # the first n_estimators if self.warm_start and len(self.estimators_) > 0: random_state.randint(MAX_INT, size=len(self.estimators_)) seeds = random_state.randint(MAX_INT, size=n_more_estimators) self._seeds = seeds all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_build_estimators)( n_estimators[i], self, X, y, sample_weight, seeds[starts[i]:starts[i + 1]], total_n_estimators, verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ += list(itertools.chain.from_iterable( t[0] for t in all_results)) self.estimators_features_ += list(itertools.chain.from_iterable( t[1] for t in all_results)) if self.oob_score: self._set_oob_score(X, y) return self
def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like of shape (n_samples,) The target values (class labels in classification, real numbers in regression). max_samples : int or float, default=None Argument to use instead of self.max_samples. max_depth : int, default=None Override value used when constructing base estimator. Only supported if the base estimator has a max_depth parameter. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Note that this is supported only if the base estimator supports sample weighting. Returns ------- self : object """ random_state = check_random_state(self.random_state) # Convert data (X is required to be 2d and indexable) X, y = self._validate_data( X, y, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False, multi_output=True, ) if sample_weight is not None: # pragma: no cover sample_weight = _check_sample_weight(sample_weight, X, dtype=None) # Remap output n_samples, self.n_features_ = X.shape self._n_samples = n_samples y = self._validate_y(y) # Check parameters self._validate_estimator() if max_depth is not None: # pragma: no cover self.base_estimator_.max_depth = max_depth # Validate max_samples if max_samples is None: # pragma: no cover max_samples = self.max_samples elif not isinstance(max_samples, numbers.Integral): # pragma: no cover max_samples = int(max_samples * X.shape[0]) if not (0 < max_samples <= X.shape[0]): # pragma: no cover raise ValueError("max_samples must be in (0, n_samples]") # Store validated integer row sampling value self._max_samples = max_samples # Validate max_features if isinstance(self.max_features, numbers.Integral): max_features = self.max_features elif isinstance(self.max_features, np.float): # pragma: no cover max_features = self.max_features * self.n_features_ else: # pragma: no cover raise ValueError("max_features must be int or float") if not (0 < max_features <= self.n_features_): # pragma: no cover raise ValueError("max_features must be in (0, n_features]") max_features = max(1, int(max_features)) # Store validated integer feature sampling value self._max_features = max_features # Other checks if not self.bootstrap and self.oob_score: # pragma: no cover raise ValueError("Out of bag estimation only available" " if bootstrap=True") if self.warm_start and self.oob_score: # pragma: no cover raise ValueError("Out of bag estimate only available" " if warm_start=False") if hasattr(self, "oob_score_") and self.warm_start: # pragma: no cover del self.oob_score_ if not self.warm_start or not hasattr( self, "estimators_"): # pragma: no cover # Free allocated memory, if any self.estimators_ = [] self.estimators_features_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: # pragma: no cover raise ValueError("n_estimators=%d must be larger or equal to " "len(estimators_)=%d when warm_start==True" % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: # pragma: no cover warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") return self # Partition the estimators n_jobs, n_estimators, starts = _partition_estimators( n_more_estimators, self.n_jobs) total_n_estimators = sum(n_estimators) # Advance random state to state after training # the first n_estimators if self.warm_start and len(self.estimators_) > 0: # pragma: no cover random_state.randint(MAX_INT, size=len(self.estimators_)) seeds = random_state.randint(MAX_INT, size=n_more_estimators) self._seeds = seeds all_results = [ _parallel_build_estimators( n_estimators[i], self, X, y, sample_weight, seeds[starts[i]:starts[i + 1]], total_n_estimators, verbose=self.verbose, ) for i in range(n_jobs) ] # Reduce self.estimators_ += list( itertools.chain.from_iterable(t[0] for t in all_results)) self.estimators_features_ += list( itertools.chain.from_iterable(t[1] for t in all_results)) if self.oob_score: self._set_oob_score(X, y) return self
def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): """Build a Sequentially Bootstrapped Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : (array-like, sparse matrix) of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : (array-like), shape = [n_samples] The target values (class labels in classification, real numbers in regression). max_samples : (int or float), optional (default=None) Argument to use instead of self.max_samples. max_depth : (int), optional (default=None) Override value used when constructing base estimator. Only supported if the base estimator has a max_depth parameter. sample_weight : (array-like), shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if the base estimator supports sample weighting. Returns ------- self : (object) """ random_state = check_random_state(self.random_state) self.X_time_index = X.index # Remember X index for future sampling # Generate subsample ind_matrix (we need this during subsampling cross_validation) subsampled_ind_mat = self.ind_mat[:, self.timestamp_int_index_mapping. loc[self.X_time_index]] # Convert data (X is required to be 2d and indexable) X, y = check_X_y(X, y, ['csr', 'csc'], dtype=None, force_all_finite=False, multi_output=True) if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) check_consistent_length(y, sample_weight) # Remap output n_samples, self.n_features_ = X.shape self._n_samples = n_samples y = self._validate_y(y) # Check parameters self._validate_estimator() # Validate max_samples if not isinstance(max_samples, (numbers.Integral, np.integer)): max_samples = int(max_samples * X.shape[0]) if not (0 < max_samples <= X.shape[0]): raise ValueError("max_samples must be in (0, n_samples]") # Store validated integer row sampling value self._max_samples = max_samples # Validate max_features if isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features elif isinstance(self.max_features, np.float): max_features = self.max_features * self.n_features_ else: raise ValueError("max_features must be int or float") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") max_features = max(1, int(max_features)) # Store validated integer feature sampling value self._max_features = max_features if self.warm_start and self.oob_score: raise ValueError("Out of bag estimate only available" " if warm_start=False") if not self.warm_start or not hasattr(self, 'estimators_'): # Free allocated memory, if any self.estimators_ = [] self.estimators_features_ = [] self.sequentially_bootstrapped_samples_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") return self # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( n_more_estimators, self.n_jobs) total_n_estimators = sum(n_estimators) # Advance random state to state after training # the first n_estimators if self.warm_start and len(self.estimators_) > 0: random_state.randint(MAX_INT, size=len(self.estimators_)) seeds = random_state.randint(MAX_INT, size=n_more_estimators) self._seeds = seeds # pylint: disable=C0330 all_results = Parallel( n_jobs=n_jobs, verbose=self.verbose, )(delayed(_parallel_build_estimators)(n_estimators[i], self, X, y, subsampled_ind_mat, sample_weight, seeds[starts[i]:starts[i + 1]], total_n_estimators, verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ += list( itertools.chain.from_iterable(t[0] for t in all_results)) self.estimators_features_ += list( itertools.chain.from_iterable(t[1] for t in all_results)) self.sequentially_bootstrapped_samples_ += list( itertools.chain.from_iterable(t[2] for t in all_results)) if self.oob_score: self._set_oob_score(X, y) return self
def fit(self, X, y, sample_weight=None): random_state = check_random_state(self.random_state) self._max_samples = int(self.max_samples * X.shape[0]) # Convert data (X is required to be 2d and indexable) X, y = check_X_y(X, y, ['csr', 'csc'], dtype=None, force_all_finite=False, multi_output=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=None) # Remap output n_samples, self.n_features_ = X.shape self._n_samples = n_samples y = self._validate_y(y) # Check parameters self._validate_estimator() # Validate max_features if isinstance(self.max_features, numbers.Integral): max_features = self.max_features elif isinstance(self.max_features, np.float): max_features = self.max_features * self.n_features_ else: raise ValueError("max_features must be int or float") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") max_features = max(1, int(max_features)) # Store validated integer feature sampling value self._max_features = max_features # Other checks if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") if self.warm_start and self.oob_score: raise ValueError("Out of bag estimate only available" " if warm_start=False") if hasattr(self, "oob_score_") and self.warm_start: del self.oob_score_ if not self.warm_start or not hasattr(self, 'estimators_'): # Free allocated memory, if any self.estimators_ = [] self.estimators_features_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") return self # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( n_more_estimators, self.n_jobs) total_n_estimators = sum(n_estimators) # Advance random state to state after training # the first n_estimators if self.warm_start and len(self.estimators_) > 0: random_state.randint(MAX_INT, size=len(self.estimators_)) seeds = random_state.randint(MAX_INT, size=n_more_estimators) self._seeds = seeds all_results = Parallel( n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args())( delayed(_parallel_build_estimators)(n_estimators[i], self, X, y, sample_weight, seeds[starts[i]:starts[i + 1]], total_n_estimators, verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ += list( itertools.chain.from_iterable(t[0] for t in all_results)) self.estimators_features_ += list( itertools.chain.from_iterable(t[1] for t in all_results)) if self.oob_score: self._set_oob_score(X, y) return self
def predict_proba(self, X): """ Predict class probabilities for X. The predicted class probabilities of an input sample are computed as the mean predicted class probabilities of the trees in the forest. The class probability of a single tree is the fraction of samples of the same class in a leaf. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- p : ndarray of shape (n_samples, n_classes), or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ # TODO: c'est un copier / coller de scikit-learn. Simplifier au cas # classification binaire. Et il faut binner les features avant de predire check_is_fitted(self) # Check data X = self._validate_X_predict(X, check_input=True) # TODO: we can also avoid data binning for predictions... # Bin the data X_binned = self._bin_data(X, is_training_data=False) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # TODO: on ne gere pas encore le cas multi-output mais juste un label binaire # avoid storing the output of every estimator by summing them here # all_proba = [ # np.zeros((X.shape[0], j), dtype=np.float64) # for j in np.atleast_1d(self.n_classes_) # ] all_proba = np.zeros((X_binned.shape[0], self.n_classes_)) lock = threading.Lock() Parallel( n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"), )(delayed(_accumulate_prediction)(e.predict_proba, X_binned, all_proba, lock) for e in self.trees) # for proba in all_proba: # proba /= len(self.trees) all_proba /= len(self.trees) # if len(all_proba) == 1: # return all_proba[0] # else: # return all_proba return all_proba
def _fit( self, X, y, *, sample_weight=None, sampler_kwargs: dict = {}, max_samples=None, eval_datasets: dict = None, eval_metrics: dict = None, train_verbose: bool or int or dict, ): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like of shape (n_samples,) The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Note that this is supported only if the base estimator supports sample weighting. sampler_kwargs : dict, default={} The kwargs to use as additional parameters when instantiating a new sampler. If none are given, default parameters are used. max_samples : int or float, default=None Argument to use instead of self.max_samples. %(eval_datasets)s %(eval_metrics)s %(train_verbose)s Returns ------- self : object """ # Check data, sampler_kwargs and random_state check_target_type(y) self.sampler_kwargs_ = check_type(sampler_kwargs, 'sampler_kwargs', dict) random_state = check_random_state(self.random_state) # Convert data (X is required to be 2d and indexable) check_x_y_args = { 'accept_sparse': ['csr', 'csc'], 'dtype': None, 'force_all_finite': False, 'multi_output': True, } X, y = self._validate_data(X, y, **check_x_y_args) # Check evaluation data self.eval_datasets_ = check_eval_datasets(eval_datasets, X, y, **check_x_y_args) # Check evaluation metrics self.eval_metrics_ = check_eval_metrics(eval_metrics) # Check verbose self.train_verbose_ = check_train_verbose(train_verbose, self.n_estimators, **self._properties) self._init_training_log_format() if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=None) # Remap output n_samples, self.n_features_in_ = X.shape self._n_samples = n_samples y = self._validate_y(y) # Check parameters self._validate_estimator() # Validate max_samples if max_samples is None: max_samples = self.max_samples if not isinstance(max_samples, numbers.Integral): max_samples = int(max_samples * X.shape[0]) if not (0 < max_samples <= X.shape[0]): raise ValueError("max_samples must be in (0, n_samples]") # Store validated integer row sampling value self._max_samples = max_samples # Validate max_features if isinstance(self.max_features, numbers.Integral): max_features = self.max_features elif isinstance(self.max_features, float): max_features = self.max_features * self.n_features_in_ else: raise ValueError("max_features must be int or float") if not (0 < max_features <= self.n_features_in_): raise ValueError("max_features must be in (0, n_features]") max_features = max(1, int(max_features)) # Store validated integer feature sampling value self._max_features = max_features # Other checks if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") if self.warm_start and self.oob_score: raise ValueError("Out of bag estimate only available" " if warm_start=False") if hasattr(self, "oob_score_") and self.warm_start: del self.oob_score_ if not self.warm_start or not hasattr(self, 'estimators_'): # Free allocated memory, if any self.estimators_ = [] self.estimators_features_ = [] self.estimators_n_training_samples_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") return self # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( n_more_estimators, self.n_jobs) total_n_estimators = sum(n_estimators) # Advance random state to state after training # the first n_estimators if self.warm_start and len(self.estimators_) > 0: random_state.randint(MAX_INT, size=len(self.estimators_)) seeds = random_state.randint(MAX_INT, size=n_more_estimators) self._seeds = seeds all_results = Parallel( n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args())( delayed(_parallel_build_estimators)(n_estimators[i], self, X, y, sample_weight, seeds[starts[i]:starts[i + 1]], total_n_estimators, verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ += list( itertools.chain.from_iterable(t[0] for t in all_results)) self.estimators_features_ += list( itertools.chain.from_iterable(t[1] for t in all_results)) self.estimators_n_training_samples_ += list( itertools.chain.from_iterable(t[2] for t in all_results)) if self.oob_score: self._set_oob_score(X, y) # Print training infomation to console. self._training_log_to_console() return self