def calc_outofbag(n_samples, rf): """ Recovers samples used to create trees in scikit-learn RandomForest objects. See https://github.com/scikit-learn-contrib/forest-confidence-interval Parameters ---------- n_samples : int The number of samples used to fit the scikit-learn RandomForest object. forest : RandomForest Regressor or Classifier object that is already fit by scikit-learn. Returns ------- sample_idx: list The indices of the samples used to train each tree. """ assert rf.bootstrap == True, "Forest was not trained with bootstrapping." n_trees = rf.n_estimators sample_idx = [] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, rf.max_samples) for t_idx in range(n_trees): sample_idx.append( _generate_unsampled_indices(rf.estimators_[t_idx].random_state, n_samples, n_samples_bootstrap)) return sample_idx
def _set_oob_score(self, X, y): """Calculate out of bag predictions and score.""" X = check_array(X, dtype=DTYPE) n_samples = X.shape[0] event, time = y predictions = np.zeros(n_samples) n_predictions = np.zeros(n_samples) n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict(X[unsampled_indices, :], check_input=False) predictions[unsampled_indices] += p_estimator n_predictions[unsampled_indices] += 1 if (n_predictions == 0).any(): warnings.warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions self.oob_prediction_ = predictions self.oob_score_ = concordance_index_censored(event, time, predictions)[0]
def _set_oob_score(self, X, y): """Compute out-of-bag score.""" X = check_array(X, dtype=DTYPE, accept_sparse="csr") n_classes_ = self.n_classes_ n_samples = y.shape[0] oob_decision_function = [] oob_score = 0.0 predictions = [ np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_) ] for sampler, estimator in zip(self.samplers_, self.estimators_): X_resample = X[sampler.sample_indices_] y_resample = y[sampler.sample_indices_] n_sample_subset = y_resample.shape[0] n_samples_bootstrap = _get_n_samples_bootstrap( n_sample_subset, self.max_samples) unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_sample_subset, n_samples_bootstrap) p_estimator = estimator.predict_proba( X_resample[unsampled_indices, :], check_input=False) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): indices = sampler.sample_indices_[unsampled_indices] predictions[k][indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") with np.errstate(invalid="ignore", divide="ignore"): # with the resampling, we are likely to have rows not included # for the OOB score leading to division by zero decision = predictions[k] / predictions[k].sum( axis=1)[:, np.newaxis] mask_scores = np.isnan(np.sum(decision, axis=1)) oob_decision_function.append(decision) oob_score += np.mean( y[~mask_scores, k] == np.argmax(predictions[k][~mask_scores], axis=1), axis=0, ) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def _set_oob_score(self, X, y): """Compute out-of-bag score""" X = check_array(X, dtype=DTYPE, accept_sparse='csr') if self.n_classes_[0] > 2: n_classes_ = list(np.asarray(self.n_classes_) - 1) # CHANGED TO K-1 else: n_classes_ = self.n_classes_ n_samples = y.shape[0] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) oob_decision_function = [] oob_score = 0.0 predictions = [] for k in range(self.n_outputs_): predictions.append(np.zeros((n_samples, n_classes_[k]))) for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_cum_proba(X[unsampled_indices, :], check_input=False) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") decision = (predictions[k] / predictions[k].sum(axis=1)[:, np.newaxis]) oob_decision_function.append(decision) if self.n_classes_[0] <= 2: oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1), axis=0) else: class_index = np.sum((predictions[k] > 0.5).astype(np.int), axis=1) oob_score += np.mean(y[:, k] == class_index, axis=0) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def _get_unsampled_indices(tree, n_samples): """ An interface to get unsampled indices regardless of sklearn version. """ if LooseVersion(sklearn.__version__) >= LooseVersion("0.24"): # Version 0.24 moved forest package name from sklearn.ensemble._forest import _get_n_samples_bootstrap n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) return _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap) elif LooseVersion(sklearn.__version__) >= LooseVersion("0.22"): # Version 0.22 or newer uses 3 arguments. from sklearn.ensemble.forest import _get_n_samples_bootstrap n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) return _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap) else: # Version 0.21 or older uses only two arguments. return _generate_unsampled_indices(tree.random_state, n_samples)
def partial_fit(self, X, y, classes=None): """ Partially fits the forest to data X with labels y. Parameters ---------- X : ndarray Input data matrix. y : ndarray Output (i.e. response data matrix). classes : ndarray, default=None List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. Returns ------- self : CascadeStreamForest The object itself. """ X, y = check_X_y(X, y) if self.bootstrap: n_samples_bootstrap = _get_n_samples_bootstrap( X.shape[0], self.max_samples) else: n_samples_bootstrap = X.shape[0] # Update existing stream decision trees trees = Parallel(n_jobs=self.n_jobs)(delayed( _partial_fit )(tree, X, y, n_samples_bootstrap=n_samples_bootstrap, classes=classes) for tree in self.forest_) self.forest_ = trees # Before the maximum number of trees if len(self.forest_) < self.n_estimators: # Add a new decision tree based on new data sdt = DecisionTreeClassifier(splitter=self.splitter, max_features=self.max_features) _partial_fit(sdt, X, y, n_samples_bootstrap=n_samples_bootstrap, classes=classes) self.forest_.append(sdt) return self
def _set_oob_score(self, X, y): """Compute out-of-bag score.""" check_X_y(X, y) check_X(X, enforce_univariate=True) n_classes_ = self.n_classes_ n_samples = y.shape[0] oob_decision_function = [] oob_score = 0.0 predictions = [ np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_) ] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) for estimator in self.estimators_: final_estimator = estimator.steps[-1][1] unsampled_indices = _generate_unsampled_indices( final_estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :]) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") decision = predictions[k] / predictions[k].sum(axis=1)[:, np.newaxis] oob_decision_function.append(decision) oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1), axis=0) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def calc_inbag(n_samples, forest): """ Derive samples used to create trees in scikit-learn RandomForest objects. Recovers the samples in each tree from the random state of that tree using :func:`forest._generate_sample_indices`. Parameters ---------- n_samples : int The number of samples used to fit the scikit-learn RandomForest object. forest : RandomForest Regressor or Classifier object that is already fit by scikit-learn. Returns ------- Array that records how many times a data point was placed in a tree. Columns are individual trees. Rows are the number of times a sample was used in a tree. """ if not forest.bootstrap: e_s = "Cannot calculate the inbag from a forest that has bootstrap=False" raise ValueError(e_s) n_trees = forest.n_estimators inbag = np.zeros((n_samples, n_trees)) sample_idx = [] if isinstance(forest, BaseForest): n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, forest.max_samples) for t_idx in range(n_trees): sample_idx.append( _generate_sample_indices( forest.estimators_[t_idx].random_state, n_samples, n_samples_bootstrap, )) inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples) elif isinstance(forest, BaseBagging): for t_idx, estimator_sample in enumerate(forest.estimators_samples_): sample_idx.append(estimator_sample) inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples) return inbag
def _set_oob_score(self, X, y): """ Compute out-of-bag scores.""" X, y = check_X_y(X, y, enforce_univariate=True) n_samples = y.shape[0] predictions = np.zeros((n_samples, self.n_outputs_)) n_predictions = np.zeros((n_samples, self.n_outputs_)) n_samples_bootstrap = _get_n_samples_bootstrap( n_samples, self.max_samples ) for estimator in self.estimators_: final_estimator = estimator.steps[-1][1] unsampled_indices = _generate_unsampled_indices( final_estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict( X[unsampled_indices, :], check_input=False) if self.n_outputs_ == 1: p_estimator = p_estimator[:, np.newaxis] predictions[unsampled_indices, :] += p_estimator n_predictions[unsampled_indices, :] += 1 if (n_predictions == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions self.oob_prediction_ = predictions if self.n_outputs_ == 1: self.oob_prediction_ = \ self.oob_prediction_.reshape((n_samples, )) self.oob_score_ = 0.0 for k in range(self.n_outputs_): self.oob_score_ += r2_score(y[:, k], predictions[:, k]) self.oob_score_ /= self.n_outputs_
def fit(self, X, y, sample_weight=None): """Build a forest of trees from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape (n_samples,) Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object The fitted instance. """ # Validate or convert input data X = check_array(X, accept_sparse="csc", dtype=DTYPE) y = check_array(y, accept_sparse="csc", ensure_2d=False, dtype=None) if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() # Remap output _, self.n_features_ = X.shape y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2, ) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) # Check parameters self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] self.samplers_ = [] self.pipelines_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError("n_estimators=%d must be larger or equal to " "len(estimators_)=%d when warm_start==True" % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [] samplers = [] for _ in range(n_more_estimators): tree, sampler = self._make_sampler_estimator( random_state=random_state) trees.append(tree) samplers.append(sampler) # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading more efficient than multiprocessing in # that case. However, we respect any parallel_backend contexts set # at a higher level, since correctness does not rely on using # threads. samplers_trees = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, prefer="threads")(delayed(_local_parallel_build_trees)( s, t, self, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap, ) for i, (s, t) in enumerate(zip(samplers, trees))) samplers, trees = zip(*samplers_trees) # Collect newly grown trees self.estimators_.extend(trees) self.samplers_.extend(samplers) # Create pipeline with the fitted samplers and trees self.pipelines_.extend([ make_pipeline(deepcopy(s), deepcopy(t)) for s, t in zip(samplers, trees) ]) if self.oob_score: self._set_oob_score(X, y) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def fit(self, X, y, sample_weight=None): """Build a forest of survival trees from the training set (X, y). Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. Returns ------- self """ X, event, time = check_arrays_survival(X, y) self.n_features_ = X.shape[1] time = time.astype(np.float64) self.event_times_ = np.unique(time[event]) self.n_outputs_ = self.event_times_.shape[0] y_numeric = np.empty((X.shape[0], 2), dtype=np.float64) y_numeric[:, 0] = time y_numeric[:, 1] = event.astype(np.float64) # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) # Check parameters self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError("n_estimators=%d must be larger or equal to " "len(estimators_)=%d when warm_start==True" % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warnings.warn("Warm-start fitting without increasing n_estimators " "does not fit new trees.") else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [ self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators) ] # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading more efficient than multiprocessing in # that case. However, for joblib 0.12+ we respect any # parallel_backend contexts set at a higher level, # since correctness does not rely on using threads. trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))( delayed(_parallel_build_trees)( t, self, X, (y_numeric, self.event_times_), sample_weight, i, len(trees), verbose=self.verbose, n_samples_bootstrap=n_samples_bootstrap) for i, t in enumerate(trees)) # Collect newly grown trees self.estimators_.extend(trees) if self.oob_score: self._set_oob_score(X, (event, time)) return self
def fit(self, X, y, sample_weight=None): # Validate or convert input data if issparse(y): raise ValueError( "sparse multilabel-indicator for y is not supported.") if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() # Remap output self.n_features_ = X.shape[1] y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) # Check parameters self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") else: # CHANGE self.base_estimator_ = MyDecisionTreeClassifier( criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_split, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, min_impurity_split=self.min_impurity_split, random_state=self.random_state) if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [ self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators) ] # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading more efficient than multiprocessing in # that case. However, for joblib 0.12+ we respect any # parallel_backend contexts set at a higher level, # since correctness does not rely on using threads. trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))( delayed(_parallel_build_trees)( t, self, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap) for i, t in enumerate(trees)) # Collect newly grown trees self.estimators_.extend(trees) if self.oob_score: self._set_oob_score(X, y) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def _compute_oob_predictions(self, X, y): """Compute and set the OOB score. Parameters ---------- X : array-like of shape (n_samples, n_features) The data matrix. y : ndarray of shape (n_samples, n_outputs) The target matrix. Returns ------- oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or \ (n_samples, 1, n_outputs) The OOB predictions. """ # Prediction requires X to be in CSR format if issparse(X): X = X.tocsr() n_samples = y.shape[0] n_outputs = self.n_outputs_ if is_classifier(self) and hasattr(self, "n_classes_"): # n_classes_ is a ndarray at this stage # all the supported type of target will have the same number of # classes in all outputs oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs) else: # for regression, n_classes_ does not exist and we create an empty # axis to be consistent with the classification case and make # the array operations compatible with the 2 settings oob_pred_shape = (n_samples, 1, n_outputs) oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64) n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64) for sampler, estimator in zip(self.samplers_, self.estimators_): X_resample = X[sampler.sample_indices_] y_resample = y[sampler.sample_indices_] n_sample_subset = y_resample.shape[0] n_samples_bootstrap = _get_n_samples_bootstrap( n_sample_subset, self.max_samples) unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_sample_subset, n_samples_bootstrap) y_pred = self._get_oob_predictions( estimator, X_resample[unsampled_indices, :]) indices = sampler.sample_indices_[unsampled_indices] oob_pred[indices, ...] += y_pred n_oob_pred[indices, :] += 1 for k in range(n_outputs): if (n_oob_pred == 0).any(): warn( "Some inputs do not have OOB scores. This probably means " "too few trees were used to compute any reliable OOB " "estimates.", UserWarning, ) n_oob_pred[n_oob_pred == 0] = 1 oob_pred[..., k] /= n_oob_pred[..., [k]] return oob_pred
def fit(self, X, y, sample_weight=None): """ Build a forest of trees from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape (n_samples, n_features) The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) # Validate or convert input data if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() # Remap output self.n_columns = X.shape[1] self.n_features_ = X.shape[1] if X.ndim == 2 else 1 y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) # Check parameters self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [ self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators) ] # Parallel loop: for standard random forests, the threading # backend is preferred as the Cython code for fitting the trees # is internally releasing the Python GIL making threading more # efficient than multiprocessing in that case. # However, in this case,for fitting pipelines in parallel, # multiprocessing is more efficient. trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(_parallel_build_trees)( t, self, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap) for i, t in enumerate(trees)) # Collect newly grown trees self.estimators_.extend(trees) if self.oob_score: self._set_oob_score(X, y) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] self._is_fitted = True return self
def partial_fit(self, X, y, classes=None): """ Partially fits the forest to data X with labels y. Parameters ---------- X : ndarray Input data matrix. y : ndarray Output (i.e. response data matrix). classes : ndarray, default=None List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. Returns ------- self : StreamDecisionForest The object itself. """ X, y = check_X_y(X, y) if classes is not None: self.classes_ = classes # Update stream decision trees with random inputs if self.bootstrap: n_samples_bootstrap = _get_n_samples_bootstrap( X.shape[0], self.max_samples) else: n_samples_bootstrap = X.shape[0] # Update existing stream decision trees trees = Parallel(n_jobs=self.n_jobs)(delayed(_partial_fit)( tree, X, y, n_samples_bootstrap=n_samples_bootstrap, classes=self.classes_, ) for tree in self.forest_) self.n_batches_ += 1 # Calculate probability of swaps swap_prob = 1 / self.n_batches_ if self.n_batches_ >= 2 and np.random.random() <= swap_prob: # Evaluate forest performance results = Parallel(n_jobs=self.n_jobs)(delayed(tree.predict)(X) for tree in trees) # Sort predictions by accuracy acc_l = [] for idx, result in enumerate(results): acc_l.append([accuracy_score(result, y), idx]) acc_l = sorted(acc_l, key=lambda x: x[0]) # Generate new trees new_trees = Parallel(n_jobs=self.n_jobs)(delayed(_partial_fit)( DecisionTreeClassifier(max_features=self.max_features, splitter=self.splitter), X, y, n_samples_bootstrap=n_samples_bootstrap, classes=self.classes_, ) for i in range(self.n_swaps)) # Swap worst performing trees with new trees for i in range(self.n_swaps): trees[acc_l[i][1]] = new_trees[i] self.forest_ = trees return self
def _get_oof_pred_proba(self, X, y, **kwargs): if self._daal: raise AssertionError( 'DAAL forest backend does not support out-of-bag predictions.') if not self.model.bootstrap: raise ValueError( 'Forest models must set `bootstrap=True` to compute out-of-fold predictions via out-of-bag predictions.' ) # TODO: This can also be done via setting `oob_score=True` in model params, # but getting the correct `pred_time_val` that way is not easy, since we can't time the internal call. if (getattr(self.model, "oob_decision_function_", None) is None and getattr(self.model, "oob_prediction_", None) is None) \ and callable(getattr(self.model, "_set_oob_score", None)): X = self.preprocess(X) if getattr(self.model, "n_classes_", None) is not None: if self.model.n_outputs_ == 1: self.model.n_classes_ = [self.model.n_classes_] from sklearn.tree._tree import DTYPE, DOUBLE X, y = self.model._validate_data(X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) self.model._set_oob_score(X, y) if getattr(self.model, "n_classes_", None) is not None: if self.model.n_outputs_ == 1: self.model.n_classes_ = self.model.n_classes_[0] if getattr(self.model, "oob_decision_function_", None) is not None: y_oof_pred_proba = self.model.oob_decision_function_ self.model.oob_decision_function_ = None # save memory elif getattr(self.model, "oob_prediction_", None) is not None: y_oof_pred_proba = self.model.oob_prediction_ self.model.oob_prediction_ = None # save memory else: raise AssertionError( f'Model class {type(self.model)} does not support out-of-fold prediction generation.' ) # TODO: Regression does not return NaN for missing rows, instead it sets them to 0. This makes life hard. # The below code corrects the missing rows to NaN instead of 0. # Don't bother if >60 trees, near impossible to have missing # If using 68% of data for training, chance of missing for each row is 1 in 11 billion. if self.problem_type == REGRESSION and self.model.n_estimators <= 60: from sklearn.ensemble._forest import _get_n_samples_bootstrap, _generate_unsampled_indices n_samples = len(y) n_predictions = np.zeros(n_samples) n_samples_bootstrap = _get_n_samples_bootstrap( n_samples, self.model.max_samples) for estimator in self.model.estimators_: unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap) n_predictions[unsampled_indices] += 1 missing_row_mask = n_predictions == 0 y_oof_pred_proba[missing_row_mask] = np.nan # fill missing prediction rows with average of non-missing rows if np.isnan(np.sum(y_oof_pred_proba)): if len(y_oof_pred_proba.shape) == 1: col_mean = np.nanmean(y_oof_pred_proba) y_oof_pred_proba[np.isnan(y_oof_pred_proba)] = col_mean else: col_mean = np.nanmean(y_oof_pred_proba, axis=0) inds = np.where(np.isnan(y_oof_pred_proba)) y_oof_pred_proba[inds] = np.take(col_mean, inds[1]) return self._convert_proba_to_unified_form(y_oof_pred_proba)
def get_oob_indices(tree, n_samples): n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) return _generate_unsampled_indices( tree.random_state, n_samples, n_samples_bootstrap)