def _mean_fn(self, X, fn, acc, slice=None): # Helper class that accumulates an arbitrary function in parallel on the accumulator acc # and calls the function fn on each tree e and returns the mean output. The function fn # should take as input a tree e, and return another function g_e, which takes as input X, check_input # If slice is not None, but rather a tuple (start, end), then a subset of the trees from # index start to index end will be used. The returned result is essentially: # (mean over e in slice)(g_e(X)). check_is_fitted(self, 'estimators_') # Check data X = self._validate_X_predict(X) if slice is None: estimator_slice = self.estimators_ else: estimator_slice = self.estimators_[slice[0]:slice[1]] # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(len(estimator_slice), self.n_jobs) lock = threading.Lock() Parallel(n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"))( delayed(_accumulate_prediction)(fn(e), X, [acc], lock) for e in estimator_slice) acc /= len(estimator_slice) return acc
def _predict(self, predict_fn, X): check_is_fitted(self, 'estimators_') # Check data X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # avoid storing the output of every estimator by summing them here if predict_fn == "predict": y_hat = np.zeros((X.shape[0]), dtype=np.float64) else: y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64) def _get_fn(est, name): fn = getattr(est, name) if name in ("predict_cumulative_hazard_function", "predict_survival_function"): fn = partial(fn, return_array=True) return fn # Parallel loop lock = threading.Lock() Parallel(n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"))( delayed(_accumulate_prediction)(_get_fn(e, predict_fn), X, [y_hat], lock) for e in self.estimators_) y_hat /= len(self.estimators_) return y_hat
def apply(self, X): """ Apply trees in the forest to X, return leaf indices. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- X_leaves : ndarray of shape (n_samples, n_estimators) For each datapoint x in X and for each tree in the forest, return the index of the leaf x ends up in. """ X = self._validate_X_predict(X) results = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer="threads"), )(delayed(tree.apply)(X, check_input=False) for tree in self.estimators_) return np.array(results).T
def predict_proba(self, X): check_is_fitted(self) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Avoid storing the output of every estimator by summing them here all_proba = [ np.zeros((X.shape[0], j), dtype=np.float64) for j in np.atleast_1d(self.n_classes_) ] lock = threading.Lock() Parallel(n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"))( delayed(_accumulate_prediction) (self.features[i], self.thresholds[i], self.childrens[i], self.values[i], X, all_proba, lock) for i in range(self.n_estimators)) for proba in all_proba: proba /= len(self.features) if len(all_proba) == 1: return all_proba[0] else: return all_proba
def predict_proba(self, X): check_is_fitted(self) X = self._validate_X_predict(X) n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) all_proba = [ np.zeros((X.shape[0], j), dtype=np.float64) for j in np.atleast_1d(self.n_classes_) ] lock = threading.Lock() Parallel(n_jobs=-1, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"))( delayed(accumalate_prediction)(e.predict_proba, X, all_proba, lock) for e in self.estimators_) for proba in all_proba: proba /= len(self.estimators_) if len(all_proba) == 1: return all_proba[0] else: return all_proba
def feature_importances_(self): """The impurity-based feature importance. The higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. .. warning:: impurity-based feature importance can be misleading for high cardinality features (many unique values). See https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html as an alternative. Returns ------- feature_importances_ : ndarray of shape (n_features,) The values of this array sum to 1, unless all trees are single node trees consisting of only the root node, in which case it will be an array of zeros. """ check_is_fitted(self) all_importances = Parallel( n_jobs=self.n_jobs, **_joblib_parallel_args(prefer="threads"))( delayed(getattr)(tree, "feature_importances_") for tree in self.detector_.estimators_ if tree.tree_.node_count > 1) if not all_importances: return np.zeros(self.n_features_in_, dtype=np.float64) all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances)
def fit(random_state): np.random.seed(random_state) random_states = randint(np.iinfo(np.intp).max, size=n_jobs) trees = Parallel( n_jobs=16, **_joblib_parallel_args(prefer="threads"), )(delayed(f)(out, n, random_state, idx) for idx, random_state in zip(range(n_jobs), random_states))
def fit(self, X, y, sample_weight=None): """ Trains the binner and an estimator on every bucket. Parameters ---------- X: features, *X* is converted into an array if *X* is a dataframe y: target sample_weight: sample weights Returns ------- self: returns an instance of self. Attributes ---------- binner_ : binner estimators_ : dictionary of estimators, each of them mapped to a leave to the tree mean_estimator_ : estimator trained on the whole datasets in case the binner can find a bucket for a new observation dim_: dimension of the output mean_: average targets """ self.estimators_ = [] estimators = [clone(self.estimator) for i in range(self.n_estimators)] loop = tqdm(range( len(estimators))) if self.verbose == 'tqdm' else range( len(estimators)) verbose = 1 if self.verbose == 'tqdm' else (1 if self.verbose else 0) def _fit_piecewise_estimator(i, est, X, y, sample_weight, alpha): new_size = int(X.shape[0] * alpha + 0.5) rnd = numpy.random.randint(0, X.shape[0] - 1, new_size) Xr = X[rnd] yr = y[rnd] sr = sample_weight[rnd] if sample_weight else None return est.fit(Xr, yr, sr) self.estimators_ = \ Parallel(n_jobs=self.n_jobs, verbose=verbose, **_joblib_parallel_args(prefer='threads'))( delayed(_fit_piecewise_estimator)( i, estimators[i], X, y, sample_weight, self.alpha) for i in loop) return self
def predict(self, X, mask): """Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. mask : array que armazena a informaçao de haver ou não haver a arvore na floresta. fileCache : Rota onde se encontra as arvores que seram trabalhadas, lembrando que cada colecao e cada fold possui um conjunto unico de arvores Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted values. """ mask = [i == '1' or i == 1 for i in mask] self.n_outputs_ = 1 check_is_fitted(self, 'estimators_') # Check data X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # avoid storing the output of every estimator by summing them here if self.n_outputs_ > 1: y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64) else: y_hat = np.zeros((X.shape[0]), dtype=np.float64) # Parallel loop lock = threading.Lock() Parallel(n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"))( delayed(_accumulate_prediction_mod)(e.predict, X, gene, [y_hat], lock) for e, gene in zip(self.estimators_, mask)) n_trees = 0 for g in mask: if g: n_trees += 1 y_hat /= n_trees return y_hat
def oob_predict(self, X, y, genes, parallel=True): """ Compute out-of-bag prediction. """ X = check_array(X, dtype=DTYPE, accept_sparse='csr') n_samples = y.shape[0] predictions = np.zeros((n_samples, self.n_outputs_)) n_predictions = np.zeros((n_samples, self.n_outputs_)) n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, None) genes = [i == '1' or i == 1 for i in genes] if parallel: # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Parallel loop lock = threading.Lock() Parallel(n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"))( delayed(_oob_accumulate_prediction) (e.predict, X, gene, [predictions, n_predictions], lock, n_samples, n_samples_bootstrap, self.n_outputs_, e.random_state) for e, gene in zip(self.estimators_, genes)) else: for e, gene in zip(self.estimators_, genes): if gene: unsampled_indices = _generate_unsampled_indices( e.random_state, n_samples, n_samples_bootstrap) p_estimator = e.predict(X[unsampled_indices, :], check_input=False) if self.n_outputs_ == 1: p_estimator = p_estimator[:, np.newaxis] predictions[unsampled_indices, :] += p_estimator n_predictions[unsampled_indices, :] += 1 else: pass if (n_predictions == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions return predictions
def from_df(cls, args: CommonArgs, df: pd.DataFrame): args.update_columns(df.keys().to_list()) if args.group_reading: pure_columns = args.pure_columns or [] mixture_columns = args.mixture_columns or [] reaction_columns = args.reaction_columns or [] n1 = len(pure_columns) n2 = len(mixture_columns) n3 = len(reaction_columns) groups = df.groupby(pure_columns + mixture_columns + reaction_columns) data = Parallel( n_jobs=args.n_jobs, verbose=True, **_joblib_parallel_args(prefer='processes'))( delayed(cls.get_subDataset) ((lambda x: [x] if x.__class__ == str else tolist(x))(g[0]) [0:n1], (lambda x: tolist([x]) if x.__class__ == str else tolist(x))(g[0])[n1:n1 + n2], args.mixture_type, (lambda x: [x] if x.__class__ == str else tolist(x) )(g[0])[n1 + n2:n1 + n2 + n3], args.reaction_type, to_numpy(g[1][args.target_columns]), to_numpy(g[1][ args.feature_columns]), args.features_generator) for g in groups) else: data = Parallel( n_jobs=args.n_jobs, verbose=True, **_joblib_parallel_args(prefer='processes'))( delayed(cls.get_subDataset)( tolist(df.iloc[i].get(args.pure_columns)), tolist(df.iloc[i].get(args.mixture_columns)), args.mixture_type, tolist(df.iloc[i].get(args.reaction_columns)), args.reaction_type, to_numpy(df.iloc[i:i + 1][args.target_columns]), to_numpy(df.iloc[i:i + 1].get(args.feature_columns)), args.features_generator, ) for i in df.index) return cls(data)
def _decision_path(isolation_forest, X, n_jobs): # code from sklearn RandomForest. X = check_array(X, dtype=DTYPE, accept_sparse='csr') indicators = Parallel(n_jobs=n_jobs, **_joblib_parallel_args(prefer='threads'))( delayed(parallel_helper)( tree, 'decision_path', X, check_input=False) for tree in isolation_forest.estimators_) n_nodes = [0] n_nodes.extend([i.shape[1] for i in indicators]) n_nodes_ptr = np.array(n_nodes).cumsum() indicators = sparse_hstack(indicators).tocsr() return indicators, n_nodes_ptr
def fit(self, X, y): X, y = self._validate_data(X, y) self.X_y_correlations_ = np.array( Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args())( delayed(self.corr_method)(X[:, X_idx], y) for X_idx in range(X.shape[1]))) self.create_masks(y) return self
def feature_importances_(self): check_is_fitted(self) all_importances = Parallel( n_jobs=-1, **_joblib_parallel_args(prefer='threads'))( delayed(getattr)(tree, 'feature_importances_') for tree in self.estimators_ if tree.tree_.node_count > 1) if not all_importances: return np.zeros(self.n_features_, dtype=np.float64) all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances)
def predict(self, X): """ Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted values. """ check_is_fitted(self) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # avoid storing the output of every estimator by summing them here y_hat = np.zeros((X.shape[0], 1), dtype=np.float64) # Parallel loop lock = threading.Lock() Parallel( n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem") )( delayed(_accumulate_prediction)( self.features[i], self.thresholds[i], self.childrens[i], self.values[i], X, [y_hat], lock, ) for i in range(self.n_estimators) ) y_hat /= self.n_estimators return y_hat
def predict_proba_trees(self, X): check_is_fitted(self) # Check data X = self._validate_X_predict(X) # TODO: we can also avoid data binning for predictions... X_binned = self._bin_data(X, is_training_data=False) n_samples, n_features = X.shape n_estimators = len(self.trees) n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) probas = np.empty((n_estimators, n_samples, n_features)) lock = threading.Lock() Parallel( n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"), )(delayed(_get_tree_prediction)(e.predict_proba, X_binned, probas, lock, tree_idx) for tree_idx, e in enumerate(self.trees)) return probas
def predict(self, X): """ Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : array-like or sparse matrix of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- y : array-like of shape (n_samples,) or (n_samples, n_outputs) The predicted values. """ check_is_fitted(self) # Check data X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Parallel loop # Store the output of every estimator in order to compute confidence intervals y_hat = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"))( delayed(_accumulate_prediction)( e.predict, X, self.minimum_value) for e in self.forest.estimators_) y_hat_below = np.percentile(y_hat, self.confidence_interval_lower, axis=0) y_hat_above = np.percentile(y_hat, self.confidence_interval_upper, axis=0) return np.dstack((y_hat_below, y_hat_above))
def _apply_predict_method(self, X, method, parallelized, dimout): """ Generic *predict* method, works for *predict_proba* and *decision_function* as well. """ check_is_fitted(self, 'estimators_') if len(self.estimators_) == 0: raise RuntimeError( "Estimator was apparently fitted but contains no estimator.") if not hasattr(self.estimators_[0], method): raise TypeError("Estimator {} does not have method '{}'.".format( type(self.estimators_[0]), method)) if isinstance(X, pandas.DataFrame): X = X.values association = self.transform_bins(X) indpred = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))( delayed(parallelized)(i, model, X, association) for i, model in enumerate(self.estimators_)) pred = numpy.zeros((X.shape[0], dimout) if dimout > 1 else (X.shape[0], )) indall = numpy.empty((X.shape[0], )) indall[:] = False for ind, p in indpred: if ind is None: continue pred[ind] = p indall = numpy.logical_or(indall, ind) # pylint: disable=E1111 # no in a bucket indall = numpy.logical_not(indall) # pylint: disable=E1111 Xmissed = X[indall] if Xmissed.shape[0] > 0: meth = getattr(self.mean_estimator_, method) missed = meth(Xmissed) pred[indall] = missed return pred
def oob_predict_buffer(self, X, y, parallel=True): X = check_array(X, dtype=DTYPE, accept_sparse='csr') n_samples = X.shape[0] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, None) prediction_buffer = np.zeros( (n_samples_bootstrap, len(self.estimators_)), dtype='float32') prediction_buffer[:, :] = np.nan if parallel: # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Parallel loop lock = threading.Lock() Parallel( n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require="sharedmem"))( delayed(_oob_bufferize_prediction) (e.predict, X, estimator, prediction_buffer, lock, n_samples, n_samples_bootstrap, self.n_outputs_, e.random_state) for e, estimator in zip( self.estimators_, range(0, len(self.estimators_)))) else: for e, estimator in zip(self.estimators_, range(0, len(self.estimators_))): unsampled_indices = _generate_unsampled_indices( e.random_state, n_samples, n_samples_bootstrap) p_estimator = e.predict(X[unsampled_indices, :], check_input=False) prediction_buffer[unsampled_indices, estimator] = p_estimator self.__buffer = prediction_buffer
def from_public(cls, args: CommonArgs): if args.data_public == 'qm7': qm_data = QM7(ase=True) elif args.data_public == 'qm9': qm_data = QM9(ase=True) else: raise RuntimeError(f'Unknown public data set {args.data_public}') data = Parallel( n_jobs=args.n_jobs, verbose=True, **_joblib_parallel_args(prefer='processes'))( delayed(cls.get_subDataset)( [], [], args.mixture_type, [], args.reaction_type, tolist(qm_data.iloc[i].get('atoms')), to_numpy(qm_data.iloc[i:i + 1][args.target_columns]), to_numpy(qm_data.iloc[i:i + 1].get(args.feature_columns)), args.features_generator, ) for i in qm_data.index) return cls(data)
def _forest_predict_var(forest, X_test, n_jobs): """Helper function to accumulate predictions and their variances. Parameters ---------- forest : RandomForestRegressor Regressor object. X_test : ndarray, shape (n_test_samples,) The design matrix for testing data. n_jobs : int or None, optional (default=None) The number of jobs to run in parallel. ``None`` means 1. ``-1`` means use all processors. """ check_is_fitted(forest) X_test = forest._validate_X_predict(X_test) n_jobs, _, _ = _partition_estimators(forest.n_estimators, n_jobs) y_hat = np.zeros((X_test.shape[0]), dtype=np.float64) y_var = np.zeros((X_test.shape[0]), dtype=np.float64) # Parallel loop lock = threading.Lock() Parallel(n_jobs=n_jobs, verbose=forest.verbose, **_joblib_parallel_args(require='sharedmem'))( delayed(_accumulate_predictions_and_var)(e.predict, X_test, [y_hat, y_var], lock) for e in forest.estimators_) y_hat /= len(forest.estimators_) y_var /= len(forest.estimators_) y_var -= y_hat**2 return [y_hat, y_var]
def fit(self, X, y=None, *, groups=None, **fit_kwargs): cv = check_cv(self.cv) jobs = (delayed(self._compute_score_path)(X, y, train, test, **fit_kwargs) for train, test in cv.split(X, y, groups=groups)) score_path = Parallel(n_jobs=self.n_jobs, verbose=self.verbose_cv, **_joblib_parallel_args(prefer='threads'))(jobs) self.mean_scores_ = np.mean(score_path, axis=0) self.best_index_ = np.argmin(self.mean_scores_) self.best_score_ = np.min(self.mean_scores_) self.best_hyperparams_ = self.hyperparams_grid_[self.best_index_] for name in self.hyperparam_names: setattr(self.model, name, self.best_hyperparams_[name]) self.model.fit(X, y, **fit_kwargs) return self
def fit(self, X, y, sample_weight=None): """ Build a forest of trees from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object """ # Validate or convert input data if issparse(y): raise ValueError( "sparse multilabel-indicator for y is not supported.") if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() # Remap output n_samples, self.n_features_ = X.shape y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) # Check parameters self._validate_estimator() random_state = check_random_state(self.random_state) n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) trees = [ self._make_estimator(append=False, random_state=random_state) for i in range(self.n_estimators) ] # Pre-allocate OOB estimations oob_decision_function = np.zeros( (n_samples, self.classes_[0].shape[0])) lock = threading.Lock() rets = Parallel( n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads', require="sharedmem"))( delayed(_parallel_build_trees)( t, X, y, n_samples_bootstrap, sample_weight, oob_decision_function, lock) for i, t in enumerate(trees)) # Collect newly grown trees for feature, threshold, children, value in rets: # No check on feature and threshold since 1-D array is always # C-aligned and F-aligned. self.features.append(feature) self.thresholds.append(threshold) self.childrens.append(children) self.values.append(value) # Check the OOB predictions if (oob_decision_function.sum(axis=1) == 0).any(): warn("Some inputs do not have OOB predictions. " "This probably means too few trees were used " "to compute any reliable oob predictions.") prediction = (oob_decision_function / oob_decision_function.sum(axis=1)[:, np.newaxis]) self.oob_decision_function_ = prediction # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def fit(self, X, y, sample_weight=None): """Build a forest of survival trees from the training set (X, y). Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. Returns ------- self """ X, event, time = check_arrays_survival(X, y) self.n_features_ = X.shape[1] time = time.astype(np.float64) self.event_times_ = np.unique(time[event]) self.n_outputs_ = self.event_times_.shape[0] y_numeric = np.empty((X.shape[0], 2), dtype=np.float64) y_numeric[:, 0] = time y_numeric[:, 1] = event.astype(np.float64) # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) # Check parameters self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError("n_estimators=%d must be larger or equal to " "len(estimators_)=%d when warm_start==True" % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warnings.warn("Warm-start fitting without increasing n_estimators " "does not fit new trees.") else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [ self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators) ] # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading more efficient than multiprocessing in # that case. However, for joblib 0.12+ we respect any # parallel_backend contexts set at a higher level, # since correctness does not rely on using threads. trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))( delayed(_parallel_build_trees)( t, self, X, (y_numeric, self.event_times_), sample_weight, i, len(trees), verbose=self.verbose, n_samples_bootstrap=n_samples_bootstrap) for i, t in enumerate(trees)) # Collect newly grown trees self.estimators_.extend(trees) if self.oob_score: self._set_oob_score(X, (event, time)) return self
def test_joblib_parallel_args(monkeypatch, joblib_version): import joblib monkeypatch.setattr(joblib, "__version__", joblib_version) if joblib_version == "0.12.0": # arguments are simply passed through assert _joblib_parallel_args(prefer="threads") == {"prefer": "threads"} assert _joblib_parallel_args(prefer="processes", require=None) == { "prefer": "processes", "require": None, } assert _joblib_parallel_args(non_existing=1) == {"non_existing": 1} elif joblib_version == "0.11": # arguments are mapped to the corresponding backend assert _joblib_parallel_args(prefer="threads") == { "backend": "threading" } assert _joblib_parallel_args(prefer="processes") == { "backend": "multiprocessing" } with pytest.raises(ValueError): _joblib_parallel_args(prefer="invalid") assert _joblib_parallel_args(prefer="processes", require="sharedmem") == { "backend": "threading" } with pytest.raises(ValueError): _joblib_parallel_args(require="invalid") with pytest.raises(NotImplementedError): _joblib_parallel_args(verbose=True) else: raise ValueError
def fit(self, X, y, sample_weight=None): """ Trains the binner and an estimator on every bucket. :param X: features, *X* is converted into an array if *X* is a dataframe :param y: target :param sample_weight: sample weights :return: self: returns an instance of self. Fitted attributes: * `binner_`: binner * `estimators_`: dictionary of estimators, each of them mapped to a leave to the tree * `mean_estimator_`: estimator trained on the whole datasets in case the binner can find a bucket for a new observation * `dim_`: dimension of the output * `mean_`: average targets """ if isinstance(X, pandas.DataFrame): X = X.values if isinstance(X, list): raise TypeError( # pragma: no cover "X cannot be a list.") binner = clone(self.binner) if sample_weight is None: self.binner_ = binner.fit(X, y) else: self.binner_ = binner.fit(X, y, sample_weight=sample_weight) association, self.mapping_, self.leaves_ = self._mapping_train( X, self.binner_) estimators = [clone(self.estimator) for i in self.mapping_] loop = (tqdm(range(len(estimators))) if self.verbose == 'tqdm' else range(len(estimators))) verbose = 1 if self.verbose == 'tqdm' else (1 if self.verbose else 0) self.mean_estimator_ = clone(self.estimator).fit(X, y, sample_weight) nb_classes = (None if not hasattr(self.mean_estimator_, 'classes_') else len(set(self.mean_estimator_.classes_))) if hasattr(self, 'random_state') and self.random_state is not None: # pylint: disable=E1101 rnd = numpy.random.RandomState( # pylint: disable=E1101 self.random_state) # pylint: disable=E1101 else: rnd = None self.estimators_ = \ Parallel(n_jobs=self.n_jobs, verbose=verbose, **_joblib_parallel_args(prefer='threads'))( delayed(_fit_piecewise_estimator)( i, estimators[i], X, y, sample_weight, association, nb_classes, rnd) for i in loop) self.dim_ = 1 if len(y.shape) == 1 else y.shape[1] if hasattr(self.estimators_[0], 'classes_'): self.classes_ = self.estimators_[0].classes_ return self
def test_joblib_parallel_args(monkeypatch, joblib_version): import sklearn.utils._joblib monkeypatch.setattr(sklearn.utils._joblib, '__version__', joblib_version) if joblib_version == '0.12.0': # arguments are simply passed through assert _joblib_parallel_args(prefer='threads') == {'prefer': 'threads'} assert _joblib_parallel_args(prefer='processes', require=None) == { 'prefer': 'processes', 'require': None} assert _joblib_parallel_args(non_existing=1) == {'non_existing': 1} elif joblib_version == '0.11': # arguments are mapped to the corresponding backend assert _joblib_parallel_args(prefer='threads') == { 'backend': 'threading'} assert _joblib_parallel_args(prefer='processes') == { 'backend': 'multiprocessing'} with pytest.raises(ValueError): _joblib_parallel_args(prefer='invalid') assert _joblib_parallel_args( prefer='processes', require='sharedmem') == { 'backend': 'threading'} with pytest.raises(ValueError): _joblib_parallel_args(require='invalid') with pytest.raises(NotImplementedError): _joblib_parallel_args(verbose=True) else: raise ValueError
def fit(self, X, y, sample_weight=None): """Build a forest of trees from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape (n_samples,) Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object The fitted instance. """ # Validate or convert input data if issparse(y): raise ValueError( "sparse multilabel-indicator for y is not supported.") X, y = self._validate_data(X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) self._n_features = X.shape[1] if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2, ) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] y_encoded, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y_encoded = np.ascontiguousarray(y_encoded, dtype=DOUBLE) if isinstance(self.sampling_strategy, dict): self._sampling_strategy = { np.where(self.classes_[0] == key)[0][0]: value for key, value in check_sampling_strategy( self.sampling_strategy, y, "under-sampling", ).items() } else: self._sampling_strategy = self.sampling_strategy if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) # Check parameters self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError( "Out of bag estimation only available if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] self.samplers_ = [] self.pipelines_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError("n_estimators=%d must be larger or equal to " "len(estimators_)=%d when warm_start==True" % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [] samplers = [] for _ in range(n_more_estimators): tree, sampler = self._make_sampler_estimator( random_state=random_state) trees.append(tree) samplers.append(sampler) # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading more efficient than multiprocessing in # that case. However, we respect any parallel_backend contexts set # at a higher level, since correctness does not rely on using # threads. samplers_trees = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer="threads"), )(delayed(_local_parallel_build_trees)( s, t, self, X, y_encoded, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap, ) for i, (s, t) in enumerate(zip(samplers, trees))) samplers, trees = zip(*samplers_trees) # Collect newly grown trees self.estimators_.extend(trees) self.samplers_.extend(samplers) # Create pipeline with the fitted samplers and trees self.pipelines_.extend([ make_pipeline(deepcopy(s), deepcopy(t)) for s, t in zip(samplers, trees) ]) if self.oob_score: self._set_oob_score(X, y_encoded) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def compute_feature_importance(self, x, y, partition_feature=None, norm=True, n_jobs=None): ''' :param x: input X of data and must be Pandas.core.frame.DataFrame or Pandas.core.series.Series :param y: input Y of data and do not need specify the type, but must be supported in numpy :param partition_feature: used for partitioning the data into local data subspaces and must be a column of data that can be hashed, but is optional. You can partition the data in advance instead and input feature subspace one by one. For example, if you want to compute local variable importance for each day, you only need to let partition_feature = day of year (1-365). Or input the feature subspace for each day one by one. :param norm: Yes or No normalise the output leading to the sum of each row equals to one.. :param n_jobs: The number of jobs paralleling at the same time. Please refer to class Parallel in package sklearn for more detailed information. :return: local variable importance ''' # to obtain the names of variables if not isinstance(x, Series) and not isinstance(x, DataFrame): raise TypeError( "{0} must be pandas.core.frame.DataFrame or pandas.core.series.Series not {1}" .format(x, type(x))) columns = x.columns # convert input X into numpy.array x = array(x, dtype=float64) # convert input Y to 1-D array y = array(y).ravel() # to obtain the number of variables self.FN = x.shape[1] # Produce data_choose array.This array contains bool values to choose rows for each feature subspace dataset if type(partition_feature) != type(None): partition_factor = list(partition_feature) # use set structure to extract factors partition_factor_set = set(partition_factor) partition_factor_list = list(partition_factor_set) # to obtain the number of group attribute self.FL = len(partition_factor_list) partition_factor_arr = np.array(partition_factor_list).reshape( self.FL, 1) # for each factor find out the rows of input group_by which is equal to it data_choose_bool = partition_factor_arr == partition_factor else: # if there is no group_by inputted, using all input rows self.FL = 1 partition_factor_list = None data_choose_bool = np.ones((1, x.shape[0])) == 1 # Parallel each tree. It is inherited from sklearn, you can refer to sklearn more detailed description. indicators = Parallel( n_jobs=n_jobs, verbose=self.verbose, max_nbytes='1M', **_joblib_parallel_args(prefer='threads'))( delayed(self.__traverse__)(tree, x, y, data_choose_bool) for tree in self.estimators_) # traverse each tree in a forest feature_importance_trees = vstack( indicators ) # Vertically stack the arrays returned by traverse forming a feature_importance_forest = np.average( feature_importance_trees, axis=0) # To compute averaged feature importance if not isinstance(norm, bool): raise TypeError('{0} must be True or False not {1}'.format( norm, type(norm))) if norm: # whether standardise the output # sum up each row sum_of_feature_importance = feature_importance_forest.sum( axis=1).reshape(feature_importance_forest.shape[0], 1) # each one is divided by the sum of this row feature_importance_norm = feature_importance_forest / ( sum_of_feature_importance + (sum_of_feature_importance == 0)) else: # directly output without normalization feature_importance_norm = feature_importance_forest # return the result with the form of DataFrame return pd.DataFrame(feature_importance_norm, columns=columns, index=partition_factor_list)
def test_joblib_parallel_args(monkeypatch, joblib_version): import joblib monkeypatch.setattr(joblib, '__version__', joblib_version) if joblib_version == '0.12.0': # arguments are simply passed through assert _joblib_parallel_args(prefer='threads') == {'prefer': 'threads'} assert _joblib_parallel_args(prefer='processes', require=None) == { 'prefer': 'processes', 'require': None } assert _joblib_parallel_args(non_existing=1) == {'non_existing': 1} elif joblib_version == '0.11': # arguments are mapped to the corresponding backend assert _joblib_parallel_args(prefer='threads') == { 'backend': 'threading' } assert _joblib_parallel_args(prefer='processes') == { 'backend': 'multiprocessing' } with pytest.raises(ValueError): _joblib_parallel_args(prefer='invalid') assert _joblib_parallel_args(prefer='processes', require='sharedmem') == { 'backend': 'threading' } with pytest.raises(ValueError): _joblib_parallel_args(require='invalid') with pytest.raises(NotImplementedError): _joblib_parallel_args(verbose=True) else: raise ValueError
def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. sample_weight : ignored Ignored by diffprivlib. Present for consistency with sklearn API. Returns ------- self : class """ self.accountant.check(self.epsilon, 0) if sample_weight is not None: warn_unused_args("sample_weight") if not isinstance(self.C, numbers.Real) or self.C < 0: raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) if not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0: raise ValueError( "Maximum number of iteration must be positive; got (max_iter=%r)" % self.max_iter) if not isinstance(self.tol, numbers.Real) or self.tol < 0: raise ValueError( "Tolerance for stopping criteria must be positive; got (tol=%r)" % self.tol) solver = _check_solver(self.solver, self.penalty, self.dual) X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, order="C", accept_large_sparse=solver != 'liblinear') check_classification_targets(y) self.classes_ = np.unique(y) _, n_features = X.shape if self.data_norm is None: warnings.warn( "Data norm has not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = np.linalg.norm(X, axis=1).max() X = clip_to_norm(X, self.data_norm) self.multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_)) n_classes = len(self.classes_) classes_ = self.classes_ if n_classes < 2: raise ValueError( "This solver needs samples of at least 2 classes in the data, but the data contains only " "one class: %r" % classes_[0]) if len(self.classes_) == 2: n_classes = 1 classes_ = classes_[1:] if self.warm_start: warm_start_coef = getattr(self, 'coef_', None) else: warm_start_coef = None if warm_start_coef is not None and self.fit_intercept: warm_start_coef = np.append(warm_start_coef, self.intercept_[:, np.newaxis], axis=1) self.coef_ = list() self.intercept_ = np.zeros(n_classes) if warm_start_coef is None: warm_start_coef = [None] * n_classes path_func = delayed(_logistic_regression_path) fold_coefs_ = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='processes'))( path_func(X, y, epsilon=self.epsilon / n_classes, data_norm=self.data_norm, pos_class=class_, Cs=[self.C], fit_intercept=self.fit_intercept, max_iter=self.max_iter, tol=self.tol, verbose=self.verbose, coef=warm_start_coef_, check_input=False) for class_, warm_start_coef_ in zip(classes_, warm_start_coef)) fold_coefs_, _, n_iter_ = zip(*fold_coefs_) self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0] self.coef_ = np.asarray(fold_coefs_) self.coef_ = self.coef_.reshape(n_classes, n_features + int(self.fit_intercept)) if self.fit_intercept: self.intercept_ = self.coef_[:, -1] self.coef_ = self.coef_[:, :-1] self.accountant.spend(self.epsilon, 0) return self