def _get_unsampled_indices(tree, n_samples): """ An interface to get unsampled indices regardless of sklearn version. """ if LooseVersion(sklearn.__version__) >= LooseVersion("0.22"): # Version 0.22 or newer uses 3 arguments. from sklearn.ensemble.forest import _get_n_samples_bootstrap n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) return _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap) else: # Version 0.21 or older uses only two arguments. return _generate_unsampled_indices(tree.random_state, n_samples)
def oob_r2(rf_reg, X_train, y_train): """Compute the out-of-bag R2 of random forest regressor""" n_samples = X_train.shape[0] n_preds = np.zeros(n_samples) preds_matrix = np.zeros((n_samples)) # Iterate over all trees for tree in rf_reg.estimators_: # Generate unsampled indices unsampled_idxs = _generate_unsampled_indices(tree.random_state, n_samples) preds = tree.predict(X_train[unsampled_idxs, :]) preds_matrix[unsampled_idxs] += preds n_preds[unsampled_idxs] += 1 # Avoid dividing by zero if some samples weren't included if (n_preds == 0).any(): warnings.warn("Some features didn't have OOB samples.") # Discard samples weren't OOB in any feature y_train = y_train[n_preds != 0] preds_matrix = preds_matrix[n_preds != 0] avg_preds = preds_matrix / n_preds oob_score = r2_score( y_train, avg_preds, ) return oob_score
def _set_oob_score(self, X, y): """Calculate out of bag predictions and score.""" X = check_array(X, dtype=DTYPE) n_samples = X.shape[0] event, time = y predictions = np.zeros(n_samples) n_predictions = np.zeros(n_samples) for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples) p_estimator = estimator.predict(X[unsampled_indices, :], check_input=False) predictions[unsampled_indices] += p_estimator n_predictions[unsampled_indices] += 1 if (n_predictions == 0).any(): warnings.warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions self.oob_prediction_ = predictions self.oob_score_ = concordance_index_censored(event, time, predictions)[0]
def _get_unsampled_indices(self, tree, n_samples): """ Taken from rfpimp module to decouple dependency and modify <https://github.com/parrt/random-forest-importances> --------------------------------------------------------------- An interface to get unsampled indices regardless of sklearn version. """ if LooseVersion(sklearn.__version__) >= LooseVersion("0.22"): # Version 0.22 or newer uses 3 arguments. n_samples_bootstrap = _get_n_samples_bootstrap( n_samples, n_samples) return _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap) else: # Version 0.21 or older uses only two arguments. return _generate_unsampled_indices(tree.random_state, n_samples)
def oob_accuracy(rf_clf, X_train, y_train): """Compute the out-of-bag accuracy of random forest classifier""" n_samples = X_train.shape[0] n_classes = len(np.bincount(y_train)) n_preds = np.zeros((n_samples)) preds_matrix = np.zeros((n_samples, n_classes)) # Iterate over all trees for tree in rf_clf.estimators_: # Generate unsampled indices unsampled_idxs = _generate_unsampled_indices(tree.random_state, n_samples) preds = tree.predict_proba(X_train[unsampled_idxs, :]) preds_matrix[unsampled_idxs, :] += preds n_preds[unsampled_idxs] += 1 # Avoid dividing by zero if some samples weren't included if (n_preds == 0).any(): warnings.warn("Some features didn't have OOB samples.") y_train = y_train[n_preds != 0] preds_matrix = preds_matrix[n_preds != 0, :] preds_classes = np.argmax(preds_matrix, axis=1) oob_score = (y_train == preds_classes).mean() return oob_score
def _getOOBIndices(self): ''' Retrieve the indices of the points that were not sampled for each tree's bootstrap sample. Inputs: X as training data, rf as instance of sk-learn RandomForestClassifier class Output: unsampledIndices - dictionary with keys as integers corresponding to each tree and values as numpy arrays of the unsampled points for each tree ''' nSamples = self.X.shape[0] unsampledIndices = {} for i, tree in enumerate(self.rf.estimators_): # Here at each iteration we obtain out of bag samples for every # tree. unsampledIndices[i] = _generate_unsampled_indices( tree.random_state, nSamples) return unsampledIndices
def rf_accuracy(rf, X, y, type = 'oob', metric = 'accuracy'): if metric == 'accuracy': score = accuracy_score elif metric == 'mse': score = neg_mse else: raise ValueError('metric type not understood') n_samples, n_features = X.shape tmp = 0 count = 0 if type == 'test': return score(y, rf.predict(X)) elif type == 'train' and not rf.bootstrap: return score(y, rf.predict(X)) for tree in rf.estimators_: if type == 'oob': if rf.bootstrap: indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples) else: raise ValueError('Without bootstrap, it is not possible to calculate oob.') elif type == 'train': indices = _generate_sample_indices(tree.random_state, n_samples, n_samples) else: raise ValueError('type is not recognized. (%s)'%(type)) tmp += score(y[indices,:], tree.predict(X[indices, :])) * len(indices) count += len(indices) return tmp / count
def oob_regression_r2_score(rf, X_train, y_train): """ Compute out-of-bag (OOB) R^2 for a scikit-learn random forest regressor. We learned the guts of scikit's RF from the BSD licensed code: https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L702 """ X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train y = y_train.values if isinstance(y_train, pd.Series) else y_train n_samples = len(X) predictions = np.zeros(n_samples) n_predictions = np.zeros(n_samples) for tree in rf.estimators_: unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples) tree_preds = tree.predict(X[unsampled_indices, :]) predictions[unsampled_indices] += tree_preds n_predictions[unsampled_indices] += 1 if (n_predictions == 0).any(): warnings.warn("Too few trees; some variables do not have OOB scores.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions oob_score = r2_score(y, predictions) return oob_score
def get_tree_oob_score(tree, X_train, y_train): #gets the oob score for the given tree indicies = _generate_unsampled_indices(tree.random_state, X_train.shape[0]) y_true = y_train[indicies] y_hat_tree = tree.predict(X_train[indicies]) rmse = np.sqrt(np.mean((y_true - y_hat_tree)**2)) return rmse
def _calculate_ps_prox(self): """Calculate the out of bag proximities between the rest and the treeated, for the rest.""" from counterfactuals import proximity from sklearn.ensemble.forest import _generate_unsampled_indices # (2d) array of which whether an element is in each tree Zin = np.column_stack((np.isin( self._data.index, _generate_unsampled_indices(e.random_state, self._data.shape[0])) for e in self._clf.estimators_)) # (2d) array of leaves for each observation in each tree leaves = np.array(self._clf.apply(self._data[self._feature_names]), dtype=np.int32, order='c') ZT = Zin[self._data['z'] == 1, :] LT = leaves[self._data['z'] == 1, :] ZO = np.ascontiguousarray(Zin) LO = leaves # proximitry matrix (others x treated) self._proximity = proximity(len(self._clf.estimators_), LO.shape[0], LT.shape[0], LO, LT, ZO, ZT)
def oob_regression_r2_score(rf, X_train, y_train): """ Compute out-of-bag (OOB) R^2 for a scikit-learn random forest regressor. We learned the guts of scikit's RF from the BSD licensed code: https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L702 """ X = X_train.values y = y_train.values n_samples = len(X) predictions = np.zeros(n_samples) n_predictions = np.zeros(n_samples) for tree in rf.estimators_: unsampled_indices = _generate_unsampled_indices( tree.random_state, n_samples) tree_preds = tree.predict(X[unsampled_indices, :]) predictions[unsampled_indices] += tree_preds n_predictions[unsampled_indices] += 1 if (n_predictions == 0).any(): warnings.warn("Too few trees; some variables do not have OOB scores.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions oob_score = r2_score(y, predictions) return oob_score
def oob_classifier_accuracy(rf, X_train, y_train): """ Compute out-of-bag (OOB) accuracy for a scikit-learn random forest classifier. We learned the guts of scikit's RF from the BSD licensed code: https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425 """ X = X_train.values y = y_train.values n_samples = len(X) n_classes = len(np.unique(y)) predictions = np.zeros((n_samples, n_classes)) for tree in rf.estimators_: unsampled_indices = _generate_unsampled_indices( tree.random_state, n_samples) tree_preds = tree.predict_proba(X[unsampled_indices, :]) predictions[unsampled_indices] += tree_preds predicted_class_indexes = np.argmax(predictions, axis=1) predicted_classes = [rf.classes_[i] for i in predicted_class_indexes] oob_score = np.mean(y == predicted_classes) return oob_score
def _get_unsampled_indices(tree, n_samples): """ An interface to get unsampled indices regardless of sklearn version. """ from sklearn.ensemble.forest import _get_n_samples_bootstrap n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples) return _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap)
def feature_importance(rf, X, y, type = 'oob', normalized = False, balanced = False, demean=False,normal_fX = False): n_samples, n_features = X.shape if len(y.shape) != 2: raise ValueError('y must be 2d array (n_samples, 1) if numerical or (n_samples, n_categories).') out = np.zeros((n_features,)) SE = np.zeros((n_features,)) if demean: # demean y y = y - np.mean(y, axis=0) for tree in rf.estimators_: if type == 'oob': if rf.bootstrap: indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples) else: raise ValueError('Without bootstrap, it is not possible to calculate oob.') elif type == 'test': indices = np.arange(n_samples) elif type == 'classic': if rf.bootstrap: indices = _generate_sample_indices(tree.random_state, n_samples, n_samples) else: indices = np.arange(n_samples) else: raise ValueError('type is not recognized. (%s)'%(type)) _, _, contributions = _predict_tree(tree, X[indices,:]) if balanced and (type == 'oob' or type == 'test'): base_indices = _generate_sample_indices(tree.random_state, n_samples, n_samples) ids = tree.apply(X[indices, :]) base_ids = tree.apply(X[base_indices, :]) tmp1, tmp2 = np.unique(ids, return_counts = True) weight1 = {key: 1. / value for key, value in zip(tmp1, tmp2)} tmp1, tmp2 = np.unique(base_ids, return_counts = True) weight2 = {key: value for key, value in zip(tmp1, tmp2)} final_weights = np.array([[weight1[id] * weight2[id]] for id in ids]) final_weights /= np.mean(final_weights) else: final_weights = 1 if len(contributions.shape) == 2: contributions = contributions[:,:,np.newaxis] #print(contributions.shape, y[indices,:].shape) if normal_fX: for k in range(contributions.shape[-1]): contributions[:, :, k] = scale(contributions[:, :, k]) tmp = np.tensordot(np.array(y[indices,:]) * final_weights, contributions, axes=([0, 1], [0, 2])) if normalized: out += tmp / sum(tmp) else: out += tmp / len(indices) if normalized: SE += (tmp / sum(tmp)) ** 2 else: SE += (tmp / len(indices)) ** 2 out /= rf.n_estimators SE /= rf.n_estimators SE = ((SE - out ** 2) / rf.n_estimators) ** .5 return out, SE
def _extract_oob(estimator, x, y, sample_weight=None): """Returns OOB sample data for the given estimator.""" unsampled = _generate_unsampled_indices(estimator.random_state, x.shape[0]) x_oob = x[unsampled, :] y_oob = y[unsampled] w_oob = None if sample_weight is None else sample_weight[unsampled] return x_oob, y_oob, w_oob
def worker(tree): # Get indices of estimation set, i.e. those NOT used in for learning trees of the forest. estimation_indices = _generate_unsampled_indices( tree.random_state, n) # Count the occurences of each class in each leaf node, by first extracting the leaves. node_counts = tree.tree_.n_node_samples leaf_nodes = self._get_leaves(tree) unique_leaf_nodes = np.unique(leaf_nodes) class_counts_per_leaf = np.zeros( (len(unique_leaf_nodes), model.n_classes_)) # Drop each estimation example down the tree, and record its 'y' value. for i in estimation_indices: temp_node = tree.apply(X_train[i].reshape((1, -1))).item() class_counts_per_leaf[np.where( unique_leaf_nodes == temp_node)[0][0], y_train[i]] += 1 # Count the number of data points in each leaf in. n_per_leaf = class_counts_per_leaf.sum(axis=1) n_per_leaf[n_per_leaf == 0] = 1 # Avoid divide by zero. # Posterior probability distributions in each leaf. Each row is length num_classes. posterior_per_leaf = np.divide( class_counts_per_leaf, np.repeat(n_per_leaf.reshape((-1, 1)), model.n_classes_, axis=1)) posterior_per_leaf = self._finite_sample_correct( posterior_per_leaf, n_per_leaf) posterior_per_leaf.tolist() # Posterior probability for each element of the evaluation set. eval_posteriors = [ posterior_per_leaf[np.where(unique_leaf_nodes == node)[0][0]] for node in tree.apply(X_eval) ] eval_posteriors = np.array(eval_posteriors) # Number of estimation points in the cell of each eval point. n_per_eval_leaf = np.asarray([ node_counts[np.where(unique_leaf_nodes == x)[0][0]] for x in tree.apply(X_eval) ]) class_count_increment = np.multiply( eval_posteriors, np.repeat(n_per_eval_leaf.reshape((-1, 1)), model.n_classes_, axis=1)) return class_count_increment
def _map(u, x): return np.where(u == x)[0][0] class_counts = np.zeros((m, model.n_classes_)) for tree in model: # get out of bag indicies if in_task: prob_indices = _generate_unsampled_indices(tree.random_state, n) # in_bag_idx = _generate_sample_indices(tree.random_state, n) # this is not behaving as i expected else: prob_indices = np.random.choice(range(n), size=int(subsample*n), replace=False) leaf_nodes = get_leaves(tree) unique_leaf_nodes = np.unique(leaf_nodes) # get all node counts node_counts = tree.tree_.n_node_samples # get probs for eval samples posterior_class_counts = np.zeros((len(unique_leaf_nodes), model.n_classes_)) for prob_index in prob_indices: temp_node = tree.apply(train[prob_index].reshape(1, -1)).item() posterior_class_counts[np.where(unique_leaf_nodes == temp_node)[0][0], y[prob_index]] += 1 # total number of points in a node row_sums = posterior_class_counts.sum(axis=1) # no divide by zero row_sums[row_sums == 0] = 1 # posteriors class_probs = (posterior_class_counts / row_sums[:, None]) # posteriors with finite sampling correction class_probs = finite_sample_correction(class_probs, row_sums) # posteriors as a list class_probs.tolist() partition_counts = np.asarray([node_counts[np.where(unique_leaf_nodes == x)[0][0]] for x in tree.apply(test)]) # get probability for out of bag samples eval_class_probs = [class_probs[np.where(unique_leaf_nodes == x)[0][0]] for x in tree.apply(test)] eval_class_probs = np.array(eval_class_probs) # find total elements for out of bag samples elems = np.multiply(eval_class_probs, partition_counts[:, np.newaxis]) # store counts for each x (repeat fhis for each tree) class_counts += elems # calculate p(y|X = x) for all x's probs = class_counts / class_counts.sum(axis=1, keepdims=True) return probs
def learn_rf(self, X_train, y_train, t): rf_model = RandomForestClassifier(n_estimators=self.n_trees, max_depth=self.max_depth, bootstrap=True, random_state=t, verbose=0) model = rf_model.fit(X_train, y_train) n_samples = X_train.shape[0] num_estimators = len(model.estimators_) unsampled_indices = _generate_unsampled_indices( model.estimators_[0].random_state, n_samples, _get_n_samples_bootstrap(n_samples, None)) for ind in range(1, num_estimators): arr = _generate_unsampled_indices( model.estimators_[ind].random_state, n_samples, _get_n_samples_bootstrap(n_samples, None)) unsampled_indices = np.unique( np.concatenate((unsampled_indices, arr), 0)) return model, unsampled_indices
def computeError(tree, n_samples,X, y, typ="MSE"): unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples,n_samples) #get info for dataset and classes for OOB indices OOB_DSet = X.iloc[unsampled_indices,:] OOB_Y = y.values[unsampled_indices] #make the prediction for bag sample indices Predicting class probabilities predic = tree.predict(OOB_DSet) #probs = tree.predict_proba(OOB_DSet) if typ == "MSE": error = sum(abs(OOB_Y-predic))/len(predic) #error = mean_squared_error(OOB_Y, predic) else: error = r2_score(OOB_Y, predic) #[OOB_Err,OOB_Acc] return [round(error,2), round(1-error,2)]
def _set_oob_score(self, X, y): """Compute out-of-bag score""" validate_X_y(X, y) check_X_is_univariate(X) n_classes_ = self.n_classes_ n_samples = y.shape[0] oob_decision_function = [] oob_score = 0.0 predictions = [ np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_) ] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :]) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") decision = (predictions[k] / predictions[k].sum(axis=1)[:, np.newaxis]) oob_decision_function.append(decision) oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1), axis=0) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def oob_classifier_accuracy(rf, X_train, y_train): X = X_train.values y = y_train.values n_samples = len(X) n_classes = len(np.unique(y)) predictions = np.zeros((n_samples, n_classes)) for tree in rf.estimators_: unsampled_indices = _generate_unsampled_indices( tree.random_state, n_samples) tree_preds = tree.predict_proba(X[unsampled_indices, :]) predictions[unsampled_indices] += tree_preds predicted_class_indexes = np.argmax(predictions, axis=1) predicted_classes = [rf.classes_[i] for i in predicted_class_indexes] oob_score = np.mean(y == predicted_classes) return oob_score
def cat_rf_entropy_estimate(X, y, n_estimators = 200, max_samples = .32, bootstrap = True, depth = 30, min_samples_leaf = 1): model = BaggingClassifier(DecisionTreeClassifier(max_depth = depth, min_samples_leaf = min_samples_leaf), n_estimators = n_estimators, max_samples= max_samples, bootstrap = bootstrap) model.fit(X, y) class_counts = np.zeros((X.shape[0], model.n_classes_)) for tree in model: # get out of bag indicies unsampled_indices = _generate_unsampled_indices(tree.random_state, len(X)) total_unsampled = len(unsampled_indices) np.random.shuffle(unsampled_indices) prob_indices, eval_indices = unsampled_indices[:total_unsampled//2], unsampled_indices[total_unsampled//2:] # get all node counts node_counts = tree.tree_.n_node_samples # get probs for eval samples posterior_class_counts = np.zeros((len(node_counts), model.n_classes_)) for prob_index in prob_indices: posterior_class_counts[tree.apply(X[prob_index].item()).item(), y[prob_index]] += 1 row_sums = posterior_class_counts.sum(axis=1) row_sums[row_sums == 0] = 1 class_probs = (posterior_class_counts/row_sums[:, None]) where_0 = np.argwhere(class_probs == 0) for elem in where_0: class_probs[elem[0], elem[1]] = 1/(2*row_sums[elem[0], None]) where_1 = np.argwhere(class_probs == 1) for elem in where_1: class_probs[elem[0], elem[1]] = 1 - 1/(2*row_sums[elem[0], None]) class_probs.tolist() partition_counts = np.asarray([node_counts[x] for x in tree.apply(X[eval_indices])]) # get probability for out of bag samples eval_class_probs = [class_probs[x] for x in tree.apply(X[eval_indices])] eval_class_probs = np.array(eval_class_probs) # find total elements for out of bag samples elems = np.multiply(eval_class_probs, partition_counts[:, np.newaxis]) # store counts for each x (repeat fhis for each tree) class_counts[eval_indices] += elems # calculate p(y|X = x) for all x's probs = class_counts/class_counts.sum(axis = 1, keepdims = True) entropies = -np.sum(np.log(probs)*probs, axis = 1) # convert nan to 0 entropies = np.nan_to_num(entropies) return np.mean(entropies)
def getOOBErrorTree(model, X, y): n_samples = X.shape[0] n_outputs_ = len(y) OOB_Err, OOB_Acc, IdT = {}, {}, 1 model.n_jobs = -1 # Here at each iteration we obtain out of bag samples for every tree. for tree in model.estimators_: unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples) # get info for dataset and classes for OOB indices OOB_DSet = X.iloc[unsampled_indices, :] OOB_Y = y.values[unsampled_indices] # make the prediction for bag sample indices Predicting class probabilities predic = tree.predict(OOB_DSet) # probs = tree.predict_proba(OOB_DSet) error = sum(abs(OOB_Y - predic)) / len(predic) OOB_Err[IdT], OOB_Acc[IdT] = round(error, 2), round(1 - error, 2) IdT += 1 return OOB_Err, OOB_Acc
def _oob_predictions_and_indices(estimators, data): """Computes the out-of-bag indices and their prediction result for each provided estimator. Args: estimators (List[sklearn.tree.DecisionTreeClassifier]): Estimators of the ensemble model. data (array-like, shape=(n_samples, n_features)): The training input samples that were used to fit the model. Returns: List[Tuple[np.array, np.array]]: A tuple of out-of-bag indices and predictions for each provided estimator. """ import numpy as np from sklearn.ensemble.forest import _generate_unsampled_indices n_samples = data.shape[0] oob_predictions_and_indices = [] for estimator in estimators: indices = _generate_unsampled_indices(estimator.random_state, n_samples) predictions = np.zeros(n_samples) predictions[indices] = estimator.predict(data[indices, :]) oob_predictions_and_indices.append((predictions, indices)) return oob_predictions_and_indices
def check_oob(self, x, y): n_samples = y.shape[0] in_sample_tensor = numpy.zeros(shape=( len(self.dt_classifier.estimators_), x.shape[0], )) out_sample_tensor = numpy.zeros(shape=( len(self.dt_classifier.estimators_), x.shape[0], )) for i, estimator in enumerate(self.dt_classifier.estimators_): unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples) sampled_indices = _generate_sample_indices( estimator.random_state, n_samples) assert len(set(unsampled_indices) & set(sampled_indices)) == 0 unsampled_estimated = estimator.predict(x[unsampled_indices, :]) unsampled_real = y[unsampled_indices] sample_estimated = estimator.predict(x[sampled_indices, :]) sample_real = y[sampled_indices] out_sample_success = numpy.where(unsampled_estimated.astype(int) == unsampled_real) out_sample_fail = numpy.where(unsampled_estimated.astype(int) != unsampled_real) out_sample_success_indices = unsampled_indices[out_sample_success] out_sample_fail_indices = unsampled_indices[out_sample_fail] out_sample_tensor[i, out_sample_success_indices] = 1.0 out_sample_tensor[i, out_sample_fail_indices] = -1.0 in_sample_success = numpy.where(sample_estimated.astype(int) == sample_real) in_sample_fail = numpy.where(sample_estimated.astype(int) != sample_real) in_sample_success_indices = sampled_indices[in_sample_success] in_sample_fail_indices = sampled_indices[in_sample_fail] in_sample_tensor[i, in_sample_success_indices] = 1.0 in_sample_tensor[i, in_sample_fail_indices] = -1.0 return in_sample_tensor, out_sample_tensor, y
def oob_classifier_accuracy(rf, X_train, y_train): """ Adjusted... Compute out-of-bag (OOB) accuracy for a scikit-learn random forest classifier. We learned the guts of scikit's RF from the BSD licensed code: https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425 """ try: X = X_train.values except: X = X_train.copy() try: y = y_train.values except: y = y_train.copy() n_samples = len(X) n_classes = len(np.unique(y)) # preallocation predictions = np.zeros((n_samples, n_classes)) for tree in rf.estimators_: # for each decision tree in the random forest - I have put 1 tree in the forest # Private function used to _parallel_build_trees function. unsampled_indices = _generate_unsampled_indices( tree.random_state, n_samples) tree_preds = tree.predict_proba(X[unsampled_indices, :]) predictions[unsampled_indices] += tree_preds predicted_class_indexes = np.argmax( predictions, axis=1) # threshold the probabilistic predictions predicted_classes = [ rf.classes_[i] for i in predicted_class_indexes ] # use the thresholded indicies to obtain a binary prediction oob_score = sum(y == predicted_classes) / float(len(y)) return oob_score
def oob_classifier_accuracy(rf, X_train, y_train): """ Compute out-of-bag (OOB) accuracy for a scikit-learn random forest classifier. We learned the guts of scikit's RF from the BSD licensed code: https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425 """ X = X_train.values y = y_train.values n_samples = len(X) n_classes = len(np.unique(y)) predictions = np.zeros((n_samples, n_classes)) for tree in rf.estimators_: unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples) tree_preds = tree.predict_proba(X[unsampled_indices, :]) predictions[unsampled_indices] += tree_preds predicted_class_indexes = np.argmax(predictions, axis=1) predicted_classes = [rf.classes_[i] for i in predicted_class_indexes] oob_score = np.mean(y == predicted_classes) return oob_score
def calculate_cond_pair_vi(model, X, y, sample_weight=None, sampling_weight=None): """Computes pairwise permutation VI score for given model, X and y.""" #if y.ndim == 1: # # reshape is necessary to preserve the data contiguity against vs # # [:, np.newaxis] that does not. # y = np.reshape(y, (-1, 1)) X = check_array(X, dtype=DTYPE, accept_sparse='csr') n_samples = y.shape[0] n_features = X.shape[1] vi = np.zeros((len(model.estimators_), n_features, n_features), dtype=np.float32) for t, estimator in enumerate(model.estimators_): # Extract oob features and response values. unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, sampling_weight) X_unsampled = X[unsampled_indices, :] y_unsampled = y[unsampled_indices] if sample_weight is None: weight_unsampled = None else: weight_unsampled = sample_weight[unsampled_indices] # Calculate MSE. y_estimator = estimator.predict(X_unsampled, check_input=False) mse = mean_squared_error(y_unsampled, y_estimator) # Copy X for second permutation. X_copy = np.copy(X_unsampled) # Permute variable in X. for i in range(n_features): i_orig = np.array(X_unsampled[:, i]) np.random.shuffle(X_unsampled[:, i]) # MSE of permuted i. y_perm_i = estimator.predict(X_unsampled, check_input=False) mse_i = mean_squared_error(y_unsampled, y_perm_i, sample_weight=weight_unsampled) for j in range(i, n_features): # Copy and shuffle feature values. j_orig = np.array(X_unsampled[:, j]) np.random.shuffle(X_unsampled[:, j]) X_copy[:, j] = X_unsampled[:, j] # MSE of permuted j. y_perm_j = estimator.predict(X_copy, check_input=False) mse_j = mean_squared_error(y_unsampled, y_perm_j, sample_weight=weight_unsampled) # MSE of permuted i and j. y_perm_both = estimator.predict(X_unsampled, check_input=False) mse_both = mean_squared_error(y_unsampled, y_perm_both, sample_weight=weight_unsampled) # Restore unpermuted feature values. X_unsampled[:, j] = j_orig X_copy[:, j] = j_orig # Store difference for feature i in tree t. cond_vi = min((mse_both - mse_i), (mse_both - mse_j)) vi[t, i, j] = max(0, cond_vi) X_unsampled[:, i] = i_orig # Calculate overall VI score. score = np.mean(vi, axis=0) return score
def calculate_perm_vi(model, X, y, sample_weight=None, sampling_weight=None, normalize=False): """Computes permutation VI score for given model, X and y.""" #if y.ndim == 1: # # reshape is necessary to preserve the data contiguity against vs # # [:, np.newaxis] that does not. # y = np.reshape(y, (-1, 1)) X = check_array(X, dtype=DTYPE, accept_sparse='csr') n_samples = y.shape[0] n_features = X.shape[1] vi = np.zeros((len(model.estimators_), n_features), dtype=np.float32) for t, estimator in enumerate(model.estimators_): # Extract oob features and response values. unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, sampling_weight) X_unsampled = X[unsampled_indices, :] y_unsampled = y[unsampled_indices] if sample_weight is None: weight_unsampled = None else: weight_unsampled = sample_weight[unsampled_indices] # Calculate MSE. y_estimator = estimator.predict(X_unsampled, check_input=False) mse = mean_squared_error(y_unsampled, y_estimator) # Permute variable in X. for i in range(n_features): # Copy and shuffle feature values. f_orig = np.array(X_unsampled[:, i]) np.random.shuffle(X_unsampled[:, i]) # Calculate permuted MSE. y_estimator_perm = estimator.predict(X_unsampled, check_input=False) mse_perm = mean_squared_error(y_unsampled, y_estimator_perm, sample_weight=weight_unsampled) # Restore unpermuted feature values. X_unsampled[:, i] = f_orig # Store difference for feature i in tree t. vi[t, i] = max(0, mse_perm - mse) # Calculate overall VI score. score = np.mean(vi, axis=0) if normalize: score /= np.sum(score) return score
def _estimate_posteriors(self, test, representation=0, decider=0, subsample=1, acorn=None): r""" An internal function to estimate the posteriors. Input task_number: int; indicates which model in self.model_ to use test: array-like; test observation in_task: bool; True if test is an in-task observation(s) subsample: float in (0, 1]; proportion of out-of-task samples to use to estimate posteriors Return probs: numpy array; probs[i, k] is the probability of observation i being class k Usage predict(..) """ if acorn is not None: acorn = np.random.seed(acorn) if representation == decider: in_task = True else: in_task = False train = self.X_[decider] y = self.y_[decider] model = self.models_[representation] n, d = train.shape if test.ndim > 1: m, d_ = test.shape else: m = len(test) d_ = 1 size = len(np.unique(y)) class_counts = np.zeros((m, size)) for tree in model: # get out of bag indicies if in_task: prob_indices = _generate_unsampled_indices( tree.random_state, n) # in_bag_idx = _generate_sample_indices(tree.random_state, n) # this is not behaving as i expected else: prob_indices = np.random.choice(range(n), size=int(subsample * n), replace=False) leaf_nodes = self._get_leaves(tree) unique_leaf_nodes = np.unique(leaf_nodes) # get all node counts node_counts = tree.tree_.n_node_samples # get probs for eval samples posterior_class_counts = np.zeros((len(unique_leaf_nodes), size)) for prob_index in prob_indices: temp_node = tree.apply(train[prob_index].reshape(1, -1)).item() #print(y[prob_index], size, np.unique(y)) posterior_class_counts[np.where( unique_leaf_nodes == temp_node)[0][0], y[prob_index]] += 1 # total number of points in a node row_sums = posterior_class_counts.sum(axis=1) # no divide by zero row_sums[row_sums == 0] = 1 # posteriors class_probs = (posterior_class_counts / row_sums[:, None]) # posteriors with finite sampling correction class_probs = self._finite_sample_correction(class_probs, row_sums) # posteriors as a list class_probs.tolist() partition_counts = np.asarray([ node_counts[np.where(unique_leaf_nodes == x)[0][0]] for x in tree.apply(test) ]) # get probability for out of bag samples eval_class_probs = [ class_probs[np.where(unique_leaf_nodes == x)[0][0]] for x in tree.apply(test) ] eval_class_probs = np.array(eval_class_probs) # find total elements for out of bag samples elems = np.multiply(eval_class_probs, partition_counts[:, np.newaxis]) # store counts for each x (repeat fhis for each tree) class_counts += elems # calculate p(y|X = x) for all x's probs = class_counts / class_counts.sum(axis=1, keepdims=True) return probs
def get_oob_indices(forest, X_train): oob_indices = {} for t, estimator in enumerate(forest): oob_indices[t] = _generate_unsampled_indices(estimator.random_state, X_train.shape[0]) return oob_indices
n_redundant=0, random_state=0, shuffle=False) feature_names = ['x' + str(i) for i in range(X.shape[1])] data = pd.DataFrame(data=X, columns=feature_names) data['y'] = y # fit a random forest clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, oob_score=True) clf.fit(data[feature_names], data['y']) # print some stuff about it print(clf.feature_importances_) print print(clf.apply([[0, 0, 0, 0]])) # this can be used as estimate of the propensity score. It is only produced if # the argument *oob_score* is passed to the constructor. It is possible that it # is NaN if an observation makes it into all trees print(clf.oob_decision_function_) # We can find out which rows of X were/were not used in a tree using the random # state attributed of the tree like... print(_generate_sample_indices(clf.estimators_[0].random_state, X.shape[0])) print(_generate_unsampled_indices(clf.estimators_[0].random_state, X.shape[0]))