def feature_importance(rf, X, y, type = 'oob', normalized = False, balanced = False, demean=False,normal_fX = False): n_samples, n_features = X.shape if len(y.shape) != 2: raise ValueError('y must be 2d array (n_samples, 1) if numerical or (n_samples, n_categories).') out = np.zeros((n_features,)) SE = np.zeros((n_features,)) if demean: # demean y y = y - np.mean(y, axis=0) for tree in rf.estimators_: if type == 'oob': if rf.bootstrap: indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples) else: raise ValueError('Without bootstrap, it is not possible to calculate oob.') elif type == 'test': indices = np.arange(n_samples) elif type == 'classic': if rf.bootstrap: indices = _generate_sample_indices(tree.random_state, n_samples, n_samples) else: indices = np.arange(n_samples) else: raise ValueError('type is not recognized. (%s)'%(type)) _, _, contributions = _predict_tree(tree, X[indices,:]) if balanced and (type == 'oob' or type == 'test'): base_indices = _generate_sample_indices(tree.random_state, n_samples, n_samples) ids = tree.apply(X[indices, :]) base_ids = tree.apply(X[base_indices, :]) tmp1, tmp2 = np.unique(ids, return_counts = True) weight1 = {key: 1. / value for key, value in zip(tmp1, tmp2)} tmp1, tmp2 = np.unique(base_ids, return_counts = True) weight2 = {key: value for key, value in zip(tmp1, tmp2)} final_weights = np.array([[weight1[id] * weight2[id]] for id in ids]) final_weights /= np.mean(final_weights) else: final_weights = 1 if len(contributions.shape) == 2: contributions = contributions[:,:,np.newaxis] #print(contributions.shape, y[indices,:].shape) if normal_fX: for k in range(contributions.shape[-1]): contributions[:, :, k] = scale(contributions[:, :, k]) tmp = np.tensordot(np.array(y[indices,:]) * final_weights, contributions, axes=([0, 1], [0, 2])) if normalized: out += tmp / sum(tmp) else: out += tmp / len(indices) if normalized: SE += (tmp / sum(tmp)) ** 2 else: SE += (tmp / len(indices)) ** 2 out /= rf.n_estimators SE /= rf.n_estimators SE = ((SE - out ** 2) / rf.n_estimators) ** .5 return out, SE
def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None): """Private function used to fit a single tree in parallel.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) if forest.bootstrap: n_samples = X.shape[0] if sample_weight is None: curr_sample_weight = np.ones((n_samples, ), dtype=np.float64) else: curr_sample_weight = sample_weight.copy() indices = _generate_sample_indices(tree.random_state, n_samples) tree.used_indices = indices sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts if class_weight is not None: raise RuntimeError( "not compatible with the hacked parallel_build_trees") tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) else: tree.fit(X, y, sample_weight=sample_weight, check_input=False) return tree
def rf_accuracy(rf, X, y, type = 'oob', metric = 'accuracy'): if metric == 'accuracy': score = accuracy_score elif metric == 'mse': score = neg_mse else: raise ValueError('metric type not understood') n_samples, n_features = X.shape tmp = 0 count = 0 if type == 'test': return score(y, rf.predict(X)) elif type == 'train' and not rf.bootstrap: return score(y, rf.predict(X)) for tree in rf.estimators_: if type == 'oob': if rf.bootstrap: indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples) else: raise ValueError('Without bootstrap, it is not possible to calculate oob.') elif type == 'train': indices = _generate_sample_indices(tree.random_state, n_samples, n_samples) else: raise ValueError('type is not recognized. (%s)'%(type)) tmp += score(y[indices,:], tree.predict(X[indices, :])) * len(indices) count += len(indices) return tmp / count
def test_class_prob(): """ testing class probabilities from Random forests """ n_trees = 100 num_classes = 20 X, y = make_blobs(n_samples=1000, centers=num_classes, random_state=2, cluster_std=2.0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) forest = get_models('RandomForest', 'classify') forest.set_params(n_estimators=n_trees) forest.fit(X_train, y_train) y_pred_forest = forest.predict(X_test) prob_val_all = np.zeros(shape=(len(y_test), num_classes)) n_samples = X_train.shape[0] print('Diff over all trees :') for t, estimator in enumerate(forest): sample_indices = _generate_sample_indices(estimator.random_state, n_samples) y_tree_predict = estimator.predict(X_test) class_prob = get_class_prob(estimator) test_leaves_id = estimator.apply(X_test) y_tree_mine = class_prob[test_leaves_id, :] diff = np.linalg.norm(y_tree_predict - np.argmax(y_tree_mine, axis=1)) print("%.2f" % round(diff, 2), end=', ') prob_val_all += y_tree_mine #n_nodes, num_classes print('') prob_val_all = prob_val_all / n_trees y_pred_mine_rf = np.argmax(prob_val_all, axis=1) print('% Predictions diff = ') print(np.linalg.norm(y_pred_forest - y_pred_mine_rf)) return
def calc_inbag(n_samples, forest): """ Derive samples used to create trees in scikit-learn RandomForest objects. Recovers the samples in each tree from the random state of that tree using :func:`forest._generate_sample_indices`. Parameters ---------- n_samples : int The number of samples used to fit the scikit-learn RandomForest object. forest : RandomForest Regressor or Classifier object that is already fit by scikit-learn. Returns ------- Array that records how many times a data point was placed in a tree. Columns are individual trees. Rows are the number of times a sample was used in a tree. """ n_trees = forest.n_estimators inbag = np.zeros((n_samples, n_trees)) sample_idx = [] for t_idx in range(n_trees): sample_idx.append( _generate_sample_indices(forest.estimators_[t_idx].random_state, n_samples)) inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples) return inbag
def calculate_inbag(forest, n_samples): n_trees = forest.n_estimators inbag = np.zeros((n_samples, n_trees)) for t_idx in range(n_trees): sample_idx = _generate_sample_indices( forest.estimators_[t_idx].random_state, n_samples) inbag[:, t_idx] = np.bincount(sample_idx, minlength=n_samples) return inbag
def test_regress_forest(): """ testing Random forests regression predict function """ n_trees = 4 boston = load_boston() X = boston.data y = boston.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) # X_train = np.array([range(1,4),range(4,7)]) # y_train = np.array([9,5]) # X_test = X_train # y_test = y_train print('Single regression tree test : ') estimator = DecisionTreeRegressor() estimator.fit(X_train, y_train) y_pred_dt = estimator.predict(X_test) node_indicator = estimator.decision_path(X_train) mean_vals, _ = get_node_means(node_indicator, y_train) test_leaves_id = estimator.apply(X_test) y_pred_mine_dt = mean_vals[test_leaves_id] diff = np.linalg.norm(y_pred_dt - y_pred_mine_dt) print('Tree predictions diff :' + repr(diff)) print('Regression Forest Test : ') forest = get_models('RandomForest', 'regress') forest.set_params(n_estimators=n_trees) forest.fit(X_train, y_train) y_pred_all = np.zeros(shape=(len(y_test))) n_samples = X_train.shape[0] indicator, n_nodes_ptr = forest.decision_path(X_train) for t, estimator in enumerate(forest): t_idx = _generate_sample_indices(estimator.random_state, n_samples) y_tree_predict = estimator.predict(X_test) print('Num nodes = ' + repr(estimator.tree_.node_count)) node_indicator = indicator[:, n_nodes_ptr[t]:n_nodes_ptr[t + 1]] # node_indicator = estimator.decision_path(X_train) mean_vals, _ = get_node_means(node_indicator, y_train[t_idx]) leaves_id = estimator.apply(X_test) y_tree_mine = mean_vals[leaves_id] diff = np.linalg.norm(y_tree_predict - y_tree_mine) # print(y_tree_predict, y_tree_mine) print('Tree#' + repr(t) + ': Diff = ' + repr(diff)) y_pred_all += y_tree_mine y_pred_rf = forest.predict(X_test) y_pred_mine_rf = y_pred_all / n_trees diff = np.linalg.norm(y_pred_rf - y_pred_mine_rf) print('Forest predictions difference :' + repr(diff)) print('#BUG#-->Trees in the forest dont match my tree predictions') return
def calc_inbag(n_samples, forest): """ """ n_trees = forest.n_estimators inbag = np.zeros((n_samples, n_trees)) sample_idx = [] for t_idx in range(n_trees): sample_idx.append(_generate_sample_indices(forest.estimators_[t_idx].random_state, n_samples)) inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples) return inbag
def _parallel_build_trees(self, tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None): """ Private function used to fit a single tree in parallel. Copied from sklearn.ensemble.forest and converted to a class function to perform undersampling prior to fitting the single tree :param tree: base_estimator {default=DecisionTreeClassifier()} :param forest: self {BalancedRandomForestClassifier object} :param X:{array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the training data. :param y: array-like, shape (n_samples,) Corresponding label for each sample in X. :param sample_weight: array-like of shape = [n_samples], optional Sample weights. :param tree_idx: index for specific tree :param n_trees: total number of trees :param verbose: int, optional (default=0) Controls the verbosity of the building process. :param class_weight: dict, list of dicts, "balanced", "balanced_subsample" or None, optional (default=None) Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. :return: fitted tree """ if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) X_res, y_res, indices = self.rus.fit_sample(X, y) if forest.bootstrap: n_samples = X_res.shape[0] if sample_weight is None: curr_sample_weight = np.ones((n_samples,), dtype=np.float64) else: curr_sample_weight = sample_weight[indices] indices = _generate_sample_indices(tree.random_state, n_samples) sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts if class_weight == 'subsample': with warnings.catch_warnings(): warnings.simplefilter('ignore', DeprecationWarning) curr_sample_weight *= compute_sample_weight('auto', y, indices) elif class_weight == 'balanced_subsample': curr_sample_weight *= compute_sample_weight('balanced', y, indices) tree.fit(X_res, y_res, sample_weight=curr_sample_weight, check_input=False) else: tree.fit(X_res, y_res, sample_weight=sample_weight, check_input=False) return tree
def calc_inbag(n_samples, forest): """ """ n_trees = forest.n_estimators inbag = np.zeros((n_samples, n_trees)) sample_idx = [] for t_idx in range(n_trees): sample_idx.append( _generate_sample_indices(forest.estimators_[t_idx].random_state, n_samples)) inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples) return inbag
def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None, n_samples_bootstrap=None): """Private function used to fit a single tree in parallel, adjusted for pipeline trees.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) # name of step of final estimator in pipeline estimator = tree.steps[-1][0] if forest.bootstrap: n_samples = X.shape[0] if sample_weight is None: curr_sample_weight = np.ones((n_samples, ), dtype=np.float64) else: curr_sample_weight = sample_weight.copy() indices = _generate_sample_indices(tree.random_state, n_samples, n_samples_bootstrap) sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts if class_weight == 'subsample': with catch_warnings(): simplefilter('ignore', DeprecationWarning) curr_sample_weight *= compute_sample_weight('auto', y, indices) elif class_weight == 'balanced_subsample': curr_sample_weight *= compute_sample_weight('balanced', y, indices) fit_params = { f'{estimator}__sample_weight': curr_sample_weight, f'{estimator}__check_input': True } tree.fit(X, y, **fit_params) else: fit_params = { f'{estimator}__sample_weight': sample_weight, f'{estimator}__check_input': True } tree.fit(X, y, **fit_params) return tree
def _parallel_build_trees_under(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None): """Private function used to fit a single tree in parallel.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) # Undersample X and y first if forest.undersample is not None: rus = RandomUnderSampler(ratio=lambda y: { 0: int(Counter(y)[0] / forest.undersample), 1: Counter(y)[1] }, return_indices=True) X, y, indices_under = rus.fit_sample(X, y) if sample_weight is not None: sample_weight = sample_weight[indices_under] if forest.bootstrap: n_samples = X.shape[0] if sample_weight is None: curr_sample_weight = np.ones((n_samples, ), dtype=np.float64) else: curr_sample_weight = sample_weight.copy() indices = _generate_sample_indices(tree.random_state, n_samples) sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts if class_weight == 'subsample': with warnings.catch_warnings(): warnings.simplefilter('ignore', DeprecationWarning) curr_sample_weight *= compute_sample_weight('auto', y, indices) elif class_weight == 'balanced_subsample': curr_sample_weight *= compute_sample_weight('balanced', y, indices) tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) else: tree.fit(X, y, sample_weight=sample_weight, check_input=False) return tree
def check_oob(self, x, y): n_samples = y.shape[0] in_sample_tensor = numpy.zeros(shape=( len(self.dt_classifier.estimators_), x.shape[0], )) out_sample_tensor = numpy.zeros(shape=( len(self.dt_classifier.estimators_), x.shape[0], )) for i, estimator in enumerate(self.dt_classifier.estimators_): unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples) sampled_indices = _generate_sample_indices( estimator.random_state, n_samples) assert len(set(unsampled_indices) & set(sampled_indices)) == 0 unsampled_estimated = estimator.predict(x[unsampled_indices, :]) unsampled_real = y[unsampled_indices] sample_estimated = estimator.predict(x[sampled_indices, :]) sample_real = y[sampled_indices] out_sample_success = numpy.where(unsampled_estimated.astype(int) == unsampled_real) out_sample_fail = numpy.where(unsampled_estimated.astype(int) != unsampled_real) out_sample_success_indices = unsampled_indices[out_sample_success] out_sample_fail_indices = unsampled_indices[out_sample_fail] out_sample_tensor[i, out_sample_success_indices] = 1.0 out_sample_tensor[i, out_sample_fail_indices] = -1.0 in_sample_success = numpy.where(sample_estimated.astype(int) == sample_real) in_sample_fail = numpy.where(sample_estimated.astype(int) != sample_real) in_sample_success_indices = sampled_indices[in_sample_success] in_sample_fail_indices = sampled_indices[in_sample_fail] in_sample_tensor[i, in_sample_success_indices] = 1.0 in_sample_tensor[i, in_sample_fail_indices] = -1.0 return in_sample_tensor, out_sample_tensor, y
n_redundant=0, random_state=0, shuffle=False) feature_names = ['x' + str(i) for i in range(X.shape[1])] data = pd.DataFrame(data=X, columns=feature_names) data['y'] = y # fit a random forest clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, oob_score=True) clf.fit(data[feature_names], data['y']) # print some stuff about it print(clf.feature_importances_) print print(clf.apply([[0, 0, 0, 0]])) # this can be used as estimate of the propensity score. It is only produced if # the argument *oob_score* is passed to the constructor. It is possible that it # is NaN if an observation makes it into all trees print(clf.oob_decision_function_) # We can find out which rows of X were/were not used in a tree using the random # state attributed of the tree like... print(_generate_sample_indices(clf.estimators_[0].random_state, X.shape[0])) print(_generate_unsampled_indices(clf.estimators_[0].random_state, X.shape[0]))
forest = RandomForestRegressor(n_estimators=n_trees, oob_score=True) oob_indices, oob_leaves_id, OOB_tree_indicator = {}, {}, {} #fit forest.fit(X_train, y_train) forest_oob_score = forest.oob_score_ n_trees, train_size = forest.n_estimators, len(y_train) indicator, n_nodes_ptr = forest.decision_path(X_train) node_indicator = {} sample_index = {} for t, estimator in enumerate(forest): oob_indices[t] = _generate_unsampled_indices(estimator.random_state, X_train.shape[0]) oob_leaves_id[t] = estimator.apply(X_train[oob_indices[t], :]) sample_index[t] = _generate_sample_indices(estimator.random_state, n_samples) node_indicator[t] = indicator[:, n_nodes_ptr[t]:n_nodes_ptr[t + 1]] mean_vals = {} for t in range(n_trees): mean_vals[t] = np.zeros(node_indicator[t].shape[1]) for node in range(node_indicator[t].shape[1]): r, c = node_indicator[t][:, node].nonzero() mean_vals[t][node] = np.mean(y_train[sample_index[t]][r]) alpha_list, _, node_score = get_alpha(forest, X_train, y_train, predicttype) y_pred_oob = np.zeros(len(y_train)) print('Forest size, trees : ' + repr(get_forest_size(forest)) + ',' + repr(n_trees)) if (predicttype == 'classify'):