Exemplo n.º 1
0
def _get_unsampled_indices(tree, n_samples):
    """
    An interface to get unsampled indices regardless of sklearn version.
    """
    if LooseVersion(sklearn.__version__) >= LooseVersion("0.22"):
        # Version 0.22 or newer uses 3 arguments.
        from sklearn.ensemble.forest import _get_n_samples_bootstrap
        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
        return _generate_unsampled_indices(tree.random_state, n_samples,
                                           n_samples_bootstrap)
    else:
        # Version 0.21 or older uses only two arguments.
        return _generate_unsampled_indices(tree.random_state, n_samples)
Exemplo n.º 2
0
def oob_r2(rf_reg, X_train, y_train):
    """Compute the out-of-bag R2 of random forest regressor"""
    n_samples = X_train.shape[0]
    n_preds = np.zeros(n_samples)
    preds_matrix = np.zeros((n_samples))

    # Iterate over all trees
    for tree in rf_reg.estimators_:
        # Generate unsampled indices
        unsampled_idxs = _generate_unsampled_indices(tree.random_state,
                                                     n_samples)
        preds = tree.predict(X_train[unsampled_idxs, :])
        preds_matrix[unsampled_idxs] += preds
        n_preds[unsampled_idxs] += 1

    # Avoid dividing by zero if some samples weren't included
    if (n_preds == 0).any():
        warnings.warn("Some features didn't have OOB samples.")
        # Discard samples weren't OOB in any feature
        y_train = y_train[n_preds != 0]
        preds_matrix = preds_matrix[n_preds != 0]
    avg_preds = preds_matrix / n_preds
    oob_score = r2_score(
        y_train,
        avg_preds,
    )
    return oob_score
Exemplo n.º 3
0
    def _set_oob_score(self, X, y):
        """Calculate out of bag predictions and score."""
        X = check_array(X, dtype=DTYPE)

        n_samples = X.shape[0]
        event, time = y

        predictions = np.zeros(n_samples)
        n_predictions = np.zeros(n_samples)

        for estimator in self.estimators_:
            unsampled_indices = _generate_unsampled_indices(
                estimator.random_state, n_samples)
            p_estimator = estimator.predict(X[unsampled_indices, :],
                                            check_input=False)

            predictions[unsampled_indices] += p_estimator
            n_predictions[unsampled_indices] += 1

        if (n_predictions == 0).any():
            warnings.warn("Some inputs do not have OOB scores. "
                          "This probably means too few trees were used "
                          "to compute any reliable oob estimates.")
            n_predictions[n_predictions == 0] = 1

        predictions /= n_predictions
        self.oob_prediction_ = predictions

        self.oob_score_ = concordance_index_censored(event, time,
                                                     predictions)[0]
Exemplo n.º 4
0
 def _get_unsampled_indices(self, tree, n_samples):
     """
     Taken from rfpimp module to decouple dependency and modify
     <https://github.com/parrt/random-forest-importances>
     ---------------------------------------------------------------
     An interface to get unsampled indices regardless of sklearn version.
     """
     if LooseVersion(sklearn.__version__) >= LooseVersion("0.22"):
         # Version 0.22 or newer uses 3 arguments.
         n_samples_bootstrap = _get_n_samples_bootstrap(
             n_samples, n_samples)
         return _generate_unsampled_indices(tree.random_state, n_samples,
                                            n_samples_bootstrap)
     else:
         # Version 0.21 or older uses only two arguments.
         return _generate_unsampled_indices(tree.random_state, n_samples)
Exemplo n.º 5
0
def oob_accuracy(rf_clf, X_train, y_train):
    """Compute the out-of-bag accuracy of random forest classifier"""
    n_samples = X_train.shape[0]
    n_classes = len(np.bincount(y_train))
    n_preds = np.zeros((n_samples))
    preds_matrix = np.zeros((n_samples, n_classes))

    # Iterate over all trees
    for tree in rf_clf.estimators_:
        # Generate unsampled indices
        unsampled_idxs = _generate_unsampled_indices(tree.random_state,
                                                     n_samples)
        preds = tree.predict_proba(X_train[unsampled_idxs, :])
        preds_matrix[unsampled_idxs, :] += preds
        n_preds[unsampled_idxs] += 1

    # Avoid dividing by zero if some samples weren't included
    if (n_preds == 0).any():
        warnings.warn("Some features didn't have OOB samples.")
        y_train = y_train[n_preds != 0]
        preds_matrix = preds_matrix[n_preds != 0, :]

    preds_classes = np.argmax(preds_matrix, axis=1)
    oob_score = (y_train == preds_classes).mean()

    return oob_score
Exemplo n.º 6
0
    def _getOOBIndices(self):
        '''
        Retrieve the indices of the points that were not sampled for each
        tree's bootstrap sample.

        Inputs:
        X as training data, rf as instance of sk-learn RandomForestClassifier
        class

        Output:
        unsampledIndices - dictionary with keys as integers corresponding to
            each tree and values as numpy arrays of the unsampled points for
            each tree
        '''
        nSamples = self.X.shape[0]

        unsampledIndices = {}

        for i, tree in enumerate(self.rf.estimators_):

            # Here at each iteration we obtain out of bag samples for every
            # tree.
            unsampledIndices[i] = _generate_unsampled_indices(
                tree.random_state, nSamples)

        return unsampledIndices
def rf_accuracy(rf, X, y, type = 'oob', metric = 'accuracy'):
    if metric == 'accuracy':
        score = accuracy_score
    elif metric == 'mse':
        score = neg_mse
    else:
        raise ValueError('metric type not understood')

    n_samples, n_features = X.shape
    tmp = 0
    count = 0
    if type == 'test':
        return score(y, rf.predict(X))
    elif type == 'train' and not rf.bootstrap:
        return score(y, rf.predict(X))

    for tree in rf.estimators_:
        if type == 'oob':
            if rf.bootstrap:
                indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples)
            else:
                raise ValueError('Without bootstrap, it is not possible to calculate oob.')
        elif type == 'train':
            indices = _generate_sample_indices(tree.random_state, n_samples, n_samples)
        else:
            raise ValueError('type is not recognized. (%s)'%(type))
        tmp +=  score(y[indices,:], tree.predict(X[indices, :])) * len(indices) 
        count += len(indices)
    return tmp / count
Exemplo n.º 8
0
def oob_regression_r2_score(rf, X_train, y_train):
    """
    Compute out-of-bag (OOB) R^2 for a scikit-learn random forest
    regressor. We learned the guts of scikit's RF from the BSD licensed
    code:

    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L702
    """
    X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
    y = y_train.values if isinstance(y_train, pd.Series) else y_train

    n_samples = len(X)
    predictions = np.zeros(n_samples)
    n_predictions = np.zeros(n_samples)
    for tree in rf.estimators_:
        unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples)
        tree_preds = tree.predict(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds
        n_predictions[unsampled_indices] += 1

    if (n_predictions == 0).any():
        warnings.warn("Too few trees; some variables do not have OOB scores.")
        n_predictions[n_predictions == 0] = 1

    predictions /= n_predictions

    oob_score = r2_score(y, predictions)
    return oob_score
def get_tree_oob_score(tree, X_train, y_train):
    #gets the oob score for the given tree
    indicies = _generate_unsampled_indices(tree.random_state, X_train.shape[0])
    y_true = y_train[indicies]
    y_hat_tree = tree.predict(X_train[indicies])
    rmse = np.sqrt(np.mean((y_true - y_hat_tree)**2))
    return rmse
Exemplo n.º 10
0
    def _calculate_ps_prox(self):
        """Calculate the out of bag proximities between the rest and the
        treeated, for the rest."""

        from counterfactuals import proximity
        from sklearn.ensemble.forest import _generate_unsampled_indices

        # (2d) array of which whether an element is in each tree
        Zin = np.column_stack((np.isin(
            self._data.index,
            _generate_unsampled_indices(e.random_state, self._data.shape[0]))
                               for e in self._clf.estimators_))

        # (2d) array of leaves for each observation in each tree
        leaves = np.array(self._clf.apply(self._data[self._feature_names]),
                          dtype=np.int32,
                          order='c')

        ZT = Zin[self._data['z'] == 1, :]
        LT = leaves[self._data['z'] == 1, :]

        ZO = np.ascontiguousarray(Zin)
        LO = leaves

        # proximitry matrix (others x treated)
        self._proximity = proximity(len(self._clf.estimators_), LO.shape[0],
                                    LT.shape[0], LO, LT, ZO, ZT)
def oob_regression_r2_score(rf, X_train, y_train):
    """
    Compute out-of-bag (OOB) R^2 for a scikit-learn random forest
    regressor. We learned the guts of scikit's RF from the BSD licensed
    code:

    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L702
    """
    X = X_train.values
    y = y_train.values

    n_samples = len(X)
    predictions = np.zeros(n_samples)
    n_predictions = np.zeros(n_samples)
    for tree in rf.estimators_:
        unsampled_indices = _generate_unsampled_indices(
            tree.random_state, n_samples)
        tree_preds = tree.predict(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds
        n_predictions[unsampled_indices] += 1

    if (n_predictions == 0).any():
        warnings.warn("Too few trees; some variables do not have OOB scores.")
        n_predictions[n_predictions == 0] = 1

    predictions /= n_predictions

    oob_score = r2_score(y, predictions)
    return oob_score
def oob_classifier_accuracy(rf, X_train, y_train):
    """
    Compute out-of-bag (OOB) accuracy for a scikit-learn random forest
    classifier. We learned the guts of scikit's RF from the BSD licensed
    code:

    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
    """
    X = X_train.values
    y = y_train.values

    n_samples = len(X)
    n_classes = len(np.unique(y))
    predictions = np.zeros((n_samples, n_classes))
    for tree in rf.estimators_:
        unsampled_indices = _generate_unsampled_indices(
            tree.random_state, n_samples)
        tree_preds = tree.predict_proba(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds

    predicted_class_indexes = np.argmax(predictions, axis=1)
    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]

    oob_score = np.mean(y == predicted_classes)
    return oob_score
Exemplo n.º 13
0
def _get_unsampled_indices(tree, n_samples):
    """
    An interface to get unsampled indices regardless of sklearn version.
    """
    from sklearn.ensemble.forest import _get_n_samples_bootstrap
    n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
    return _generate_unsampled_indices(tree.random_state, n_samples,
                                       n_samples_bootstrap)
Exemplo n.º 14
0
def feature_importance(rf, X, y, type = 'oob', normalized = False, balanced = False, demean=False,normal_fX = False):
    n_samples, n_features = X.shape
    if len(y.shape) != 2:
        raise ValueError('y must be 2d array (n_samples, 1) if numerical or (n_samples, n_categories).')
    out = np.zeros((n_features,))
    SE = np.zeros((n_features,))
    if demean:
        # demean y
        y = y - np.mean(y, axis=0)
        
    for tree in rf.estimators_:
        if type == 'oob':
            if rf.bootstrap:
                indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples)
            else:
                raise ValueError('Without bootstrap, it is not possible to calculate oob.')
        elif type == 'test':
            indices = np.arange(n_samples)
        elif type == 'classic':
            if rf.bootstrap:
                indices = _generate_sample_indices(tree.random_state, n_samples, n_samples)
            else:
                indices = np.arange(n_samples)
        else:
            raise ValueError('type is not recognized. (%s)'%(type))
        _, _, contributions = _predict_tree(tree, X[indices,:])
        if balanced and (type == 'oob' or type == 'test'):
            base_indices = _generate_sample_indices(tree.random_state, n_samples, n_samples)
            ids = tree.apply(X[indices, :])
            base_ids = tree.apply(X[base_indices, :])
            tmp1, tmp2 = np.unique(ids, return_counts = True)
            weight1 = {key: 1. / value for key, value in zip(tmp1, tmp2)}
            tmp1, tmp2 = np.unique(base_ids, return_counts = True)
            weight2 = {key: value for key, value in zip(tmp1, tmp2)}
            final_weights = np.array([[weight1[id] * weight2[id]] for id in ids])
            final_weights /= np.mean(final_weights)
        else:
            final_weights = 1
        if len(contributions.shape) == 2:
            contributions = contributions[:,:,np.newaxis]
        #print(contributions.shape, y[indices,:].shape)
        if normal_fX:
            for k in range(contributions.shape[-1]):
                contributions[:, :, k] = scale(contributions[:, :, k]) 
        tmp =  np.tensordot(np.array(y[indices,:]) * final_weights, contributions, axes=([0, 1], [0, 2])) 
        if normalized:
            out +=  tmp / sum(tmp)
        else:
            out += tmp / len(indices)
        if normalized:
            SE += (tmp / sum(tmp)) ** 2
        else:
            SE += (tmp / len(indices)) ** 2
    out /= rf.n_estimators
    SE /= rf.n_estimators
    SE = ((SE - out ** 2) / rf.n_estimators) ** .5 
    return out, SE
Exemplo n.º 15
0
def _extract_oob(estimator, x, y, sample_weight=None):
    """Returns OOB sample data for the given estimator."""

    unsampled = _generate_unsampled_indices(estimator.random_state, x.shape[0])

    x_oob = x[unsampled, :]
    y_oob = y[unsampled]
    w_oob = None if sample_weight is None else sample_weight[unsampled]

    return x_oob, y_oob, w_oob
Exemplo n.º 16
0
        def worker(tree):
            # Get indices of estimation set, i.e. those NOT used in for learning trees of the forest.
            estimation_indices = _generate_unsampled_indices(
                tree.random_state, n)

            # Count the occurences of each class in each leaf node, by first extracting the leaves.
            node_counts = tree.tree_.n_node_samples
            leaf_nodes = self._get_leaves(tree)
            unique_leaf_nodes = np.unique(leaf_nodes)
            class_counts_per_leaf = np.zeros(
                (len(unique_leaf_nodes), model.n_classes_))

            # Drop each estimation example down the tree, and record its 'y' value.
            for i in estimation_indices:
                temp_node = tree.apply(X_train[i].reshape((1, -1))).item()
                class_counts_per_leaf[np.where(
                    unique_leaf_nodes == temp_node)[0][0], y_train[i]] += 1

            # Count the number of data points in each leaf in.
            n_per_leaf = class_counts_per_leaf.sum(axis=1)
            n_per_leaf[n_per_leaf == 0] = 1  # Avoid divide by zero.

            # Posterior probability distributions in each leaf. Each row is length num_classes.
            posterior_per_leaf = np.divide(
                class_counts_per_leaf,
                np.repeat(n_per_leaf.reshape((-1, 1)),
                          model.n_classes_,
                          axis=1))
            posterior_per_leaf = self._finite_sample_correct(
                posterior_per_leaf, n_per_leaf)
            posterior_per_leaf.tolist()

            # Posterior probability for each element of the evaluation set.
            eval_posteriors = [
                posterior_per_leaf[np.where(unique_leaf_nodes == node)[0][0]]
                for node in tree.apply(X_eval)
            ]
            eval_posteriors = np.array(eval_posteriors)

            # Number of estimation points in the cell of each eval point.
            n_per_eval_leaf = np.asarray([
                node_counts[np.where(unique_leaf_nodes == x)[0][0]]
                for x in tree.apply(X_eval)
            ])

            class_count_increment = np.multiply(
                eval_posteriors,
                np.repeat(n_per_eval_leaf.reshape((-1, 1)),
                          model.n_classes_,
                          axis=1))
            return class_count_increment
Exemplo n.º 17
0
    def _map(u, x):
        return np.where(u == x)[0][0]

	class_counts = np.zeros((m, model.n_classes_))
	for tree in model:
	    # get out of bag indicies
	    if in_task:
	        prob_indices = _generate_unsampled_indices(tree.random_state, n)
	        # in_bag_idx = _generate_sample_indices(tree.random_state, n) # this is not behaving as i expected
	    else:
	        prob_indices = np.random.choice(range(n), size=int(subsample*n), replace=False)
	    
	    leaf_nodes = get_leaves(tree)
	    unique_leaf_nodes = np.unique(leaf_nodes)
	        
	    # get all node counts
	    node_counts = tree.tree_.n_node_samples
	    # get probs for eval samples
	    posterior_class_counts = np.zeros((len(unique_leaf_nodes), model.n_classes_))

	    for prob_index in prob_indices:
	        temp_node = tree.apply(train[prob_index].reshape(1, -1)).item()
	        posterior_class_counts[np.where(unique_leaf_nodes == temp_node)[0][0], y[prob_index]] += 1
	        
	    # total number of points in a node
	    row_sums = posterior_class_counts.sum(axis=1)
	    
	    # no divide by zero
	    row_sums[row_sums == 0] = 1

	    # posteriors
	    class_probs = (posterior_class_counts / row_sums[:, None])
	    # posteriors with finite sampling correction
	    
	    class_probs = finite_sample_correction(class_probs, row_sums)

	    # posteriors as a list
	    class_probs.tolist()
	    
	    partition_counts = np.asarray([node_counts[np.where(unique_leaf_nodes == x)[0][0]] for x in tree.apply(test)])
	    # get probability for out of bag samples
	    eval_class_probs = [class_probs[np.where(unique_leaf_nodes == x)[0][0]] for x in tree.apply(test)]
	    eval_class_probs = np.array(eval_class_probs)
	    # find total elements for out of bag samples
	    elems = np.multiply(eval_class_probs, partition_counts[:, np.newaxis])
	    # store counts for each x (repeat fhis for each tree)
	    class_counts += elems
	# calculate p(y|X = x) for all x's
	probs = class_counts / class_counts.sum(axis=1, keepdims=True)

	return probs
Exemplo n.º 18
0
    def learn_rf(self, X_train, y_train, t):
        rf_model = RandomForestClassifier(n_estimators=self.n_trees,
                                          max_depth=self.max_depth,
                                          bootstrap=True,
                                          random_state=t,
                                          verbose=0)

        model = rf_model.fit(X_train, y_train)

        n_samples = X_train.shape[0]
        num_estimators = len(model.estimators_)
        unsampled_indices = _generate_unsampled_indices(
            model.estimators_[0].random_state, n_samples,
            _get_n_samples_bootstrap(n_samples, None))

        for ind in range(1, num_estimators):
            arr = _generate_unsampled_indices(
                model.estimators_[ind].random_state, n_samples,
                _get_n_samples_bootstrap(n_samples, None))
            unsampled_indices = np.unique(
                np.concatenate((unsampled_indices, arr), 0))

        return model, unsampled_indices
Exemplo n.º 19
0
def computeError(tree, n_samples,X, y, typ="MSE"):
    unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples,n_samples)
    #get info for dataset and classes for OOB indices
    OOB_DSet = X.iloc[unsampled_indices,:]
    OOB_Y = y.values[unsampled_indices]
    #make the prediction for bag sample indices Predicting class probabilities
    predic = tree.predict(OOB_DSet)
    #probs = tree.predict_proba(OOB_DSet)
    if typ == "MSE":
        error = sum(abs(OOB_Y-predic))/len(predic)
        #error = mean_squared_error(OOB_Y, predic)
    else:
        error = r2_score(OOB_Y, predic)

    #[OOB_Err,OOB_Acc]
    return [round(error,2), round(1-error,2)]
Exemplo n.º 20
0
    def _set_oob_score(self, X, y):
        """Compute out-of-bag score"""
        validate_X_y(X, y)
        check_X_is_univariate(X)

        n_classes_ = self.n_classes_
        n_samples = y.shape[0]

        oob_decision_function = []
        oob_score = 0.0
        predictions = [
            np.zeros((n_samples, n_classes_[k]))
            for k in range(self.n_outputs_)
        ]

        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples,
                                                       self.max_samples)

        for estimator in self.estimators_:
            unsampled_indices = _generate_unsampled_indices(
                estimator.random_state, n_samples, n_samples_bootstrap)
            p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :])

            if self.n_outputs_ == 1:
                p_estimator = [p_estimator]

            for k in range(self.n_outputs_):
                predictions[k][unsampled_indices, :] += p_estimator[k]

        for k in range(self.n_outputs_):
            if (predictions[k].sum(axis=1) == 0).any():
                warn("Some inputs do not have OOB scores. "
                     "This probably means too few trees were used "
                     "to compute any reliable oob estimates.")

            decision = (predictions[k] /
                        predictions[k].sum(axis=1)[:, np.newaxis])
            oob_decision_function.append(decision)
            oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1),
                                 axis=0)

        if self.n_outputs_ == 1:
            self.oob_decision_function_ = oob_decision_function[0]
        else:
            self.oob_decision_function_ = oob_decision_function

        self.oob_score_ = oob_score / self.n_outputs_
Exemplo n.º 21
0
def oob_classifier_accuracy(rf, X_train, y_train):
    X = X_train.values
    y = y_train.values

    n_samples = len(X)
    n_classes = len(np.unique(y))
    predictions = np.zeros((n_samples, n_classes))
    for tree in rf.estimators_:
        unsampled_indices = _generate_unsampled_indices(
            tree.random_state, n_samples)
        tree_preds = tree.predict_proba(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds

    predicted_class_indexes = np.argmax(predictions, axis=1)
    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]

    oob_score = np.mean(y == predicted_classes)
    return oob_score
def cat_rf_entropy_estimate(X, y, n_estimators = 200, max_samples = .32, bootstrap = True, depth = 30, min_samples_leaf = 1):
    model = BaggingClassifier(DecisionTreeClassifier(max_depth = depth, min_samples_leaf = min_samples_leaf), 
                              n_estimators = n_estimators, 
                              max_samples= max_samples, 
                              bootstrap = bootstrap)
    model.fit(X, y)
    class_counts = np.zeros((X.shape[0], model.n_classes_))
    for tree in model:
        # get out of bag indicies
        unsampled_indices = _generate_unsampled_indices(tree.random_state, len(X))
        
        total_unsampled = len(unsampled_indices)
        np.random.shuffle(unsampled_indices)
        prob_indices, eval_indices = unsampled_indices[:total_unsampled//2], unsampled_indices[total_unsampled//2:]
        # get all node counts
        node_counts = tree.tree_.n_node_samples
        # get probs for eval samples
        posterior_class_counts = np.zeros((len(node_counts), model.n_classes_))
        for prob_index in prob_indices:
            posterior_class_counts[tree.apply(X[prob_index].item()).item(), y[prob_index]] += 1
        row_sums = posterior_class_counts.sum(axis=1)
        row_sums[row_sums == 0] = 1
        class_probs = (posterior_class_counts/row_sums[:, None])
        where_0 = np.argwhere(class_probs == 0)
        for elem in where_0:
            class_probs[elem[0], elem[1]] = 1/(2*row_sums[elem[0], None])
        where_1 = np.argwhere(class_probs == 1)
        for elem in where_1:
            class_probs[elem[0], elem[1]] = 1 - 1/(2*row_sums[elem[0], None])
        class_probs.tolist()
        partition_counts = np.asarray([node_counts[x] for x in tree.apply(X[eval_indices])])
        # get probability for out of bag samples
        eval_class_probs = [class_probs[x] for x in tree.apply(X[eval_indices])]
        eval_class_probs = np.array(eval_class_probs)
        # find total elements for out of bag samples
        elems = np.multiply(eval_class_probs, partition_counts[:, np.newaxis])
        # store counts for each x (repeat fhis for each tree)
        class_counts[eval_indices] += elems
    # calculate p(y|X = x) for all x's
    probs = class_counts/class_counts.sum(axis = 1, keepdims = True)
    entropies = -np.sum(np.log(probs)*probs, axis = 1)
    # convert nan to 0
    entropies = np.nan_to_num(entropies)
    return np.mean(entropies)
Exemplo n.º 23
0
def getOOBErrorTree(model, X, y):
    n_samples = X.shape[0]
    n_outputs_ = len(y)
    OOB_Err, OOB_Acc, IdT = {}, {}, 1
    model.n_jobs = -1
    # Here at each iteration we obtain out of bag samples for every tree.
    for tree in model.estimators_:
        unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples)
        # get info for dataset and classes for OOB indices
        OOB_DSet = X.iloc[unsampled_indices, :]
        OOB_Y = y.values[unsampled_indices]
        # make the prediction for bag sample indices Predicting class probabilities
        predic = tree.predict(OOB_DSet)
        # probs = tree.predict_proba(OOB_DSet)
        error = sum(abs(OOB_Y - predic)) / len(predic)
        OOB_Err[IdT], OOB_Acc[IdT] = round(error, 2), round(1 - error, 2)

        IdT += 1

    return OOB_Err, OOB_Acc
Exemplo n.º 24
0
def _oob_predictions_and_indices(estimators, data):
    """Computes the out-of-bag indices and their prediction result for each provided estimator.

    Args:
        estimators (List[sklearn.tree.DecisionTreeClassifier]): Estimators of the ensemble model.
        data (array-like, shape=(n_samples, n_features)): The training input samples that were used to fit the model.

    Returns:
        List[Tuple[np.array, np.array]]: A tuple of out-of-bag indices and predictions for each provided estimator.
    """
    import numpy as np
    from sklearn.ensemble.forest import _generate_unsampled_indices

    n_samples = data.shape[0]
    oob_predictions_and_indices = []
    for estimator in estimators:
        indices = _generate_unsampled_indices(estimator.random_state, n_samples)
        predictions = np.zeros(n_samples)
        predictions[indices] = estimator.predict(data[indices, :])
        oob_predictions_and_indices.append((predictions, indices))
    return oob_predictions_and_indices
Exemplo n.º 25
0
    def check_oob(self, x, y):
        n_samples = y.shape[0]
        in_sample_tensor = numpy.zeros(shape=(
            len(self.dt_classifier.estimators_),
            x.shape[0],
        ))
        out_sample_tensor = numpy.zeros(shape=(
            len(self.dt_classifier.estimators_),
            x.shape[0],
        ))

        for i, estimator in enumerate(self.dt_classifier.estimators_):
            unsampled_indices = _generate_unsampled_indices(
                estimator.random_state, n_samples)
            sampled_indices = _generate_sample_indices(
                estimator.random_state, n_samples)

            assert len(set(unsampled_indices) & set(sampled_indices)) == 0

            unsampled_estimated = estimator.predict(x[unsampled_indices, :])
            unsampled_real = y[unsampled_indices]
            sample_estimated = estimator.predict(x[sampled_indices, :])
            sample_real = y[sampled_indices]

            out_sample_success = numpy.where(unsampled_estimated.astype(int) == unsampled_real)
            out_sample_fail = numpy.where(unsampled_estimated.astype(int) != unsampled_real)
            out_sample_success_indices = unsampled_indices[out_sample_success]
            out_sample_fail_indices = unsampled_indices[out_sample_fail]
            out_sample_tensor[i, out_sample_success_indices] = 1.0
            out_sample_tensor[i, out_sample_fail_indices] = -1.0

            in_sample_success = numpy.where(sample_estimated.astype(int) == sample_real)
            in_sample_fail = numpy.where(sample_estimated.astype(int) != sample_real)
            in_sample_success_indices = sampled_indices[in_sample_success]
            in_sample_fail_indices = sampled_indices[in_sample_fail]
            in_sample_tensor[i, in_sample_success_indices] = 1.0
            in_sample_tensor[i, in_sample_fail_indices] = -1.0

        return in_sample_tensor, out_sample_tensor, y
Exemplo n.º 26
0
def oob_classifier_accuracy(rf, X_train, y_train):
    """
    Adjusted... 
    Compute out-of-bag (OOB) accuracy for a scikit-learn random forest
    classifier. We learned the guts of scikit's RF from the BSD licensed
    code:
    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
    """
    try:
        X = X_train.values
    except:
        X = X_train.copy()
    try:
        y = y_train.values
    except:
        y = y_train.copy()

    n_samples = len(X)
    n_classes = len(np.unique(y))
    # preallocation
    predictions = np.zeros((n_samples, n_classes))
    for tree in rf.estimators_:  # for each decision tree in the random forest - I have put 1 tree in the forest
        # Private function used to _parallel_build_trees function.
        unsampled_indices = _generate_unsampled_indices(
            tree.random_state, n_samples)
        tree_preds = tree.predict_proba(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds

    predicted_class_indexes = np.argmax(
        predictions, axis=1)  # threshold the probabilistic predictions
    predicted_classes = [
        rf.classes_[i] for i in predicted_class_indexes
    ]  # use the thresholded indicies to obtain a binary prediction

    oob_score = sum(y == predicted_classes) / float(len(y))
    return oob_score
Exemplo n.º 27
0
def oob_classifier_accuracy(rf, X_train, y_train):
    """
    Compute out-of-bag (OOB) accuracy for a scikit-learn random forest
    classifier. We learned the guts of scikit's RF from the BSD licensed
    code:

    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L425
    """
    X = X_train.values
    y = y_train.values

    n_samples = len(X)
    n_classes = len(np.unique(y))
    predictions = np.zeros((n_samples, n_classes))
    for tree in rf.estimators_:
        unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples)
        tree_preds = tree.predict_proba(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds

    predicted_class_indexes = np.argmax(predictions, axis=1)
    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]

    oob_score = np.mean(y == predicted_classes)
    return oob_score
Exemplo n.º 28
0
def calculate_cond_pair_vi(model,
                           X,
                           y,
                           sample_weight=None,
                           sampling_weight=None):
    """Computes pairwise permutation VI score for given model, X and y."""

    #if y.ndim == 1:
    #    # reshape is necessary to preserve the data contiguity against vs
    #    # [:, np.newaxis] that does not.
    #    y = np.reshape(y, (-1, 1))

    X = check_array(X, dtype=DTYPE, accept_sparse='csr')

    n_samples = y.shape[0]
    n_features = X.shape[1]

    vi = np.zeros((len(model.estimators_), n_features, n_features),
                  dtype=np.float32)

    for t, estimator in enumerate(model.estimators_):
        # Extract oob features and response values.
        unsampled_indices = _generate_unsampled_indices(
            estimator.random_state, n_samples, sampling_weight)

        X_unsampled = X[unsampled_indices, :]
        y_unsampled = y[unsampled_indices]

        if sample_weight is None:
            weight_unsampled = None
        else:
            weight_unsampled = sample_weight[unsampled_indices]

        # Calculate MSE.
        y_estimator = estimator.predict(X_unsampled, check_input=False)
        mse = mean_squared_error(y_unsampled, y_estimator)

        # Copy X for second permutation.
        X_copy = np.copy(X_unsampled)

        # Permute variable in X.
        for i in range(n_features):
            i_orig = np.array(X_unsampled[:, i])
            np.random.shuffle(X_unsampled[:, i])

            # MSE of permuted i.
            y_perm_i = estimator.predict(X_unsampled, check_input=False)
            mse_i = mean_squared_error(y_unsampled,
                                       y_perm_i,
                                       sample_weight=weight_unsampled)

            for j in range(i, n_features):
                # Copy and shuffle feature values.
                j_orig = np.array(X_unsampled[:, j])
                np.random.shuffle(X_unsampled[:, j])

                X_copy[:, j] = X_unsampled[:, j]

                # MSE of permuted j.
                y_perm_j = estimator.predict(X_copy, check_input=False)
                mse_j = mean_squared_error(y_unsampled,
                                           y_perm_j,
                                           sample_weight=weight_unsampled)

                # MSE of permuted i and j.
                y_perm_both = estimator.predict(X_unsampled, check_input=False)
                mse_both = mean_squared_error(y_unsampled,
                                              y_perm_both,
                                              sample_weight=weight_unsampled)

                # Restore unpermuted feature values.
                X_unsampled[:, j] = j_orig
                X_copy[:, j] = j_orig

                # Store difference for feature i in tree t.
                cond_vi = min((mse_both - mse_i), (mse_both - mse_j))
                vi[t, i, j] = max(0, cond_vi)

            X_unsampled[:, i] = i_orig

    # Calculate overall VI score.
    score = np.mean(vi, axis=0)

    return score
Exemplo n.º 29
0
def calculate_perm_vi(model,
                      X,
                      y,
                      sample_weight=None,
                      sampling_weight=None,
                      normalize=False):
    """Computes permutation VI score for given model, X and y."""

    #if y.ndim == 1:
    #    # reshape is necessary to preserve the data contiguity against vs
    #    # [:, np.newaxis] that does not.
    #    y = np.reshape(y, (-1, 1))

    X = check_array(X, dtype=DTYPE, accept_sparse='csr')

    n_samples = y.shape[0]
    n_features = X.shape[1]

    vi = np.zeros((len(model.estimators_), n_features), dtype=np.float32)

    for t, estimator in enumerate(model.estimators_):
        # Extract oob features and response values.
        unsampled_indices = _generate_unsampled_indices(
            estimator.random_state, n_samples, sampling_weight)

        X_unsampled = X[unsampled_indices, :]
        y_unsampled = y[unsampled_indices]

        if sample_weight is None:
            weight_unsampled = None
        else:
            weight_unsampled = sample_weight[unsampled_indices]

        # Calculate MSE.
        y_estimator = estimator.predict(X_unsampled, check_input=False)
        mse = mean_squared_error(y_unsampled, y_estimator)

        # Permute variable in X.
        for i in range(n_features):
            # Copy and shuffle feature values.
            f_orig = np.array(X_unsampled[:, i])
            np.random.shuffle(X_unsampled[:, i])

            # Calculate permuted MSE.
            y_estimator_perm = estimator.predict(X_unsampled,
                                                 check_input=False)
            mse_perm = mean_squared_error(y_unsampled,
                                          y_estimator_perm,
                                          sample_weight=weight_unsampled)

            # Restore unpermuted feature values.
            X_unsampled[:, i] = f_orig

            # Store difference for feature i in tree t.
            vi[t, i] = max(0, mse_perm - mse)

    # Calculate overall VI score.
    score = np.mean(vi, axis=0)

    if normalize:
        score /= np.sum(score)

    return score
Exemplo n.º 30
0
 def _estimate_posteriors(self,
                          test,
                          representation=0,
                          decider=0,
                          subsample=1,
                          acorn=None):
     r"""
     An internal function to estimate the posteriors.
     Input
     task_number: int; indicates which model in self.model_ to use
     test: array-like; test observation
     in_task: bool; True if test is an in-task observation(s)
     subsample: float in (0, 1]; proportion of out-of-task samples to use to
         estimate posteriors
     Return
     probs: numpy array; probs[i, k] is the probability of observation i
         being class k
     Usage
     predict(..)
     """
     if acorn is not None:
         acorn = np.random.seed(acorn)
     if representation == decider:
         in_task = True
     else:
         in_task = False
     train = self.X_[decider]
     y = self.y_[decider]
     model = self.models_[representation]
     n, d = train.shape
     if test.ndim > 1:
         m, d_ = test.shape
     else:
         m = len(test)
         d_ = 1
     size = len(np.unique(y))
     class_counts = np.zeros((m, size))
     for tree in model:
         # get out of bag indicies
         if in_task:
             prob_indices = _generate_unsampled_indices(
                 tree.random_state, n)
             # in_bag_idx = _generate_sample_indices(tree.random_state, n) # this is not behaving as i expected
         else:
             prob_indices = np.random.choice(range(n),
                                             size=int(subsample * n),
                                             replace=False)
         leaf_nodes = self._get_leaves(tree)
         unique_leaf_nodes = np.unique(leaf_nodes)
         # get all node counts
         node_counts = tree.tree_.n_node_samples
         # get probs for eval samples
         posterior_class_counts = np.zeros((len(unique_leaf_nodes), size))
         for prob_index in prob_indices:
             temp_node = tree.apply(train[prob_index].reshape(1, -1)).item()
             #print(y[prob_index], size, np.unique(y))
             posterior_class_counts[np.where(
                 unique_leaf_nodes == temp_node)[0][0], y[prob_index]] += 1
         # total number of points in a node
         row_sums = posterior_class_counts.sum(axis=1)
         # no divide by zero
         row_sums[row_sums == 0] = 1
         # posteriors
         class_probs = (posterior_class_counts / row_sums[:, None])
         # posteriors with finite sampling correction
         class_probs = self._finite_sample_correction(class_probs, row_sums)
         # posteriors as a list
         class_probs.tolist()
         partition_counts = np.asarray([
             node_counts[np.where(unique_leaf_nodes == x)[0][0]]
             for x in tree.apply(test)
         ])
         # get probability for out of bag samples
         eval_class_probs = [
             class_probs[np.where(unique_leaf_nodes == x)[0][0]]
             for x in tree.apply(test)
         ]
         eval_class_probs = np.array(eval_class_probs)
         # find total elements for out of bag samples
         elems = np.multiply(eval_class_probs, partition_counts[:,
                                                                np.newaxis])
         # store counts for each x (repeat fhis for each tree)
         class_counts += elems
     # calculate p(y|X = x) for all x's
     probs = class_counts / class_counts.sum(axis=1, keepdims=True)
     return probs
Exemplo n.º 31
0
def get_oob_indices(forest, X_train):
    oob_indices = {}
    for t, estimator in enumerate(forest):
        oob_indices[t] = _generate_unsampled_indices(estimator.random_state,
                                                     X_train.shape[0])
    return oob_indices
Exemplo n.º 32
0
                           n_redundant=0,
                           random_state=0,
                           shuffle=False)

feature_names = ['x' + str(i) for i in range(X.shape[1])]
data = pd.DataFrame(data=X, columns=feature_names)
data['y'] = y

# fit a random forest
clf = RandomForestClassifier(n_estimators=100,
                             max_depth=2,
                             random_state=0,
                             oob_score=True)

clf.fit(data[feature_names], data['y'])

# print some stuff about it
print(clf.feature_importances_)
print
print(clf.apply([[0, 0, 0, 0]]))

# this can be used as estimate of the propensity score. It is only produced if
# the argument *oob_score* is passed to the constructor. It is possible that it
# is NaN if an observation makes it into all trees
print(clf.oob_decision_function_)

# We can find out which rows of X were/were not used in a tree using the random
# state attributed of the tree like...
print(_generate_sample_indices(clf.estimators_[0].random_state, X.shape[0]))
print(_generate_unsampled_indices(clf.estimators_[0].random_state, X.shape[0]))