def test_zero_weights(): x = [1, 2, 3, 4, 5] w = [0, 0, 0, 0.1, 0.1] for q in np.arange(0, 110, 10): assert_equal(weighted_percentile(x, q, w), weighted_percentile([4, 5], q, [0.1, 0.1]))
def test_percentile_equal_weights(): rng = np.random.RandomState(0) x = rng.randn(10) weights = 0.1 * np.ones(10) # since weights are equal, quantiles lie in the midpoint. sorted_x = np.sort(x) expected = 0.5 * (sorted_x[1:] + sorted_x[:-1]) actual = ( [weighted_percentile(x, q, weights) for q in np.arange(10, 100, 10)] ) assert_array_almost_equal(expected, actual) # check quantiles at (5, 95) at intervals of 10 actual = ( [weighted_percentile(x, q, weights) for q in np.arange(5, 105, 10)] ) assert_array_almost_equal(sorted_x, actual)
def predict(self, X, quantiles=None): """ Predict regression value for X. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. quantile : int, optional Value ranging from 0 to 100. By default, the mean is returned. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- y : array of shape = [n_samples] If quantile is set to None, then return E(Y | X). Else return y such that F(Y=y | x) = quantile. """ # apply method requires X to be of dtype np.float32 if quantiles is None: quantiles = [0.50] if quantiles == 'mean': quantiles = None column_names = ['mean'] else: column_names = [ str(round(100 * quantile, 1)) + '%' for quantile in quantiles ] index = X.index X = check_array(X, dtype=np.float32, accept_sparse="csc") if quantiles is None: preds = super(MyRandomForestQuantileRegressor, self).predict(X) return pd.DataFrame(preds, index=index, columns=column_names) sorter = np.argsort(self.y_train_) X_leaves = self.apply(X) quantile_values = np.zeros((X.shape[0], len(quantiles))) for i, x_leaf in enumerate(X_leaves): mask = self.y_train_leaves_ != np.expand_dims(x_leaf, 1) x_weights = ma.masked_array(self.y_weights_, mask) weights = x_weights.sum(axis=0) for i_q, quantile in enumerate(quantiles): quantile_values[i, i_q] = weighted_percentile( self.y_train_, int(100 * quantile), weights, sorter) return pd.DataFrame(quantile_values, index=index, columns=column_names)
def test_quantiles(): # Test with max depth 1. for est in estimators: est.set_params(max_depth=1) est.fit(X_train, y_train) tree = est.tree_ for q in [20, 40, 50, 60, 80, 90]: left_ind = X_train[:, tree.feature[0]] <= tree.threshold[0] right_ind = X_train[:, tree.feature[0]] > tree.threshold[0] # fixme left_q = weighted_percentile(y_train[left_ind], q) right_q = weighted_percentile(y_train[right_ind], q) for curr_X, curr_y in [[X_train, y_train], [X_test, y_test]]: actual_q = np.zeros(curr_X.shape[0]) left_ind = curr_X[:, tree.feature[0]] <= tree.threshold[0] actual_q[left_ind] = left_q right_ind = curr_X[:, tree.feature[0]] > tree.threshold[0] actual_q[right_ind] = right_q expected_q = est.predict(curr_X, quantile=q) assert_array_almost_equal(expected_q, actual_q)
def test_percentile_toy_data(): x = [1, 2, 3] weights = [1, 4, 5] # Test 0 and 100th quantile assert_equal(weighted_percentile(x, 0, weights), 1) assert_equal(weighted_percentile(x, 100, weights), 3) assert_equal(weighted_percentile(x, 5, weights), 1) assert_equal(weighted_percentile(x, 30, weights), 2) assert_equal(weighted_percentile(x, 75, weights), 3) assert_almost_equal(weighted_percentile(x, 50, weights), 2.44, 2)
def predict(self, X, quantiles=None, check_input=False): """ Predict regression value for X. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. quantile : int, optional Value ranging from 0 to 100. By default, the mean is returned. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- y : array of shape = [n_samples] If quantile is set to None, then return E(Y | X). Else return y such that F(Y=y | x) = quantile. """ # apply method requires X to be of dtype np.float32 X = check_array(X, dtype=np.float32, accept_sparse="csc") if quantiles is None: return super(MyDecisionTreeQuantileRegressor, self).predict(X, check_input=check_input) quantile_values = np.zeros((X.shape[0], len(quantiles))) X_leaves = self.apply(X) unique_leaves = np.unique(X_leaves) for leaf in unique_leaves: for i_q, quantile in enumerate(quantiles): quantile_values[X_leaves == leaf, i_q] = weighted_percentile( self.y_train_[self.y_train_leaves_ == leaf], int(100 * quantile)) return quantile_values