def __init__( self, template_splitter, split_criterion, pred_model, drift_detector, ): super().__init__( template_splitter=template_splitter, split_criterion=split_criterion, ) self.pred_model = pred_model self.drift_detector = drift_detector self._target_stats = stats.Var() self._feat_stats = collections.defaultdict(functools.partial( stats.Var))
def _eval_numerical_splits( self, feature_idx, candidate, sgt ) -> typing.Tuple[BranchFactory, bool]: skip_candidate = True quantizer = self._split_stats[feature_idx] # Get updated quantizer params self.split_params[feature_idx].update(quantizer._get_params()) n_bins = len(quantizer) if n_bins == 1: # Insufficient number of bins to perform splits return candidate, skip_candidate skip_candidate = False candidate.merit.loss_mean = math.inf candidate.merit.delta_pred = {} # Auxiliary gradient and hessian statistics left_ghs = GradHessStats() left_dlms = stats.Var() for thresh, ghs in quantizer: left_ghs += ghs left_delta_pred = self.delta_prediction(left_ghs.mean, sgt.lambda_value) left_dlms += left_ghs.delta_loss_mean_var(left_delta_pred) right_ghs = self._update_stats - left_ghs right_delta_pred = self.delta_prediction(right_ghs.mean, sgt.lambda_value) right_dlms = right_ghs.delta_loss_mean_var(right_delta_pred) all_dlms = left_dlms + right_dlms loss_mean = all_dlms.mean.get() loss_var = all_dlms.get() if loss_mean < candidate.merit.loss_mean: candidate.merit.loss_mean = ( loss_mean + 2.0 * sgt.gamma / self.total_weight ) candidate.merit.loss_var = loss_var candidate.merit.delta_pred[0] = left_delta_pred candidate.merit.delta_pred[1] = right_delta_pred candidate.split_info = thresh return candidate, skip_candidate
def __init__( self, index_original: int, model: BaseTreeRegressor, created_on: int, drift_detector: base.DriftDetector, warning_detector: base.DriftDetector, is_background_learner, metric: metrics.RegressionMetric, ): super().__init__( index_original=index_original, model=model, created_on=created_on, drift_detector=drift_detector, warning_detector=warning_detector, is_background_learner=is_background_learner, metric=metric, ) self._var = stats.Var() # Used to track drift
def reset(self, n_samples_seen): super().reset(n_samples_seen) # Reset the stats for the drift detector self._var = stats.Var()
def __init__(self, seed=None): super().__init__(seed=seed) self.variance = stats.Var() self.mean = stats.Mean() self.seed = seed
def find_best_split(self, sgt): best = SGTSplit() # Null split: update the prediction using the new gradient information best.delta_pred = delta_prediction(self._update_stats.mean(), sgt.lambda_value) dlms = self._update_stats.delta_loss_mean_var(best.delta_pred) best.loss_mean = dlms.mean.get() best.loss_var = dlms.get() for feature_idx in self._split_stats: candidate = SGTSplit() candidate.feature_idx = feature_idx if sgt.nominal_attributes is not None and feature_idx in sgt.nominal_attributes: # Nominal attribute has been already used in a previous split if feature_idx in sgt._split_features: continue candidate.is_nominal = True candidate.delta_pred = {} all_dlms = stats.Var() cat_collection = self._split_stats[feature_idx] for category in cat_collection: dp = delta_prediction(cat_collection[category].mean(), sgt.lambda_value) dlms = cat_collection[category].delta_loss_mean_var(dp) candidate.delta_pred[category] = dp all_dlms += dlms candidate.loss_mean = ( all_dlms.mean.get() + len(cat_collection) * sgt.gamma / self.total_weight) candidate.loss_var = all_dlms.get() else: # Numerical features quantizer = self._split_stats[feature_idx] half_radius = quantizer.radius / 2. n_bins = len(quantizer) if n_bins == 1: # Insufficient number of bins to perform splits continue candidate.loss_mean = math.inf candidate.delta_pred = {} # Auxiliary gradient and hessian statistics left_ghs = GradHessStats() left_dlms = stats.Var() for i, ghs in enumerate(quantizer): left_ghs += ghs left_delta_pred = delta_prediction(left_ghs.mean(), sgt.lambda_value) left_dlms += left_ghs.delta_loss_mean_var(left_delta_pred) right_ghs = self._update_stats - left_ghs right_delta_pred = delta_prediction( right_ghs.mean(), sgt.lambda_value) right_dlms = right_ghs.delta_loss_mean_var( right_delta_pred) all_dlms = left_dlms + right_dlms loss_mean = all_dlms.mean.get() loss_var = all_dlms.get() if loss_mean < candidate.loss_mean: candidate.loss_mean = ( loss_mean + 2.0 * sgt.gamma / self.total_weight) candidate.loss_var = loss_var candidate.delta_pred[0] = left_delta_pred candidate.delta_pred[1] = right_delta_pred # Define split point if i == n_bins - 1: # Last bin candidate.feature_val = ghs.get_x() else: # Use middle point between bins candidate.feature_val = ghs.get_x() + half_radius if candidate.loss_mean < best.loss_mean: best = candidate return best
def __init__(self): super().__init__() self._y_var = stats.Var() self._total_sum_of_squares = 0 self._residual_sum_of_squares = 0 self.sample_correction = {}
def __init__(self): self.x_m = stats.Mean() self.g_var = stats.Var() self.h_var = stats.Var() self.gh_cov = stats.Cov()
def __init__(self, regressor: base.Regressor): self.var = stats.Var() super().__init__(regressor=regressor, func=self._scale, inverse_func=self._unscale)
# Check the statistic has a working __str__ and name method assert isinstance(str(stat), str) if isinstance(stat, stats.Univariate): assert isinstance(stat.name, str) @pytest.mark.parametrize( 'stat, func', [(stats.Kurtosis(bias=True), sp_stats.kurtosis), (stats.Kurtosis(bias=False), functools.partial(sp_stats.kurtosis, bias=False)), (stats.Mean(), statistics.mean), (stats.Skew(bias=True), sp_stats.skew), (stats.Skew(bias=False), functools.partial(sp_stats.skew, bias=False)), (stats.Var(ddof=0), np.var), (stats.Var(), functools.partial(np.var, ddof=1))]) def test_univariate(stat, func): # Shut up np.warnings.filterwarnings('ignore') X = [random.random() for _ in range(30)] for i, x in enumerate(X): stat.update(x) if i >= 1: assert math.isclose(stat.get(), func(X[:i + 1]), abs_tol=1e-10) @pytest.mark.parametrize('stat, func',
def __init__(self, seed=None): super().__init__() self.variance = stats.Var() self.mean = stats.Mean() self.seed = seed self._rng = random.Random(seed)
def __init__(self): self._y_var = stats.Var() self._total_sum_of_squares = 0 self._residual_sum_of_squares = 0
import copy import functools import math import random import numpy as np import pytest from river import stats @pytest.mark.parametrize( "stat", [ pytest.param(stat, id=stat.__class__.__name__) for stat in [stats.Mean(), stats.Var( ddof=0), stats.Var(ddof=1)] ], ) def test_add(stat): A = copy.deepcopy(stat) B = copy.deepcopy(stat) C = copy.deepcopy(stat) X = [random.random() for _ in range(30)] Y = [random.random() for _ in range(30)] W = [random.random() for _ in range(30)] for x, y, w in zip(X, Y, W): A.update(x, w) B.update(y, w) C.update(x, w).update(y, w)