Пример #1
0
 def _init_estimator(self, y):
     if isinstance(y, dict):
         self.is_single_target = False
         self.y_stats = VectorDict(default_factory=functools.partial(Var))
         self._update_estimator = self._update_estimator_multivariate
     else:
         self.y_stats = Var()
         self._update_estimator = self._update_estimator_univariate
 def __init__(self, index_original: int, base_model: BaseTreeRegressor,
              created_on: int, base_drift_detector: base.DriftDetector,
              base_warning_detector: base.DriftDetector,
              is_background_learner, base_metric: RegressionMetric):
     super().__init__(index_original=index_original,
                      base_model=base_model,
                      created_on=created_on,
                      base_drift_detector=base_drift_detector,
                      base_warning_detector=base_warning_detector,
                      is_background_learner=is_background_learner,
                      base_metric=base_metric)
     self._var = Var()  # Used to track drift
Пример #3
0
    def __init__(self, split_test, stats, depth, adwin_delta, seed):
        stats = stats if stats else Var()
        super().__init__(split_test, stats, depth)
        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self._alternate_tree = None
        self._error_change = False

        self._rng = check_random_state(seed)

        # Normalization of info monitored by drift detectors (using Welford's algorithm)
        self._error_normalizer = Var(ddof=1)
Пример #4
0
class Slot:
    """ The element stored in the quantization hash.

    Each slot keeps the mean values of the numerical feature, as well as the variance
    and mean of the target.

    """
    def __init__(self,
                 x: float,
                 y=typing.Union[float, VectorDict],
                 weight: float = 1.0):
        self.x_stats = Mean()
        self.x_stats.update(x, weight)

        self.y_stats: typing.Union[Var, VectorDict]

        self._update_estimator: typing.Callable[
            [typing.Union[float, VectorDict], float], None]
        self.is_single_target = True

        self._init_estimator(y)
        self._update_estimator(y, weight)

    def _init_estimator(self, y):
        if isinstance(y, dict):
            self.is_single_target = False
            self.y_stats = VectorDict(default_factory=functools.partial(Var))
            self._update_estimator = self._update_estimator_multivariate
        else:
            self.y_stats = Var()
            self._update_estimator = self._update_estimator_univariate

    def _update_estimator_univariate(self, target, sample_weight):
        self.y_stats.update(target, sample_weight)

    def _update_estimator_multivariate(self, target, sample_weight):
        for t in target:
            self.y_stats[t].update(target[t], sample_weight)

    def __iadd__(self, o):
        self.x_stats += o.x_stats
        self.y_stats += o.y_stats

        return self

    def update(self, x, y, sample_weight):
        self.x_stats.update(x, sample_weight)
        self._update_estimator(y, sample_weight)
    def update(self, att_val, target, sample_weight=1.0):
        if att_val is None or sample_weight is None:
            return
        else:
            try:
                estimator = self._statistics[att_val]
            except KeyError:
                if isinstance(target, dict):  # Multi-target case
                    self._statistics[att_val] = VectorDict(default_factory=lambda: Var())
                    self._update_estimator = self._update_estimator_multivariate
                else:
                    self._statistics[att_val] = Var()
                estimator = self._statistics[att_val]
            self._update_estimator(estimator, target, sample_weight)

        return self
    def best_evaluated_split_suggestion(self,
                                        criterion,
                                        pre_split_dist,
                                        att_idx,
                                        binary_only=True):
        candidate = AttributeSplitSuggestion(None, [{}], -float('inf'))

        if self._root is None:
            return candidate

        self._criterion = criterion
        self._pre_split_dist = pre_split_dist
        self._att_idx = att_idx

        # Handles both single-target and multi-target tasks
        if isinstance(pre_split_dist, VectorDict):
            self._aux_estimator = VectorDict(
                default_factory=functools.partial(Var))
        else:
            self._aux_estimator = Var()

        best_split = self._find_best_split(self._root, candidate)

        # Delete auxiliary variables
        del self._criterion
        del self._pre_split_dist
        del self._att_idx
        del self._aux_estimator

        return best_split
Пример #7
0
    def __iter__(self):
        aux_stats = (Var() if next(iter(self.hash.values())).is_single_target
                     else VectorDict(default_factory=functools.partial(Var)))

        for i in sorted(self.hash.keys()):
            x = self.hash[i].x_stats.get()
            aux_stats += self.hash[i].y_stats
            yield x, aux_stats
Пример #8
0
class ForestMemberRegressor(BaseForestMember, base.Regressor):
    """Forest member class for regression"""
    def __init__(
        self,
        index_original: int,
        model: BaseTreeRegressor,
        created_on: int,
        drift_detector: base.DriftDetector,
        warning_detector: base.DriftDetector,
        is_background_learner,
        metric: RegressionMetric,
    ):
        super().__init__(
            index_original=index_original,
            model=model,
            created_on=created_on,
            drift_detector=drift_detector,
            warning_detector=warning_detector,
            is_background_learner=is_background_learner,
            metric=metric,
        )
        self._var = Var()  # Used to track drift

    def _drift_detector_input(self, y_true: float, y_pred: float):
        drift_input = y_true - y_pred
        self._var.update(drift_input)

        if self._var.mean.n == 1:
            return 0.5  # The expected error is the normalized mean error

        sd = math.sqrt(self._var.sigma)

        # We assume the error follows a normal distribution -> (empirical rule)
        # 99.73% of the values lie  between [mean - 3*sd, mean + 3*sd]. We
        # assume this range for the normalized data. Hence, we can apply the
        # min-max norm to cope with  ADWIN's requirements
        return (drift_input + 3 * sd) / (6 * sd) if sd > 0 else 0.5

    def reset(self, n_samples_seen):
        super().reset(n_samples_seen)
        # Reset the stats for the drift detector
        self._var = Var()

    def predict_one(self, x):
        return self.model.predict_one(x)
Пример #9
0
    def __init__(self, stats, depth, attr_obs, attr_obs_params, leaf_model, adwin_delta, seed):
        super().__init__(stats, depth, attr_obs, attr_obs_params, leaf_model)

        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self.error_change = False
        self._rng = check_random_state(seed)

        # Normalization of info monitored by drift detectors (using Welford's algorithm)
        self._error_normalizer = Var(ddof=1)
Пример #10
0
    def __init__(self, stats, depth, splitter, adwin_delta, seed, **kwargs):
        super().__init__(stats, depth, splitter, **kwargs)

        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self._error_change = False
        self._rng = check_random_state(seed)

        # Normalization of info monitored by drift detectors (using Welford's algorithm)
        self._error_normalizer = Var(ddof=1)
Пример #11
0
    def delta_loss_mean_var(self, delta_pred: float) -> Var:
        m = self.mean
        n = self.total_weight
        mean = delta_pred * m.gradient + 0.5 * m.hessian * delta_pred * delta_pred

        variance = self.variance
        covariance = self.covariance

        grad_term_var = delta_pred * delta_pred * variance.gradient
        hess_term_var = 0.25 * variance.hessian * (delta_pred ** 4.0)
        sigma = max(0.0, grad_term_var + hess_term_var + (delta_pred ** 3) * covariance)
        return Var._from_state(n, mean, sigma)  # noqa
Пример #12
0
    def __init__(self, att_val, target_val, sample_weight):
        self.att_val = att_val

        if isinstance(target_val, dict):
            self.estimator = VectorDict(default_factory=functools.partial(Var))
            self._update_estimator = self._update_estimator_multivariate
        else:
            self.estimator = Var()
            self._update_estimator = self._update_estimator_univariate

        self._update_estimator(self, target_val, sample_weight)

        self._left = None
        self._right = None
Пример #13
0
 def __init__(self, stats, depth, attr_obs, attr_obs_params):
     if stats is None:
         # Enforce the usage of Var to keep track of target statistics
         stats = Var()
     super().__init__(stats, depth, attr_obs, attr_obs_params)
Пример #14
0
 def __init__(self, stats, depth, splitter, **kwargs):
     if stats is None:
         # Enforce the usage of Var to keep track of target statistics
         stats = Var()
     super().__init__(stats, depth, splitter, **kwargs)
    def remove_bad_splits(self, criterion, last_check_ratio, last_check_vr,
                          last_check_e, pre_split_dist):
        """Remove bad splits.

        Based on FIMT-DD's [^1] procedure to remove bad split candidates from the E-BST. This
        mechanism is triggered every time a split attempt fails. The rationale is to remove
        points whose split merit is much worse than the best candidate overall (for which the
        growth decision already failed).

        Let $m_1$ be the merit of the best split point and $m_2$ be the merit of the
        second best split candidate. The ratio $r = m_2/m_1$ along with the Hoeffding bound
        ($\\epsilon$) are used to decide upon creating a split. A split occurs when
        $r < 1 - \\epsilon$. A split candidate, with merit $m_i$, is considered badr
        if $m_i / m_1 < r - 2\\epsilon$. The rationale is the following: if the merit ratio
        for this point is smaller than the lower bound of $r$, then the true merit of that
        split relative to the best one is small. Hence, this candidate can be safely removed.

        To avoid excessive and costly manipulations of the E-BST to update the stored statistics,
        only the nodes whose children are all bad split points are pruned, as defined in [^1].

        Parameters
        ----------
        criterion
            The split criterion used by the regression tree.
        last_check_ratio
            The ratio between the merit of the second best split candidate and the merit of the
            best split candidate observed in the last failed split attempt.
        last_check_vr
            The merit (variance reduction) of the best split candidate observed in the last
            failed split attempt.
        last_check_e
            The Hoeffding bound value calculated in the last failed split attempt.
        pre_split_dist
            The complete statistics of the target observed in the leaf node.

        References
        ----------
        [^1]: Ikonomovska, E., Gama, J., & Džeroski, S. (2011). Learning model trees from evolving
        data streams. Data mining and knowledge discovery, 23(1), 128-168.
        """

        if self._root is None:
            return

        # Auxiliary variables
        self._criterion = criterion
        self._pre_split_dist = pre_split_dist
        self._last_check_ratio = last_check_ratio
        self._last_check_vr = last_check_vr
        self._last_check_e = last_check_e

        # Handles both single-target and multi-target tasks
        if isinstance(pre_split_dist, VectorDict):
            self._aux_estimator = VectorDict(
                default_factory=functools.partial(Var))
        else:
            self._aux_estimator = Var()

        self._remove_bad_split_nodes(self._root)

        # Delete auxiliary variables
        del self._criterion
        del self._pre_split_dist
        del self._last_check_ratio
        del self._last_check_vr
        del self._last_check_e
        del self._aux_estimator
Пример #16
0
class GradHessStats:
    """Class used to monitor and update the gradient/hessian information in Stochastic Gradient
    Trees.

    Represents the aggregated gradient/hessian data in a node (global node statistics), category,
    or numerical feature's discretized bin.
    """
    def __init__(self):
        self.g_var = Var()
        self.h_var = Var()
        self.gh_cov = Cov()

    def __iadd__(self, other):
        self.g_var += other.g_var
        self.h_var += other.h_var
        self.gh_cov += other.gh_cov

        return self

    def __isub__(self, other):
        self.g_var -= other.g_var
        self.h_var -= other.h_var
        self.gh_cov -= other.gh_cov

        return self

    def __add__(self, other):
        new = copy.deepcopy(self)
        new += other

        return new

    def __sub__(self, other):
        new = copy.deepcopy(self)
        new -= other

        return new

    def update(self, gh: GradHess, w: float = 1.0):
        self.g_var.update(gh.gradient, w)
        self.h_var.update(gh.hessian, w)
        self.gh_cov.update(gh.gradient, gh.hessian, w)

    @property
    def mean(self) -> GradHess:
        return GradHess(self.g_var.mean.get(), self.h_var.mean.get())

    @property
    def variance(self) -> GradHess:
        return GradHess(self.g_var.get(), self.h_var.get())

    @property
    def covariance(self) -> float:
        return self.gh_cov.get()

    @property
    def total_weight(self) -> float:
        return self.g_var.mean.n

    # This method ignores correlations between delta_pred and the gradients/hessians! Considering
    # delta_pred is derived from the gradient and hessian sample, this assumption is definitely
    # violated. However, as empirically demonstrated in the original SGT, this fact does not seem
    # to significantly impact on the obtained results.
    def delta_loss_mean_var(self, delta_pred: float) -> Var:
        m = self.mean
        n = self.total_weight
        mean = delta_pred * m.gradient + 0.5 * m.hessian * delta_pred * delta_pred

        variance = self.variance
        covariance = self.covariance

        grad_term_var = delta_pred * delta_pred * variance.gradient
        hess_term_var = 0.25 * variance.hessian * (delta_pred**4.0)
        sigma = max(
            0.0, grad_term_var + hess_term_var + (delta_pred**3) * covariance)
        return Var._from_state(n, mean, sigma)  # noqa
Пример #17
0
 def __init__(self):
     self.g_var = Var()
     self.h_var = Var()
     self.gh_cov = Cov()
Пример #18
0
 def __init__(self, radius: float = 0.01):
     super().__init__()
     self.radius = radius if radius > 0 else 0.01
     self._x_var = Var()
     self._quantizer = FeatureQuantizer(radius=self.radius)
Пример #19
0
class NumericAttributeRegressionQuantizerObserver(AttributeObserver):
    """Quantizer observer (QO).

    Utilizes a dynamical hash-based quantization algorithm to keep track of the target statistics
    and evaluate split candidates. This class implements the algorithm described in [^1].
    This attribute observer keeps an internal estimator of the input feature's variance. By doing
    that, QO can calculate better values for its radius parameter to be passed to future learning
    nodes.

    Parameters
    ----------
    radius
        The quantization radius.

    References
    ----------
    [^1]: Mastelini, S.M. and de Leon Ferreira, A.C.P., 2021. Using dynamical quantization to
    perform split attempts in online tree regressors. Pattern Recognition Letters.

    """
    def __init__(self, radius: float = 0.01):
        super().__init__()
        self.radius = radius if radius > 0 else 0.01
        self._x_var = Var()
        self._quantizer = FeatureQuantizer(radius=self.radius)

    def update(self, x, y, sample_weight):
        if x is None:
            return
        else:
            self._x_var.update(x, sample_weight)
            self._quantizer.update(x, y, sample_weight)

    def probability_of_attribute_value_given_class(self, x, y):
        raise NotImplementedError

    def best_evaluated_split_suggestion(self,
                                        criterion,
                                        pre_split_dist,
                                        att_idx,
                                        binary_only=True):
        candidate = AttributeSplitSuggestion(None, [{}], -math.inf)

        # The previously evaluated x value
        prev_x = None

        for (x, left_dist) in self._quantizer:
            # First hash element
            if prev_x is None:
                # In case the hash carries just one element return the null split
                if len(self._quantizer) == 1:
                    return candidate
                prev_x = x
                continue

            right_dist = pre_split_dist - left_dist
            post_split_dists = [left_dist, right_dist]
            merit = criterion.merit_of_split(pre_split_dist, post_split_dists)

            if merit > candidate.merit:
                split_point = (prev_x + x) / 2.0
                candidate = self._update_candidate(split_point, att_idx,
                                                   post_split_dists, merit)

            prev_x = x
        return candidate

    @property
    def x_var(self):
        return self._x_var

    @staticmethod
    def _update_candidate(split_point, att_idx, post_split_dists, merit):
        num_att_binary_test = NumericAttributeBinaryTest(
            att_idx, split_point, True)
        candidate = AttributeSplitSuggestion(num_att_binary_test,
                                             post_split_dists, merit)

        return candidate
 def reset(self, n_samples_seen):
     super().reset(n_samples_seen)
     # Reset the stats for the drift detector
     self._var = Var()