Пример #1
0
    def __init__(self, split_test, stats, depth, adwin_delta, seed):
        stats = stats if stats else Var()
        super().__init__(split_test, stats, depth)
        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self._alternate_tree = None
        self._error_change = False

        self._rng = check_random_state(seed)

        # Normalization of info monitored by drift detectors (using Welford's algorithm)
        self._error_normalizer = Var(ddof=1)
    def best_evaluated_split_suggestion(self,
                                        criterion,
                                        pre_split_dist,
                                        att_idx,
                                        binary_only=True):
        candidate = AttributeSplitSuggestion(None, [{}], -float('inf'))

        if self._root is None:
            return candidate

        self._criterion = criterion
        self._pre_split_dist = pre_split_dist
        self._att_idx = att_idx

        # Handles both single-target and multi-target tasks
        if isinstance(pre_split_dist, VectorDict):
            self._aux_estimator = VectorDict(
                default_factory=functools.partial(Var))
        else:
            self._aux_estimator = Var()

        best_split = self._find_best_split(self._root, candidate)

        # Delete auxiliary variables
        del self._criterion
        del self._pre_split_dist
        del self._att_idx
        del self._aux_estimator

        return best_split
    def update(self, att_val, target, sample_weight=1.0):
        if att_val is None or sample_weight is None:
            return
        else:
            try:
                estimator = self._statistics[att_val]
            except KeyError:
                if isinstance(target, dict):  # Multi-target case
                    self._statistics[att_val] = VectorDict(default_factory=lambda: Var())
                    self._update_estimator = self._update_estimator_multivariate
                else:
                    self._statistics[att_val] = Var()
                estimator = self._statistics[att_val]
            self._update_estimator(estimator, target, sample_weight)

        return self
Пример #4
0
 def _init_estimator(self, y):
     if isinstance(y, dict):
         self.is_single_target = False
         self.y_stats = VectorDict(default_factory=functools.partial(Var))
         self._update_estimator = self._update_estimator_multivariate
     else:
         self.y_stats = Var()
         self._update_estimator = self._update_estimator_univariate
Пример #5
0
    def __iter__(self):
        aux_stats = (Var() if next(iter(self.hash.values())).is_single_target
                     else VectorDict(default_factory=functools.partial(Var)))

        for i in sorted(self.hash.keys()):
            x = self.hash[i].x_stats.get()
            aux_stats += self.hash[i].y_stats
            yield x, aux_stats
Пример #6
0
    def __init__(self, stats, depth, splitter, adwin_delta, seed, **kwargs):
        super().__init__(stats, depth, splitter, **kwargs)

        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self._error_change = False
        self._rng = check_random_state(seed)

        # Normalization of info monitored by drift detectors (using Welford's algorithm)
        self._error_normalizer = Var(ddof=1)
Пример #7
0
    def __init__(self, stats, depth, attr_obs, attr_obs_params, leaf_model, adwin_delta, seed):
        super().__init__(stats, depth, attr_obs, attr_obs_params, leaf_model)

        self.adwin_delta = adwin_delta
        self._adwin = ADWIN(delta=self.adwin_delta)
        self.error_change = False
        self._rng = check_random_state(seed)

        # Normalization of info monitored by drift detectors (using Welford's algorithm)
        self._error_normalizer = Var(ddof=1)
 def __init__(self, index_original: int, base_model: BaseTreeRegressor,
              created_on: int, base_drift_detector: base.DriftDetector,
              base_warning_detector: base.DriftDetector,
              is_background_learner, base_metric: RegressionMetric):
     super().__init__(index_original=index_original,
                      base_model=base_model,
                      created_on=created_on,
                      base_drift_detector=base_drift_detector,
                      base_warning_detector=base_warning_detector,
                      is_background_learner=is_background_learner,
                      base_metric=base_metric)
     self._var = Var()  # Used to track drift
Пример #9
0
    def __init__(self, att_val, target_val, sample_weight):
        self.att_val = att_val

        if isinstance(target_val, dict):
            self.estimator = VectorDict(default_factory=functools.partial(Var))
            self._update_estimator = self._update_estimator_multivariate
        else:
            self.estimator = Var()
            self._update_estimator = self._update_estimator_univariate

        self._update_estimator(self, target_val, sample_weight)

        self._left = None
        self._right = None
Пример #10
0
 def __init__(self, stats, depth, attr_obs, attr_obs_params):
     if stats is None:
         # Enforce the usage of Var to keep track of target statistics
         stats = Var()
     super().__init__(stats, depth, attr_obs, attr_obs_params)
Пример #11
0
 def __init__(self, stats, depth, splitter, **kwargs):
     if stats is None:
         # Enforce the usage of Var to keep track of target statistics
         stats = Var()
     super().__init__(stats, depth, splitter, **kwargs)
    def remove_bad_splits(self, criterion, last_check_ratio, last_check_vr,
                          last_check_e, pre_split_dist):
        """Remove bad splits.

        Based on FIMT-DD's [^1] procedure to remove bad split candidates from the E-BST. This
        mechanism is triggered every time a split attempt fails. The rationale is to remove
        points whose split merit is much worse than the best candidate overall (for which the
        growth decision already failed).

        Let $m_1$ be the merit of the best split point and $m_2$ be the merit of the
        second best split candidate. The ratio $r = m_2/m_1$ along with the Hoeffding bound
        ($\\epsilon$) are used to decide upon creating a split. A split occurs when
        $r < 1 - \\epsilon$. A split candidate, with merit $m_i$, is considered badr
        if $m_i / m_1 < r - 2\\epsilon$. The rationale is the following: if the merit ratio
        for this point is smaller than the lower bound of $r$, then the true merit of that
        split relative to the best one is small. Hence, this candidate can be safely removed.

        To avoid excessive and costly manipulations of the E-BST to update the stored statistics,
        only the nodes whose children are all bad split points are pruned, as defined in [^1].

        Parameters
        ----------
        criterion
            The split criterion used by the regression tree.
        last_check_ratio
            The ratio between the merit of the second best split candidate and the merit of the
            best split candidate observed in the last failed split attempt.
        last_check_vr
            The merit (variance reduction) of the best split candidate observed in the last
            failed split attempt.
        last_check_e
            The Hoeffding bound value calculated in the last failed split attempt.
        pre_split_dist
            The complete statistics of the target observed in the leaf node.

        References
        ----------
        [^1]: Ikonomovska, E., Gama, J., & Džeroski, S. (2011). Learning model trees from evolving
        data streams. Data mining and knowledge discovery, 23(1), 128-168.
        """

        if self._root is None:
            return

        # Auxiliary variables
        self._criterion = criterion
        self._pre_split_dist = pre_split_dist
        self._last_check_ratio = last_check_ratio
        self._last_check_vr = last_check_vr
        self._last_check_e = last_check_e

        # Handles both single-target and multi-target tasks
        if isinstance(pre_split_dist, VectorDict):
            self._aux_estimator = VectorDict(
                default_factory=functools.partial(Var))
        else:
            self._aux_estimator = Var()

        self._remove_bad_split_nodes(self._root)

        # Delete auxiliary variables
        del self._criterion
        del self._pre_split_dist
        del self._last_check_ratio
        del self._last_check_vr
        del self._last_check_e
        del self._aux_estimator
Пример #13
0
 def __init__(self, radius: float = 0.01):
     super().__init__()
     self.radius = radius if radius > 0 else 0.01
     self._x_var = Var()
     self._quantizer = FeatureQuantizer(radius=self.radius)
 def reset(self, n_samples_seen):
     super().reset(n_samples_seen)
     # Reset the stats for the drift detector
     self._var = Var()
Пример #15
0
 def __init__(self):
     self.g_var = Var()
     self.h_var = Var()
     self.gh_cov = Cov()