def best_evaluated_split_suggestion(self,
                                        criterion,
                                        pre_split_dist,
                                        att_idx,
                                        binary_only=True):
        candidate = AttributeSplitSuggestion(None, [{}], -float('inf'))

        if self._root is None:
            return candidate

        self._criterion = criterion
        self._pre_split_dist = pre_split_dist
        self._att_idx = att_idx

        # Handles both single-target and multi-target tasks
        if isinstance(pre_split_dist, VectorDict):
            self._aux_estimator = VectorDict(
                default_factory=functools.partial(Var))
        else:
            self._aux_estimator = Var()

        best_split = self._find_best_split(self._root, candidate)

        # Delete auxiliary variables
        del self._criterion
        del self._pre_split_dist
        del self._att_idx
        del self._aux_estimator

        return best_split
示例#2
0
    def __iter__(self):
        aux_stats = (Var() if next(iter(self.hash.values())).is_single_target
                     else VectorDict(default_factory=functools.partial(Var)))

        for i in sorted(self.hash.keys()):
            x = self.hash[i].x_stats.get()
            aux_stats += self.hash[i].y_stats
            yield x, aux_stats
示例#3
0
 def _init_estimator(self, y):
     if isinstance(y, dict):
         self.is_single_target = False
         self.y_stats = VectorDict(default_factory=functools.partial(Var))
         self._update_estimator = self._update_estimator_multivariate
     else:
         self.y_stats = Var()
         self._update_estimator = self._update_estimator_univariate
示例#4
0
    def __init__(self, att_val, target_val, sample_weight):
        self.att_val = att_val

        if isinstance(target_val, dict):
            self.estimator = VectorDict(default_factory=functools.partial(Var))
            self._update_estimator = self._update_estimator_multivariate
        else:
            self.estimator = Var()
            self._update_estimator = self._update_estimator_univariate

        self._update_estimator(self, target_val, sample_weight)

        self._left = None
        self._right = None
示例#5
0
 def update(self, att_val, target_val, sample_weight):
     if att_val is None or sample_weight is None:
         return
     else:
         try:
             estimator = self._statistics[att_val]
         except KeyError:
             if isinstance(target_val, dict):  # Multi-target case
                 self._statistics[att_val] = VectorDict(
                     default_factory=functools.partial(Var))
                 self._update_estimator = self._update_estimator_multivariate
             else:
                 self._statistics[att_val] = Var()
             estimator = self._statistics[att_val]
         self._update_estimator(estimator, target_val, sample_weight)
示例#6
0
def test_vectordict():

    # test empty init
    x = dict()
    vx = VectorDict()
    assert vx == x

    # test basics
    x = {"a": 8, "b": -1.2, 4: 2.7}
    vx = VectorDict(x)
    assert vx == x
    assert vx["a"] == 8
    assert vx[4] == 2.7
    vx[9] = 8.9
    assert x[9] == vx[9] == 8.9

    # test copy
    x = {"a": 8, "b": -1.2, 4: 2.7}
    vx = VectorDict(x, copy=True)
    assert vx == x
    vx["a"] = 2
    assert x["a"] == 8
    assert vx["a"] == 2

    # test operations
    x = {"a": 1, "b": -5, "c": -3}
    y = {"a": 2, "b": 0.5, "d": 4}
    vx = VectorDict(x)
    vy = VectorDict(y)
    assert vx == vx == x
    assert +vx == vx == x
    assert -vx == {"a": -1, "b": 5, "c": 3}
    assert vx + 2 == 2 + vx == {"a": 3, "b": -3, "c": -1}
    assert vx * 2 == 2 * vx == {"a": 2, "b": -10, "c": -6}
    assert vx - 2 == {"a": -1, "b": -7, "c": -5}
    assert 2 - vx == -(vx - 2)
    assert vx / 2 == {"a": 0.5, "b": -2.5, "c": -1.5}
    assert 2 / vx == {"a": 2, "b": -0.4, "c": -2 / 3}
    assert vx + vy == vy + vx == {"a": 3, "b": -4.5, "c": -3, "d": 4}
    assert vx - vy == {"a": -1, "b": -5.5, "c": -3, "d": -4}
    assert vx**2 == pow(vx, 2) == {"a": 1, "b": 25, "c": 9}
    assert vx * vy == vy * vx == {"a": 2, "b": -2.5, "d": 0, "c": 0}
    assert vx / vx == {"a": 1.0, "b": 1.0, "c": 1.0}
    with pytest.raises(ZeroDivisionError):
        vx / vy
    assert vx @ vy == vy @ vx == -0.5
    vz = VectorDict(x, copy=True)
    vz += 2
    assert vz == vx + 2
    vz = VectorDict(x, copy=True)
    vz -= 2
    assert vz == vx - 2
    vz = VectorDict(x, copy=True)
    vz *= 2
    assert vz == vx * 2
    vz = VectorDict(x, copy=True)
    vz /= 2
    assert vz == vx / 2
    vz = VectorDict(x, copy=True)
    vz += vy
    assert vz == vx + vy
    vz = VectorDict(x, copy=True)
    vz -= vy
    assert vz == vx - vy
    vz = VectorDict(x, copy=True)
    vz *= vy
    assert vz == vx * vy
    vz = VectorDict(x, copy=True)
    vz /= vz
    assert vz == vx / vx
    vz = VectorDict(x, copy=True)
    vz **= 2
    assert vz == vx**2

    # test default_factory
    x = {"a": 1, "b": -5}
    y = {"b": 0.5, "d": 4, "e": 3, "f": 8}
    counter = iter(range(100))
    vx = VectorDict(x, default_factory=counter.__next__)
    vy = VectorDict(y)
    assert vx @ vy == 16.5
    assert counter.__next__() == 3
    assert x["f"] == 2

    # test mask
    x = {"a": 1, "b": -5, "e": 2}
    y = {"b": 0.5, "d": 4, "e": 3, "f": 8}
    z = {"b": 4, "d": 2, "g": -1}
    vx = VectorDict(x)
    vy = VectorDict(y)
    assert vx + vy == vy + vx == {"a": 1, "b": -4.5, "d": 4, "e": 5, "f": 8}
    vy = VectorDict(y, mask=z)
    assert vx + vy == vy + vx == {"a": 1, "b": -4.5, "d": 4, "e": 2}
    vy = VectorDict(y).with_mask(z.keys())
    assert vx + vy == vy + vx == {"a": 1, "b": -4.5, "d": 4, "e": 2}
    vy = VectorDict(y).with_mask(x)
    assert vy / vx == {"b": -0.1, "a": 0.0, "e": 1.5}

    # test export
    x = {"a": 1, "b": -5}
    vx = VectorDict(x)
    nx = vx.to_numpy(["b", "c"])
    assert isinstance(nx, np.ndarray)
    assert (vx.to_numpy(["b", "c"]) == np.array([-5, 0])).all()

    # other methods
    x = {"a": 1, "b": -5}
    vx = VectorDict(x)
    assert vx.abs() == abs(vx) == {"a": 1, "b": 5}
    assert vx.min() == -5
    assert vx.max() == 1
    assert vx.with_mask(["a"]).min() == 1
    assert vx.with_mask(["b"]).max() == -5
    assert vx.minimum(-2) == {"a": -2, "b": -5}
    assert vx.maximum(-2) == {"a": 1, "b": -2}
    y = {"b": 0.5, "c": 4}
    vy = VectorDict(y)
    assert vx.minimum(vy) == vy.minimum(vx) == {"a": 0, "b": -5, "c": 0}
    assert vx.maximum(vy) == vy.maximum(vx) == {"a": 1, "b": 0.5, "c": 4}
示例#7
0
 def __init__(self, stats, depth, attr_obs, attr_obs_params):
     stats = stats if stats else VectorDict(
         default_factory=functools.partial(Var))
     super().__init__(stats, depth, attr_obs, attr_obs_params)
    def remove_bad_splits(self, criterion, last_check_ratio, last_check_vr,
                          last_check_e, pre_split_dist):
        """Remove bad splits.

        Based on FIMT-DD's [^1] procedure to remove bad split candidates from the E-BST. This
        mechanism is triggered every time a split attempt fails. The rationale is to remove
        points whose split merit is much worse than the best candidate overall (for which the
        growth decision already failed).

        Let $m_1$ be the merit of the best split point and $m_2$ be the merit of the
        second best split candidate. The ratio $r = m_2/m_1$ along with the Hoeffding bound
        ($\\epsilon$) are used to decide upon creating a split. A split occurs when
        $r < 1 - \\epsilon$. A split candidate, with merit $m_i$, is considered badr
        if $m_i / m_1 < r - 2\\epsilon$. The rationale is the following: if the merit ratio
        for this point is smaller than the lower bound of $r$, then the true merit of that
        split relative to the best one is small. Hence, this candidate can be safely removed.

        To avoid excessive and costly manipulations of the E-BST to update the stored statistics,
        only the nodes whose children are all bad split points are pruned, as defined in [^1].

        Parameters
        ----------
        criterion
            The split criterion used by the regression tree.
        last_check_ratio
            The ratio between the merit of the second best split candidate and the merit of the
            best split candidate observed in the last failed split attempt.
        last_check_vr
            The merit (variance reduction) of the best split candidate observed in the last
            failed split attempt.
        last_check_e
            The Hoeffding bound value calculated in the last failed split attempt.
        pre_split_dist
            The complete statistics of the target observed in the leaf node.

        References
        ----------
        [^1]: Ikonomovska, E., Gama, J., & Džeroski, S. (2011). Learning model trees from evolving
        data streams. Data mining and knowledge discovery, 23(1), 128-168.
        """

        if self._root is None:
            return

        # Auxiliary variables
        self._criterion = criterion
        self._pre_split_dist = pre_split_dist
        self._last_check_ratio = last_check_ratio
        self._last_check_vr = last_check_vr
        self._last_check_e = last_check_e

        # Handles both single-target and multi-target tasks
        if isinstance(pre_split_dist, VectorDict):
            self._aux_estimator = VectorDict(
                default_factory=functools.partial(Var))
        else:
            self._aux_estimator = Var()

        self._remove_bad_split_nodes(self._root)

        # Delete auxiliary variables
        del self._criterion
        del self._pre_split_dist
        del self._last_check_ratio
        del self._last_check_vr
        del self._last_check_e
        del self._aux_estimator
示例#9
0
 def __init__(self, stats, depth, splitter, **kwargs):
     stats = stats if stats else VectorDict(
         default_factory=functools.partial(Var))
     super().__init__(stats, depth, splitter, **kwargs)