def best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only=True): candidate = AttributeSplitSuggestion(None, [{}], -float('inf')) if self._root is None: return candidate self._criterion = criterion self._pre_split_dist = pre_split_dist self._att_idx = att_idx # Handles both single-target and multi-target tasks if isinstance(pre_split_dist, VectorDict): self._aux_estimator = VectorDict( default_factory=functools.partial(Var)) else: self._aux_estimator = Var() best_split = self._find_best_split(self._root, candidate) # Delete auxiliary variables del self._criterion del self._pre_split_dist del self._att_idx del self._aux_estimator return best_split
def __iter__(self): aux_stats = (Var() if next(iter(self.hash.values())).is_single_target else VectorDict(default_factory=functools.partial(Var))) for i in sorted(self.hash.keys()): x = self.hash[i].x_stats.get() aux_stats += self.hash[i].y_stats yield x, aux_stats
def _init_estimator(self, y): if isinstance(y, dict): self.is_single_target = False self.y_stats = VectorDict(default_factory=functools.partial(Var)) self._update_estimator = self._update_estimator_multivariate else: self.y_stats = Var() self._update_estimator = self._update_estimator_univariate
def __init__(self, att_val, target_val, sample_weight): self.att_val = att_val if isinstance(target_val, dict): self.estimator = VectorDict(default_factory=functools.partial(Var)) self._update_estimator = self._update_estimator_multivariate else: self.estimator = Var() self._update_estimator = self._update_estimator_univariate self._update_estimator(self, target_val, sample_weight) self._left = None self._right = None
def update(self, att_val, target_val, sample_weight): if att_val is None or sample_weight is None: return else: try: estimator = self._statistics[att_val] except KeyError: if isinstance(target_val, dict): # Multi-target case self._statistics[att_val] = VectorDict( default_factory=functools.partial(Var)) self._update_estimator = self._update_estimator_multivariate else: self._statistics[att_val] = Var() estimator = self._statistics[att_val] self._update_estimator(estimator, target_val, sample_weight)
def test_vectordict(): # test empty init x = dict() vx = VectorDict() assert vx == x # test basics x = {"a": 8, "b": -1.2, 4: 2.7} vx = VectorDict(x) assert vx == x assert vx["a"] == 8 assert vx[4] == 2.7 vx[9] = 8.9 assert x[9] == vx[9] == 8.9 # test copy x = {"a": 8, "b": -1.2, 4: 2.7} vx = VectorDict(x, copy=True) assert vx == x vx["a"] = 2 assert x["a"] == 8 assert vx["a"] == 2 # test operations x = {"a": 1, "b": -5, "c": -3} y = {"a": 2, "b": 0.5, "d": 4} vx = VectorDict(x) vy = VectorDict(y) assert vx == vx == x assert +vx == vx == x assert -vx == {"a": -1, "b": 5, "c": 3} assert vx + 2 == 2 + vx == {"a": 3, "b": -3, "c": -1} assert vx * 2 == 2 * vx == {"a": 2, "b": -10, "c": -6} assert vx - 2 == {"a": -1, "b": -7, "c": -5} assert 2 - vx == -(vx - 2) assert vx / 2 == {"a": 0.5, "b": -2.5, "c": -1.5} assert 2 / vx == {"a": 2, "b": -0.4, "c": -2 / 3} assert vx + vy == vy + vx == {"a": 3, "b": -4.5, "c": -3, "d": 4} assert vx - vy == {"a": -1, "b": -5.5, "c": -3, "d": -4} assert vx**2 == pow(vx, 2) == {"a": 1, "b": 25, "c": 9} assert vx * vy == vy * vx == {"a": 2, "b": -2.5, "d": 0, "c": 0} assert vx / vx == {"a": 1.0, "b": 1.0, "c": 1.0} with pytest.raises(ZeroDivisionError): vx / vy assert vx @ vy == vy @ vx == -0.5 vz = VectorDict(x, copy=True) vz += 2 assert vz == vx + 2 vz = VectorDict(x, copy=True) vz -= 2 assert vz == vx - 2 vz = VectorDict(x, copy=True) vz *= 2 assert vz == vx * 2 vz = VectorDict(x, copy=True) vz /= 2 assert vz == vx / 2 vz = VectorDict(x, copy=True) vz += vy assert vz == vx + vy vz = VectorDict(x, copy=True) vz -= vy assert vz == vx - vy vz = VectorDict(x, copy=True) vz *= vy assert vz == vx * vy vz = VectorDict(x, copy=True) vz /= vz assert vz == vx / vx vz = VectorDict(x, copy=True) vz **= 2 assert vz == vx**2 # test default_factory x = {"a": 1, "b": -5} y = {"b": 0.5, "d": 4, "e": 3, "f": 8} counter = iter(range(100)) vx = VectorDict(x, default_factory=counter.__next__) vy = VectorDict(y) assert vx @ vy == 16.5 assert counter.__next__() == 3 assert x["f"] == 2 # test mask x = {"a": 1, "b": -5, "e": 2} y = {"b": 0.5, "d": 4, "e": 3, "f": 8} z = {"b": 4, "d": 2, "g": -1} vx = VectorDict(x) vy = VectorDict(y) assert vx + vy == vy + vx == {"a": 1, "b": -4.5, "d": 4, "e": 5, "f": 8} vy = VectorDict(y, mask=z) assert vx + vy == vy + vx == {"a": 1, "b": -4.5, "d": 4, "e": 2} vy = VectorDict(y).with_mask(z.keys()) assert vx + vy == vy + vx == {"a": 1, "b": -4.5, "d": 4, "e": 2} vy = VectorDict(y).with_mask(x) assert vy / vx == {"b": -0.1, "a": 0.0, "e": 1.5} # test export x = {"a": 1, "b": -5} vx = VectorDict(x) nx = vx.to_numpy(["b", "c"]) assert isinstance(nx, np.ndarray) assert (vx.to_numpy(["b", "c"]) == np.array([-5, 0])).all() # other methods x = {"a": 1, "b": -5} vx = VectorDict(x) assert vx.abs() == abs(vx) == {"a": 1, "b": 5} assert vx.min() == -5 assert vx.max() == 1 assert vx.with_mask(["a"]).min() == 1 assert vx.with_mask(["b"]).max() == -5 assert vx.minimum(-2) == {"a": -2, "b": -5} assert vx.maximum(-2) == {"a": 1, "b": -2} y = {"b": 0.5, "c": 4} vy = VectorDict(y) assert vx.minimum(vy) == vy.minimum(vx) == {"a": 0, "b": -5, "c": 0} assert vx.maximum(vy) == vy.maximum(vx) == {"a": 1, "b": 0.5, "c": 4}
def __init__(self, stats, depth, attr_obs, attr_obs_params): stats = stats if stats else VectorDict( default_factory=functools.partial(Var)) super().__init__(stats, depth, attr_obs, attr_obs_params)
def remove_bad_splits(self, criterion, last_check_ratio, last_check_vr, last_check_e, pre_split_dist): """Remove bad splits. Based on FIMT-DD's [^1] procedure to remove bad split candidates from the E-BST. This mechanism is triggered every time a split attempt fails. The rationale is to remove points whose split merit is much worse than the best candidate overall (for which the growth decision already failed). Let $m_1$ be the merit of the best split point and $m_2$ be the merit of the second best split candidate. The ratio $r = m_2/m_1$ along with the Hoeffding bound ($\\epsilon$) are used to decide upon creating a split. A split occurs when $r < 1 - \\epsilon$. A split candidate, with merit $m_i$, is considered badr if $m_i / m_1 < r - 2\\epsilon$. The rationale is the following: if the merit ratio for this point is smaller than the lower bound of $r$, then the true merit of that split relative to the best one is small. Hence, this candidate can be safely removed. To avoid excessive and costly manipulations of the E-BST to update the stored statistics, only the nodes whose children are all bad split points are pruned, as defined in [^1]. Parameters ---------- criterion The split criterion used by the regression tree. last_check_ratio The ratio between the merit of the second best split candidate and the merit of the best split candidate observed in the last failed split attempt. last_check_vr The merit (variance reduction) of the best split candidate observed in the last failed split attempt. last_check_e The Hoeffding bound value calculated in the last failed split attempt. pre_split_dist The complete statistics of the target observed in the leaf node. References ---------- [^1]: Ikonomovska, E., Gama, J., & Džeroski, S. (2011). Learning model trees from evolving data streams. Data mining and knowledge discovery, 23(1), 128-168. """ if self._root is None: return # Auxiliary variables self._criterion = criterion self._pre_split_dist = pre_split_dist self._last_check_ratio = last_check_ratio self._last_check_vr = last_check_vr self._last_check_e = last_check_e # Handles both single-target and multi-target tasks if isinstance(pre_split_dist, VectorDict): self._aux_estimator = VectorDict( default_factory=functools.partial(Var)) else: self._aux_estimator = Var() self._remove_bad_split_nodes(self._root) # Delete auxiliary variables del self._criterion del self._pre_split_dist del self._last_check_ratio del self._last_check_vr del self._last_check_e del self._aux_estimator
def __init__(self, stats, depth, splitter, **kwargs): stats = stats if stats else VectorDict( default_factory=functools.partial(Var)) super().__init__(stats, depth, splitter, **kwargs)