def enumerate_splits(self, target_dist: proba.Multinomial): low = min(h[0].right for h in self.hists.values()) high = min(h[-1].right for h in self.hists.values()) # If only one single value has been observed, then no split can be proposed if low >= high: return yield # not a typo n_thresholds = min(self.n_splits, max(map(len, self.hists.values())) - 1) thresholds = list(decimal_range(start=low, stop=high, num=n_thresholds)) cdfs = {y: hist.iter_cdf(thresholds) for y, hist in self.hists.items()} for at in thresholds: l_dist = {} r_dist = {} for y in target_dist: p_xy = next(cdfs[y]) if y in cdfs else 0. # P(x < t | y) p_y = target_dist.pmf(y) # P(y) l_dist[y] = target_dist.n_samples * p_y * p_xy # P(y | x < t) r_dist[y] = target_dist.n_samples * p_y * (1 - p_xy ) # P(y | x >= t) l_dist = proba.Multinomial(l_dist) r_dist = proba.Multinomial(r_dist) yield LT, at, l_dist, r_dist
def enumerate_splits(self, target_dist: proba.Multinomial): categories = set(*(p_x.keys() for p_x in self.P_xy.values())) # There has to be at least two categories for a split to be possible if len(categories) < 2: return yield # not a typo for cat in categories: l_dist = {} r_dist = {} for y in target_dist: p_xy = self.P_xy[y].pmf(cat) # P(cat | y) p_y = target_dist.pmf(y) # P(y) l_dist[y] = target_dist.n_samples * p_y * p_xy # P(y | cat) r_dist[y] = target_dist.n_samples * p_y * (1. - p_xy ) # P(y | !cat) l_dist = proba.Multinomial(l_dist) r_dist = proba.Multinomial(r_dist) yield EQ, cat, l_dist, r_dist
def __init__(self, criterion='gini', patience=250, max_depth=5, min_split_gain=0., min_child_samples=20, confidence=1e-10, tie_threshold=5e-2, n_split_points=30, max_bins=60, curtail_under=10): self.criterion = criterion self.patience = patience self.max_depth = max_depth self.min_split_gain = min_split_gain self.min_child_samples = min_child_samples self.confidence = confidence self.tie_threshold = tie_threshold self.n_split_points = n_split_points self.max_bins = max_bins self.curtail_under = curtail_under self.criterion_func = CRITERIA_CLF[criterion] self.root = leaf.Leaf(depth=0, tree=self, target_dist=proba.Multinomial())
def _make_leaf_dist(self): return proba.Multinomial()