示例#1
0
    def get_best_split(self, features, target, root, node, father, side):
        selector = get_selector_for_node(root, node, features)
        positive_target_ratio = np.sum(target[selector], axis=0) / np.sum(
            target[selector])

        if (positive_target_ratio > 1 - 1e-8).any() or len(selector) < 10:
            return mc.List([])
        else:
            split = find_best_split(features, target, selector, kind=self.kind)
            return mc.List([SplitLocation(split, side, father, root)])
示例#2
0
    def get_best_split(self, features, target, root, node, father, side):
        selector = get_selector_for_node(root, node, features)

        if len(selector) < 10:
            return mc.List([])
        else:
            split = find_best_split_beta(features, target, selector, self.cfg)
            if split.score > 0:
                return mc.List([BetaSplitLocation(split, side, father, root)])
            else:
                return mc.List([])
示例#3
0
def get_path_model(model, columns=None):
    graft = model.tree_initial
    dic = jsonify(graft, columns)
    set_fathers(dic)
    leaves = mc.List(get_all_leaves(dic))
    leaves = leaves.filter(lambda x: not np.isnan(x["proba"][0])).sorted(
        key=lambda x: x["proba"][0])

    path = mc.List(node_to_path(leaves[-1])).map(drop_fr_dic)
    path_opt = path_redundancy_removal(path)
    return path_opt
示例#4
0
    def fit(self, features, target, sample_weight=None):
        self.target_dim = target.shape[1]
        assert self.target_dim == 2, "only target dim 2 supported, one hot encoded"
        self.target_dtype = target.dtype
        split = find_best_split_beta(features, target,
                                     np.arange(target.shape[0]), self.cfg)
        if self.verbosity > 0:
            print(split)
        self.tree_initial = get_basic_thump(split.feature_num, split.value, 1,
                                            target, features,
                                            np.arange(features.shape[0]))
        pq = PriorityQueue(
            self.get_splits_for_node(self.tree_initial, self.tree_initial,
                                     features, target))

        if self.verbosity > 0:
            print(self.pvalue)
        for i in range(self.max_depth):
            if pq.empty():
                break
            split = pq.pop()

            if self.verbosity > 0:
                print("\nn split is", str(split.split), end="\n")
                print(
                    "\nqueue is ",
                    mc.List(pq.container[0:3]).map(lambda x: x.split).map(
                        str).mk_string())

            if (split.emptiness_check(features)):
                new_node = split.apply(features, target, sample_weight)
                pq.push_list(
                    self.get_splits_for_node(self.tree_initial, new_node,
                                             features, target))
示例#5
0
def get_split_approx(k, num_st, n_k, num_2nd, prior, pval):
    m1 = get_approx_max(k + 1, num_st - k + 1, 10000,
                        lambda x, y: SST.beta(x, y).ppf(pval) - prior) + (0)
    m2 = get_approx_max(
        k + 1, num_st - k + 1, 10000,
        lambda x, y: prior - SST.beta(x, y).ppf(1 - pval)) + (1)
    m3 = get_approx_max(n_k + 1, num_2nd - k + 1, 10000,
                        lambda x, y: SST.beta(x, y).ppf(pval) - prior) + (2)
    m4 = get_approx_max(
        k + 1, num_2nd - k + 1, 10000,
        lambda x, y: prior - SST.beta(x, y).ppf(1 - pval)) + (3)
    return mc.List(
        [m1, m2, m3,
         m4]).map(lambda x: (x[0], (x[1], x[2]))).sort(lambda x: x[0])[-1]
示例#6
0
def node_to_path(node):
    import copy
    node = copy.copy(node)
    arr = mc.List([node])
    while "father" in node:
        fath = copy.copy(node["father"])
        if node["side"] == "right":
            fath["sign"] = ">"
        if node["side"] == "left":
            fath["sign"] = "<"
        node = fath

        arr.append(mc.Dict(node))
    return arr[::-1]
示例#7
0
    def fit(self, features, target, sample_weight=None):
        self.target_dim = target.shape[1]
        assert self.target_dim == 2, "only target dim 2 supported, one hot encoded"
        self.target_dtype = target.dtype
        split = find_best_split(features,
                                target,
                                np.arange(target.shape[0]),
                                kind=self.kind)
        if self.verbosity > 0:
            print(split)
        self.tree_initial = get_basic_thump(split.feature_num, split.value, 1,
                                            target, features,
                                            np.arange(features.shape[0]))
        pq = PriorityQueue(
            self.get_splits_for_node(self.tree_initial, self.tree_initial,
                                     features, target))
        self.pvalue = 1.0 - split.pvalue()

        if self.verbosity > 0:
            print(self.pvalue)
        for i in range(self.max_depth):
            split = pq.pop()
            self.pvalue *= (1 - split.split.pvalue())

            if self.pvalue < self.pvalue_limit:
                if self.verbosity > 0:
                    print(
                        f"Next pval would be {self.pvalue}, final complexity {i-1}"
                    )
                self.pvalue /= (1 - split.split.pvalue())
                break

            if self.verbosity > 0:
                print(str(split.split), end="    ")
                print(self.pvalue, end="    ")
                print(
                    "que",
                    mc.List(pq.container[0:3]).map(lambda x: x.split).map(
                        str).mk_string())
            new_node = split.apply(features, target, sample_weight)
            pq.push_list(
                self.get_splits_for_node(self.tree_initial, new_node, features,
                                         target))