Exemplo n.º 1
0
        def recurse(node, depth, rule=[]):
            if tree_.feature[node] != _tree.TREE_UNDEFINED:
                feature = feature_name[node]

                threshold = tree_.threshold[node]
                # left <= threshhold < right
                child_l, child_r = tree_.children_left[
                    node], tree_.children_right[node]
                impure_l, impure_r = tree_.impurity[child_l], tree_.impurity[
                    child_r]
                if ds_context.parametric_types[feature] == Categorical:
                    if impure_l < impure_r:  # go left
                        # assuming binary!
                        rule.append(
                            Condition(feature, np.equal,
                                      int(np.floor(threshold))))
                        return recurse(child_l, depth + 1, rule)
                    else:  # go right
                        rule.append(
                            Condition(feature, np.equal,
                                      int(np.ceil(threshold))))
                        return recurse(child_r, depth + 1, rule)
                else:  #todo nonbinary case? then broken?
                    if impure_l < impure_r:  # go left
                        rule.append(
                            Condition(feature, np.less_equal, threshold))
                        return recurse(child_l, depth + 1, rule)
                    else:  # go right
                        rule.append(Condition(feature, np.greater, threshold))
                        return recurse(child_r, depth + 1, rule)
            else:
                return rule
Exemplo n.º 2
0
def get_labeled_rule(head, body, value_dict):
    assert isinstance(body, Rule) and isinstance(head, Condition)
    '''simple EM to get the (body,head) and assign labels according to columns.
    only len(conseq)==1 for now'''
    #unlabeled:
    new_cs = []
    for cond in body:
        _, var_name, i2str = value_dict[cond.var]
        # new_cs.append(Condition(value_dict[cond[0]], cond[1], cond[2]))
        new_cs.append(Condition(var_name, cond.op, i2str[cond.threshold]))
    _, head_name, head_dict = value_dict[head.var]
    assert len(body.get_similar_conditions(head.var)) == 0
    labeled_head = Condition(head_name, np.equal, head_dict[head.threshold])
    return labeled_head, Rule(new_cs)
Exemplo n.º 3
0
def get_leaf_rules(leaf, ):
    # em = float(np.argmax(leaf.p))
    rules = []
    assert len(leaf.rule.get_similar_conditions(leaf.scope[0])) == 0
    for i in range(len(leaf.p)):
        conseq = Condition(leaf.scope[0], np.equal, i)
        rules.append([leaf.rule, conseq])
    return rules
Exemplo n.º 4
0
def format_mlxtend2rule_ex(
    head=None,
    body=None,
):
    if head:
        assert len(head) == 1, 'longer heads not implemented'
        res_h = Condition(tuple(head)[0], np.equal, 1)
    if body:
        conds = []
        for it in body:
            conds.append(Condition(it, np.equal, 1))
    if body and head:
        return res_h, Rule(conds)
    elif head:
        return res_h
    elif body:
        return Rule(conds)
Exemplo n.º 5
0
def reverse_label_rule(body, head, value_dict):
    new_cs = []
    for cond in body:
        for _, var_name, i2str in value_dict:
            if var_name == cond[0]:
                for i, s in i2str.items():
                    if s == cond.threshold:
                        new_cs.append(Condition(var_name, cond.op, i))
Exemplo n.º 6
0
            def yield_rules(eligable_leaves, ):
                # def __powerset(iterable):
                #     "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
                #     s = list(iterable)
                #     max_len = min((self.body_max_len, len(s)))
                #     for r in range(max_len):
                #         for e in itertools.combinations(s, r):
                #             yield e
                #     # all_sets = itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(max_len))
                #     # return itertools.islice(all_sets, 1, ncandidates)
                #     # for e in all_sets:
                #     #     yield e
                # rules = []

                s = list(eligable_leaves)
                max_len = min((self.body_max_len, len(s)))
                for r in range(max_len):
                    for rule_leaves in itertools.combinations(s, r):
                        # for rule_leaves in __powerset(eligable_leaves):
                        if len(rule_leaves) > self.body_max_len:
                            break
                        conditions = []
                        for var in rule_leaves:
                            if isinstance(var, Categorical):
                                conditions.append(
                                    Condition(var.scope[0], np.equal,
                                              np.argmax(var.p)))
                            elif isinstance(var, int):
                                varp = p_from_scope(node, var, value_dict)
                                if max(varp) >= self.min_local_p:
                                    conditions.append(
                                        Condition(var, np.equal,
                                                  np.argmax(varp)))
                            else:
                                raise ValueError(var)
                        if conditions:
                            yield Rule(conditions)
Exemplo n.º 7
0
def rule_str2idx(r, value_dict):
    if isinstance(r, Condition):
        r = [r]  #conditions

    res_conds = []
    for cond in r:
        for i, (_, name, vals) in value_dict.items():
            if name == cond.var:
                inv_vals = {v: k for k, v in vals.items()}
                res_conds.append(
                    Condition(i, np.equal, inv_vals[cond.threshold]))
    assert len(res_conds) == len(r)

    if isinstance(r, Condition):
        return res_conds[0]
    else:
        return Rule(res_conds)
Exemplo n.º 8
0
        def _leaves_target_allrules(
            node,
            target,
            vars,
            value_dict,
            root=None,
            targetp=None,
            ncandidates=None,
        ):
            eligable_leaves = []

            for var in vars:
                if isinstance(var, Categorical):
                    varp = var.p
                    if self.min_local_p and max(var.p) < self.min_local_p:
                        continue
                    leaf = var.scope[0]
                elif isinstance(var, int):
                    varp = p_from_scope(node, var, value_dict)
                    if self.min_local_p and max(varp) < self.min_local_p:
                        continue
                    leaf = var
                else:
                    raise ValueError(type(var))
                if self.min_local_js:
                    leaf_prior = self.prior_gen.calculate_prior(
                        root, var, value_dict)
                    js = jensenshannon(leaf_prior, varp)
                    if js >= self.min_local_js:
                        eligable_leaves.append(leaf)
                else:
                    eligable_leaves.append(leaf)

            def yield_rules(eligable_leaves, ):
                # def __powerset(iterable):
                #     "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
                #     s = list(iterable)
                #     max_len = min((self.body_max_len, len(s)))
                #     for r in range(max_len):
                #         for e in itertools.combinations(s, r):
                #             yield e
                #     # all_sets = itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(max_len))
                #     # return itertools.islice(all_sets, 1, ncandidates)
                #     # for e in all_sets:
                #     #     yield e
                # rules = []

                s = list(eligable_leaves)
                max_len = min((self.body_max_len, len(s)))
                for r in range(max_len):
                    for rule_leaves in itertools.combinations(s, r):
                        # for rule_leaves in __powerset(eligable_leaves):
                        if len(rule_leaves) > self.body_max_len:
                            break
                        conditions = []
                        for var in rule_leaves:
                            if isinstance(var, Categorical):
                                conditions.append(
                                    Condition(var.scope[0], np.equal,
                                              np.argmax(var.p)))
                            elif isinstance(var, int):
                                varp = p_from_scope(node, var, value_dict)
                                if max(varp) >= self.min_local_p:
                                    conditions.append(
                                        Condition(var, np.equal,
                                                  np.argmax(varp)))
                            else:
                                raise ValueError(var)
                        if conditions:
                            yield Rule(conditions)

            if isinstance(target, Categorical):
                targetp = target.p
                target = target.scope[0]
            # head = Condition(target, np.equal, np.argmax(targetp))
            # todo use all heads where p[head] > p[prior_head] ?
            heads = []
            for val in range(len(targetp)):
                if targetp[val] > self.prior_gen.calculate_prior(
                        spn, target, value_dict)[val]:
                    heads.append(Condition(target, np.equal, val))
            # local rule quality check
            l = list(yield_rules(eligable_leaves, ))
            for r in l:
                for head in heads:
                    if (head, r) not in self.rules_yielded:
                        # exact stats only evaluation TODO if you don't need evaluation, only use spn_stats
                        real_stats = rule_stats(root,
                                                r,
                                                head,
                                                metrics=self.metrics,
                                                real_data=self.df,
                                                beta=self.beta,
                                                value_dict=value_dict)
                        # local quickly calculated stats
                        spn_stats = rule_stats(root,
                                               r,
                                               head,
                                               metrics=self.metrics,
                                               beta=self.beta,
                                               value_dict=value_dict)
                        if isinstance(
                                self.min_global_conf, str
                        ) and self.min_global_conf == 'above_random':
                            min_conf = 1. / len(value_dict[head.var][2])
                        else:
                            min_conf = self.min_global_conf
                        if self.criterion == 'cosine_distance':
                            better_than_crit = spn_stats[self.metrics.index(
                                self.criterion)] <= self.min_global_criterion
                        else:
                            better_than_crit = spn_stats[self.metrics.index(
                                self.criterion)] >= self.min_global_criterion


                        if spn_stats[self.metrics.index('conf')] >= min_conf \
                            and spn_stats[self.metrics.index('recall')] >= self.min_recall \
                            and better_than_crit:
                            self.rules_yielded[(head, r)] = True
                            yield [head, r, *real_stats]
                        else:
                            self.rules_yielded[(head, r)] = False