def recurse(node, depth, rule=[]): if tree_.feature[node] != _tree.TREE_UNDEFINED: feature = feature_name[node] threshold = tree_.threshold[node] # left <= threshhold < right child_l, child_r = tree_.children_left[ node], tree_.children_right[node] impure_l, impure_r = tree_.impurity[child_l], tree_.impurity[ child_r] if ds_context.parametric_types[feature] == Categorical: if impure_l < impure_r: # go left # assuming binary! rule.append( Condition(feature, np.equal, int(np.floor(threshold)))) return recurse(child_l, depth + 1, rule) else: # go right rule.append( Condition(feature, np.equal, int(np.ceil(threshold)))) return recurse(child_r, depth + 1, rule) else: #todo nonbinary case? then broken? if impure_l < impure_r: # go left rule.append( Condition(feature, np.less_equal, threshold)) return recurse(child_l, depth + 1, rule) else: # go right rule.append(Condition(feature, np.greater, threshold)) return recurse(child_r, depth + 1, rule) else: return rule
def get_labeled_rule(head, body, value_dict): assert isinstance(body, Rule) and isinstance(head, Condition) '''simple EM to get the (body,head) and assign labels according to columns. only len(conseq)==1 for now''' #unlabeled: new_cs = [] for cond in body: _, var_name, i2str = value_dict[cond.var] # new_cs.append(Condition(value_dict[cond[0]], cond[1], cond[2])) new_cs.append(Condition(var_name, cond.op, i2str[cond.threshold])) _, head_name, head_dict = value_dict[head.var] assert len(body.get_similar_conditions(head.var)) == 0 labeled_head = Condition(head_name, np.equal, head_dict[head.threshold]) return labeled_head, Rule(new_cs)
def get_leaf_rules(leaf, ): # em = float(np.argmax(leaf.p)) rules = [] assert len(leaf.rule.get_similar_conditions(leaf.scope[0])) == 0 for i in range(len(leaf.p)): conseq = Condition(leaf.scope[0], np.equal, i) rules.append([leaf.rule, conseq]) return rules
def format_mlxtend2rule_ex( head=None, body=None, ): if head: assert len(head) == 1, 'longer heads not implemented' res_h = Condition(tuple(head)[0], np.equal, 1) if body: conds = [] for it in body: conds.append(Condition(it, np.equal, 1)) if body and head: return res_h, Rule(conds) elif head: return res_h elif body: return Rule(conds)
def reverse_label_rule(body, head, value_dict): new_cs = [] for cond in body: for _, var_name, i2str in value_dict: if var_name == cond[0]: for i, s in i2str.items(): if s == cond.threshold: new_cs.append(Condition(var_name, cond.op, i))
def yield_rules(eligable_leaves, ): # def __powerset(iterable): # "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)" # s = list(iterable) # max_len = min((self.body_max_len, len(s))) # for r in range(max_len): # for e in itertools.combinations(s, r): # yield e # # all_sets = itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(max_len)) # # return itertools.islice(all_sets, 1, ncandidates) # # for e in all_sets: # # yield e # rules = [] s = list(eligable_leaves) max_len = min((self.body_max_len, len(s))) for r in range(max_len): for rule_leaves in itertools.combinations(s, r): # for rule_leaves in __powerset(eligable_leaves): if len(rule_leaves) > self.body_max_len: break conditions = [] for var in rule_leaves: if isinstance(var, Categorical): conditions.append( Condition(var.scope[0], np.equal, np.argmax(var.p))) elif isinstance(var, int): varp = p_from_scope(node, var, value_dict) if max(varp) >= self.min_local_p: conditions.append( Condition(var, np.equal, np.argmax(varp))) else: raise ValueError(var) if conditions: yield Rule(conditions)
def rule_str2idx(r, value_dict): if isinstance(r, Condition): r = [r] #conditions res_conds = [] for cond in r: for i, (_, name, vals) in value_dict.items(): if name == cond.var: inv_vals = {v: k for k, v in vals.items()} res_conds.append( Condition(i, np.equal, inv_vals[cond.threshold])) assert len(res_conds) == len(r) if isinstance(r, Condition): return res_conds[0] else: return Rule(res_conds)
def _leaves_target_allrules( node, target, vars, value_dict, root=None, targetp=None, ncandidates=None, ): eligable_leaves = [] for var in vars: if isinstance(var, Categorical): varp = var.p if self.min_local_p and max(var.p) < self.min_local_p: continue leaf = var.scope[0] elif isinstance(var, int): varp = p_from_scope(node, var, value_dict) if self.min_local_p and max(varp) < self.min_local_p: continue leaf = var else: raise ValueError(type(var)) if self.min_local_js: leaf_prior = self.prior_gen.calculate_prior( root, var, value_dict) js = jensenshannon(leaf_prior, varp) if js >= self.min_local_js: eligable_leaves.append(leaf) else: eligable_leaves.append(leaf) def yield_rules(eligable_leaves, ): # def __powerset(iterable): # "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)" # s = list(iterable) # max_len = min((self.body_max_len, len(s))) # for r in range(max_len): # for e in itertools.combinations(s, r): # yield e # # all_sets = itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(max_len)) # # return itertools.islice(all_sets, 1, ncandidates) # # for e in all_sets: # # yield e # rules = [] s = list(eligable_leaves) max_len = min((self.body_max_len, len(s))) for r in range(max_len): for rule_leaves in itertools.combinations(s, r): # for rule_leaves in __powerset(eligable_leaves): if len(rule_leaves) > self.body_max_len: break conditions = [] for var in rule_leaves: if isinstance(var, Categorical): conditions.append( Condition(var.scope[0], np.equal, np.argmax(var.p))) elif isinstance(var, int): varp = p_from_scope(node, var, value_dict) if max(varp) >= self.min_local_p: conditions.append( Condition(var, np.equal, np.argmax(varp))) else: raise ValueError(var) if conditions: yield Rule(conditions) if isinstance(target, Categorical): targetp = target.p target = target.scope[0] # head = Condition(target, np.equal, np.argmax(targetp)) # todo use all heads where p[head] > p[prior_head] ? heads = [] for val in range(len(targetp)): if targetp[val] > self.prior_gen.calculate_prior( spn, target, value_dict)[val]: heads.append(Condition(target, np.equal, val)) # local rule quality check l = list(yield_rules(eligable_leaves, )) for r in l: for head in heads: if (head, r) not in self.rules_yielded: # exact stats only evaluation TODO if you don't need evaluation, only use spn_stats real_stats = rule_stats(root, r, head, metrics=self.metrics, real_data=self.df, beta=self.beta, value_dict=value_dict) # local quickly calculated stats spn_stats = rule_stats(root, r, head, metrics=self.metrics, beta=self.beta, value_dict=value_dict) if isinstance( self.min_global_conf, str ) and self.min_global_conf == 'above_random': min_conf = 1. / len(value_dict[head.var][2]) else: min_conf = self.min_global_conf if self.criterion == 'cosine_distance': better_than_crit = spn_stats[self.metrics.index( self.criterion)] <= self.min_global_criterion else: better_than_crit = spn_stats[self.metrics.index( self.criterion)] >= self.min_global_criterion if spn_stats[self.metrics.index('conf')] >= min_conf \ and spn_stats[self.metrics.index('recall')] >= self.min_recall \ and better_than_crit: self.rules_yielded[(head, r)] = True yield [head, r, *real_stats] else: self.rules_yielded[(head, r)] = False