def find_rules_and_measure_progress(self, X, Y, W, target_class, base_rules, domain, progress_amount): """ The top-level control procedure of the separate-and-conquer algorithm. For given data and target class (may be None), return a list of rules which all must strictly adhere to the requirements of rule finder's validators. To induce decision lists (ordered rules), set target class to None. To induce rule sets (unordered rules), learn rules for each class individually, in regard to the original learning data. Parameters ---------- X, Y, W : ndarray Learning data. target_class : int Index of the class to model. base_rules : list of Rule An optional list of initial rules to constrain the search. domain : Orange.data.domain.Domain Data domain, used to calculate class distributions. progress_amount: int, percentage Part of the learning algorithm covered by this function call. Returns ------- rule_list : list of Rule Induced rules. """ initial_class_dist = get_dist(Y, W, domain) rule_list = [] # while data allows, continuously find new rules, # break the loop if min. requirements cannot be met, # after finding a rule, remove the instances covered while not self.data_stopping(X, Y, W, target_class): # remember the distribution to correctly update progress temp_class_dist = get_dist(Y, W, domain) # generate a new rule that has not been seen before new_rule = self.rule_finder(X, Y, W, target_class, base_rules, domain, initial_class_dist, rule_list) # None when no new, unique rules that pass # the general requirements can be found if new_rule is None or self.rule_stopping(new_rule): break # exclusive or weighted X, Y, W = self.cover_and_remove(X, Y, W, new_rule) rule_list.append(new_rule) # update progress if self.progress_advance_callback is not None: progress = (((temp_class_dist[target_class] - get_dist(Y, W, domain)[target_class]) / initial_class_dist[target_class] * progress_amount) if target_class is not None else ((temp_class_dist - get_dist(Y, W, domain)).sum() / initial_class_dist.sum() * progress_amount)) self.progress_advance_callback(progress) return rule_list
def find_rules_and_measure_progress(self, X, Y, W, target_class, base_rules, domain, progress_amount): """ The top-level control procedure of the separate-and-conquer algorithm. For given data and target class (may be None), return a list of rules which all must strictly adhere to the requirements of rule finder's validators. To induce decision lists (ordered rules), set target class to None. To induce rule sets (unordered rules), learn rules for each class individually, in regard to the original learning data. Parameters ---------- X, Y, W : ndarray Learning data. target_class : int Index of the class to model. base_rules : list of Rule An optional list of initial rules to constrain the search. domain : Orange.data.domain.Domain Data domain, used to calculate class distributions. progress_amount: int, percentage Part of the learning algorithm covered by this function call. Returns ------- rule_list : list of Rule Induced rules. """ initial_class_dist = get_dist(Y, W, domain) rule_list = [] # while data allows, continuously find new rules, # break the loop if min. requirements cannot be met, # after finding a rule, remove the instances covered while not self.data_stopping(X, Y, W, target_class): # remember the distribution to correctly update progress temp_class_dist = get_dist(Y, W, domain) # generate a new rule that has not been seen before new_rule = self.rule_finder(X, Y, W, target_class, base_rules, domain, initial_class_dist, rule_list) # None when no new, unique rules that pass # the general requirements can be found if new_rule is None or self.rule_stopping(new_rule): break # exclusive or weighted X, Y, W = self.cover_and_remove(X, Y, W, new_rule) rule_list.append(new_rule) # update progress if self.progress_advance_callback is not None: progress = (((temp_class_dist[target_class] - get_dist(Y, W, domain)[target_class]) / initial_class_dist[target_class] * progress_amount) if target_class is not None else ((temp_class_dist - get_dist(Y, W, domain)).sum() / initial_class_dist.sum() * progress_amount)) self.progress_advance_callback(progress) return rule_list
def fit_storage(self, data): X, Y, W = data.X, data.Y, data.W if data.W else None Y = Y.astype(dtype=int) # estimate extreme value distributions (if necessary) if self.evc and self.to_calc_evds: self.calculate_evds(data) if self.evc and not self.evds: warn("""Extreme value distributions not set. Need to calculate them first. """) self.calculate_evds(data) prior = get_dist(Y, W, self.domain) if not prior.sum(): return self.classifier(domain=self.domain, rule_list=[]) # create initial star star = self.create_initial_star(X, Y, W, prior) # use visited to prevent learning the same rule all over again visited = set( (r.covered_examples.tostring(), r.target_class) for r in star) # update best rule bestr = np.empty(X.shape[0], dtype=object) bestq = np.zeros(X.shape[0], dtype=float) for r in star: if self.rule_validator.validate_rule(r): self.update_best(bestr, bestq, r, Y) # loop until star has rules self.inter_rules = [] # store intermediate rules rlength = 0 while star: rlength += 1 # specialize each rule in star new_star = [] for r in star: if r.curr_class_dist[ r.target_class] == r.curr_class_dist.sum(): continue # refine rule rules = self.rule_finder.search_strategy.refine_rule( X, Y, W, r) # work refined rules for nr in rules: nr.default_rule = nr.parent_rule.default_rule nr.do_evaluate() rkey = (nr.covered_examples.tostring(), nr.target_class) if (rkey not in visited and self.rule_finder. general_validator.validate_rule(nr) and nr.quality >= nr.parent_rule.quality): # rule is consistent with basic conditions # can it be new best? if self.rule_validator.validate_rule(nr): self.update_best(bestr, bestq, nr, Y) # can it be further specialized? if (self.specialization_validator.validate_rule(nr) and nr.length < self.max_rule_length): new_star.append(nr) visited.add(rkey) # assign a rank to each rule in new star nrules = len(new_star) inst_quality = np.zeros((X.shape[0], nrules)) for ri, r in enumerate(new_star): if self.target_instances: # learn rules for specific instances only c2 = np.zeros(r.covered_examples.shape, dtype=bool) c2[self.target_instances] = 1 cov = np.where(c2 & r.covered_examples & (r.target_class == Y))[0] else: cov = np.where(r.covered_examples & (r.target_class == Y))[0] inst_quality[cov, ri] = r.quality sel_rules = -(min(nrules, 5)) queues = np.argsort(inst_quality)[:, sel_rules:] # create new star from queues new_star_set = set() index = -1 while len(new_star_set) < self.width: if index < sel_rules: break # pop one rule from each queue and put into a temporary counter cnt = Counter() for qi, q in enumerate(queues): ri = q[index] if inst_quality[qi, ri] > 0: cnt[ri] += 1 if not cnt: break elts = cnt.most_common() for e, _ in elts: if e in new_star_set: continue new_star_set.add(e) if len(new_star_set) >= self.width: break index -= 1 star = [new_star[ri] for ri in new_star_set] if self.store_intermediate_rules: rl = [] vis = set() for ri, r in enumerate(bestr): if r is None: continue self.add_rule(rl, vis, r) self.inter_rules.append(rl) # select best rules rule_list = [] visited = set() for ri, r in enumerate(bestr): # add r if r is None: continue self.add_rule(rule_list, visited, r) if not hasattr(r, "best_instance"): r.best_instance = [data[ri]] else: r.best_instance.append(data[ri]) if self.add_sub_rules: # create parent rule pr = self.create_parent(r, X, Y, W) while pr is not None: # add parent rule self.add_rule(rule_list, visited, pr) pr = self.create_parent(pr, X, Y, W) rule_list = sorted(rule_list, key=lambda r: -r.quality) if self.min_unique_examples > 1: filter_rules = [] covered = np.zeros(X.shape[0], dtype=bool) for r in rule_list: if (~covered & r.covered_examples).sum() >= self.min_unique_examples: filter_rules.append(r) covered |= r.covered_examples rule_list = filter_rules return self.classifier(domain=self.domain, rule_list=rule_list)