def get_default_rule(labels, num_labels, labels_weights=None): lg.debug(f'labels = {labels}') count = np.bincount(labels, labels_weights, minlength=num_labels) label_index = np.argmax(count) rule = MRule(label_index) rule.update_errors(np.sum(count), count[label_index]) return rule
def build_classifier(inst: Instances, params: MParam = None, add_default_rule=False): rules = [] lines = inst.all_lines() entry_lines = lines atts = inst.all_nominal_attributes() num_remaining_lines = len(lines) while num_remaining_lines > 0: rule, r_lines, covered = get_one_rule(inst, available_atts=list(atts), available_lines=entry_lines, params=params) entry_lines = r_lines if covered == 0: # DO NOT remove this break rules.append(rule) lg.debug(f'rule = {rule}') lg.debug(f'covered = {covered}') if add_default_rule and len(entry_lines) > 0: labels = inst.label_data[entry_lines] lg.debug(f'add default with lines ={entry_lines}') lg.debug(f'add default with labels ={labels}') default = get_default_rule( inst.label_data[entry_lines], num_labels=inst.num_items_label, labels_weights=None) # TODO set to params.labels_weights rules.append(default) assert len(rules) > 0 lg.debug(f'number of found rules = {len(rules)}') return rules
def best_att_item_label(self, min_freq, weight=WEIGHT): lbl_w = np.multiply(self.items_labels, weight) # add support advantage rank = entropy(lbl_w, base=2, axis=1) + 10e-8 / np.amax(lbl_w, axis=1) not_passed = np.all(lbl_w < min_freq, axis=1) lg.debug(f'passed = {not_passed}') not_passed_index = np.where(not_passed) print(f'not passed indexes = {not_passed_index}') rank[not_passed_index] = 10e10 b_index = np.argmin(rank) b_label = np.argmax(lbl_w[b_index]) b_att, b_item = self.c_index.att_item(b_index) lg.debug(f'b_att={b_att}, b_item={b_item}, b_label={b_label}') lg.debug(f'self.att_items = {self.att_items}') return b_att, b_item, b_label
def mutual_info(self): # TODO reimplementing all method att_lines = self.c_index.atts_lines() atts_labels = [self.items_labels[aline] for aline in att_lines] u_labels = [np.sum(alabel, axis=0) for alabel in atts_labels ] # TODO change later for performance labels_entropy = np.array([v_entropy(ul) for ul in u_labels]) lg.debug(f'labels_entropy = {labels_entropy}') lg.debug(f'att_lines = {att_lines}') # att_label = [np.sum(self.items_labels[a_line], axis=1) for a_line in att_lines] for aline, alabel in zip(att_lines, atts_labels): lg.debug(f'aline = {aline}, item_lables = {alabel}') att_label = [np.sum(alabel, axis=1) for alabel in atts_labels] lg.debug(f'att_label = {att_label}') att_entrop = [v_entropy(i) for i in att_label] lg.debug(f'att_entrop = {att_entrop}') j_entropy = [v_entropy(it.flat) for it in atts_labels] lg.debug(f'j_entropy = {j_entropy}') result = labels_entropy + att_entrop - j_entropy lg.debug(f'result = {result}') return result
def get_one_rule(inst: Instances, available_atts=None, available_lines=None, params: MParam = None) -> (MRule, np.array, int): """ calc step and get one rule from available lines :param inst: Instances :param available_atts: list :param available_lines: np.array :param params: MParam :return: tuple(MRule, item_lines, covered) """ # lg.debug(f'start with available_lines ={available_lines}') # lg.debug(f' data =\n {inst.nominal_data.T[available_lines]}') # lg.debug(f' data =\n {inst.data[available_lines]}') if len(available_lines) < params.min_occ: return None, available_lines, 0 if available_atts is None: available_atts = inst.all_nominal_attributes() if len(available_atts) == 0: return None, available_lines, 0 entry_lines = available_lines max_label = None m_rule = None while True: lg.debug( f'count_step with entry_lines ={len(entry_lines)}, max_label = {max_label}' ) lg.debug(f'current available_atts ={available_atts}') t_max = count_step(inst, available_atts, entry_lines, params, max_label) if m_rule is None: # first iteration m_rule = MRule(t_max.label_index) max_label = t_max.label_index lg.debug(f'firt iteration created rule = {m_rule}') # if m_rule.rank >= t_max.rank: # can not enhance rank lg.debug(f'new t_max = {t_max}') if m_rule.rank > t_max.rank: # if m_rule.is_better_than(t_max): # TODO suhel lg.debug(f'm_rule is better than t_max') return m_rule, entry_lines, 0 # better prediction m_rule.rank = t_max.rank m_rule.add_test(t_max.att_index, t_max.item_index) m_rule.update_errors(t_max.cover, t_max.correct) entry_lines, num_not_covered = \ lines_of_item(entry_lines, inst.nominal_data[t_max.att_index], t_max.item_index) available_atts.remove(t_max.att_index) if m_rule.errors == 0 \ or len(available_atts) == 0 \ or num_not_covered == 0 \ or m_rule.correct < params.min_occ: break # TODO check of rule is empty remaining_lines = np.setdiff1d(available_lines, entry_lines, assume_unique=True) return m_rule, remaining_lines, len(available_lines) - len(remaining_lines)
from log_settings import lg from odri.arffreader import loadarff from odri.m_instances import Instances from odri.m_params import MParam from odri.m_utils import build_classifier if __name__ == '__main__': filename = 'data/contact-lenses.arff' filename = 'data/tic-tac-toe.arff' # filename = 'data/cl.arff' lg.debug(f'Starting with file name = {filename}') data, meta = loadarff(filename) # # inst = Instances(*loadarff(filename)) lg.debug(f'inst num_lines = {inst.num_lines}') lg.debug(f'inst num_items = {inst.num_items}') params = MParam() rules = build_classifier(inst, params=params, add_default_rule=True) for i, rule in enumerate(rules): print(f'{i} -> rule = {rule}') lg.debug('end of application')