예제 #1
0
def get_default_rule(labels, num_labels, labels_weights=None):
    lg.debug(f'labels = {labels}')
    count = np.bincount(labels, labels_weights, minlength=num_labels)
    label_index = np.argmax(count)
    rule = MRule(label_index)
    rule.update_errors(np.sum(count), count[label_index])
    return rule
예제 #2
0
def build_classifier(inst: Instances,
                     params: MParam = None,
                     add_default_rule=False):
    rules = []
    lines = inst.all_lines()
    entry_lines = lines
    atts = inst.all_nominal_attributes()

    num_remaining_lines = len(lines)

    while num_remaining_lines > 0:
        rule, r_lines, covered = get_one_rule(inst,
                                              available_atts=list(atts),
                                              available_lines=entry_lines,
                                              params=params)

        entry_lines = r_lines
        if covered == 0:  # DO NOT remove this
            break
        rules.append(rule)
        lg.debug(f'rule = {rule}')
        lg.debug(f'covered = {covered}')
    if add_default_rule and len(entry_lines) > 0:
        labels = inst.label_data[entry_lines]
        lg.debug(f'add default with lines ={entry_lines}')
        lg.debug(f'add default with labels ={labels}')

        default = get_default_rule(
            inst.label_data[entry_lines],
            num_labels=inst.num_items_label,
            labels_weights=None)  # TODO set to params.labels_weights
        rules.append(default)

    assert len(rules) > 0
    lg.debug(f'number of found rules = {len(rules)}')
    return rules
예제 #3
0
 def best_att_item_label(self, min_freq, weight=WEIGHT):
     lbl_w = np.multiply(self.items_labels, weight)
     # add support advantage
     rank = entropy(lbl_w, base=2, axis=1) + 10e-8 / np.amax(lbl_w, axis=1)
     not_passed = np.all(lbl_w < min_freq, axis=1)
     lg.debug(f'passed = {not_passed}')
     not_passed_index = np.where(not_passed)
     print(f'not passed indexes = {not_passed_index}')
     rank[not_passed_index] = 10e10
     b_index = np.argmin(rank)
     b_label = np.argmax(lbl_w[b_index])
     b_att, b_item = self.c_index.att_item(b_index)
     lg.debug(f'b_att={b_att}, b_item={b_item}, b_label={b_label}')
     lg.debug(f'self.att_items = {self.att_items}')
     return b_att, b_item, b_label
예제 #4
0
    def mutual_info(self):
        # TODO reimplementing all method
        att_lines = self.c_index.atts_lines()
        atts_labels = [self.items_labels[aline] for aline in att_lines]

        u_labels = [np.sum(alabel, axis=0) for alabel in atts_labels
                    ]  # TODO change later for performance
        labels_entropy = np.array([v_entropy(ul) for ul in u_labels])
        lg.debug(f'labels_entropy = {labels_entropy}')

        lg.debug(f'att_lines = {att_lines}')
        # att_label = [np.sum(self.items_labels[a_line], axis=1) for a_line in att_lines]

        for aline, alabel in zip(att_lines, atts_labels):
            lg.debug(f'aline = {aline}, item_lables = {alabel}')

        att_label = [np.sum(alabel, axis=1) for alabel in atts_labels]
        lg.debug(f'att_label = {att_label}')
        att_entrop = [v_entropy(i) for i in att_label]
        lg.debug(f'att_entrop = {att_entrop}')
        j_entropy = [v_entropy(it.flat) for it in atts_labels]
        lg.debug(f'j_entropy = {j_entropy}')
        result = labels_entropy + att_entrop - j_entropy
        lg.debug(f'result = {result}')
        return result
예제 #5
0
def get_one_rule(inst: Instances,
                 available_atts=None,
                 available_lines=None,
                 params: MParam = None) -> (MRule, np.array, int):
    """
    calc step and get one rule from available lines
    :param inst: Instances
    :param available_atts: list
    :param available_lines: np.array
    :param params: MParam
    :return: tuple(MRule, item_lines, covered)
    """
    # lg.debug(f'start with available_lines ={available_lines}')
    # lg.debug(f' data =\n {inst.nominal_data.T[available_lines]}')
    # lg.debug(f' data =\n {inst.data[available_lines]}')
    if len(available_lines) < params.min_occ:
        return None, available_lines, 0

    if available_atts is None:
        available_atts = inst.all_nominal_attributes()

    if len(available_atts) == 0:
        return None, available_lines, 0

    entry_lines = available_lines
    max_label = None
    m_rule = None
    while True:
        lg.debug(
            f'count_step with entry_lines ={len(entry_lines)}, max_label = {max_label}'
        )
        lg.debug(f'current available_atts ={available_atts}')

        t_max = count_step(inst, available_atts, entry_lines, params,
                           max_label)
        if m_rule is None:  # first iteration
            m_rule = MRule(t_max.label_index)
            max_label = t_max.label_index
            lg.debug(f'firt iteration created rule = {m_rule}')

        # if m_rule.rank >= t_max.rank:  # can not enhance rank
        lg.debug(f'new t_max = {t_max}')
        if m_rule.rank > t_max.rank:
            # if m_rule.is_better_than(t_max): # TODO suhel
            lg.debug(f'm_rule is better than t_max')
            return m_rule, entry_lines, 0

        # better prediction
        m_rule.rank = t_max.rank
        m_rule.add_test(t_max.att_index, t_max.item_index)
        m_rule.update_errors(t_max.cover, t_max.correct)
        entry_lines, num_not_covered = \
            lines_of_item(entry_lines,
                          inst.nominal_data[t_max.att_index],
                          t_max.item_index)
        available_atts.remove(t_max.att_index)

        if m_rule.errors == 0 \
                or len(available_atts) == 0 \
                or num_not_covered == 0 \
                or m_rule.correct < params.min_occ:
            break
    # TODO check of rule is empty
    remaining_lines = np.setdiff1d(available_lines,
                                   entry_lines,
                                   assume_unique=True)
    return m_rule, remaining_lines, len(available_lines) - len(remaining_lines)
예제 #6
0
from log_settings import lg
from odri.arffreader import loadarff
from odri.m_instances import Instances
from odri.m_params import MParam
from odri.m_utils import build_classifier

if __name__ == '__main__':
    filename = 'data/contact-lenses.arff'
    filename = 'data/tic-tac-toe.arff'
    # filename = 'data/cl.arff'
    lg.debug(f'Starting with file name = {filename}')
    data, meta = loadarff(filename)
    # #
    inst = Instances(*loadarff(filename))
    lg.debug(f'inst num_lines = {inst.num_lines}')
    lg.debug(f'inst num_items = {inst.num_items}')

    params = MParam()
    rules = build_classifier(inst, params=params, add_default_rule=True)
    for i, rule in enumerate(rules):
        print(f'{i} -> rule = {rule}')
    lg.debug('end of application')