Exemplo n.º 1
0
def boosting_last_predict(tree, fields, input_data, path=None):
    """Predict function for boosting and last prediction strategy

    """

    if path is None:
        path = []
    node = get_node(tree)

    children_number = node[OFFSETS["children#"]]
    children = [] if children_number == 0 else node[OFFSETS["children"]]
    count = node[OFFSETS["count"]]

    if children:
        for child in children:
            [operator, field, value, term, missing] = get_predicate(child)
            if apply_predicate(operator, field, value, term, missing,
                               input_data, fields[field]):
                path.append(predicate_to_rule(operator, fields[field],
                                              value, term, missing))
                return boosting_last_predict( \
                    child, fields, \
                    input_data, path=path)

    return Prediction(
        node[OFFSETS["output"]],
        path,
        None,
        distribution=None,
        count=count,
        median=None,
        distribution_unit=None,
        children=children,
        d_min=None,
        d_max=None)
Exemplo n.º 2
0
    def get_tree_leaves(tree, fields, path, leaves, filter_function=None):

        node = get_node(tree)
        predicate = get_predicate(tree)
        if isinstance(predicate, list):
            [operator, field, value, term, missing] = get_predicate(tree)
            path.append(
                to_lisp_rule(operator, field, value, term, missing,
                             fields[field]))

        children_number = node[offsets["children#"]]
        children = [] if children_number == 0 else node[offsets["children"]]

        if children:
            for child in children:
                leaves += get_tree_leaves(child,
                                          fields,
                                          path[:],
                                          leaves,
                                          filter_function=filter_function)
        else:
            leaf = {
                'id':
                node[offsets["id"]],
                'confidence':
                node[offsets["confidence"]],
                'count':
                node[offsets["count"]],
                'distribution':
                node[offsets["distribution"]],
                'impurity':
                gini_impurity(node[offsets["distribution"]],
                              node[offsets["count"]]),
                'output':
                node[offsets["output"]],
                'path':
                path
            }
            if 'weighted_distribution' in offsets:
                leaf.update( \
                    {"weighted_distribution": node[offsets[ \
                        "weighted_distribution"]],
                     "weight": node[offsets["weight"]]})
            if (not hasattr(filter_function, '__call__')
                    or filter_function(leaf)):
                leaves += [leaf]
        return leaves
Exemplo n.º 3
0
def boosting_proportional_predict(tree, fields, input_data, path=None,
                                  missing_found=False):
    """Makes a prediction based on a number of field values considering all
       the predictions of the leaves that fall in a subtree.

       Each time a splitting field has no value assigned, we consider
       both branches of the split to be true, merging their
       predictions. The function returns the merged distribution and the
       last node reached by a unique path.

    """

    if path is None:
        path = []

    node = get_node(tree)
    children_number = node[OFFSETS["children#"]]
    children = [] if children_number == 0 else node[OFFSETS["children"]]
    g_sum = node[OFFSETS["g_sum"]]
    h_sum = node[OFFSETS["h_sum"]]
    count = node[OFFSETS["count"]]

    if not children:
        return (g_sum, h_sum, count, path)
    if one_branch(children, input_data) or \
            fields[children[0][FIELD_OFFSET]]["optype"] in \
            ["text", "items"]:
        for child in children:
            [operator, field, value, term, missing] = get_predicate(child)
            if apply_predicate(operator, field, value, term, missing,
                               input_data, fields[field]):
                new_rule = predicate_to_rule(operator, fields[field], value,
                                             term, missing)
                if new_rule not in path and not missing_found:
                    path.append(new_rule)
                return boosting_proportional_predict( \
                    child, fields,
                    input_data, path, missing_found)
    else:
        # missing value found, the unique path stops
        missing_found = True
        g_sums = 0.0
        h_sums = 0.0
        population = 0
        for child in children:
            g_sum, h_sum, count, _ = \
                boosting_proportional_predict( \
                    child, fields, input_data,
                    path, missing_found)
            g_sums += g_sum
            h_sums += h_sum
            population += count
        return (g_sums, h_sums, population, path)
Exemplo n.º 4
0
def missing_prefix_code(tree, fields, field, input_map, cmv):
    """Part of the condition that checks for missings when missing_splits
    has been used
    """

    predicate = get_predicate(tree)
    missing = predicate[MISSING_OFFSET]
    negation = "" if missing else " not"
    connection = "or" if missing else "and"
    if not missing:
        cmv.append(fields[field]['slug'])
    return "%s is%s None %s " % (map_data(fields[field]['slug'], input_map,
                                          True), negation, connection)
Exemplo n.º 5
0
 def __init__(self, tree, offsets):
     predicate = get_predicate(tree)
     if isinstance(predicate, bool):
         self.predicate = predicate
     else:
         [operator, field, value, term, _] = predicate
         self.predicate = Predicate(INVERSE_OP[operator], field, value,
                                    term)
     node = get_node(tree)
     for attr in offsets:
         if attr not in ["children#", "children"]:
             setattr(self, attr, node[offsets[attr]])
     children = [] if node[offsets["children#"]] == 0 else \
         node[offsets["children"]]
     setattr(self, "children", children)
Exemplo n.º 6
0
def generate_rules(tree,
                   offsets,
                   objective_id,
                   fields,
                   depth=0,
                   ids_path=None,
                   subtree=True):
    """Translates a tree model into a set of IF-THEN rules.

    """
    rules_str = ""

    node = get_node(tree)
    children_number = node[offsets["children#"]]
    children = [] if children_number == 0 else node[offsets["children"]]
    children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree)
    if children:
        for child in children:
            predicate = get_predicate(child)
            if isinstance(predicate, list):
                [operator, field, value, term, missing] = predicate
                child_node = get_node(child)
            rules_str += ("%s IF %s %s\n" %
                          (INDENT * depth,
                           predicate_to_rule(operator,
                                             fields[field],
                                             value,
                                             term,
                                             missing,
                                             label='slug'), "AND" if
                           child_node[offsets["children#"]] > 0 else "THEN"))
            rules_str += generate_rules(child,
                                        offsets,
                                        objective_id,
                                        fields,
                                        depth + 1,
                                        ids_path=ids_path,
                                        subtree=subtree)
    else:
        rules_str += ("%s %s = %s\n" %
                      (INDENT * depth,
                       (fields[objective_id]['slug'] if objective_id else
                        "Prediction"), node[offsets["output"]]))
    return rules_str
Exemplo n.º 7
0
def split_condition_code(tree, fields, depth, input_map, pre_condition,
                         term_analysis_fields, item_analysis_fields, cmv):
    """Condition code for the split
    """

    predicate = get_predicate(tree)
    [operation, field, value, term, _] = predicate
    optype = fields[field]['optype']
    value = value_to_print(value, optype)

    if optype in ['text', 'items']:
        if optype == 'text':
            term_analysis_fields.append((field, term))
            matching_function = "term_matches"
        else:
            item_analysis_fields.append((field, term))
            matching_function = "item_matches"

        return "%sif (%s%s(%s, \"%s\", %s%s) %s " \
               "%s):\n" % \
              (INDENT * depth, pre_condition, matching_function,
               map_data(fields[field]['slug'],
                        input_map,
                        False),
               fields[field]['slug'],
               'u' if isinstance(term, str) else '',
               value_to_print(term, 'categorical'),
               PYTHON_OPERATOR[operation],
               value)

    operator = (MISSING_OPERATOR[operation]
                if value is None else PYTHON_OPERATOR[operation])
    if value is None:
        cmv.append(fields[field]['slug'])
    return "%sif (%s%s %s %s):\n" % \
           (INDENT * depth, pre_condition,
            map_data(fields[field]['slug'], input_map,
                     False),
            operator,
            value)
Exemplo n.º 8
0
    def depth_first_search(tree, path):
        """Search for leafs' values and instances

        """
        node = get_node(tree)
        predicate = get_predicate(tree)
        if isinstance(predicate, list):
            [operation, field, value, term, _] = predicate
            operator = INVERSE_OP[operation]
            path.append(Predicate(operator, field, value, term))
            if term:
                if field not in model.terms:
                    model.terms[field] = []
                if term not in model.terms[field]:
                    model.terms[field].append(term)

        if node[offsets["children#"]] == 0:
            add_to_groups(
                groups, node[offsets["output"]], path, node[offsets["count"]],
                node[offsets["confidence"]],
                gini_impurity(node[offsets["distribution"]],
                              node[offsets["count"]]))
            return node[offsets["count"]]
        children = node[offsets["children"]][:]
        children.reverse()

        children_sum = 0
        for child in children:
            children_sum += depth_first_search(child, path[:])
        if children_sum < node[offsets["count"]]:
            add_to_groups(
                groups, node[offsets["output"]], path,
                node[offsets["count"]] - children_sum,
                node[offsets["confidence"]],
                gini_impurity(node[offsets["distribution"]],
                              node[offsets["count"]]))
        return node[offsets["count"]]
Exemplo n.º 9
0
def plug_in_body(tree,
                 offsets,
                 fields,
                 objective_id,
                 regression,
                 depth=1,
                 cmv=None,
                 input_map=False,
                 ids_path=None,
                 subtree=True):
    """Translate the model into a set of "if" python statements.
    `depth` controls the size of indentation. As soon as a value is missing
    that node is returned without further evaluation.
    """
    # label for the confidence measure and initialization
    metric = "error" if regression else "confidence"
    if cmv is None:
        cmv = []
    body = ""
    term_analysis_fields = []
    item_analysis_fields = []

    node = get_node(tree)
    children = [] if node[offsets["children#"]] == 0 else \
        node[offsets["children"]]
    children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree)
    if children:

        # field used in the split
        field = mintree_split(children)

        has_missing_branch = (missing_branch(children) or none_value(children))
        # the missing is singled out as a special case only when there's
        # no missing branch in the children list
        one_branch = not has_missing_branch or \
            fields[field]['optype'] in COMPOSED_FIELDS
        if (one_branch and not fields[field]['slug'] in cmv):
            body += missing_check_code(tree, offsets, fields, objective_id,
                                       field, depth, input_map, cmv, metric)

        for child in children:
            [_, field, value, _, _] = get_predicate(child)
            pre_condition = ""
            # code when missing_splits has been used
            if has_missing_branch and value is not None:
                pre_condition = missing_prefix_code(child, fields, field,
                                                    input_map, cmv)

            # complete split condition code
            body += split_condition_code( \
                child, fields, depth, input_map, pre_condition,
                term_analysis_fields, item_analysis_fields, cmv)

            # value to be determined in next node
            next_level = plug_in_body(child,
                                      offsets,
                                      fields,
                                      objective_id,
                                      regression,
                                      depth + 1,
                                      cmv=cmv[:],
                                      input_map=input_map,
                                      ids_path=ids_path,
                                      subtree=subtree)

            body += next_level[0]
            term_analysis_fields.extend(next_level[1])
            item_analysis_fields.extend(next_level[2])
    else:
        value = value_to_print(node[offsets["output"]],
                               fields[objective_id]['optype'])
        body = "%sreturn {\"prediction\":%s, \"%s\":%s}\n" % ( \
            INDENT * depth, value, metric, node[offsets["confidence"]])

    return body, term_analysis_fields, item_analysis_fields
Exemplo n.º 10
0
def tableau_body(tree,
                 offsets,
                 fields,
                 objective_id,
                 body="",
                 conditions=None,
                 cmv=None,
                 ids_path=None,
                 subtree=True,
                 attr=DFT_ATTR):
    """Translate the model into a set of "if" statements in Tableau syntax

    `depth` controls the size of indentation. As soon as a value is missing
    that node is returned without further evaluation.

    """

    if cmv is None:
        cmv = []
    if body:
        alternate = "ELSEIF"
    else:
        if conditions is None:
            conditions = []
        alternate = "IF"

    node = get_node(tree)
    children_number = node[offsets["children#"]]
    children = [] if children_number == 0 else node[offsets["children"]]
    children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree)
    if children:
        [_, field, _, _, _] = get_predicate(children[0])
        has_missing_branch = (missing_branch(children) or none_value(children))
        # the missing is singled out as a special case only when there's
        # no missing branch in the children list
        if (not has_missing_branch and fields[field]['name'] not in cmv):
            conditions.append("ISNULL([%s])" % fields[field]['name'])
            body += ("%s %s THEN " % (alternate, " AND ".join(conditions)))
            if fields[objective_id]['optype'] == 'numeric':
                value = node[offsets[attr]]
            else:
                value = tableau_string(node[offsets[attr]])
            body += ("%s\n" % value)
            cmv.append(fields[field]['name'])
            alternate = "ELSEIF"
            del conditions[-1]

        for child in children:
            pre_condition = ""
            post_condition = ""
            [operator, field, ch_value, _, missing] = get_predicate(child)
            if has_missing_branch and ch_value is not None:
                negation = "" if missing else "NOT "
                connection = "OR" if missing else "AND"
                pre_condition = ("(%sISNULL([%s]) %s " %
                                 (negation, fields[field]['name'], connection))
                if not missing:
                    cmv.append(fields[field]['name'])
                post_condition = ")"
            optype = fields[field]['optype']
            if ch_value is None:
                value = ""
            elif optype in ['text', 'items']:
                return ""
            elif optype == 'numeric':
                value = ch_value
            else:
                value = repr(ch_value)

            operator = ("" if ch_value is None else PYTHON_OPERATOR[operator])
            if ch_value is None:
                pre_condition = (T_MISSING_OPERATOR[operator])
                post_condition = ")"

            conditions.append("%s[%s]%s%s%s" %
                              (pre_condition, fields[field]['name'], operator,
                               value, post_condition))
            body = tableau_body(child,
                                offsets,
                                fields,
                                objective_id,
                                body,
                                conditions[:],
                                cmv=cmv[:],
                                ids_path=ids_path,
                                subtree=subtree,
                                attr=attr)
            del conditions[-1]
    else:
        if fields[objective_id]['optype'] == 'numeric':
            value = tree[offsets[attr]]
        else:
            value = tableau_string(node[offsets[attr]])
        body += ("%s %s THEN" % (alternate, " AND ".join(conditions)))
        body += " %s\n" % value

    return body