Пример #1
0
class Tree(object):
    """A tree-like predictive model.

    """
    def __init__(self, tree, fields, objective_field=None,
                 root_distribution=None, parent_id=None, ids_map=None,
                 subtree=True, tree_info=None):

        self.fields = fields
        self.objective_id = objective_field
        self.output = tree['output']

        if tree['predicate'] is True:
            self.predicate = True
        else:
            self.predicate = Predicate(
                tree['predicate']['operator'],
                tree['predicate']['field'],
                tree['predicate']['value'],
                tree['predicate'].get('term', None))
        if 'id' in tree:
            self.id = tree['id']
            self.parent_id = parent_id
            if isinstance(ids_map, dict):
                ids_map[self.id] = self
        else:
            self.id = None

        children = []
        if 'children' in tree:
            for child in tree['children']:
                children.append(Tree(child,
                                     self.fields,
                                     objective_field=objective_field,
                                     parent_id=self.id,
                                     ids_map=ids_map,
                                     subtree=subtree,
                                     tree_info=tree_info))

        self.children = children
        self.regression = self.is_regression()
        tree_info['regression'] = (self.regression and
                                   tree_info.get('regression', True))
        self.count = tree['count']
        self.confidence = tree.get('confidence', None)
        self.distribution = None
        self.max = None
        self.min = None
        self.weighted = False
        summary = None
        if 'distribution' in tree:
            self.distribution = tree['distribution']
        elif 'objective_summary' in tree:
            summary = tree['objective_summary']
            (self.distribution_unit,
             self.distribution) = extract_distribution(summary)
            if 'weighted_objective_summary' in tree:
                summary = tree['weighted_objective_summary']
                (self.weighted_distribution_unit,
                 self.weighted_distribution) = extract_distribution(summary)
                self.weighted = True
        else:
            summary = root_distribution
            (self.distribution_unit,
             self.distribution) = extract_distribution(summary)
        if self.regression:
            tree_info['max_bins'] = max(tree_info.get('max_bins', 0),
                                        len(self.distribution))
            self.median = None
            if summary:
                self.median = summary.get('median')
            if not self.median:
                self.median = dist_median(self.distribution, self.count)
            self.max = summary.get('maximum') or \
                max([value for [value, _] in self.distribution])
            self.min = summary.get('minimum') or \
                min([value for [value, _] in self.distribution])
        self.impurity = None
        if not self.regression and self.distribution is not None:
            self.impurity = self.gini_impurity()

    def gini_impurity(self):
        """Returns the gini impurity score associated to the distribution
           in the node

        """
        purity = 0.0
        if self.distribution is None:
            return None
        for _, instances in self.distribution:
            purity += math.pow(instances / float(self.count), 2)
        return 1.0 - purity

    def list_fields(self, out):
        """Lists a description of the model's fields.

        """
        out.write(utf8(u'<%-32s : %s>\n' % (
            self.fields[self.objective_id]['name'],
            self.fields[self.objective_id]['optype'])))
        out.flush()

        for field in [(val['name'], val['optype']) for key, val in
                      sort_fields(self.fields)
                      if key != self.objective_id]:
            out.write(utf8(u'[%-32s : %s]\n' % (field[0], field[1])))
            out.flush()
        return self.fields

    def is_regression(self):
        """Checks if the subtree structure can be a regression

        """
        def is_classification(node):
            """Checks if the node's value is a category

            """
            return isinstance(node.output, basestring)

        classification = is_classification(self)
        if classification:
            return False
        if not self.children:
            return True
        else:
            return not any([is_classification(child)
                            for child in self.children])

    def get_leaves(self, path=None, filter_function=None):
        """Returns a list that includes all the leaves of the tree.

        """
        leaves = []
        if path is None:
            path = []
        if not isinstance(self.predicate, bool):
            path.append(self.predicate.to_lisp_rule(self.fields))

        if self.children:
            for child in self.children:
                leaves += child.get_leaves(path=path[:],
                                           filter_function=filter_function)
        else:
            leaf = {
                'id': self.id,
                'confidence': self.confidence,
                'count': self.count,
                'distribution': self.distribution,
                'impurity': self.impurity,
                'output': self.output,
                'path': path}
            if (not hasattr(filter_function, '__call__')
                    or filter_function(leaf)):
                leaves += [leaf]
        return leaves

    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: as we cannot choose between the two branches
                in the tree that stem from this split, we consider both. The
                algorithm goes on until the final leaves are reached and
                all their predictions are used to decide the final prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            (final_distribution,
             d_min,
             d_max,
             last_node,
             population) = self.predict_proportional(input_data, path=path)

            if self.regression:
                # singular case:
                # when the prediction is the one given in a 1-instance node
                if len(final_distribution.items()) == 1:
                    prediction, instances = final_distribution.items()[0]
                    if instances == 1:
                        return Prediction(
                            last_node.output,
                            path,
                            last_node.confidence,
                            distribution=last_node.distribution,
                            count=instances,
                            median=last_node.median,
                            distribution_unit=last_node.distribution_unit,
                            children=last_node.children,
                            d_min=last_node.min,
                            d_max=last_node.max)
                # when there's more instances, sort elements by their mean
                distribution = [list(element) for element in
                                sorted(final_distribution.items(),
                                       key=lambda x: x[0])]
                distribution_unit = ('bins' if len(distribution) > BINS_LIMIT
                                     else 'counts')
                distribution = merge_bins(distribution, BINS_LIMIT)
                total_instances = sum([instances
                                       for _, instances in distribution])
                prediction = mean(distribution)
                confidence = regression_error(
                    unbiased_sample_variance(distribution, prediction),
                    total_instances)
                return Prediction(
                    prediction,
                    path,
                    confidence,
                    distribution=distribution,
                    count=total_instances,
                    median=dist_median(distribution, total_instances),
                    distribution_unit=distribution_unit,
                    children=last_node.children,
                    d_min=d_min,
                    d_max=d_max)
            else:
                distribution = [list(element) for element in
                                sorted(final_distribution.items(),
                                       key=lambda x: (-x[1], x[0]))]
                return Prediction(
                    distribution[0][0],
                    path,
                    ws_confidence(distribution[0][0], final_distribution,
                                  ws_n=population),
                    distribution=distribution,
                    count=population,
                    median=None,
                    distribution_unit='categorical',
                    children=last_node.children)

        else:
            if self.children:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path=path)

            return Prediction(
                self.output,
                path,
                self.confidence,
                distribution=self.distribution,
                count=get_instances(self.distribution),
                median=None if not self.regression else self.median,
                distribution_unit=self.distribution_unit,
                children=self.children,
                d_min=None if not self.regression else self.min,
                d_max=None if not self.regression else self.max)

    def predict_proportional(self, input_data, path=None,
                             missing_found=False, median=False):
        """Makes a prediction based on a number of field values averaging
           the predictions of the leaves that fall in a subtree.

           Each time a splitting field has no value assigned, we consider
           both branches of the split to be true, merging their
           predictions. The function returns the merged distribution and the
           last node reached by a unique path.

        """

        if path is None:
            path = []

        final_distribution = {}
        if not self.children:
            distribution = self.distribution if not self.weighted else \
                self.weighted_distribution
            return (merge_distributions({}, dict((x[0], x[1])
                                                 for x in distribution)),
                    self.min, self.max, self, self.count)
        if one_branch(self.children, input_data) or \
                self.fields[split(self.children)]["optype"] in \
                ["text", "items"]:
            for child in self.children:
                if child.predicate.apply(input_data, self.fields):
                    new_rule = child.predicate.to_rule(self.fields)
                    if new_rule not in path and not missing_found:
                        path.append(new_rule)
                    return child.predict_proportional(input_data, path,
                                                      missing_found, median)
        else:
            # missing value found, the unique path stops
            missing_found = True
            minimums = []
            maximums = []
            population = 0
            for child in self.children:
                (subtree_distribution, subtree_min,
                 subtree_max, _, subtree_pop) = \
                    child.predict_proportional(input_data, path,
                                               missing_found, median)
                if subtree_min is not None:
                    minimums.append(subtree_min)
                if subtree_max is not None:
                    maximums.append(subtree_max)
                population += subtree_pop
                final_distribution = merge_distributions(
                    final_distribution, subtree_distribution)
            return (final_distribution,
                    min(minimums) if minimums else None,
                    max(maximums) if maximums else None, self, population)

    def generate_rules(self, depth=0, ids_path=None, subtree=True):
        """Translates a tree model into a set of IF-THEN rules.

        """
        rules = u""
        children = filter_nodes(self.children, ids=ids_path,
                                subtree=subtree)
        if children:
            for child in children:
                rules += (u"%s IF %s %s\n" %
                          (INDENT * depth,
                           child.predicate.to_rule(self.fields, 'slug'),
                           "AND" if child.children else "THEN"))
                rules += child.generate_rules(depth + 1, ids_path=ids_path,
                                              subtree=subtree)
        else:
            rules += (u"%s %s = %s\n" %
                      (INDENT * depth,
                       (self.fields[self.objective_id]['slug']
                        if self.objective_id else "Prediction"),
                       self.output))
        return rules

    def rules(self, out, ids_path=None, subtree=True):
        """Prints out an IF-THEN rule version of the tree.

        """
        for field in [(key, val) for key, val in sort_fields(self.fields)]:

            slug = slugify(self.fields[field[0]]['name'])
            self.fields[field[0]].update(slug=slug)
        out.write(utf8(self.generate_rules(ids_path=ids_path,
                                           subtree=subtree)))
        out.flush()

    def python_body(self, depth=1, cmv=None, input_map=False,
                    ids_path=None, subtree=True):
        """Translate the model into a set of "if" python statements.

        `depth` controls the size of indentation. As soon as a value is missing
        that node is returned without further evaluation.

        """

        def map_data(field, missing=False):
            """Returns the subject of the condition in map format when
               more than MAX_ARGS_LENGTH arguments are used.
            """
            if input_map:
                if missing:
                    return "data.get('%s')" % field
                else:
                    return "data['%s']" % field
            return field
        if cmv is None:
            cmv = []
        body = u""
        term_analysis_fields = []
        item_analysis_fields = []
        children = filter_nodes(self.children, ids=ids_path,
                                subtree=subtree)
        if children:
            field = split(children)
            has_missing_branch = (missing_branch(children) or
                                  none_value(children))
            # the missing is singled out as a special case only when there's
            # no missing branch in the children list
            if not has_missing_branch and \
                    self.fields[field]["optype"] not in ["text", "items"] and \
                    self.fields[field]['slug'] not in cmv:
                body += (u"%sif (%s is None):\n" %
                         (INDENT * depth,
                          map_data(self.fields[field]['slug'], True)))
                if self.fields[self.objective_id]['optype'] == 'numeric':
                    value = self.output
                else:
                    value = repr(self.output)
                body += (u"%sreturn %s\n" %
                         (INDENT * (depth + 1),
                          value))
                cmv.append(self.fields[field]['slug'])

            for child in children:
                field = child.predicate.field
                pre_condition = u""
                if has_missing_branch and child.predicate.value is not None:
                    negation = u"" if child.predicate.missing else u" not"
                    connection = u"or" if child.predicate.missing else u"and"
                    pre_condition = (
                        u"%s is%s None %s " % (
                            map_data(self.fields[field]['slug'], True),
                            negation,
                            connection))
                    if not child.predicate.missing:
                        cmv.append(self.fields[field]['slug'])
                optype = self.fields[field]['optype']
                if (optype == 'numeric' or optype == 'text' or
                        optype == 'items'
                        or child.predicate.value is None):
                    value = child.predicate.value
                else:
                    value = repr(child.predicate.value)
                if optype == 'text' or optype == 'items':
                    if optype == 'text':
                        term_analysis_fields.append((field,
                                                     child.predicate.term))
                        matching_function = "term_matches"
                    else:
                        item_analysis_fields.append((field,
                                                     child.predicate.term))
                        matching_function = "item_matches"

                    body += (
                        u"%sif (%s%s(%s, \"%s\", %s\"%s\") %s %s):"
                        u"\n" %
                        (INDENT * depth, pre_condition, matching_function,
                         map_data(self.fields[field]['slug'],
                                  False),
                         self.fields[field]['slug'],
                         ('u' if isinstance(child.predicate.term, unicode)
                          else ''),
                         child.predicate.term.replace("\"", "\\\""),
                         PYTHON_OPERATOR[child.predicate.operator],
                         value))
                else:
                    operator = (MISSING_OPERATOR[child.predicate.operator] if
                                child.predicate.value is None else
                                PYTHON_OPERATOR[child.predicate.operator])
                    if child.predicate.value is None:
                        cmv.append(self.fields[field]['slug'])
                    body += (
                        u"%sif (%s%s %s %s):\n" %
                        (INDENT * depth, pre_condition,
                         map_data(self.fields[field]['slug'],
                                  False),
                         operator,
                         value))
                next_level = child.python_body(depth + 1, cmv=cmv[:],
                                               input_map=input_map,
                                               ids_path=ids_path,
                                               subtree=subtree)
                body += next_level[0]
                term_analysis_fields.extend(next_level[1])
                item_analysis_fields.extend(next_level[2])
        else:
            if self.fields[self.objective_id]['optype'] == 'numeric':
                value = self.output
            else:
                value = repr(self.output)
            body = u"%sreturn %s\n" % (INDENT * depth, value)

        return body, term_analysis_fields, item_analysis_fields

    def python(self, out, docstring, input_map=False,
               ids_path=None, subtree=True):
        """Writes a python function that implements the model.

        """
        args = []
        parameters = sort_fields(self.fields)
        if not input_map:
            input_map = len(parameters) > MAX_ARGS_LENGTH
        reserved_keywords = keyword.kwlist if not input_map else None
        prefix = "_" if not input_map else ""
        for field in [(key, val) for key, val in parameters]:
            slug = slugify(self.fields[field[0]]['name'],
                           reserved_keywords=reserved_keywords, prefix=prefix)
            self.fields[field[0]].update(slug=slug)
            if not input_map:
                if field[0] != self.objective_id:
                    args.append("%s=None" % (slug))
        if input_map:
            args.append("data={}")
        predictor_definition = (u"def predict_%s" %
                                self.fields[self.objective_id]['slug'])
        depth = len(predictor_definition) + 1
        predictor = u"%s(%s):\n" % (
            predictor_definition,
            (",\n" + " " * depth).join(args))
        predictor_doc = (INDENT + u"\"\"\" " + docstring +
                         u"\n" + INDENT + u"\"\"\"\n")
        body, term_analysis_predicates, item_analysis_predicates = \
            self.python_body(input_map=input_map,
                             ids_path=ids_path,
                             subtree=subtree)
        terms_body = u""
        if term_analysis_predicates or item_analysis_predicates:
            terms_body = self.term_analysis_body(term_analysis_predicates,
                                                 item_analysis_predicates)
        predictor += predictor_doc + terms_body + body
        out.write(utf8(predictor))
        out.flush()

    def term_analysis_body(self, term_analysis_predicates,
                           item_analysis_predicates):
        """ Writes auxiliary functions to handle the term and item
        analysis fields

        """
        body = u""
        # static content
        body += """
    import re

    tm_tokens = '%s'
    tm_full_term = '%s'
    tm_all = '%s'

"""  % (TM_TOKENS, TM_FULL_TERM, TM_ALL)
        if term_analysis_predicates:
            body += """
    def term_matches(text, field_name, term):
        \"\"\" Counts the number of occurences of term and its variants in text

        \"\"\"
        if text is None:
            text = ""
        forms_list = term_forms[field_name].get(term, [term])
        options = term_analysis[field_name]
        token_mode = options.get('token_mode', tm_tokens)
        case_sensitive = options.get('case_sensitive', False)
        first_term = forms_list[0]
        if token_mode == tm_full_term:
            return full_term_match(text, first_term, case_sensitive)
        else:
            # In token_mode='all' we will match full terms using equals and
            # tokens using contains
            if token_mode == tm_all and len(forms_list) == 1:
                pattern = re.compile(r'^.+\\b.+$', re.U)
                if re.match(pattern, first_term):
                    return full_term_match(text, first_term, case_sensitive)
            return term_matches_tokens(text, forms_list, case_sensitive)


    def full_term_match(text, full_term, case_sensitive):
        \"\"\"Counts the match for full terms according to the case_sensitive
              option

        \"\"\"
        if not case_sensitive:
            text = text.lower()
            full_term = full_term.lower()
        return 1 if text == full_term else 0

    def get_tokens_flags(case_sensitive):
        \"\"\"Returns flags for regular expression matching depending on text
              analysis options

        \"\"\"
        flags = re.U
        if not case_sensitive:
            flags = (re.I | flags)
        return flags


    def term_matches_tokens(text, forms_list, case_sensitive):
        \"\"\" Counts the number of occurences of the words in forms_list in
               the text

        \"\"\"
        flags = get_tokens_flags(case_sensitive)
        expression = ur'(\\b|_)%s(\\b|_)' % '(\\\\b|_)|(\\\\b|_)'.join(forms_list)
        pattern = re.compile(expression, flags=flags)
        matches = re.findall(pattern, text)
        return len(matches)

"""

            term_analysis_options = set([predicate[0] for predicate in
                                         term_analysis_predicates])
            term_analysis_predicates = set(term_analysis_predicates)
            body += """
    term_analysis = {"""
            for field_id in term_analysis_options:
                field = self.fields[field_id]
                body += """
        \"%s\": {""" % field['slug']
                for option in field['term_analysis']:
                    if option in TERM_OPTIONS:
                        body += """
                \"%s\": %s,""" % (option, repr(field['term_analysis'][option]))
                body += """
        },"""
                body += """
    }"""
            term_forms = {}
            fields = self.fields
            for field_id, term in term_analysis_predicates:
                alternatives = []
                field = fields[field_id]
                if field['slug'] not in term_forms:
                    term_forms[field['slug']] = {}
                all_forms = field['summary'].get('term_forms', {})
                if all_forms:
                    alternatives = all_forms.get(term, [])
                    if alternatives:
                        terms = [term]
                        terms.extend(all_forms.get(term, []))
                        term_forms[field['slug']][term] = terms
                body += """
    term_forms = {"""
                for field in term_forms:
                    body += """
        \"%s\": {""" % field
                    for term in term_forms[field]:
                        body += """
            u\"%s\": %s,""" % (term, term_forms[field][term])
                    body += """
        },
                """
                body += """
    }
"""
        if item_analysis_predicates:
            body += """
    def item_matches(text, field_name, item):
        \"\"\" Counts the number of occurences of item in text

        \"\"\"
        if text is None:
            text = ""
        options = item_analysis[field_name]
        separator = options.get('separator', ' ')
        regexp = options.get('separator_regexp')
        if regexp is None:
            regexp = r\"%s\" % separator
        return count_items_matches(text, item, regexp)


    def count_items_matches(text, item, regexp):
        \"\"\" Counts the number of occurences of the item in the text

        \"\"\"
        expression = r'(^|%s)%s($|%s)' % (regexp, item, regexp)
        pattern = re.compile(expression, flags=re.U)
        matches = re.findall(pattern, text)
        return len(matches)
"""

            item_analysis_options = set([predicate[0] for predicate in
                                         item_analysis_predicates])
            item_analysis_predicates = set(item_analysis_predicates)
            body += """
    item_analysis = {"""
            for field_id in item_analysis_options:
                field = self.fields[field_id]
                body += """
        \"%s\": {""" % field['slug']
                for option in field['item_analysis']:
                    if option in ITEM_OPTIONS:
                        body += """
                \"%s\": %s,""" % (option, repr(field['item_analysis'][option]))
                body += """
        },"""
            body += """
    }
"""

        return body

    def tableau_body(self, body=u"", conditions=None, cmv=None,
                     ids_path=None, subtree=True):
        """Translate the model into a set of "if" statements in Tableau syntax

        `depth` controls the size of indentation. As soon as a value is missing
        that node is returned without further evaluation.

        """

        if cmv is None:
            cmv = []
        if body:
            alternate = u"ELSEIF"
        else:
            if conditions is None:
                conditions = []
            alternate = u"IF"

        children = filter_nodes(self.children, ids=ids_path,
                                subtree=subtree)
        if children:
            field = split(children)
            has_missing_branch = (missing_branch(children) or
                                  none_value(children))
            # the missing is singled out as a special case only when there's
            # no missing branch in the children list
            if (not has_missing_branch and
                    self.fields[field]['name'] not in cmv):
                conditions.append("ISNULL([%s])" % self.fields[field]['name'])
                body += (u"%s %s THEN " %
                         (alternate, " AND ".join(conditions)))
                if self.fields[self.objective_id]['optype'] == 'numeric':
                    value = self.output
                else:
                    value = tableau_string(self.output)
                body += (u"%s\n" % value)
                cmv.append(self.fields[field]['name'])
                alternate = u"ELSEIF"
                del conditions[-1]

            for child in children:
                pre_condition = u""
                post_condition = u""
                if has_missing_branch and child.predicate.value is not None:
                    negation = u"" if child.predicate.missing else u"NOT "
                    connection = u"OR" if child.predicate.missing else u"AND"
                    pre_condition = (
                        u"(%sISNULL([%s]) %s " % (
                            negation, self.fields[field]['name'], connection))
                    if not child.predicate.missing:
                        cmv.append(self.fields[field]['name'])
                    post_condition = u")"
                optype = self.fields[child.predicate.field]['optype']
                if child.predicate.value is None:
                    value = ""
                elif optype == 'text' or optype == 'items':
                    return u""
                elif optype == 'numeric':
                    value = child.predicate.value
                else:
                    value = repr(child.predicate.value)

                operator = (u"" if child.predicate.value is None else
                            PYTHON_OPERATOR[child.predicate.operator])
                if child.predicate.value is None:
                    pre_condition = (
                        T_MISSING_OPERATOR[child.predicate.operator])
                    post_condition = u")"

                conditions.append("%s[%s]%s%s%s" % (
                    pre_condition,
                    self.fields[child.predicate.field]['name'],
                    operator,
                    value,
                    post_condition))
                body = child.tableau_body(body, conditions[:], cmv=cmv[:],
                                          ids_path=ids_path, subtree=subtree)
                del conditions[-1]
        else:
            if self.fields[self.objective_id]['optype'] == 'numeric':
                value = self.output
            else:
                value = tableau_string(self.output)
            body += (
                u"%s %s THEN" % (alternate, " AND ".join(conditions)))
            body += u" %s\n" % value

        return body

    def tableau(self, out, ids_path=None, subtree=True):
        """Writes a Tableau function that implements the model.

        """
        body = self.tableau_body(ids_path=ids_path, subtree=subtree)
        if not body:
            return False
        out.write(utf8(body))
        out.flush()
        return True

    def get_nodes_info(self, headers=None, leaves_only=False):
        """Yields the information associated to each of the tree nodes

        """
        row = []
        if not self.regression:
            category_dict = dict(self.distribution)
        for header in headers:
            if header == self.fields[self.objective_id]['name']:
                row.append(self.output)
                continue
            if header in ['confidence', 'error']:
                row.append(self.confidence)
                continue
            if header == 'impurity':
                row.append(self.impurity)
                continue
            if self.regression and header.startswith('bin'):
                for bin_value, bin_instances in self.distribution:
                    row.append(bin_value)
                    row.append(bin_instances)
                break
            if not self.regression:
                row.append(category_dict.get(header))
        while len(row) < len(headers):
            row.append(None)
        if not leaves_only or not self.children:
            yield row

        if self.children:
            for child in self.children:
                for row in child.get_nodes_info(headers,
                                                leaves_only=leaves_only):
                    yield row
Пример #2
0
class BoostedTree(object):
    """A boosted tree-like predictive model.

    """
    def __init__(self, tree, fields, objective_field=None):

        self.fields = fields
        self.objective_id = objective_field
        self.output = tree['output']

        if tree['predicate'] is True:
            self.predicate = True
        else:
            self.predicate = Predicate(tree['predicate']['operator'],
                                       tree['predicate']['field'],
                                       tree['predicate']['value'],
                                       tree['predicate'].get('term', None))

        self.id = tree.get('id')
        children = []
        if 'children' in tree:
            for child in tree['children']:
                children.append(self.__class__( \
                    child,
                    self.fields,
                    objective_field=objective_field))
        self.children = children
        self.count = tree['count']
        self.g_sum = tree.get('g_sum')
        self.h_sum = tree.get('h_sum')

    def list_fields(self, out):
        """Lists a description of the model's fields.

        """

        for field in [(val['name'], val['optype'])
                      for _, val in sort_fields(self.fields)]:
            out.write(utf8(u'[%-32s : %s]\n' % (field[0], field[1])))
            out.flush()
        return self.fields

    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: we consider all possible outcomes and create
                              an average prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            return self.predict_proportional(input_data, path=path)
        else:
            if self.children:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path=path)

            return Prediction(self.output,
                              path,
                              None,
                              distribution=None,
                              count=self.count,
                              median=None,
                              distribution_unit=None,
                              children=self.children,
                              d_min=None,
                              d_max=None)

    def predict_proportional(self, input_data, path=None, missing_found=False):
        """Makes a prediction based on a number of field values considering all
           the predictions of the leaves that fall in a subtree.

           Each time a splitting field has no value assigned, we consider
           both branches of the split to be true, merging their
           predictions. The function returns the merged distribution and the
           last node reached by a unique path.

        """

        if path is None:
            path = []

        if not self.children:
            return (self.g_sum, self.h_sum, self.count, path)
        if one_branch(self.children, input_data) or \
                self.fields[split(self.children)]["optype"] in \
                ["text", "items"]:
            for child in self.children:
                if child.predicate.apply(input_data, self.fields):
                    new_rule = child.predicate.to_rule(self.fields)
                    if new_rule not in path and not missing_found:
                        path.append(new_rule)
                    return child.predict_proportional(input_data, path,
                                                      missing_found)
        else:
            # missing value found, the unique path stops
            missing_found = True
            g_sums = 0.0
            h_sums = 0.0
            population = 0
            for child in self.children:
                g_sum, h_sum, count, _ = \
                    child.predict_proportional(input_data, path,
                                               missing_found)
                g_sums += g_sum
                h_sums += h_sum
                population += count
            return (g_sums, h_sums, population, path)

    def get_leaves(self, path=None, filter_function=None):
        """Returns a list that includes all the leaves of the tree.

        """
        leaves = []
        if path is None:
            path = []
        if not isinstance(self.predicate, bool):
            path.append(self.predicate.to_lisp_rule(self.fields))

        if self.children:
            for child in self.children:
                leaves += child.get_leaves(path=path[:],
                                           filter_function=filter_function)
        else:
            leaf = {
                'id': self.id,
                'count': self.count,
                'g_sum': self.g_sum,
                'h_sum': self.h_sum,
                'output': self.output,
                'path': path
            }
            if (not hasattr(filter_function, '__call__')
                    or filter_function(leaf)):
                leaves += [leaf]
        return leaves
Пример #3
0
class Tree(object):
    """A tree-like predictive model.

    """
    def __init__(self,
                 tree,
                 fields,
                 objective_field=None,
                 root_distribution=None,
                 parent_id=None,
                 ids_map=None,
                 subtree=True,
                 tree_info=None):

        self.fields = fields
        self.objective_id = objective_field
        self.output = tree['output']

        if tree['predicate'] is True:
            self.predicate = True
        else:
            self.predicate = Predicate(tree['predicate']['operator'],
                                       tree['predicate']['field'],
                                       tree['predicate']['value'],
                                       tree['predicate'].get('term', None))
        if 'id' in tree:
            self.id = tree['id']
            self.parent_id = parent_id
            if isinstance(ids_map, dict):
                ids_map[self.id] = self
        else:
            self.id = None

        children = []
        if 'children' in tree:
            for child in tree['children']:
                children.append(self.__class__( \
                    child,
                    self.fields,
                    objective_field=objective_field,
                    parent_id=self.id,
                    ids_map=ids_map,
                    subtree=subtree,
                    tree_info=tree_info))

        self.children = children
        self.regression = self.is_regression()
        tree_info['regression'] = (self.regression
                                   and tree_info.get('regression', True))
        self.count = tree['count']
        self.confidence = tree.get('confidence', None)
        self.distribution = None
        self.max = None
        self.min = None
        self.weighted = False
        summary = None
        if 'distribution' in tree:
            self.distribution = tree['distribution']
        elif 'objective_summary' in tree:
            summary = tree['objective_summary']
            (self.distribution_unit,
             self.distribution) = extract_distribution(summary)
            if 'weighted_objective_summary' in tree:
                summary = tree['weighted_objective_summary']
                (self.weighted_distribution_unit,
                 self.weighted_distribution) = extract_distribution(summary)
                self.weight = tree['weight']
                self.weighted = True
        else:
            summary = root_distribution
            (self.distribution_unit,
             self.distribution) = extract_distribution(summary)
        if self.regression:
            tree_info['max_bins'] = max(tree_info.get('max_bins', 0),
                                        len(self.distribution))
            self.median = None
            if summary:
                self.median = summary.get('median')
            if not self.median:
                self.median = dist_median(self.distribution, self.count)
            self.max = summary.get('maximum') or \
                max([value for [value, _] in self.distribution])
            self.min = summary.get('minimum') or \
                min([value for [value, _] in self.distribution])
        self.impurity = None
        if not self.regression and self.distribution is not None:
            self.impurity = self.gini_impurity()

    def gini_impurity(self):
        """Returns the gini impurity score associated to the distribution
           in the node

        """
        purity = 0.0
        if self.distribution is None:
            return None
        for _, instances in self.distribution:
            purity += math.pow(instances / float(self.count), 2)
        return 1.0 - purity

    def list_fields(self, out):
        """Lists a description of the model's fields.

        """
        out.write(
            utf8(u'<%-32s : %s>\n' %
                 (self.fields[self.objective_id]['name'],
                  self.fields[self.objective_id]['optype'])))
        out.flush()

        for field in [(val['name'], val['optype'])
                      for key, val in sort_fields(self.fields)
                      if key != self.objective_id]:
            out.write(utf8(u'[%-32s : %s]\n' % (field[0], field[1])))
            out.flush()
        return self.fields

    def is_regression(self):
        """Checks if the subtree structure can be a regression

        """
        def is_classification(node):
            """Checks if the node's value is a category

            """
            return isinstance(node.output, basestring)

        classification = is_classification(self)
        if classification:
            return False
        if not self.children:
            return True
        else:
            return not any(
                [is_classification(child) for child in self.children])

    def get_leaves(self, path=None, filter_function=None):
        """Returns a list that includes all the leaves of the tree.

        """
        leaves = []
        if path is None:
            path = []
        if not isinstance(self.predicate, bool):
            path.append(self.predicate.to_lisp_rule(self.fields))

        if self.children:
            for child in self.children:
                leaves += child.get_leaves(path=path[:],
                                           filter_function=filter_function)
        else:
            leaf = {
                'id': self.id,
                'confidence': self.confidence,
                'count': self.count,
                'distribution': self.distribution,
                'impurity': self.impurity,
                'output': self.output,
                'path': path
            }
            if hasattr(self, 'weighted_distribution'):
                leaf.update( \
                    {"weighted_distribution": self.weighted_distribution,
                     "weight": self.weight})
            if (not hasattr(filter_function, '__call__')
                    or filter_function(leaf)):
                leaves += [leaf]
        return leaves

    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: as we cannot choose between the two branches
                in the tree that stem from this split, we consider both. The
                algorithm goes on until the final leaves are reached and
                all their predictions are used to decide the final prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            (final_distribution, d_min, d_max, last_node, population,
             parent_node) = self.predict_proportional(input_data, path=path)

            if self.regression:
                # singular case:
                # when the prediction is the one given in a 1-instance node
                if len(final_distribution.items()) == 1:
                    prediction, instances = final_distribution.items()[0]
                    if instances == 1:
                        return Prediction(
                            last_node.output,
                            path,
                            last_node.confidence,
                            distribution=(last_node.distribution if not  \
                                self.weighted else \
                                last_node.weighted_distribution),
                            count=instances,
                            median=last_node.median,
                            distribution_unit=last_node.distribution_unit,
                            children=last_node.children,
                            d_min=last_node.min,
                            d_max=last_node.max)
                # when there's more instances, sort elements by their mean
                distribution = [
                    list(element)
                    for element in sorted(final_distribution.items(),
                                          key=lambda x: x[0])
                ]
                distribution_unit = ('bins' if len(distribution) > BINS_LIMIT
                                     else 'counts')
                distribution = merge_bins(distribution, BINS_LIMIT)
                total_instances = sum(
                    [instances for _, instances in distribution])
                if len(distribution) == 1:
                    # where there's only one bin, there will be no error, but
                    # we use a correction derived from the parent's error
                    prediction = distribution[0][0]
                    if total_instances < 2:
                        total_instances = 1
                    try:
                        # some strange models can have nodes with no confidence
                        confidence = round(
                            parent_node.confidence /
                            math.sqrt(total_instances), PRECISION)
                    except AttributeError:
                        confidence = None
                else:
                    prediction = mean(distribution)
                    confidence = round(
                        regression_error(
                            unbiased_sample_variance(distribution, prediction),
                            total_instances), PRECISION)
                return Prediction(prediction,
                                  path,
                                  confidence,
                                  distribution=distribution,
                                  count=total_instances,
                                  median=dist_median(distribution,
                                                     total_instances),
                                  distribution_unit=distribution_unit,
                                  children=last_node.children,
                                  d_min=d_min,
                                  d_max=d_max)
            else:
                distribution = [
                    list(element)
                    for element in sorted(final_distribution.items(),
                                          key=lambda x: (-x[1], x[0]))
                ]
                return Prediction(distribution[0][0],
                                  path,
                                  ws_confidence(distribution[0][0],
                                                final_distribution,
                                                ws_n=population),
                                  distribution=distribution,
                                  count=population,
                                  median=None,
                                  distribution_unit='categorical',
                                  children=last_node.children)

        else:
            if self.children:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path=path)

            if self.weighted:
                output_distribution = self.weighted_distribution
                output_unit = self.weighted_distribution_unit
            else:
                output_distribution = self.distribution
                output_unit = self.distribution_unit

            return Prediction(
                self.output,
                path,
                self.confidence,
                distribution=output_distribution,
                count=get_instances(output_distribution),
                median=None if not self.regression else self.median,
                distribution_unit=output_unit,
                children=self.children,
                d_min=None if not self.regression else self.min,
                d_max=None if not self.regression else self.max)

    def predict_proportional(self,
                             input_data,
                             path=None,
                             missing_found=False,
                             median=False,
                             parent=None):
        """Makes a prediction based on a number of field values averaging
           the predictions of the leaves that fall in a subtree.

           Each time a splitting field has no value assigned, we consider
           both branches of the split to be true, merging their
           predictions. The function returns the merged distribution and the
           last node reached by a unique path.

        """

        if path is None:
            path = []

        final_distribution = {}
        if not self.children:
            distribution = self.distribution if not self.weighted else \
                self.weighted_distribution
            return (merge_distributions({},
                                        dict((x[0], x[1])
                                             for x in distribution)), self.min,
                    self.max, self, self.count, parent)
        if one_branch(self.children, input_data) or \
                self.fields[split(self.children)]["optype"] in \
                ["text", "items"]:
            for child in self.children:
                if child.predicate.apply(input_data, self.fields):
                    new_rule = child.predicate.to_rule(self.fields)
                    if new_rule not in path and not missing_found:
                        path.append(new_rule)
                    return child.predict_proportional(input_data,
                                                      path,
                                                      missing_found,
                                                      median,
                                                      parent=self)
        else:
            # missing value found, the unique path stops
            missing_found = True
            minimums = []
            maximums = []
            population = 0
            for child in self.children:
                (subtree_distribution, subtree_min,
                 subtree_max, _, subtree_pop, _) = \
                    child.predict_proportional(input_data, path,
                                               missing_found, median,
                                               parent=self)
                if subtree_min is not None:
                    minimums.append(subtree_min)
                if subtree_max is not None:
                    maximums.append(subtree_max)
                population += subtree_pop
                final_distribution = merge_distributions(
                    final_distribution, subtree_distribution)
            return (final_distribution, min(minimums) if minimums else None,
                    max(maximums) if maximums else None, self, population,
                    self)

    def generate_rules(self, depth=0, ids_path=None, subtree=True):
        """Translates a tree model into a set of IF-THEN rules.

        """
        rules = u""
        children = filter_nodes(self.children, ids=ids_path, subtree=subtree)
        if children:
            for child in children:
                rules += (u"%s IF %s %s\n" %
                          (INDENT * depth,
                           child.predicate.to_rule(self.fields, 'slug'),
                           "AND" if child.children else "THEN"))
                rules += child.generate_rules(depth + 1,
                                              ids_path=ids_path,
                                              subtree=subtree)
        else:
            rules += (u"%s %s = %s\n" %
                      (INDENT * depth,
                       (self.fields[self.objective_id]['slug']
                        if self.objective_id else "Prediction"), self.output))
        return rules

    def rules(self, out, ids_path=None, subtree=True):
        """Prints out an IF-THEN rule version of the tree.

        """
        for field in [(key, val) for key, val in sort_fields(self.fields)]:

            slug = slugify(self.fields[field[0]]['name'])
            self.fields[field[0]].update(slug=slug)
        out.write(utf8(self.generate_rules(ids_path=ids_path,
                                           subtree=subtree)))
        out.flush()

    def python_body(self,
                    depth=1,
                    cmv=None,
                    input_map=False,
                    ids_path=None,
                    subtree=True):
        """Translate the model into a set of "if" python statements.

        `depth` controls the size of indentation. As soon as a value is missing
        that node is returned without further evaluation.

        """
        def map_data(field, missing=False):
            """Returns the subject of the condition in map format when
               more than MAX_ARGS_LENGTH arguments are used.
            """
            if input_map:
                if missing:
                    return "data.get('%s')" % field
                else:
                    return "data['%s']" % field
            return field

        if cmv is None:
            cmv = []
        body = u""
        term_analysis_fields = []
        item_analysis_fields = []
        children = filter_nodes(self.children, ids=ids_path, subtree=subtree)
        if children:
            field = split(children)
            has_missing_branch = (missing_branch(children)
                                  or none_value(children))
            # the missing is singled out as a special case only when there's
            # no missing branch in the children list
            if not has_missing_branch and \
                    self.fields[field]["optype"] not in ["text", "items"] and \
                    self.fields[field]['slug'] not in cmv:
                body += (u"%sif (%s is None):\n" %
                         (INDENT * depth,
                          map_data(self.fields[field]['slug'], True)))
                if self.fields[self.objective_id]['optype'] == 'numeric':
                    value = self.output
                else:
                    value = repr(self.output)
                body += (u"%sreturn %s\n" % (INDENT * (depth + 1), value))
                cmv.append(self.fields[field]['slug'])

            for child in children:
                field = child.predicate.field
                pre_condition = u""
                if has_missing_branch and child.predicate.value is not None:
                    negation = u"" if child.predicate.missing else u" not"
                    connection = u"or" if child.predicate.missing else u"and"
                    pre_condition = (u"%s is%s None %s " %
                                     (map_data(self.fields[field]['slug'],
                                               True), negation, connection))
                    if not child.predicate.missing:
                        cmv.append(self.fields[field]['slug'])
                optype = self.fields[field]['optype']
                if (optype == 'numeric' or optype == 'text'
                        or optype == 'items' or child.predicate.value is None):
                    value = child.predicate.value
                else:
                    value = repr(child.predicate.value)
                if optype == 'text' or optype == 'items':
                    if optype == 'text':
                        term_analysis_fields.append(
                            (field, child.predicate.term))
                        matching_function = "term_matches"
                    else:
                        item_analysis_fields.append(
                            (field, child.predicate.term))
                        matching_function = "item_matches"

                    body += (
                        u"%sif (%s%s(%s, \"%s\", %s\"%s\") %s %s):"
                        u"\n" %
                        (INDENT * depth, pre_condition, matching_function,
                         map_data(self.fields[field]['slug'],
                                  False), self.fields[field]['slug'],
                         ('u' if isinstance(child.predicate.term, unicode) else
                          ''), child.predicate.term.replace("\"", "\\\""),
                         PYTHON_OPERATOR[child.predicate.operator], value))
                else:
                    operator = (MISSING_OPERATOR[child.predicate.operator]
                                if child.predicate.value is None else
                                PYTHON_OPERATOR[child.predicate.operator])
                    if child.predicate.value is None:
                        cmv.append(self.fields[field]['slug'])
                    body += (u"%sif (%s%s %s %s):\n" %
                             (INDENT * depth, pre_condition,
                              map_data(self.fields[field]['slug'],
                                       False), operator, value))
                next_level = child.python_body(depth + 1,
                                               cmv=cmv[:],
                                               input_map=input_map,
                                               ids_path=ids_path,
                                               subtree=subtree)
                body += next_level[0]
                term_analysis_fields.extend(next_level[1])
                item_analysis_fields.extend(next_level[2])
        else:
            if self.fields[self.objective_id]['optype'] == 'numeric':
                value = self.output
            else:
                value = repr(self.output)
            body = u"%sreturn %s\n" % (INDENT * depth, value)

        return body, term_analysis_fields, item_analysis_fields

    def python(self,
               out,
               docstring,
               input_map=False,
               ids_path=None,
               subtree=True):
        """Writes a python function that implements the model.

        """
        args = []
        parameters = sort_fields(self.fields)
        if not input_map:
            input_map = len(parameters) > MAX_ARGS_LENGTH
        reserved_keywords = keyword.kwlist if not input_map else None
        prefix = "_" if not input_map else ""
        for field in [(key, val) for key, val in parameters]:
            slug = slugify(self.fields[field[0]]['name'],
                           reserved_keywords=reserved_keywords,
                           prefix=prefix)
            self.fields[field[0]].update(slug=slug)
            if not input_map:
                if field[0] != self.objective_id:
                    args.append("%s=None" % (slug))
        if input_map:
            args.append("data={}")
        predictor_definition = (u"def predict_%s" %
                                self.fields[self.objective_id]['slug'])
        depth = len(predictor_definition) + 1
        predictor = u"%s(%s):\n" % (predictor_definition,
                                    (",\n" + " " * depth).join(args))
        predictor_doc = (INDENT + u"\"\"\" " + docstring + u"\n" + INDENT +
                         u"\"\"\"\n")
        body, term_analysis_predicates, item_analysis_predicates = \
            self.python_body(input_map=input_map,
                             ids_path=ids_path,
                             subtree=subtree)
        terms_body = u""
        if term_analysis_predicates or item_analysis_predicates:
            terms_body = self.term_analysis_body(term_analysis_predicates,
                                                 item_analysis_predicates)
        predictor += predictor_doc + terms_body + body
        out.write(utf8(predictor))
        out.flush()

    def term_analysis_body(self, term_analysis_predicates,
                           item_analysis_predicates):
        """ Writes auxiliary functions to handle the term and item
        analysis fields

        """
        body = u""
        # static content
        body += """
    import re

    tm_tokens = '%s'
    tm_full_term = '%s'
    tm_all = '%s'

""" % (TM_TOKENS, TM_FULL_TERM, TM_ALL)
        if term_analysis_predicates:
            body += """
    def term_matches(text, field_name, term):
        \"\"\" Counts the number of occurences of term and its variants in text

        \"\"\"
        if text is None:
            text = ""
        forms_list = term_forms[field_name].get(term, [term])
        options = term_analysis[field_name]
        token_mode = options.get('token_mode', tm_tokens)
        case_sensitive = options.get('case_sensitive', False)
        first_term = forms_list[0]
        if token_mode == tm_full_term:
            return full_term_match(text, first_term, case_sensitive)
        else:
            # In token_mode='all' we will match full terms using equals and
            # tokens using contains
            if token_mode == tm_all and len(forms_list) == 1:
                pattern = re.compile(r'^.+\\b.+$', re.U)
                if re.match(pattern, first_term):
                    return full_term_match(text, first_term, case_sensitive)
            return term_matches_tokens(text, forms_list, case_sensitive)


    def full_term_match(text, full_term, case_sensitive):
        \"\"\"Counts the match for full terms according to the case_sensitive
              option

        \"\"\"
        if not case_sensitive:
            text = text.lower()
            full_term = full_term.lower()
        return 1 if text == full_term else 0

    def get_tokens_flags(case_sensitive):
        \"\"\"Returns flags for regular expression matching depending on text
              analysis options

        \"\"\"
        flags = re.U
        if not case_sensitive:
            flags = (re.I | flags)
        return flags


    def term_matches_tokens(text, forms_list, case_sensitive):
        \"\"\" Counts the number of occurences of the words in forms_list in
               the text

        \"\"\"
        flags = get_tokens_flags(case_sensitive)
        expression = ur'(\\b|_)%s(\\b|_)' % '(\\\\b|_)|(\\\\b|_)'.join(forms_list)
        pattern = re.compile(expression, flags=flags)
        matches = re.findall(pattern, text)
        return len(matches)

"""

            term_analysis_options = set(
                [predicate[0] for predicate in term_analysis_predicates])
            term_analysis_predicates = set(term_analysis_predicates)
            body += """
    term_analysis = {"""
            for field_id in term_analysis_options:
                field = self.fields[field_id]
                body += """
        \"%s\": {""" % field['slug']
                for option in field['term_analysis']:
                    if option in TERM_OPTIONS:
                        body += """
                \"%s\": %s,""" % (option, repr(field['term_analysis'][option]))
                body += """
        },"""
                body += """
    }"""
            term_forms = {}
            fields = self.fields
            for field_id, term in term_analysis_predicates:
                alternatives = []
                field = fields[field_id]
                if field['slug'] not in term_forms:
                    term_forms[field['slug']] = {}
                all_forms = field['summary'].get('term_forms', {})
                if all_forms:
                    alternatives = all_forms.get(term, [])
                    if alternatives:
                        terms = [term]
                        terms.extend(all_forms.get(term, []))
                        term_forms[field['slug']][term] = terms
                body += """
    term_forms = {"""
                for field in term_forms:
                    body += """
        \"%s\": {""" % field
                    for term in term_forms[field]:
                        body += """
            u\"%s\": %s,""" % (term, term_forms[field][term])
                    body += """
        },
                """
                body += """
    }
"""
        if item_analysis_predicates:
            body += """
    def item_matches(text, field_name, item):
        \"\"\" Counts the number of occurences of item in text

        \"\"\"
        if text is None:
            text = ""
        options = item_analysis[field_name]
        separator = options.get('separator', ' ')
        regexp = options.get('separator_regexp')
        if regexp is None:
            regexp = r\"%s\" % separator
        return count_items_matches(text, item, regexp)


    def count_items_matches(text, item, regexp):
        \"\"\" Counts the number of occurences of the item in the text

        \"\"\"
        expression = r'(^|%s)%s($|%s)' % (regexp, item, regexp)
        pattern = re.compile(expression, flags=re.U)
        matches = re.findall(pattern, text)
        return len(matches)
"""

            item_analysis_options = set(
                [predicate[0] for predicate in item_analysis_predicates])
            item_analysis_predicates = set(item_analysis_predicates)
            body += """
    item_analysis = {"""
            for field_id in item_analysis_options:
                field = self.fields[field_id]
                body += """
        \"%s\": {""" % field['slug']
                for option in field['item_analysis']:
                    if option in ITEM_OPTIONS:
                        body += """
                \"%s\": %s,""" % (option, repr(field['item_analysis'][option]))
                body += """
        },"""
            body += """
    }
"""

        return body

    def tableau_body(self,
                     body=u"",
                     conditions=None,
                     cmv=None,
                     ids_path=None,
                     subtree=True):
        """Translate the model into a set of "if" statements in Tableau syntax

        `depth` controls the size of indentation. As soon as a value is missing
        that node is returned without further evaluation.

        """

        if cmv is None:
            cmv = []
        if body:
            alternate = u"ELSEIF"
        else:
            if conditions is None:
                conditions = []
            alternate = u"IF"

        children = filter_nodes(self.children, ids=ids_path, subtree=subtree)
        if children:
            field = split(children)
            has_missing_branch = (missing_branch(children)
                                  or none_value(children))
            # the missing is singled out as a special case only when there's
            # no missing branch in the children list
            if (not has_missing_branch
                    and self.fields[field]['name'] not in cmv):
                conditions.append("ISNULL([%s])" % self.fields[field]['name'])
                body += (u"%s %s THEN " %
                         (alternate, " AND ".join(conditions)))
                if self.fields[self.objective_id]['optype'] == 'numeric':
                    value = self.output
                else:
                    value = tableau_string(self.output)
                body += (u"%s\n" % value)
                cmv.append(self.fields[field]['name'])
                alternate = u"ELSEIF"
                del conditions[-1]

            for child in children:
                pre_condition = u""
                post_condition = u""
                if has_missing_branch and child.predicate.value is not None:
                    negation = u"" if child.predicate.missing else u"NOT "
                    connection = u"OR" if child.predicate.missing else u"AND"
                    pre_condition = (
                        u"(%sISNULL([%s]) %s " %
                        (negation, self.fields[field]['name'], connection))
                    if not child.predicate.missing:
                        cmv.append(self.fields[field]['name'])
                    post_condition = u")"
                optype = self.fields[child.predicate.field]['optype']
                if child.predicate.value is None:
                    value = ""
                elif optype == 'text' or optype == 'items':
                    return u""
                elif optype == 'numeric':
                    value = child.predicate.value
                else:
                    value = repr(child.predicate.value)

                operator = (u"" if child.predicate.value is None else
                            PYTHON_OPERATOR[child.predicate.operator])
                if child.predicate.value is None:
                    pre_condition = (
                        T_MISSING_OPERATOR[child.predicate.operator])
                    post_condition = u")"

                conditions.append(
                    "%s[%s]%s%s%s" %
                    (pre_condition, self.fields[child.predicate.field]['name'],
                     operator, value, post_condition))
                body = child.tableau_body(body,
                                          conditions[:],
                                          cmv=cmv[:],
                                          ids_path=ids_path,
                                          subtree=subtree)
                del conditions[-1]
        else:
            if self.fields[self.objective_id]['optype'] == 'numeric':
                value = self.output
            else:
                value = tableau_string(self.output)
            body += (u"%s %s THEN" % (alternate, " AND ".join(conditions)))
            body += u" %s\n" % value

        return body

    def tableau(self, out, ids_path=None, subtree=True):
        """Writes a Tableau function that implements the model.

        """
        body = self.tableau_body(ids_path=ids_path, subtree=subtree)
        if not body:
            return False
        out.write(utf8(body))
        out.flush()
        return True

    def get_nodes_info(self, headers=None, leaves_only=False):
        """Yields the information associated to each of the tree nodes

        """
        row = []
        if not self.regression:
            category_dict = dict(self.distribution)
        for header in headers:
            if header == self.fields[self.objective_id]['name']:
                row.append(self.output)
                continue
            if header in ['confidence', 'error']:
                row.append(self.confidence)
                continue
            if header == 'impurity':
                row.append(self.impurity)
                continue
            if self.regression and header.startswith('bin'):
                for bin_value, bin_instances in self.distribution:
                    row.append(bin_value)
                    row.append(bin_instances)
                break
            if not self.regression:
                row.append(category_dict.get(header))
        while len(row) < len(headers):
            row.append(None)
        if not leaves_only or not self.children:
            yield row

        if self.children:
            for child in self.children:
                for row in child.get_nodes_info(headers,
                                                leaves_only=leaves_only):
                    yield row
Пример #4
0
class BoostedTree(object):
    """A boosted tree-like predictive model.

    """
    def __init__(self, tree, fields, objective_field=None):

        self.fields = fields
        self.objective_id = objective_field
        self.output = tree['output']

        if tree['predicate'] is True:
            self.predicate = True
        else:
            self.predicate = Predicate(
                tree['predicate']['operator'],
                tree['predicate']['field'],
                tree['predicate']['value'],
                tree['predicate'].get('term', None))

        self.id = tree.get('id')
        children = []
        if 'children' in tree:
            for child in tree['children']:
                children.append(BoostedTree(child,
                                            self.fields,
                                            objective_field=objective_field))
        self.children = children
        self.count = tree['count']
        self.g_sum = tree.get('g_sum')
        self.h_sum = tree.get('h_sum')

    def list_fields(self, out):
        """Lists a description of the model's fields.

        """

        for field in [(val['name'], val['optype']) for _, val in
                      sort_fields(self.fields)]:
            out.write(utf8(u'[%-32s : %s]\n' % (field[0], field[1])))
            out.flush()
        return self.fields

    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: we consider all possible outcomes and create
                              an average prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            return self.predict_proportional(input_data, path=path)
        else:
            if self.children:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path=path)

            return Prediction(
                self.output,
                path,
                None,
                distribution=None,
                count=self.count,
                median=None,
                distribution_unit=None,
                children=self.children,
                d_min=None,
                d_max=None)

    def predict_proportional(self, input_data, path=None,
                             missing_found=False):
        """Makes a prediction based on a number of field values considering all
           the predictions of the leaves that fall in a subtree.

           Each time a splitting field has no value assigned, we consider
           both branches of the split to be true, merging their
           predictions. The function returns the merged distribution and the
           last node reached by a unique path.

        """

        if path is None:
            path = []

        if not self.children:
            return (self.g_sum, self.h_sum, self.count, path)
        if one_branch(self.children, input_data) or \
                self.fields[split(self.children)]["optype"] in \
                ["text", "items"]:
            for child in self.children:
                if child.predicate.apply(input_data, self.fields):
                    new_rule = child.predicate.to_rule(self.fields)
                    if new_rule not in path and not missing_found:
                        path.append(new_rule)
                    return child.predict_proportional(input_data, path,
                                                      missing_found)
        else:
            # missing value found, the unique path stops
            missing_found = True
            g_sums = 0.0
            h_sums = 0.0
            population = 0
            for child in self.children:
                g_sum, h_sum, count, _ = \
                    child.predict_proportional(input_data, path,
                                               missing_found)
                g_sums += g_sum
                h_sums += h_sum
                population += count
            return (g_sums, h_sums, population, path)


    def get_leaves(self, path=None, filter_function=None):
        """Returns a list that includes all the leaves of the tree.

        """
        leaves = []
        if path is None:
            path = []
        if not isinstance(self.predicate, bool):
            path.append(self.predicate.to_lisp_rule(self.fields))

        if self.children:
            for child in self.children:
                leaves += child.get_leaves(path=path[:],
                                           filter_function=filter_function)
        else:
            leaf = {
                'id': self.id,
                'count': self.count,
                'g_sum': self.g_sum,
                'h_sum': self.h_sum,
                'output': self.output,
                'path': path}
            if (not hasattr(filter_function, '__call__')
                    or filter_function(leaf)):
                leaves += [leaf]
        return leaves