Exemplo n.º 1
0
    def tree_rules(tree,
                   offsets,
                   objective_id,
                   fields,
                   out,
                   ids_path=None,
                   subtree=True):
        """Prints out an IF-THEN rule version of the tree.

        """
        for field in sort_fields(fields):

            slug = slugify(fields[field[0]]['name'])
            fields[field[0]].update(slug=slug)
        out.write(
            utf8(
                generate_rules(tree,
                               offsets,
                               objective_id,
                               fields,
                               ids_path=ids_path,
                               subtree=subtree)))
        out.flush()
Exemplo n.º 2
0
def dot(name):
    """Creates a dot-separated name

    """
    return slugify(name.replace(" ", "."))
Exemplo n.º 3
0
def dot(name):
    """Creates a dot-separated name

    """
    return slugify(name.replace(" ", "."))
Exemplo n.º 4
0
    def python(self, out, docstring, ids_path=None, subtree=True):
        """Generates a python function that implements the model.

        """

        args = []
        args_tree = []
        parameters = sort_fields(self.fields)
        input_map = len(parameters) > MAX_ARGS_LENGTH and MAX_ARGS_LENGTH > 0
        reserved_keywords = PYTHON_KEYWORDS if not input_map else None
        prefix = "_" if not input_map else ""
        for field in [(key, val) for key, val in parameters]:
            field_name_to_show = self.fields[field[0]]['name'].strip()
            if field_name_to_show == "":
                field_name_to_show = field[0]
            slug = slugify(field_name_to_show,
                           reserved_keywords=reserved_keywords,
                           prefix=prefix)
            self.fields[field[0]].update(slug=slug)
            if not input_map:
                if field[0] != self.objective_id:
                    args.append("%s=None" % (slug))
                    args_tree.append("%s=%s" % (slug, slug))
        if input_map:
            args.append("data={}")
            args_tree.append("data=data")

        function_name = self.fields[self.objective_id]['slug'] if \
            not self.boosting else \
            self.fields[self.boosting["objective_field"]]['slug']
        if prefix == "_" and function_name[0] == prefix:
            function_name = function_name[1:]
        if function_name == "":
            function_name = "field_" + self.objective_id
        python_header = u"#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n"
        predictor_definition = (u"def predict_%s" % function_name)
        depth = len(predictor_definition) + 1
        predictor = u"%s(%s):\n" % (predictor_definition,
                                    (",\n" + " " * depth).join(args))
        predictor_doc = (INDENT + u"\"\"\" " + docstring + u"\n" + INDENT +
                         u"\"\"\"\n")
        body, term_analysis_predicates, item_analysis_predicates = \
            self.tree.plug_in_body(input_map=input_map,
                                   ids_path=ids_path,
                                   subtree=subtree)
        terms_body = ""
        if term_analysis_predicates or item_analysis_predicates:
            terms_body = self.term_analysis_body(term_analysis_predicates,
                                                 item_analysis_predicates)
        predictor = python_header + predictor + \
            predictor_doc + terms_body + body

        predictor_model = u"def predict"
        depth = len(predictor_model) + 1
        predictor += u"\n\n%s(%s):\n" % (predictor_model,
                                         (",\n" + " " * depth).join(args))
        predictor += u"%sprediction = predict_%s(%s)\n" % ( \
            INDENT, function_name, ", ".join(args_tree))

        if self.boosting is not None:
            predictor += u"%sprediction.update({\"weight\": %s})\n" % \
                (INDENT, self.boosting.get("weight"))
            if self.boosting.get("objective_class") is not None:
                predictor += u"%sprediction.update({\"class\": \"%s\"})\n" % \
                    (INDENT, self.boosting.get("objective_class"))
        predictor += u"%sreturn prediction" % INDENT

        if not PY3:
            predictor = predictor.encode("utf8")
        out.write(predictor)
        out.flush()
Exemplo n.º 5
0
    def hadoop_python_mapper(self,
                             out=sys.stdout,
                             ids_path=None,
                             subtree=True):
        """Generates a hadoop mapper header to make predictions in python

        """
        input_fields = [(value, key) for (
            key,
            value) in sorted(self.inverted_fields.items(), key=lambda x: x[1])]
        parameters = [
            value for (key, value) in input_fields
            if key != self.tree.objective_id
        ]
        args = []
        for field in input_fields:
            slug = slugify(self.tree.fields[field[0]]['name'])
            self.tree.fields[field[0]].update(slug=slug)
            if field[0] != self.tree.objective_id:
                args.append("\"" + self.tree.fields[field[0]]['slug'] + "\"")

        with open(HADOOP_CSV_TEMPLATE) as template_hander:
            output = template_handler.read() % u",".join(parameters)

        output += u"\n%sself.INPUT_FIELDS = [%s]\n" % \
            ((INDENT * 3), (",\n " + INDENT * 8).join(args))

        input_types = []
        prefixes = []
        suffixes = []
        count = 0
        fields = self.tree.fields
        for key in [
                field[0] for field in input_fields
                if field[0] != self.tree.objective_id
        ]:
            input_type = ('None' if not fields[key]['datatype'] in PYTHON_CONV
                          else PYTHON_CONV[fields[key]['datatype']])
            input_types.append(input_type)
            if 'prefix' in fields[key]:
                prefixes.append("%s: %s" %
                                (count, repr(fields[key]['prefix'])))
            if 'suffix' in fields[key]:
                suffixes.append("%s: %s" %
                                (count, repr(fields[key]['suffix'])))
            count += 1
        static_content = "%sself.INPUT_TYPES = [" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content, formatter.join(input_types),
                                 "]\n")
        static_content = "%sself.PREFIXES = {" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content, formatter.join(prefixes),
                                 "}\n")
        static_content = "%sself.SUFFIXES = {" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content, formatter.join(suffixes),
                                 "}\n")

        with open(HADOOP_NEXT_TEMPLATE) as template_hander:
            output += template_handler.read()

        out.write(output)
        out.flush()

        self.tree.python(out,
                         self.docstring(),
                         input_map=True,
                         ids_path=ids_path,
                         subtree=subtree)

        output = \
u"""
csv = CSVInput()
for values in csv:
    if not isinstance(values, bool):
        print u'%%s\\t%%s' %% (repr(values), repr(predict_%s(values)))
\n\n
""" % fields[self.tree.objective_id]['slug']
        out.write(output)
        out.flush()
Exemplo n.º 6
0
    def python(self, out, docstring, ids_path=None, subtree=True):
        """Generates a python function that implements the model.

        """

        args = []
        args_tree = []
        parameters = sort_fields(self.fields)
        input_map = len(parameters) > MAX_ARGS_LENGTH and MAX_ARGS_LENGTH > 0
        reserved_keywords = PYTHON_KEYWORDS if not input_map else None
        prefix = "_" if not input_map else ""
        for field in [(key, val) for key, val in parameters]:
            field_name_to_show = self.fields[field[0]]['name'].strip()
            if field_name_to_show == "":
                field_name_to_show = field[0]
            slug = slugify(field_name_to_show,
                           reserved_keywords=reserved_keywords, prefix=prefix)
            self.fields[field[0]].update(slug=slug)
            if not input_map:
                if field[0] != self.objective_id:
                    args.append("%s=None" % (slug))
                    args_tree.append("%s=%s" % (slug, slug))
        if input_map:
            args.append("data={}")
            args_tree.append("data=data")

        function_name = self.fields[self.objective_id]['slug'] if \
            not self.boosting else \
            self.fields[self.boosting["objective_field"]]['slug']
        if prefix == "_" and function_name[0] == prefix:
            function_name = function_name[1:]
        if function_name == "":
            function_name = "field_" + self.objective_id
        python_header = u"#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n"
        predictor_definition = (u"def predict_%s" %
                                function_name)
        depth = len(predictor_definition) + 1
        predictor = u"%s(%s):\n" % (predictor_definition,
                                   (",\n" + " " * depth).join(args))
        predictor_doc = (INDENT + u"\"\"\" " + docstring +
                         u"\n" + INDENT + u"\"\"\"\n")
        body, term_analysis_predicates, item_analysis_predicates = \
            self.tree.plug_in_body(input_map=input_map,
                                   ids_path=ids_path,
                                   subtree=subtree)
        terms_body = ""
        if term_analysis_predicates or item_analysis_predicates:
            terms_body = self.term_analysis_body(term_analysis_predicates,
                                                 item_analysis_predicates)
        predictor = python_header + predictor + \
            predictor_doc + terms_body + body

        predictor_model = u"def predict"
        depth = len(predictor_model) + 1
        predictor += u"\n\n%s(%s):\n" % (predictor_model,
                                         (",\n" + " " * depth).join(args))
        predictor += u"%sprediction = predict_%s(%s)\n" % ( \
            INDENT, function_name, ", ".join(args_tree))

        if self.boosting is not None:
            predictor += u"%sprediction.update({\"weight\": %s})\n" % \
                (INDENT, self.boosting.get("weight"))
            if self.boosting.get("objective_class") is not None:
                predictor += u"%sprediction.update({\"class\": \"%s\"})\n" % \
                    (INDENT, self.boosting.get("objective_class"))
        predictor += u"%sreturn prediction" % INDENT

        if not PY3:
            predictor = predictor.encode("utf8")
        out.write(predictor)
        out.flush()
Exemplo n.º 7
0
    def hadoop_python_mapper(self, out=sys.stdout, ids_path=None,
                             subtree=True):
        """Generates a hadoop mapper header to make predictions in python

        """
        input_fields = [(value, key) for (key, value) in
                        sorted(self.inverted_fields.items(),
                               key=lambda x: x[1])]
        parameters = [value for (key, value) in
                      input_fields if key != self.tree.objective_id]
        args = []
        for field in input_fields:
            slug = slugify(self.tree.fields[field[0]]['name'])
            self.tree.fields[field[0]].update(slug=slug)
            if field[0] != self.tree.objective_id:
                args.append("\"" + self.tree.fields[field[0]]['slug'] + "\"")

        with open(HADOOP_CSV_TEMPLATE) as template_hander:
            output = template_handler.read() % u",".join(parameters)

        output += u"\n%sself.INPUT_FIELDS = [%s]\n" % \
            ((INDENT * 3), (",\n " + INDENT * 8).join(args))

        input_types = []
        prefixes = []
        suffixes = []
        count = 0
        fields = self.tree.fields
        for key in [field[0] for field in input_fields
                    if field[0] != self.tree.objective_id]:
            input_type = ('None' if not fields[key]['datatype'] in
                          PYTHON_CONV
                          else PYTHON_CONV[fields[key]['datatype']])
            input_types.append(input_type)
            if 'prefix' in fields[key]:
                prefixes.append("%s: %s" % (count,
                                            repr(fields[key]['prefix'])))
            if 'suffix' in fields[key]:
                suffixes.append("%s: %s" % (count,
                                            repr(fields[key]['suffix'])))
            count += 1
        static_content = "%sself.INPUT_TYPES = [" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content,
                                 formatter.join(input_types),
                                 "]\n")
        static_content = "%sself.PREFIXES = {" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content,
                                 formatter.join(prefixes),
                                 "}\n")
        static_content = "%sself.SUFFIXES = {" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content,
                                 formatter.join(suffixes),
                                 "}\n")

        with open(HADOOP_NEXT_TEMPLATE) as template_hander:
            output += template_handler.read()

        out.write(output)
        out.flush()

        self.tree.python(out, self.docstring(),
                         input_map=True,
                         ids_path=ids_path,
                         subtree=subtree)

        output = \
u"""
csv = CSVInput()
for values in csv:
    if not isinstance(values, bool):
        print u'%%s\\t%%s' %% (repr(values), repr(predict_%s(values)))
\n\n
""" % fields[self.tree.objective_id]['slug']
        out.write(output)
        out.flush()
Exemplo n.º 8
0
def tree_python(tree,
                offsets,
                fields,
                objective_id,
                boosting,
                out,
                docstring_str,
                input_map=False,
                ids_path=None,
                subtree=True):
    """Writes a python function that implements the model.

    """
    args = []
    args_tree = []
    parameters = sort_fields(fields)
    if not input_map:
        input_map = len(parameters) > MAX_ARGS_LENGTH
    reserved_keywords = keyword.kwlist if not input_map else None
    prefix = "_" if not input_map else ""
    for field in parameters:
        field_name_to_show = fields[field[0]]['name'].strip()
        if field_name_to_show == "":
            field_name_to_show = field[0]
        slug = slugify(field_name_to_show,
                       reserved_keywords=reserved_keywords,
                       prefix=prefix)
        fields[field[0]].update(slug=slug)
        if not input_map:
            if field[0] != objective_id:
                args.append("%s=None" % (slug))
                args_tree.append("%s=%s" % (slug, slug))
    if input_map:
        args.append("data={}")
        args_tree.append("data=data")

    function_name = fields[objective_id]['slug'] if \
        not boosting else fields[boosting["objective_field"]]['slug']
    if prefix == "_" and function_name[0] == prefix:
        function_name = function_name[1:]
    if function_name == "":
        function_name = "field_" + objective_id
    python_header = "# -*- coding: utf-8 -*-\n"
    predictor_definition = ("def predict_%s" % function_name)
    depth = len(predictor_definition) + 1
    predictor = "%s(%s):\n" % (predictor_definition,
                               (",\n" + " " * depth).join(args))

    predictor_doc = (INDENT + "\"\"\" " + docstring_str + "\n" + INDENT +
                     "\"\"\"\n")
    body_fn = boosted_plug_in_body if boosting else plug_in_body
    body, term_analysis_predicates, item_analysis_predicates = \
        body_fn(tree, offsets, fields, objective_id,
                fields[objective_id]["optype"] == NUMERIC,
                input_map=input_map,
                ids_path=ids_path, subtree=subtree)
    terms_body = ""
    if term_analysis_predicates or item_analysis_predicates:
        terms_body = term_analysis_body(fields, term_analysis_predicates,
                                        item_analysis_predicates)
    predictor = python_header + predictor + \
        predictor_doc + terms_body + body

    predictor_model = "def predict"
    depth = len(predictor_model) + 1
    predictor += "\n\n%s(%s):\n" % (predictor_model,
                                    (",\n" + " " * depth).join(args))
    predictor += "%sprediction = predict_%s(%s)\n" % ( \
        INDENT, function_name, ", ".join(args_tree))

    if boosting is not None:
        predictor += "%sprediction.update({\"weight\": %s})\n" % \
            (INDENT, boosting.get("weight"))
        if boosting.get("objective_class") is not None:
            predictor += "%sprediction.update({\"class\": \"%s\"})\n" % \
                (INDENT, boosting.get("objective_class"))
    predictor += "%sreturn prediction" % INDENT

    out.write(utf8(predictor))
    out.flush()