示例#1
0
    def __init__(self,
                 training_set,
                 training_set_header,
                 objective_field,
                 multi_label=False,
                 labels=None,
                 label_separator=None,
                 training_separator=None,
                 multi_label_fields=None,
                 label_aggregates=None,
                 objective=True):
        """Builds a generator from a csv file

           `training_set`: path to the training data file
           `training_set_header`: boolean, True means that headers are first
                                 row in the file
           `objective_field`: objective field column or field name
           `labels`: Fields object with the expected fields structure.
        """
        self.training_set = training_set
        self.training_set_header = training_set_header
        self.training_set_handler = None
        self.training_reader = None
        self.multi_label = multi_label
        self.objective = objective
        if label_aggregates is None:
            label_aggregates = []
        self.label_aggregates = label_aggregates

        self.training_separator = (training_separator.decode("string_escape")
                                   if training_separator is not None else
                                   get_csv_delimiter())
        if len(self.training_separator) > 1:
            sys.exit("Only one character can be used as test data separator.")
        # opening csv reader
        self.reset()
        self.label_separator = (label_separator.decode("string_escape")
                                if label_separator is not None else
                                get_csv_delimiter())

        first_row = self.next(reset=not training_set_header)
        self.row_length = len(first_row)

        if training_set_header:
            self.headers = [unicode(header, "utf-8") for header in first_row]
        else:
            self.headers = [("field_%s" % index)
                            for index in range(0, self.row_length)]

        self.multi_label_fields = sorted(self._get_columns(multi_label_fields))
        if objective:
            self.objective_column = self._get_columns([objective_field])[0]
            if not self.objective_column in self.multi_label_fields:
                self.multi_label_fields.append(self.objective_column)
            self.labels = labels
        self.fields_labels = self._get_labels()
        if objective:
            if labels is None:
                self.labels = self.fields_labels[self.objective_column]
            self.objective_name = self.headers[self.objective_column]
示例#2
0
    def __init__(self, training_set, training_set_header, objective_field,
                 multi_label=False, labels=None, label_separator=None,
                 training_separator=None, multi_label_fields=None,
                 label_aggregates=None, objective=True):
        """Builds a generator from a csv file

           `training_set`: path to the training data file
           `training_set_header`: boolean, True means that headers are first
                                 row in the file
           `objective_field`: objective field column or field name
           `labels`: Fields object with the expected fields structure.
        """
        self.training_set = training_set

        if training_set.__class__.__name__ == "StringIO":
            self.encode = None
            self.training_set = UTF8Recoder(training_set, SYSTEM_ENCODING)
        else:
            self.encode = None if PYTHON3 else FILE_ENCODING
        self.training_set_header = training_set_header
        self.training_reader = None
        self.multi_label = multi_label
        self.objective = objective
        if label_aggregates is None:
            label_aggregates = []
        self.label_aggregates = label_aggregates
        self.training_separator = (decode2(training_separator,
                                           encoding="string_escape")
                                   if training_separator is not None
                                   else get_csv_delimiter())
        if len(self.training_separator) > 1:
            sys.exit("Only one character can be used as test data separator.")
        # opening csv reader
        self.reset()
        self.label_separator = (decode2(label_separator,
                                        encoding="string_escape")
                                if label_separator is not None
                                else get_csv_delimiter())

        first_row = self.get_next(reset=not training_set_header)
        self.row_length = len(first_row)

        if training_set_header:
            self.headers = first_row
        else:
            self.headers = [("field_%s" % index) for index in
                            range(0, self.row_length)]

        self.multi_label_fields = sorted(self._get_columns(multi_label_fields))
        if objective:
            self.objective_column = self._get_columns([objective_field])[0]
            if not self.objective_column in self.multi_label_fields:
                self.multi_label_fields.append(self.objective_column)
            self.labels = labels
        self.fields_labels = self._get_labels()
        if objective:
            if labels is None:
                self.labels = self.fields_labels[self.objective_column]
            self.objective_name = self.headers[self.objective_column]
示例#3
0
    def __init__(self, training_set, training_set_header, objective_field,
                 multi_label=False, labels=None, label_separator=None,
                 training_separator=None):
        """Builds a generator from a csv file

           `training_set`: path to the training data file
           `training_set_header`: boolean, True means that headers are first
                                 row in the file
           `objective_field`: objective field column or field name
           `labels`: Fields object with the expected fields structure.
        """
        self.training_set = training_set
        self.training_set_header = training_set_header
        self.training_set_handler = None
        self.training_reader = None
        self.multi_label = multi_label

        self.training_separator = (training_separator.decode("string_escape")
                                   if training_separator is not None
                                   else get_csv_delimiter())
        if len(self.training_separator) > 1:
            sys.exit("Only one character can be used as test data separator.")
        # opening csv reader
        self.reset()
        self.label_separator = (label_separator.decode("string_escape")
                                if label_separator is not None
                                else get_csv_delimiter())

        first_row = self.next(reset=not training_set_header)
        row_length = len(first_row)

        if training_set_header:
            self.headers = [unicode(header, "utf-8") for header in first_row]
        else:
            self.headers = [("field_%s" % index) for index in
                            range(0, row_length)]

        if isinstance(objective_field, int):
            self.objective_column = objective_field
        elif objective_field is None:
            self.objective_column = row_length - 1
        else:
            try:
                self.objective_column = self.headers.index(objective_field)
            except ValueError:
                sys.exit("The %s has been set as objective field but"
                         " it cannot be found in the headers row: \n %s" %
                         (objective_field,
                          ", ".join([header.encode("utf-8")
                                     for header in self.headers])))

        self.labels = labels
        self.labels = self.get_labels()
        self.objective_name = self.headers[self.objective_column]
示例#4
0
    def __init__(self, test_set, test_set_header, fields, objective_field):
        """Builds a generator from a csv file and the fields' model structure

        """
        self.test_set = test_set
        self.test_set_header = test_set_header
        self.fields = fields
        self.objective_field = objective_field
        try:
            self.test_reader = csv.reader(open(test_set, "U"),
                                          delimiter=get_csv_delimiter(),
                                          lineterminator="\n")
        except IOError:
            sys.exit("Error: cannot read test %s" % test_set)

        self.headers = None
        self.exclude = []
        if test_set_header:
            self.headers = self.test_reader.next()
            # validate headers against model fields excluding objective_field,
            # that may be present or not
            objective_field = fields.field_column_number(objective_field)
            fields_names = [
                fields.fields[fields.field_id(i)]['name']
                for i in sorted(fields.fields_by_column_number.keys())
                if i != objective_field
            ]
            self.headers = [
                unicode(header, "utf-8") for header in self.headers
            ]
            self.exclude = [
                i for i in range(len(self.headers))
                if not self.headers[i] in fields_names
            ]

            self.exclude.reverse()
            if self.exclude:
                if len(self.headers) > len(self.exclude):
                    print(u"WARNING: predictions will be processed but some "
                          u"data might not be used. The used fields will be:"
                          u"\n\n%s"
                          u"\n\nwhile the headers found in the test file are:"
                          u"\n\n%s" % (",".join(fields_names), ",".join(
                              self.headers))).encode("utf-8")
                    for index in self.exclude:
                        del self.headers[index]
                else:
                    raise Exception(
                        (u"No test field matches the model fields."
                         u"\nThe expected fields are:\n\n%s\n\n"
                         u"while "
                         u"the headers found in the test file are:"
                         u"\n\n%s\n\n"
                         u"Use --no-test-header flag if first li"
                         u"ne should not be interpreted as"
                         u" headers." % (",".join(fields_names), ",".join(
                             self.headers))).encode("utf-8"))
示例#5
0
    def __init__(self, test_set, test_set_header, fields, objective_field):
        """Builds a generator from a csv file and the fields' model structure

        """
        self.test_set = test_set
        self.test_set_header = test_set_header
        self.fields = fields
        self.objective_field = objective_field
        try:
            self.test_reader = csv.reader(open(test_set, "U"),
                                          delimiter=get_csv_delimiter(),
                                          lineterminator="\n")
        except IOError:
            sys.exit("Error: cannot read test %s" % test_set)

        self.headers = None
        self.exclude = []
        if test_set_header:
            self.headers = self.test_reader.next()
            # validate headers against model fields excluding objective_field,
            # that may be present or not
            objective_field = fields.field_column_number(objective_field)
            fields_names = [fields.fields[fields.field_id(i)]
                            ['name'] for i in
                            sorted(fields.fields_by_column_number.keys())
                            if i != objective_field]
            self.headers = [unicode(header, "utf-8")
                            for header in self.headers]
            self.exclude = [i for i in range(len(self.headers))
                            if not self.headers[i] in fields_names]

            self.exclude.reverse()
            if self.exclude:
                if len(self.headers) > len(self.exclude):
                    print (u"WARNING: predictions will be processed but some "
                           u"data might not be used. The used fields will be:"
                           u"\n\n%s"
                           u"\n\nwhile the headers found in the test file are:"
                           u"\n\n%s" %
                           (",".join(fields_names),
                            ",".join(self.headers))).encode("utf-8")
                    for index in self.exclude:
                        del self.headers[index]
                else:
                    raise Exception((u"No test field matches the model fields."
                                     u"\nThe expected fields are:\n\n%s\n\n"
                                     u"while "
                                     u"the headers found in the test file are:"
                                     u"\n\n%s\n\n"
                                     u"Use --no-test-header flag if first li"
                                     u"ne should not be interpreted as"
                                     u" headers." %
                                     (",".join(fields_names),
                                      ",".join(self.headers))).encode("utf-8"))
示例#6
0
class TstReader(object):
    """Retrieves csv info and builds a input data dict

    """
    def __init__(self,
                 test_set,
                 test_set_header,
                 fields,
                 objective_field,
                 test_separator=None):
        """Builds a generator from a csv file and the fields' model structure

           `test_set`: path to the test data file
           `test_set_header`: boolean, True means that headers are first row
                              in the file
           `fields`: Fields object with the expected fields structure.
           `objective_field`: field_id of the objective field
        """
        self.test_set = test_set
        if test_set.__class__.__name__ == "StringIO":
            self.encode = None
            self.test_set = UTF8Recoder(test_set, SYSTEM_ENCODING)
        else:
            self.encode = None if PYTHON3 else FILE_ENCODING
        self.test_set_header = test_set_header
        self.fields = fields
        if (objective_field is not None
                and not objective_field in fields.fields):
            try:
                objective_field = fields.field_id(objective_field)
            except ValueError, exc:
                sys.exit(exc)
        self.objective_field = objective_field
        if test_separator and not PYTHON3:
            test_separator = decode2(test_separator, encoding="string_escape")
        self.test_separator = (test_separator if test_separator is not None
                               else get_csv_delimiter())
        if len(self.test_separator) > 1:
            sys.exit("Only one character can be used as test data separator.")
        try:
            self.test_reader = UnicodeReader(
                self.test_set,
                delimiter=self.test_separator,
                lineterminator="\n").open_reader()
        except IOError:
            sys.exit("Error: cannot read test %s" % test_set)

        self.headers = None
        self.raw_headers = None
        self.exclude = []
        if test_set_header:
            self.headers = self.test_reader.next()
            # validate headers against model fields excluding objective_field,
            # that may be present or not
            if objective_field is not None:
                objective_field = fields.field_column_number(objective_field)
            try:
                fields_names = [
                    fields.fields[fields.field_id(i)]['name']
                    for i in sorted(fields.fields_by_column_number.keys())
                    if objective_field is None or i != objective_field
                ]
            except ValueError, exc:
                sys.exit(exc)
            self.raw_headers = self.headers[:]

            self.exclude = [
                i for i in range(len(self.headers))
                if not self.headers[i] in fields_names
            ]

            self.exclude.reverse()
            if self.exclude:
                if len(self.headers) > len(self.exclude):
                    for index in self.exclude:
                        del self.headers[index]
                else:
                    raise Exception(
                        (u"No test field matches the model fields."
                         u"\nThe expected fields are:\n\n%s\n\n"
                         u"while "
                         u"the headers found in the test file are:"
                         u"\n\n%s\n\n"
                         u"Use --no-test-header flag if first li"
                         u"ne should not be interpreted as"
                         u" headers." % (",".join(fields_names), ",".join(
                             self.headers))).encode("utf-8"))
示例#7
0
def predict(test_set, test_set_header, models, fields, output,
            objective_field, remote=False, api=None, log=None,
            max_models=MAX_MODELS, method=0, resume=False,
            tags=None, verbosity=1, session_file=None, debug=False):
    """Computes a prediction for each entry in the `test_set`.

       Predictions can be computed remotely, locally using MultiModels built
       on all the models or locally using MultiModels on subgroups of models.
       Chosing a max_batch_models value not bigger than the number_of_models
       flag will lead to the last case, where memory usage is bounded and each
       model predictions are saved for further use.
    """

    try:
        test_reader = csv.reader(open(test_set, "U"),
                                 delimiter=get_csv_delimiter(),
                                 lineterminator="\n")
    except IOError:
        sys.exit("Error: cannot read test %s" % test_set)

    headers = None
    exclude = []
    if test_set_header:
        headers = test_reader.next()
        # validate headers against model fields excluding objective_field,
        # that may be present or not
        fields_names = [fields.fields[fields.field_id(i)]
                        ['name'] for i in
                        sorted(fields.fields_by_column_number.keys())
                        if i != fields.field_column_number(objective_field)]
        headers = [unicode(header, "utf-8") for header in headers]
        exclude = [i for i in range(len(headers)) if not headers[i]
                   in fields_names]
        exclude.reverse()
        if len(exclude):
            if (len(headers) - len(exclude)):
                print (u"WARNING: predictions will be processed but some data"
                       u" might not be used. The used fields will be:\n\n%s"
                       u"\n\nwhile the headers found in the test file are:"
                       u"\n\n%s" %
                       (",".join(fields_names),
                        ",".join(headers))).encode("utf-8")
                for index in exclude:
                    del headers[index]
            else:
                raise Exception((u"No test field matches the model fields.\n"
                                 u"The expected fields are:\n\n%s\n\nwhile "
                                 u"the headers found in the test file are:\n\n"
                                 u"%s\n\nUse --no-test-header flag if first li"
                                 u"ne should not be interpreted as headers." %
                                 (",".join(fields_names),
                                  ",".join(headers))).encode("utf-8"))

    prediction_file = output
    output_path = u.check_dir(output)
    output = open(output, 'w', 0)
    number_of_tests = None
    if resume:
        number_of_tests = u.file_number_of_lines(test_set)
        if test_set_header:
            number_of_tests -= 1
    # Remote predictions: predictions are computed in bigml.com and stored
    # in a file named after the model in the following syntax:
    #     model_[id of the model]__predictions.csv
    # For instance,
    #     model_50c0de043b563519830001c2_predictions.csv
    if remote:
        remote_predict(models, headers, output_path, number_of_tests, resume,
                       verbosity, test_reader, exclude, fields, api,
                       prediction_file, method, tags, objective_field,
                       session_file, test_set_header, log, debug)
    # Local predictions: Predictions are computed locally using models' rules
    # with MultiModel's predict method
    else:
        message = u.dated("Creating local predictions.\n")
        u.log_message(message, log_file=session_file, console=verbosity)
        # For a small number of models, we build a MultiModel using all of
        # the given models and issue a combined prediction
        if len(models) < max_models:
            local_predict(models, headers, test_reader, exclude, fields,
                          method, objective_field, output, test_set_header)
        # For large numbers of models, we split the list of models in chunks
        # and build a MultiModel for each chunk, issue and store predictions
        # for each model and combine all of them eventually.
        else:
            local_batch_predict(models, headers, test_reader, exclude, fields,
                                resume, output_path, max_models,
                                number_of_tests, api, output,
                                verbosity, method, objective_field,
                                session_file, debug)
    output.close()
示例#8
0
    def __init__(self, test_set, test_set_header, fields, objective_field,
                 test_separator=None):
        """Builds a generator from a csv file and the fields' model structure

           `test_set`: path to the test data file
           `test_set_header`: boolean, True means that headers are first row
                              in the file
           `fields`: Fields object with the expected fields structure.
           `objective_field`: field_id of the objective field
        """
        self.test_set = test_set
        if test_set.__class__.__name__ == "StringIO":
            self.encode = "utf-8"
            self.test_set_handler = UTF8Recoder(test_set, self.encode)
        else:
            self.encode = None
            self.test_set_handler = open(test_set, "U")
        self.test_set_header = test_set_header
        self.fields = fields
        if not objective_field in fields.fields:
            objective_field = fields.field_id(objective_field)
        self.objective_field = objective_field
        self.test_separator = (test_separator.decode("string_escape")
                               if test_separator is not None
                               else get_csv_delimiter())
        if len(self.test_separator) > 1:
            sys.exit("Only one character can be used as test data separator.")
        try:
            self.test_reader = csv.reader(self.test_set_handler,
                                          delimiter=self.test_separator,
                                          lineterminator="\n")
        except IOError:
            sys.exit("Error: cannot read test %s" % test_set)

        self.headers = None
        self.raw_headers = None
        self.exclude = []
        if test_set_header:
            self.headers = self.test_reader.next()
            self.raw_headers = self.headers
            # validate headers against model fields excluding objective_field,
            # that may be present or not
            objective_field = fields.field_column_number(objective_field)
            fields_names = [fields.fields[fields.field_id(i)]
                            ['name'] for i in
                            sorted(fields.fields_by_column_number.keys())
                            if i != objective_field]
            self.headers = [unicode(header, "utf-8")
                            for header in self.headers]
            self.exclude = [i for i in range(len(self.headers))
                            if not self.headers[i] in fields_names]

            self.exclude.reverse()
            if self.exclude:
                if len(self.headers) > len(self.exclude):
                    print (u"WARNING: predictions will be processed but some "
                           u"data might not be used. The used fields will be:"
                           u"\n\n%s"
                           u"\n\nwhile the headers found in the test file are:"
                           u"\n\n%s" %
                           (",".join(fields_names),
                            ",".join(self.headers))).encode("utf-8")
                    for index in self.exclude:
                        del self.headers[index]
                else:
                    raise Exception((u"No test field matches the model fields."
                                     u"\nThe expected fields are:\n\n%s\n\n"
                                     u"while "
                                     u"the headers found in the test file are:"
                                     u"\n\n%s\n\n"
                                     u"Use --no-test-header flag if first li"
                                     u"ne should not be interpreted as"
                                     u" headers." %
                                     (",".join(fields_names),
                                      ",".join(self.headers))).encode("utf-8"))
示例#9
0
    def __init__(self,
                 training_set,
                 training_set_header,
                 objective_field,
                 multi_label=False,
                 labels=None,
                 label_separator=None,
                 training_separator=None,
                 multi_label_fields=None,
                 label_aggregates=None,
                 objective=True):
        """Builds a generator from a csv file

           `training_set`: path to the training data file
           `training_set_header`: boolean, True means that headers are first
                                 row in the file
           `objective_field`: objective field column or field name
           `labels`: Fields object with the expected fields structure.
        """
        self.training_set = training_set

        if training_set.__class__.__name__ == "StringIO":
            self.encode = None
            self.training_set = UTF8Recoder(training_set, BIGML_SYS_ENCODING)
        else:
            self.encode = None if PYTHON3 else FILE_ENCODING
        self.training_set_header = training_set_header
        self.training_reader = None
        self.multi_label = multi_label
        self.objective = objective
        if label_aggregates is None:
            label_aggregates = []
        self.label_aggregates = label_aggregates
        self.training_separator = (decode2(
            training_separator, encoding="string_escape") if training_separator
                                   is not None else get_csv_delimiter())
        if len(self.training_separator) > 1:
            sys.exit("Only one character can be used as test data separator.")
        # opening csv reader
        self.reset()
        self.label_separator = (decode2(
            label_separator, encoding="string_escape") if label_separator
                                is not None else get_csv_delimiter())

        first_row = self.get_next(reset=not training_set_header)
        self.row_length = len(first_row)

        if training_set_header:
            self.headers = first_row
        else:
            self.headers = [("field_%s" % index)
                            for index in range(0, self.row_length)]

        self.multi_label_fields = sorted(self._get_columns(multi_label_fields))
        if objective:
            self.objective_column = self._get_columns([objective_field])[0]
            if not self.objective_column in self.multi_label_fields:
                self.multi_label_fields.append(self.objective_column)
            self.labels = labels
        self.fields_labels = self._get_labels()
        if objective:
            if labels is None:
                self.labels = self.fields_labels[self.objective_column]
            self.objective_name = self.headers[self.objective_column]