def __init__(self, training_set, training_set_header, objective_field, multi_label=False, labels=None, label_separator=None, training_separator=None, multi_label_fields=None, label_aggregates=None, objective=True): """Builds a generator from a csv file `training_set`: path to the training data file `training_set_header`: boolean, True means that headers are first row in the file `objective_field`: objective field column or field name `labels`: Fields object with the expected fields structure. """ self.training_set = training_set self.training_set_header = training_set_header self.training_set_handler = None self.training_reader = None self.multi_label = multi_label self.objective = objective if label_aggregates is None: label_aggregates = [] self.label_aggregates = label_aggregates self.training_separator = (training_separator.decode("string_escape") if training_separator is not None else get_csv_delimiter()) if len(self.training_separator) > 1: sys.exit("Only one character can be used as test data separator.") # opening csv reader self.reset() self.label_separator = (label_separator.decode("string_escape") if label_separator is not None else get_csv_delimiter()) first_row = self.next(reset=not training_set_header) self.row_length = len(first_row) if training_set_header: self.headers = [unicode(header, "utf-8") for header in first_row] else: self.headers = [("field_%s" % index) for index in range(0, self.row_length)] self.multi_label_fields = sorted(self._get_columns(multi_label_fields)) if objective: self.objective_column = self._get_columns([objective_field])[0] if not self.objective_column in self.multi_label_fields: self.multi_label_fields.append(self.objective_column) self.labels = labels self.fields_labels = self._get_labels() if objective: if labels is None: self.labels = self.fields_labels[self.objective_column] self.objective_name = self.headers[self.objective_column]
def __init__(self, training_set, training_set_header, objective_field, multi_label=False, labels=None, label_separator=None, training_separator=None, multi_label_fields=None, label_aggregates=None, objective=True): """Builds a generator from a csv file `training_set`: path to the training data file `training_set_header`: boolean, True means that headers are first row in the file `objective_field`: objective field column or field name `labels`: Fields object with the expected fields structure. """ self.training_set = training_set if training_set.__class__.__name__ == "StringIO": self.encode = None self.training_set = UTF8Recoder(training_set, SYSTEM_ENCODING) else: self.encode = None if PYTHON3 else FILE_ENCODING self.training_set_header = training_set_header self.training_reader = None self.multi_label = multi_label self.objective = objective if label_aggregates is None: label_aggregates = [] self.label_aggregates = label_aggregates self.training_separator = (decode2(training_separator, encoding="string_escape") if training_separator is not None else get_csv_delimiter()) if len(self.training_separator) > 1: sys.exit("Only one character can be used as test data separator.") # opening csv reader self.reset() self.label_separator = (decode2(label_separator, encoding="string_escape") if label_separator is not None else get_csv_delimiter()) first_row = self.get_next(reset=not training_set_header) self.row_length = len(first_row) if training_set_header: self.headers = first_row else: self.headers = [("field_%s" % index) for index in range(0, self.row_length)] self.multi_label_fields = sorted(self._get_columns(multi_label_fields)) if objective: self.objective_column = self._get_columns([objective_field])[0] if not self.objective_column in self.multi_label_fields: self.multi_label_fields.append(self.objective_column) self.labels = labels self.fields_labels = self._get_labels() if objective: if labels is None: self.labels = self.fields_labels[self.objective_column] self.objective_name = self.headers[self.objective_column]
def __init__(self, training_set, training_set_header, objective_field, multi_label=False, labels=None, label_separator=None, training_separator=None): """Builds a generator from a csv file `training_set`: path to the training data file `training_set_header`: boolean, True means that headers are first row in the file `objective_field`: objective field column or field name `labels`: Fields object with the expected fields structure. """ self.training_set = training_set self.training_set_header = training_set_header self.training_set_handler = None self.training_reader = None self.multi_label = multi_label self.training_separator = (training_separator.decode("string_escape") if training_separator is not None else get_csv_delimiter()) if len(self.training_separator) > 1: sys.exit("Only one character can be used as test data separator.") # opening csv reader self.reset() self.label_separator = (label_separator.decode("string_escape") if label_separator is not None else get_csv_delimiter()) first_row = self.next(reset=not training_set_header) row_length = len(first_row) if training_set_header: self.headers = [unicode(header, "utf-8") for header in first_row] else: self.headers = [("field_%s" % index) for index in range(0, row_length)] if isinstance(objective_field, int): self.objective_column = objective_field elif objective_field is None: self.objective_column = row_length - 1 else: try: self.objective_column = self.headers.index(objective_field) except ValueError: sys.exit("The %s has been set as objective field but" " it cannot be found in the headers row: \n %s" % (objective_field, ", ".join([header.encode("utf-8") for header in self.headers]))) self.labels = labels self.labels = self.get_labels() self.objective_name = self.headers[self.objective_column]
def __init__(self, test_set, test_set_header, fields, objective_field): """Builds a generator from a csv file and the fields' model structure """ self.test_set = test_set self.test_set_header = test_set_header self.fields = fields self.objective_field = objective_field try: self.test_reader = csv.reader(open(test_set, "U"), delimiter=get_csv_delimiter(), lineterminator="\n") except IOError: sys.exit("Error: cannot read test %s" % test_set) self.headers = None self.exclude = [] if test_set_header: self.headers = self.test_reader.next() # validate headers against model fields excluding objective_field, # that may be present or not objective_field = fields.field_column_number(objective_field) fields_names = [ fields.fields[fields.field_id(i)]['name'] for i in sorted(fields.fields_by_column_number.keys()) if i != objective_field ] self.headers = [ unicode(header, "utf-8") for header in self.headers ] self.exclude = [ i for i in range(len(self.headers)) if not self.headers[i] in fields_names ] self.exclude.reverse() if self.exclude: if len(self.headers) > len(self.exclude): print(u"WARNING: predictions will be processed but some " u"data might not be used. The used fields will be:" u"\n\n%s" u"\n\nwhile the headers found in the test file are:" u"\n\n%s" % (",".join(fields_names), ",".join( self.headers))).encode("utf-8") for index in self.exclude: del self.headers[index] else: raise Exception( (u"No test field matches the model fields." u"\nThe expected fields are:\n\n%s\n\n" u"while " u"the headers found in the test file are:" u"\n\n%s\n\n" u"Use --no-test-header flag if first li" u"ne should not be interpreted as" u" headers." % (",".join(fields_names), ",".join( self.headers))).encode("utf-8"))
def __init__(self, test_set, test_set_header, fields, objective_field): """Builds a generator from a csv file and the fields' model structure """ self.test_set = test_set self.test_set_header = test_set_header self.fields = fields self.objective_field = objective_field try: self.test_reader = csv.reader(open(test_set, "U"), delimiter=get_csv_delimiter(), lineterminator="\n") except IOError: sys.exit("Error: cannot read test %s" % test_set) self.headers = None self.exclude = [] if test_set_header: self.headers = self.test_reader.next() # validate headers against model fields excluding objective_field, # that may be present or not objective_field = fields.field_column_number(objective_field) fields_names = [fields.fields[fields.field_id(i)] ['name'] for i in sorted(fields.fields_by_column_number.keys()) if i != objective_field] self.headers = [unicode(header, "utf-8") for header in self.headers] self.exclude = [i for i in range(len(self.headers)) if not self.headers[i] in fields_names] self.exclude.reverse() if self.exclude: if len(self.headers) > len(self.exclude): print (u"WARNING: predictions will be processed but some " u"data might not be used. The used fields will be:" u"\n\n%s" u"\n\nwhile the headers found in the test file are:" u"\n\n%s" % (",".join(fields_names), ",".join(self.headers))).encode("utf-8") for index in self.exclude: del self.headers[index] else: raise Exception((u"No test field matches the model fields." u"\nThe expected fields are:\n\n%s\n\n" u"while " u"the headers found in the test file are:" u"\n\n%s\n\n" u"Use --no-test-header flag if first li" u"ne should not be interpreted as" u" headers." % (",".join(fields_names), ",".join(self.headers))).encode("utf-8"))
class TstReader(object): """Retrieves csv info and builds a input data dict """ def __init__(self, test_set, test_set_header, fields, objective_field, test_separator=None): """Builds a generator from a csv file and the fields' model structure `test_set`: path to the test data file `test_set_header`: boolean, True means that headers are first row in the file `fields`: Fields object with the expected fields structure. `objective_field`: field_id of the objective field """ self.test_set = test_set if test_set.__class__.__name__ == "StringIO": self.encode = None self.test_set = UTF8Recoder(test_set, SYSTEM_ENCODING) else: self.encode = None if PYTHON3 else FILE_ENCODING self.test_set_header = test_set_header self.fields = fields if (objective_field is not None and not objective_field in fields.fields): try: objective_field = fields.field_id(objective_field) except ValueError, exc: sys.exit(exc) self.objective_field = objective_field if test_separator and not PYTHON3: test_separator = decode2(test_separator, encoding="string_escape") self.test_separator = (test_separator if test_separator is not None else get_csv_delimiter()) if len(self.test_separator) > 1: sys.exit("Only one character can be used as test data separator.") try: self.test_reader = UnicodeReader( self.test_set, delimiter=self.test_separator, lineterminator="\n").open_reader() except IOError: sys.exit("Error: cannot read test %s" % test_set) self.headers = None self.raw_headers = None self.exclude = [] if test_set_header: self.headers = self.test_reader.next() # validate headers against model fields excluding objective_field, # that may be present or not if objective_field is not None: objective_field = fields.field_column_number(objective_field) try: fields_names = [ fields.fields[fields.field_id(i)]['name'] for i in sorted(fields.fields_by_column_number.keys()) if objective_field is None or i != objective_field ] except ValueError, exc: sys.exit(exc) self.raw_headers = self.headers[:] self.exclude = [ i for i in range(len(self.headers)) if not self.headers[i] in fields_names ] self.exclude.reverse() if self.exclude: if len(self.headers) > len(self.exclude): for index in self.exclude: del self.headers[index] else: raise Exception( (u"No test field matches the model fields." u"\nThe expected fields are:\n\n%s\n\n" u"while " u"the headers found in the test file are:" u"\n\n%s\n\n" u"Use --no-test-header flag if first li" u"ne should not be interpreted as" u" headers." % (",".join(fields_names), ",".join( self.headers))).encode("utf-8"))
def predict(test_set, test_set_header, models, fields, output, objective_field, remote=False, api=None, log=None, max_models=MAX_MODELS, method=0, resume=False, tags=None, verbosity=1, session_file=None, debug=False): """Computes a prediction for each entry in the `test_set`. Predictions can be computed remotely, locally using MultiModels built on all the models or locally using MultiModels on subgroups of models. Chosing a max_batch_models value not bigger than the number_of_models flag will lead to the last case, where memory usage is bounded and each model predictions are saved for further use. """ try: test_reader = csv.reader(open(test_set, "U"), delimiter=get_csv_delimiter(), lineterminator="\n") except IOError: sys.exit("Error: cannot read test %s" % test_set) headers = None exclude = [] if test_set_header: headers = test_reader.next() # validate headers against model fields excluding objective_field, # that may be present or not fields_names = [fields.fields[fields.field_id(i)] ['name'] for i in sorted(fields.fields_by_column_number.keys()) if i != fields.field_column_number(objective_field)] headers = [unicode(header, "utf-8") for header in headers] exclude = [i for i in range(len(headers)) if not headers[i] in fields_names] exclude.reverse() if len(exclude): if (len(headers) - len(exclude)): print (u"WARNING: predictions will be processed but some data" u" might not be used. The used fields will be:\n\n%s" u"\n\nwhile the headers found in the test file are:" u"\n\n%s" % (",".join(fields_names), ",".join(headers))).encode("utf-8") for index in exclude: del headers[index] else: raise Exception((u"No test field matches the model fields.\n" u"The expected fields are:\n\n%s\n\nwhile " u"the headers found in the test file are:\n\n" u"%s\n\nUse --no-test-header flag if first li" u"ne should not be interpreted as headers." % (",".join(fields_names), ",".join(headers))).encode("utf-8")) prediction_file = output output_path = u.check_dir(output) output = open(output, 'w', 0) number_of_tests = None if resume: number_of_tests = u.file_number_of_lines(test_set) if test_set_header: number_of_tests -= 1 # Remote predictions: predictions are computed in bigml.com and stored # in a file named after the model in the following syntax: # model_[id of the model]__predictions.csv # For instance, # model_50c0de043b563519830001c2_predictions.csv if remote: remote_predict(models, headers, output_path, number_of_tests, resume, verbosity, test_reader, exclude, fields, api, prediction_file, method, tags, objective_field, session_file, test_set_header, log, debug) # Local predictions: Predictions are computed locally using models' rules # with MultiModel's predict method else: message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=verbosity) # For a small number of models, we build a MultiModel using all of # the given models and issue a combined prediction if len(models) < max_models: local_predict(models, headers, test_reader, exclude, fields, method, objective_field, output, test_set_header) # For large numbers of models, we split the list of models in chunks # and build a MultiModel for each chunk, issue and store predictions # for each model and combine all of them eventually. else: local_batch_predict(models, headers, test_reader, exclude, fields, resume, output_path, max_models, number_of_tests, api, output, verbosity, method, objective_field, session_file, debug) output.close()
def __init__(self, test_set, test_set_header, fields, objective_field, test_separator=None): """Builds a generator from a csv file and the fields' model structure `test_set`: path to the test data file `test_set_header`: boolean, True means that headers are first row in the file `fields`: Fields object with the expected fields structure. `objective_field`: field_id of the objective field """ self.test_set = test_set if test_set.__class__.__name__ == "StringIO": self.encode = "utf-8" self.test_set_handler = UTF8Recoder(test_set, self.encode) else: self.encode = None self.test_set_handler = open(test_set, "U") self.test_set_header = test_set_header self.fields = fields if not objective_field in fields.fields: objective_field = fields.field_id(objective_field) self.objective_field = objective_field self.test_separator = (test_separator.decode("string_escape") if test_separator is not None else get_csv_delimiter()) if len(self.test_separator) > 1: sys.exit("Only one character can be used as test data separator.") try: self.test_reader = csv.reader(self.test_set_handler, delimiter=self.test_separator, lineterminator="\n") except IOError: sys.exit("Error: cannot read test %s" % test_set) self.headers = None self.raw_headers = None self.exclude = [] if test_set_header: self.headers = self.test_reader.next() self.raw_headers = self.headers # validate headers against model fields excluding objective_field, # that may be present or not objective_field = fields.field_column_number(objective_field) fields_names = [fields.fields[fields.field_id(i)] ['name'] for i in sorted(fields.fields_by_column_number.keys()) if i != objective_field] self.headers = [unicode(header, "utf-8") for header in self.headers] self.exclude = [i for i in range(len(self.headers)) if not self.headers[i] in fields_names] self.exclude.reverse() if self.exclude: if len(self.headers) > len(self.exclude): print (u"WARNING: predictions will be processed but some " u"data might not be used. The used fields will be:" u"\n\n%s" u"\n\nwhile the headers found in the test file are:" u"\n\n%s" % (",".join(fields_names), ",".join(self.headers))).encode("utf-8") for index in self.exclude: del self.headers[index] else: raise Exception((u"No test field matches the model fields." u"\nThe expected fields are:\n\n%s\n\n" u"while " u"the headers found in the test file are:" u"\n\n%s\n\n" u"Use --no-test-header flag if first li" u"ne should not be interpreted as" u" headers." % (",".join(fields_names), ",".join(self.headers))).encode("utf-8"))
def __init__(self, training_set, training_set_header, objective_field, multi_label=False, labels=None, label_separator=None, training_separator=None, multi_label_fields=None, label_aggregates=None, objective=True): """Builds a generator from a csv file `training_set`: path to the training data file `training_set_header`: boolean, True means that headers are first row in the file `objective_field`: objective field column or field name `labels`: Fields object with the expected fields structure. """ self.training_set = training_set if training_set.__class__.__name__ == "StringIO": self.encode = None self.training_set = UTF8Recoder(training_set, BIGML_SYS_ENCODING) else: self.encode = None if PYTHON3 else FILE_ENCODING self.training_set_header = training_set_header self.training_reader = None self.multi_label = multi_label self.objective = objective if label_aggregates is None: label_aggregates = [] self.label_aggregates = label_aggregates self.training_separator = (decode2( training_separator, encoding="string_escape") if training_separator is not None else get_csv_delimiter()) if len(self.training_separator) > 1: sys.exit("Only one character can be used as test data separator.") # opening csv reader self.reset() self.label_separator = (decode2( label_separator, encoding="string_escape") if label_separator is not None else get_csv_delimiter()) first_row = self.get_next(reset=not training_set_header) self.row_length = len(first_row) if training_set_header: self.headers = first_row else: self.headers = [("field_%s" % index) for index in range(0, self.row_length)] self.multi_label_fields = sorted(self._get_columns(multi_label_fields)) if objective: self.objective_column = self._get_columns([objective_field])[0] if not self.objective_column in self.multi_label_fields: self.multi_label_fields.append(self.objective_column) self.labels = labels self.fields_labels = self._get_labels() if objective: if labels is None: self.labels = self.fields_labels[self.objective_column] self.objective_name = self.headers[self.objective_column]