def number_of_tests(self): """Returns the number of tests in the test file """ tests = u.file_number_of_lines(self.test_set) if self.test_set_header: tests -= 1 return tests
def predict(test_set, test_set_header, models, fields, output, objective_field, remote=False, api=None, log=None, max_models=MAX_MODELS, method=0, resume=False, tags=None, verbosity=1, session_file=None, debug=False): """Computes a prediction for each entry in the `test_set`. Predictions can be computed remotely, locally using MultiModels built on all the models or locally using MultiModels on subgroups of models. Chosing a max_batch_models value not bigger than the number_of_models flag will lead to the last case, where memory usage is bounded and each model predictions are saved for further use. """ try: test_reader = csv.reader(open(test_set, "U"), delimiter=get_csv_delimiter(), lineterminator="\n") except IOError: sys.exit("Error: cannot read test %s" % test_set) headers = None exclude = [] if test_set_header: headers = test_reader.next() # validate headers against model fields excluding objective_field, # that may be present or not fields_names = [fields.fields[fields.field_id(i)] ['name'] for i in sorted(fields.fields_by_column_number.keys()) if i != fields.field_column_number(objective_field)] headers = [unicode(header, "utf-8") for header in headers] exclude = [i for i in range(len(headers)) if not headers[i] in fields_names] exclude.reverse() if len(exclude): if (len(headers) - len(exclude)): print (u"WARNING: predictions will be processed but some data" u" might not be used. The used fields will be:\n\n%s" u"\n\nwhile the headers found in the test file are:" u"\n\n%s" % (",".join(fields_names), ",".join(headers))).encode("utf-8") for index in exclude: del headers[index] else: raise Exception((u"No test field matches the model fields.\n" u"The expected fields are:\n\n%s\n\nwhile " u"the headers found in the test file are:\n\n" u"%s\n\nUse --no-test-header flag if first li" u"ne should not be interpreted as headers." % (",".join(fields_names), ",".join(headers))).encode("utf-8")) prediction_file = output output_path = u.check_dir(output) output = open(output, 'w', 0) number_of_tests = None if resume: number_of_tests = u.file_number_of_lines(test_set) if test_set_header: number_of_tests -= 1 # Remote predictions: predictions are computed in bigml.com and stored # in a file named after the model in the following syntax: # model_[id of the model]__predictions.csv # For instance, # model_50c0de043b563519830001c2_predictions.csv if remote: remote_predict(models, headers, output_path, number_of_tests, resume, verbosity, test_reader, exclude, fields, api, prediction_file, method, tags, objective_field, session_file, test_set_header, log, debug) # Local predictions: Predictions are computed locally using models' rules # with MultiModel's predict method else: message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=verbosity) # For a small number of models, we build a MultiModel using all of # the given models and issue a combined prediction if len(models) < max_models: local_predict(models, headers, test_reader, exclude, fields, method, objective_field, output, test_set_header) # For large numbers of models, we split the list of models in chunks # and build a MultiModel for each chunk, issue and store predictions # for each model and combine all of them eventually. else: local_batch_predict(models, headers, test_reader, exclude, fields, resume, output_path, max_models, number_of_tests, api, output, verbosity, method, objective_field, session_file, debug) output.close()