if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: eval_ensembles = len(ensemble_ids) == len(args.test_dataset_ids) models_or_ensembles = ensemble_ids if eval_ensembles else models # Evaluate the models with the corresponding test datasets. resume = evaluate( models_or_ensembles, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels, objective_field=args.objective_field, ) else: if args.multi_label and args.test_set is not None: # When evaluation starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: args.multi_label_fields = multi_label_fields test_set = ps.multi_label_expansion(
combine_votes(votes_files, local_model.to_prediction, output, args.method) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.dataset_off and not args.test_dataset_ids: args.test_dataset_ids = datasets if args.test_dataset_ids: # Evaluate the models with the corresponding test datasets. resume = evaluate(models, args.test_dataset_ids, output, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, fields_map=fields_map, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels, objective_field=objective_field) else: if args.multi_label and args.test_set is not None: # When evaluation starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: args.multi_label_fields = multi_label_fields test_set = ps.multi_label_expansion( test_set, test_set_header, objective_field, args, path, labels=labels, session_file=session_file)[0] test_set_header = True
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ logistic_regression = None logistic_regressions = None # no multi-label support at present # variables from command-line options resume = args.resume_ logistic_regression_ids = args.logistic_regression_ids_ output = args.predictions # there's only one logistic regression to be generated at present args.max_parallel_logistic_regressions = 1 # logistic regressions cannot be published yet. args.public_logistic_regression = False # It is compulsory to have a description to publish either datasets or # logistic regressions if (not args.description_ and (args.public_logistic_regression or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) if args.logistic_file: # logistic regression is retrieved from the contents of the given local # JSON file logistic_regression, csv_properties, fields = u.read_local_resource( args.logistic_file, csv_properties=csv_properties) logistic_regressions = [logistic_regression] logistic_regression_ids = [logistic_regression['resource']] else: # logistic regression is retrieved from the remote object logistic_regressions, logistic_regression_ids, resume = \ plr.logistic_regressions_processing( \ datasets, logistic_regressions, logistic_regression_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if logistic_regressions: logistic_regression = logistic_regressions[0] # We update the logistic regression's public state if needed if logistic_regression: if isinstance(logistic_regression, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' logistic_regression = u.check_resource(logistic_regression, api.get_logistic_regression, query_string=query_string) logistic_regressions[0] = logistic_regression if (args.public_logistic_regression or (args.shared_flag and r.shared_changed(args.shared, logistic_regression))): logistic_regression_args = {} if args.shared_flag and r.shared_changed(args.shared, logistic_regression): logistic_regression_args.update(shared=args.shared) if args.public_logistic_regression: logistic_regression_args.update( \ r.set_publish_logistic_regression_args(args)) if logistic_regression_args: logistic_regression = r.update_logistic_regression( \ logistic_regression, logistic_regression_args, args, api=api, path=path, \ session_file=session_file) logistic_regressions[0] = logistic_regression # We get the fields of the logistic_regression if we haven't got # them yet and need them if logistic_regression and (args.test_set or args.export_fields): fields = plr.get_logistic_fields( \ logistic_regression, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if logistic_regressions and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_lr_prediction(logistic_regression, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: lr_prediction(logistic_regressions, fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(logistic_regressions, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(logistic_regressions, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
labels=labels, session_file=session_file)[0] test_set_header = True if args.test_split > 0: dataset = test_dataset dataset_fields = pd.get_fields_structure(dataset, None) models_or_ensembles = ensemble_ids if ensemble_ids != [] else models resume = evaluate(models_or_ensembles, [dataset], output, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, fields_map=fields_map, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels, objective_field=objective_field) # If cross_validation_rate is > 0, create remote evaluations and save # results in json and human-readable format. Then average the results to # issue a cross_validation measure set. if args.cross_validation_rate > 0: args.sample_rate = 1 - args.cross_validation_rate cross_validate(models, dataset,
u.log_message(message, log_file=session_file, console=args.verbosity) u.combine_votes(votes_files, local_model.to_prediction, output, args.method) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: if args.test_split > 0: dataset = test_dataset resume = evaluate(model, dataset, name, description, fields, fields_map, output, api, args, resume, session_file=session_file, path=path, log=log) # If cross_validation_rate is > 0, create remote evaluations and save # results in json and human-readable format. Then average the results to # issue a cross_validation measure set. if args.cross_validation_rate > 0: args.sample_rate = 1 - args.cross_validation_rate if args.number_of_evaluations > 0: number_of_evaluations = args.number_of_evaluations else:
def compute_output(api, args): """ Creates a fusion using the `models` list or uses the ids of a previously created BigML fusion to make predictions for the `test_set`. """ fusion = None # variables from command-line options resume = args.resume_ fusion_ids = args.fusion_ids_ output = args.predictions # there's only one fusion to be generated at present args.max_parallel_fusions = 1 # fusion cannot be published yet. args.public_fusion = False # It is compulsory to have a description to publish either datasets or # fusions if (not args.description_ and args.public_fusion): sys.exit("You should provide a description to publish.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) if args.fusion_file: # fusion regression is retrieved from the contents of the given local # JSON file fusion, csv_properties, fields = u.read_local_resource( args.fusion_file, csv_properties=csv_properties) fusion_ids = [fusion] else: # fusion is retrieved from the remote object or created fusion, resume = \ pf.fusion_processing( \ fusion, fusion_ids, \ api, args, resume, \ session_file=session_file, path=path, log=log) # We update the fusion public state if needed if fusion: if isinstance(fusion, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' fusion = u.check_resource(fusion, api.get_fusion, query_string=query_string) if (args.public_fusion or (args.shared_flag and r.shared_changed(args.shared, fusion))): fusion_args = {} if args.shared_flag and r.shared_changed(args.shared, fusion): fusion_args.update(shared=args.shared) if args.public_fusion: fusion_args.update( \ r.set_publish_fusion_args(args)) if fusion_args: fusion = r.update_fusion( \ fusion, fusion_args, args, api=api, path=path, \ session_file=session_file) # We get the fields of the fusion if we haven't got # them yet and need them if fusion and (args.test_set or args.evaluate): fields = pf.get_fusion_fields( \ fusion, csv_properties, args) # If predicting if fusion and (a.has_test(args) or \ args.remote): test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) if not args.evaluate: batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_prediction(fusion, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: prediction([fusion], fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets args.max_parallel_evaluations = 1 # only one evaluation at present args.cross_validation_rate = 0 # no cross-validation args.number_of_evaluations = 1 # only one evaluation if args.has_test_datasets_: test_dataset = get_test_dataset(args) dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate([fusion], [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ deepnet = None deepnets = None # variables from command-line options resume = args.resume_ deepnet_ids = args.deepnet_ids_ output = args.predictions # there's only one deepnet to be generated at present args.max_parallel_deepnets = 1 # deepnets cannot be published yet. args.public_deepnet = False # It is compulsory to have a description to publish either datasets or # deepnet if (not args.description_ and (args.public_deepnet or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) if args.deepnet_file: # deepnet is retrieved from the contents of the given local # JSON file deepnet, csv_properties, fields = u.read_local_resource( args.deepnet_file, csv_properties=csv_properties) deepnets = [deepnet] deepnet_ids = [deepnet['resource']] else: # deepnet is retrieved from the remote object deepnets, deepnet_ids, resume = \ pdn.deepnets_processing( \ datasets, deepnets, deepnet_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if deepnets: deepnet = deepnets[0] # We update the deepnet's public state if needed if deepnet: if isinstance(deepnet, basestring) or \ api.status(deepnet) != bigml.api.FINISHED: if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' deepnet = u.check_resource(deepnet, api.get_deepnet, query_string=query_string) deepnets[0] = deepnet if (args.public_deepnet or (args.shared_flag and r.shared_changed(args.shared, deepnet))): deepnet_args = {} if args.shared_flag and r.shared_changed(args.shared, deepnet): deepnet_args.update(shared=args.shared) if args.public_deepnet: deepnet_args.update( \ r.set_publish_deepnet_args(args)) if deepnet_args: deepnet = r.update_deepnet( \ deepnet, deepnet_args, args, api=api, path=path, \ session_file=session_file) deepnet[0] = deepnet # We get the fields of the deepnet if we haven't got # them yet and need them if deepnet and (args.test_set or args.export_fields): fields = pdn.get_deepnet_fields( \ deepnet, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if deepnets and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_dn_prediction(deepnet, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: dn_prediction(deepnets, fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(deepnets, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(deepnets, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
# json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: eval_ensembles = len(ensemble_ids) == len(args.test_dataset_ids) models_or_ensembles = (ensemble_ids if eval_ensembles else models) # Evaluate the models with the corresponding test datasets. resume = evaluate(models_or_ensembles, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels, objective_field=args.objective_field) else: if args.multi_label and args.test_set is not None: # When evaluation starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: args.multi_label_fields = multi_label_fields test_set = ps.multi_label_expansion( test_set, test_set_header, args, path, labels=labels, session_file=session_file)[0] test_set_header = True if args.test_split > 0 or args.has_test_datasets_:
sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) u.combine_votes(votes_files, local_model.to_prediction, output, args.method) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: if args.test_split > 0: dataset = test_dataset resume = evaluate(model, dataset, name, description, fields, fields_map, output, api, args, resume, session_file=session_file, path=path, log=log) # If cross_validation_rate is > 0, create remote evaluations and save # results in json and human-readable format. Then average the results to # issue a cross_validation measure set. if args.cross_validation_rate > 0: args.sample_rate = 1 - args.cross_validation_rate if args.number_of_evaluations > 0: number_of_evaluations = args.number_of_evaluations else: number_of_evaluations = int(MONTECARLO_FACTOR * args.cross_validation_rate) cross_validate(models, dataset, number_of_evaluations, name, description, fields, fields_map, api, args, resume, session_file=session_file, path=path, log=log)