def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None ensemble_ids = [] # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): sys.exit("You should provide a description to publish.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass # labels to be used in multi-label expansion labels = (map(str.strip, args.labels.split(',')) if args.labels is not None else None) if labels is not None: labels = sorted([label.decode("utf-8") for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and training_set is not None: (training_set, labels, field_attributes, objective_field) = multi_label_expansion( training_set, training_set_header, objective_field, args, path, field_attributes=field_attributes, labels=labels, session_file=session_file) training_set_header = True all_labels = labels if args.multi_label and args.evaluate and args.test_set is not None: (test_set, test_labels, field_attributes, objective_field) = multi_label_expansion( test_set, test_set_header, objective_field, args, path, field_attributes=field_attributes, labels=labels, session_file=session_file) test_set_header = True source, resume, csv_properties, fields = source_processing( training_set, test_set, training_set_header, test_set_header, api, args, resume, name=name, description=description, csv_properties=csv_properties, field_attributes=field_attributes, types=types, session_file=session_file, path=path, log=log) dataset, resume, csv_properties, fields = dataset_processing( source, training_set, test_set, fields, api, args, resume, name=name, description=description, dataset_fields=dataset_fields, csv_properties=csv_properties, session_file=session_file, path=path, log=log) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = split_processing( dataset, api, args, resume, name=name, description=description, session_file=session_file, path=path, log=log) models, model_ids, ensemble_ids, resume = models_processing( dataset, models, model_ids, objective_field, fields, api, args, resume, name=name, description=description, model_fields=model_fields, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels) if models: model = models[0] single_model = len(models) == 1 # We get the fields of the model if we haven't got # them yet and update its public state if needed if model and not args.evaluate and (test_set or args.black_box or args.white_box): if args.black_box or args.white_box: model = r.publish_model(model, args, api, session_file) models[0] = model # If more than one model, use the full field structure fields, objective_field = get_model_fields( model, csv_properties, args, single_model=single_model) # If multi-label flag is set and no training_set was provided, label # info is extracted from the fields structure if args.multi_label and training_set is None: fields_list = [] for model in models: if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): # if there's more than one model the first one must contain # the entire field structure to be used as reference. query_string = (r.FIELDS_QS if single_model else r.ALL_FIELDS_QS) model = bigml.api.check_resource(model, api.get_model, query_string=query_string) fields_list.append(model['object']['model']['fields']) fields_list.reverse() all_labels, labels = l.retrieve_labels(fields_list, labels) # If predicting if models and test_set and not args.evaluate: models_per_label = 1 if args.multi_label: models_per_label = len(models) / len(all_labels) predict(test_set, test_set_header, models, fields, output, objective_field, args, api, log, args.max_batch_models, resume, session_file, labels=labels, models_per_label=models_per_label) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(votes_files, local_model.to_prediction, output, args.method)
def models_processing(dataset, models, model_ids, objective_field, fields, api, args, resume, name=None, description=None, model_fields=None, session_file=None, path=None, log=None, labels=None, all_labels=None): """Creates or retrieves models from the input data """ log_models = False ensemble_ids = [] # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if dataset and not (has_models(args) or args.no_model): model_ids = [] models = [] if args.multi_label: # Create one model per column choosing only the label column if args.training_set is None: all_labels, labels = l.retrieve_labels(fields.fields, labels) # If --number-of-models is not set or is 1, create one model per # label. Otherwise, create one ensemble per label with the required # number of models if args.number_of_models < 2: models, model_ids, resume = model_per_label( labels, all_labels, dataset, fields, objective_field, api, args, resume, name, description, model_fields, session_file, path, log) else: (ensembles, ensemble_ids, models, model_ids, resume) = ensemble_per_label( labels, all_labels, dataset, fields, objective_field, api, args, resume, name, description, model_fields, session_file, path, log) elif args.number_of_models > 1: ensembles = [] # Ensemble of models (ensembles, ensemble_ids, models, model_ids, resume) = ensemble_processing( dataset, objective_field, fields, api, args, resume, name=name, description=description, model_fields=model_fields, session_file=session_file, path=path, log=log) ensemble = ensembles[0] args.ensemble = bigml.api.get_ensemble_id(ensemble) log_models = True else: # Cross-validation case: we create 2 * n models to be validated # holding out an n% of data if args.cross_validation_rate > 0: if args.number_of_evaluations > 0: args.number_of_models = args.number_of_evaluations else: args.number_of_models = int(MONTECARLO_FACTOR * args.cross_validation_rate) if resume: resume, model_ids = c.checkpoint( c.are_models_created, path, args.number_of_models, debug=args.debug) if not resume: message = u.dated("Found %s models out of %s. Resuming.\n" % (len(model_ids), args.number_of_models)) u.log_message(message, log_file=session_file, console=args.verbosity) models = model_ids args.number_of_models -= len(model_ids) model_args = r.set_model_args(name, description, args, objective_field, fields, model_fields) models, model_ids = r.create_models(dataset, models, model_args, args, api, path, session_file, log) # If a model is provided, we use it. elif args.model: model_ids = [args.model] models = model_ids[:] elif args.models or args.model_tag: models = model_ids[:] if args.ensemble: ensemble = r.get_ensemble(args.ensemble, api, args.verbosity, session_file) ensemble_ids = [ensemble] model_ids = ensemble['object']['models'] if log_models and args.number_of_models > 1: for model_id in model_ids: u.log_created_resources("models", path, model_id, open_mode='a') models = model_ids[:] if args.ensembles or args.ensemble_tag: model_ids = [] ensemble_ids = [] # Parses ensemble/ids if provided. if args.ensemble_tag: ensemble_ids = (ensemble_ids + u.list_ids(api.list_ensembles, "tags__in=%s" % args.ensemble_tag)) else: ensemble_ids = u.read_resources(args.ensembles) for ensemble_id in ensemble_ids: ensemble = r.get_ensemble(ensemble_id, api) if args.ensemble is None: args.ensemble = ensemble_id model_ids.extend(ensemble['object']['models']) models = model_ids[:] # If we are going to predict we must retrieve the models if model_ids and args.test_set and not args.evaluate: models, model_ids = r.get_models(models, args, api, session_file) return models, model_ids, ensemble_ids, resume