def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): raise Exception("You should provide a description to publish.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass source, resume, csv_properties, fields = source_processing( training_set, test_set, training_set_header, test_set_header, name, description, api, args, resume, csv_properties=csv_properties, field_attributes=field_attributes, types=types, session_file=session_file, path=path, log=log) dataset, resume, csv_properties, fields = dataset_processing( source, training_set, test_set, model_ids, name, description, fields, dataset_fields, api, args, resume, csv_properties=csv_properties, session_file=session_file, path=path, log=log) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = split_processing( dataset, name, description, api, args, resume, session_file=session_file, path=path, log=log) models, model_ids, resume = models_processing(dataset, models, model_ids, name, description, test_set, objective_field, fields, model_fields, api, args, resume, session_file=session_file, path=path, log=log) if models: model = models[0] # We get the fields of the model if we haven't got # them yet and update its public state if needed if model and not args.evaluate and (test_set or args.black_box or args.white_box): if args.black_box or args.white_box: model = r.publish_model(model, args, api, session_file) models[0] = model fields, objective_field = get_model_fields(model, model_fields, csv_properties, args) # If predicting if models and test_set and not args.evaluate: predict(test_set, test_set_header, models, fields, output, objective_field, args.remote, api, log, args.max_batch_models, args.method, resume, args.tag, args.verbosity, session_file, args.debug, args.ensemble, args.prediction_info) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = bigml.api.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) u.combine_votes(votes_files, local_model.to_prediction, output, args.method)
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None, test_field_attributes=None, test_types=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] local_ensemble = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --max-categories, it is compulsory to specify also the # objective_field if args.max_categories > 0 and objective_field is None: sys.exit("When --max-categories is used, you must also provide the" " --objective field name or column number") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass # labels to be used in multi-label expansion labels = (map(str.strip, args.labels.split(',')) if args.labels is not None else None) if labels is not None: labels = sorted([label.decode("utf-8") for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and training_set is not None: (training_set, multi_label_data) = ps.multi_label_expansion( training_set, training_set_header, objective_field, args, path, labels=labels, session_file=session_file) training_set_header = True objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels source, resume, csv_properties, fields = ps.source_processing( training_set, test_set, training_set_header, test_set_header, api, args, resume, name=name, description=description, csv_properties=csv_properties, field_attributes=field_attributes, types=types, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( objective_field, labels, multi_label_data, fields, multi_label_fields) datasets, resume, csv_properties, fields = pd.dataset_processing( source, training_set, test_set, fields, objective_field, api, args, resume, name=name, description=description, dataset_fields=dataset_fields, multi_label_data=multi_label_data, csv_properties=csv_properties, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, name=name, description=description, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: objective_id = fields.field_id(fields.objective_field) if pd.check_max_categories(fields.fields[objective_id]): distribution = pd.get_categories_distribution(dataset, objective_id) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # Check if the dataset a generators file associated with it, and # generate a new dataset with the specified field structure if args.new_fields: dataset, resume = pd.create_new_dataset( dataset, api, args, resume, name=name, description=description, session_file=session_file, path=path, log=log) datasets[0] = dataset if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, objective_field, fields, api, args, resume, name=name, description=description, model_fields=model_fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We get the fields of the model if we haven't got # them yet and update its public state if needed if model and not args.evaluate and (test_set or args.black_box or args.white_box): if args.black_box or args.white_box: model = r.publish_model(model, args, api, session_file) models[0] = model # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] else: ensemble_id = get_ensemble_id(model) local_ensemble = Ensemble(ensemble_id, api=api) fields, objective_field = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data, local_ensemble=local_ensemble) # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) # If predicting if models and has_test(args) and not args.evaluate: models_per_label = 1 test_dataset = None if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [field[1] for field in multi_label_fields] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion( test_set, test_set_header, objective_field, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION]): # create test source from file test_name = "%s - test" % name if args.test_source is None: (test_source, resume, csv_properties, test_fields) = ps.test_source_processing( test_set, test_set_header, api, args, resume, name=test_name, description=description, field_attributes=test_field_attributes, types=test_types, session_file=session_file, path=path, log=log) else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id, api.get_source) if args.test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(test_name, description, args) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(args.test_dataset) test_dataset = api.check_resource(test_dataset_id, api.get_dataset) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( name, description, args, fields=fields, dataset_fields=test_fields, fields_map=fields_map) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(test_set, test_set_header, models, fields, output, objective_field, args, api=api, log=log, max_models=args.max_batch_models, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(votes_files, local_model.to_prediction, output, args.method)
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None ensemble_ids = [] # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): sys.exit("You should provide a description to publish.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass # labels to be used in multi-label expansion labels = (map(str.strip, args.labels.split(',')) if args.labels is not None else None) if labels is not None: labels = sorted([label.decode("utf-8") for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and training_set is not None: (training_set, labels, field_attributes, objective_field) = multi_label_expansion( training_set, training_set_header, objective_field, args, path, field_attributes=field_attributes, labels=labels, session_file=session_file) training_set_header = True all_labels = labels if args.multi_label and args.evaluate and args.test_set is not None: (test_set, test_labels, field_attributes, objective_field) = multi_label_expansion( test_set, test_set_header, objective_field, args, path, field_attributes=field_attributes, labels=labels, session_file=session_file) test_set_header = True source, resume, csv_properties, fields = source_processing( training_set, test_set, training_set_header, test_set_header, api, args, resume, name=name, description=description, csv_properties=csv_properties, field_attributes=field_attributes, types=types, session_file=session_file, path=path, log=log) dataset, resume, csv_properties, fields = dataset_processing( source, training_set, test_set, fields, api, args, resume, name=name, description=description, dataset_fields=dataset_fields, csv_properties=csv_properties, session_file=session_file, path=path, log=log) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = split_processing( dataset, api, args, resume, name=name, description=description, session_file=session_file, path=path, log=log) models, model_ids, ensemble_ids, resume = models_processing( dataset, models, model_ids, objective_field, fields, api, args, resume, name=name, description=description, model_fields=model_fields, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels) if models: model = models[0] single_model = len(models) == 1 # We get the fields of the model if we haven't got # them yet and update its public state if needed if model and not args.evaluate and (test_set or args.black_box or args.white_box): if args.black_box or args.white_box: model = r.publish_model(model, args, api, session_file) models[0] = model # If more than one model, use the full field structure fields, objective_field = get_model_fields( model, csv_properties, args, single_model=single_model) # If multi-label flag is set and no training_set was provided, label # info is extracted from the fields structure if args.multi_label and training_set is None: fields_list = [] for model in models: if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): # if there's more than one model the first one must contain # the entire field structure to be used as reference. query_string = (r.FIELDS_QS if single_model else r.ALL_FIELDS_QS) model = bigml.api.check_resource(model, api.get_model, query_string=query_string) fields_list.append(model['object']['model']['fields']) fields_list.reverse() all_labels, labels = l.retrieve_labels(fields_list, labels) # If predicting if models and test_set and not args.evaluate: models_per_label = 1 if args.multi_label: models_per_label = len(models) / len(all_labels) predict(test_set, test_set_header, models, fields, output, objective_field, args, api, log, args.max_batch_models, resume, session_file, labels=labels, models_per_label=models_per_label) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(votes_files, local_model.to_prediction, output, args.method)
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): raise Exception("You should provide a description to publish.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass source, resume, csv_properties, fields = source_processing( training_set, test_set, training_set_header, test_set_header, name, description, api, args, resume, csv_properties=csv_properties, field_attributes=field_attributes, types=types, session_file=session_file, path=path, log=log) dataset, resume, csv_properties, fields = dataset_processing( source, training_set, test_set, model_ids, name, description, fields, dataset_fields, api, args, resume, csv_properties=csv_properties, session_file=session_file, path=path, log=log) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = split_processing( dataset, name, description, api, args, resume, session_file=session_file, path=path, log=log) models, model_ids, resume = models_processing( dataset, models, model_ids, name, description, test_set, objective_field, fields, model_fields, api, args, resume, session_file=session_file, path=path, log=log) if models: model = models[0] # We get the fields of the model if we haven't got # them yet and update its public state if needed if model and not args.evaluate and (test_set or args.black_box or args.white_box): if args.black_box or args.white_box: model = r.publish_model(model, args, api, session_file) models[0] = model fields, objective_field = get_model_fields(model, model_fields, csv_properties, args) # If predicting if models and test_set and not args.evaluate: predict(test_set, test_set_header, models, fields, output, objective_field, args.remote, api, log, args.max_batch_models, args.method, resume, args.tag, args.verbosity, session_file, args.debug, args.ensemble, args.prediction_info) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = bigml.api.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) u.combine_votes(votes_files, local_model.to_prediction, output, args.method)
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): raise Exception("You should provide a description to publish.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass # Starting source processing if (training_set or (args.evaluate and test_set)): # If resuming, try to extract args.source form log files if resume: resume, args.source = u.checkpoint(u.is_source_created, path, debug=args.debug) if not resume: message = u.dated("Source not found. Resuming.\n") u.log_message(message, log_file=session_file, console=args.verbosity) # If neither a previous source, dataset or model are provided. # we create a new one. Also if --evaluate and test data are provided # we create a new dataset to test with. data_set, data_set_header = r.data_to_source(training_set, test_set, training_set_header, test_set_header, args) if data_set is not None: source_args = r.set_source_args(data_set_header, name, description, args) source = r.create_source(data_set, source_args, args, api, path, session_file, log) # If a source is provided either through the command line or in resume # steps, we use it. elif args.source: source = bigml.api.get_source_id(args.source) # If we already have source, we check that is finished, extract the # fields, and update them if needed. if source: source = r.get_source(source, api, args.verbosity, session_file) if 'source_parser' in source['object']: source_parser = source['object']['source_parser'] if 'missing_tokens' in source_parser: csv_properties['missing_tokens'] = ( source_parser['missing_tokens']) if 'data_locale' in source_parser: csv_properties['data_locale'] = source_parser['locale'] fields = Fields(source['object']['fields'], **csv_properties) if field_attributes: source = r.update_source_fields(source, field_attributes, fields, api, args.verbosity, session_file) if types: source = r.update_source_fields(source, types, fields, api, args.verbosity, session_file) # End of source processing # Starting dataset processing if (training_set or args.source or (args.evaluate and test_set)): # if resuming, try to extract args.dataset form log files if resume: resume, args.dataset = u.checkpoint(u.is_dataset_created, path, debug=args.debug) if not resume: message = u.dated("Dataset not found. Resuming.\n") u.log_message(message, log_file=session_file, console=args.verbosity) # If we have a source but not dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not args.dataset and not args.model and not model_ids and not args.no_dataset) or (args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if not csv_properties and 'locale' in dataset['object']: csv_properties = { 'data_locale': dataset['object']['locale']} fields = Fields(dataset['object']['fields'], **csv_properties) if args.public_dataset: r.publish_dataset(dataset, api, args, session_file) #end of dataset processing #start of model processing # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if (dataset and not args.model and not model_ids and not args.no_model): model_ids = [] models = [] if resume: resume, model_ids = u.checkpoint(u.are_models_created, path, args.number_of_models, debug=args.debug) if not resume: message = u.dated("Found %s models out of %s. Resuming.\n" % (len(model_ids), args.number_of_models)) u.log_message(message, log_file=session_file, console=args.verbosity) models = model_ids args.number_of_models -= len(model_ids) model_args = r.set_model_args(name, description, args, objective_field, fields, model_fields) models, model_ids = r.create_models(dataset, models, model_args, args, api, path, session_file, log) model = models[0] # If a model is provided, we use it. elif args.model: model = args.model model_ids = [model] models = [model] elif args.models or args.model_tag: models = model_ids[:] model = models[0] # If we are going to predict we must retrieve the models if model_ids and test_set and not args.evaluate: models, model_ids = r.get_models(models, args, api, session_file) model = models[0] # We get the fields of the model if we haven't got # them yet and update its public state if needed if model and not args.evaluate and (test_set or args.black_box or args.white_box): if args.black_box or args.white_box: model = r.publish_model(model, args, api, session_file) models[0] = model if not csv_properties: csv_properties = {} csv_properties.update(verbose=True) if args.user_locale is None: args.user_locale = model['object'].get('locale', None) csv_properties.update(data_locale=args.user_locale) if 'model_fields' in model['object']['model']: model_fields = model['object']['model']['model_fields'].keys() csv_properties.update(include=model_fields) if 'missing_tokens' in model['object']['model']: missing_tokens = model['object']['model']['missing_tokens'] else: missing_tokens = MISSING_TOKENS csv_properties.update(missing_tokens=missing_tokens) objective_field = models[0]['object']['objective_fields'] if isinstance(objective_field, list): objective_field = objective_field[0] csv_properties.update(objective_field=objective_field) fields = Fields(model['object']['model']['fields'], **csv_properties) # end of model processing # If predicting if models and test_set and not args.evaluate: predict(test_set, test_set_header, models, fields, output, objective_field, args.remote, api, log, args.max_batch_models, args.method, resume, args.tag, args.verbosity, session_file, args.debug) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = api.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) u.combine_votes(votes_files, local_model.to_prediction, output, args.method)