def local_batch_predict(models, headers, test_reader, exclude, fields, resume, output_path, max_models, number_of_tests, api, output, verbosity, method, objective_field, session_file, debug): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % ( localize(current), localize(total), pct)) models_total = len(models) models_splits = [models[index:(index + max_models)] for index in range(0, models_total, max_models)] input_data_list = [] for row in test_reader: for index in exclude: del row[index] input_data_list.append(fields.pair(row, headers, objective_field)) total_votes = [] models_count = 0 for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) u.checkpoint(u.are_predictions_created, pred_file, number_of_tests, debug=debug) complete_models = [] for index in range(len(models_split)): complete_models.append(api.check_resource( models_split[index], api.get_model)) local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index].predictions predictions.extend(votes[index].predictions) else: total_votes = votes message = u.dated("Combining predictions.\n") u.log_message(message, log_file=session_file, console=verbosity) for multivote in total_votes: u.write_prediction(multivote.combine(method), output)
def local_batch_predict(models, test_reader, prediction_file, api, max_models=MAX_MODELS, resume=False, output_path=None, output=None, verbosity=True, method=PLURALITY_CODE, session_file=None, debug=False, prediction_info=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % ( localize(current), localize(total), pct)) test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [models[index:(index + max_models)] for index in range(0, models_total, max_models)] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, FIELDS_QS) except ValueError, exception: sys.exit("Failed to get model: %s" % (model, str(exception))) complete_models.append(model) local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, by_name=test_set_header, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index].predictions predictions.extend(votes[index].predictions) else: total_votes = votes
def local_batch_predict(models, test_reader, prediction_file, api, args, resume=False, output_path=None, output=None, method=PLURALITY_CODE, options=None, session_file=None, labels=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % (localize(current), localize(total), pct), reset=True) max_models = args.max_batch_models if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [ models[index:(index + max_models)] for index in range(0, models_total, max_models) ] # Input data is stored as a list and predictions are made for all rows # with each model raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) total_votes = [] models_order = [] models_count = 0 single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS # processing the models in slots for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=args.debug) # retrieving the full models allowed by --max-batch-models to be used # in a multimodel slot complete_models, models_order = retrieve_models_split( models_split, api, query_string=query_string, labels=labels, multi_label_data=multi_label_data, ordered=ordered, models_order=models_order) # predicting with the multimodel slot if complete_models: local_model = MultiModel(complete_models, api=api) # added to ensure garbage collection at each step of the loop gc.collect() try: votes = local_model.batch_predict( raw_input_data_list, output_path, by_name=test_set_header, reuse=True, missing_strategy=args.missing_strategy, headers=test_reader.raw_headers, to_file=(not args.fast), use_median=args.median) except ImportError: sys.exit("Failed to find the numpy and scipy libraries needed" " to use proportional missing strategy for" " regressions. Please, install them manually") # extending the votes for each input data with the new model-slot # predictions if not args.fast: votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if args.verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes if not single_model: message = u.dated("Combining predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) # combining the votes to issue the final prediction for each input data for index in range(0, len(total_votes)): multivote = total_votes[index] input_data = raw_input_data_list[index] if single_model: # single model predictions need no combination prediction = [ multivote.predictions[0]['prediction'], multivote.predictions[0]['confidence'] ] elif method == AGGREGATION: # multi-labeled fields: predictions are concatenated prediction = aggregate_multivote( multivote, options, labels, models_per_label, ordered, models_order, label_separator=args.label_separator) elif method == COMBINATION: # used in --max-categories flag: each model slot contains a # subset of categories and the predictions for all of them # are combined in a global distribution to obtain the final # prediction prediction = combine_multivote(multivote, other_label=other_label) else: prediction = multivote.combine(method=method, with_confidence=True, options=options) write_prediction(prediction, output, args.prediction_info, input_data, exclude)
def local_batch_predict(models, test_reader, prediction_file, api, max_models=MAX_MODELS, resume=False, output_path=None, output=None, verbosity=True, method=PLURALITY_CODE, options=None, session_file=None, debug=False, prediction_info=NORMAL_FORMAT, labels=None, label_separator=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % ( localize(current), localize(total), pct)) if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [models[index:(index + max_models)] for index in range(0, models_total, max_models)] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 if not ordered: models_order = [] single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, query_string) except ValueError, exception: sys.exit("Failed to get model: %s. %s" % (model, str(exception))) # When user selects the labels in multi-label predictions, we must # filter the models that will be used to predict if labels: objective_column = str(multi_label_data['objective_column']) labels_info = multi_label_data[ 'generated_fields'][objective_column] labels_columns = [label_info[1] for label_info in labels_info if label_info[0] in labels] model_objective_id = model['object']['objective_fields'][0] model_fields = model['object']['model']['fields'] model_objective = model_fields[model_objective_id] model_column = model_objective['column_number'] if (model_column in labels_columns): # When the list of models comes from a --model-tag # selection, the models are not retrieved in the same # order they were created. We must keep track of the # label they are associated with to label their # predictions properly if not ordered: models_order.append(model_column) complete_models.append(model) else: complete_models.append(model) if complete_models: local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, by_name=test_set_header, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes
def local_batch_predict(models, test_reader, prediction_file, api, args, resume=False, output_path=None, output=None, method=PLURALITY_CODE, options=None, session_file=None, labels=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % ( localize(current), localize(total), pct), reset=True) max_models = args.max_batch_models if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [models[index:(index + max_models)] for index in range(0, models_total, max_models)] # Input data is stored as a list and predictions are made for all rows # with each model raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) total_votes = [] models_order = [] models_count = 0 single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS # processing the models in slots for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=args.debug) # retrieving the full models allowed by --max-batch-models to be used # in a multimodel slot complete_models, models_order = retrieve_models_split( models_split, api, query_string=query_string, labels=labels, multi_label_data=multi_label_data, ordered=ordered, models_order=models_order) # predicting with the multimodel slot if complete_models: local_model = MultiModel(complete_models, api=api) # added to ensure garbage collection at each step of the loop gc.collect() try: votes = local_model.batch_predict( raw_input_data_list, output_path, by_name=test_set_header, reuse=True, missing_strategy=args.missing_strategy, headers=test_reader.raw_headers, to_file=(not args.fast), use_median=args.median) except ImportError: sys.exit("Failed to find the numpy and scipy libraries needed" " to use proportional missing strategy for" " regressions. Please, install them manually") # extending the votes for each input data with the new model-slot # predictions if not args.fast: votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if args.verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes if not single_model: message = u.dated("Combining predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) # combining the votes to issue the final prediction for each input data for index in range(0, len(total_votes)): multivote = total_votes[index] input_data = raw_input_data_list[index] if single_model: # single model predictions need no combination prediction = [multivote.predictions[0]['prediction'], multivote.predictions[0]['confidence']] elif method == AGGREGATION: # multi-labeled fields: predictions are concatenated prediction = aggregate_multivote( multivote, options, labels, models_per_label, ordered, models_order, label_separator=args.label_separator) elif method == COMBINATION: # used in --max-categories flag: each model slot contains a # subset of categories and the predictions for all of them # are combined in a global distribution to obtain the final # prediction prediction = combine_multivote(multivote, other_label=other_label) else: prediction = multivote.combine(method=method, with_confidence=True, options=options) write_prediction(prediction, output, args.prediction_info, input_data, exclude)
def local_batch_predict(models, test_reader, prediction_file, api, max_models=MAX_MODELS, resume=False, output_path=None, output=None, verbosity=True, method=PLURALITY_CODE, session_file=None, debug=False, prediction_info=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % (localize(current), localize(total), pct)) test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [ models[index:(index + max_models)] for index in range(0, models_total, max_models) ] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, FIELDS_QS) except ValueError, exception: sys.exit("Failed to get model: %s" % (model, str(exception))) complete_models.append(model) local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, by_name=test_set_header, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index].predictions predictions.extend(votes[index].predictions) else: total_votes = votes
def local_batch_predict(models, test_reader, prediction_file, api, args, resume=False, output_path=None, output=None, method=PLURALITY_CODE, options=None, session_file=None, labels=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % (localize(current), localize(total), pct)) max_models = args.max_batch_models label_separator = args.label_separator if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [ models[index:(index + max_models)] for index in range(0, models_total, max_models) ] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 if not ordered: models_order = [] single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=args.debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, query_string) except ValueError, exception: sys.exit("Failed to get model: %s. %s" % (model, str(exception))) # When user selects the labels in multi-label predictions, we must # filter the models that will be used to predict if labels: objective_column = str(multi_label_data['objective_column']) labels_info = multi_label_data['generated_fields'][ objective_column] labels_columns = [ label_info[1] for label_info in labels_info if label_info[0] in labels ] model_objective_id = model['object']['objective_fields'][0] model_fields = model['object']['model']['fields'] model_objective = model_fields[model_objective_id] model_column = model_objective['column_number'] if (model_column in labels_columns): # When the list of models comes from a --model-tag # selection, the models are not retrieved in the same # order they were created. We must keep track of the # label they are associated with to label their # predictions properly if not ordered: models_order.append(model_column) complete_models.append(model) else: complete_models.append(model) if complete_models: local_model = MultiModel(complete_models) try: local_model.batch_predict( input_data_list, output_path, by_name=test_set_header, reuse=True, missing_strategy=args.missing_strategy) except ImportError: sys.exit("Failed to find the numpy and scipy libraries needed" " to use proportional missing strategy for" " regressions. Please, install them manually") votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if args.verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes