def generate_votes(self, input_data, by_name=True, missing_strategy=LAST_PREDICTION, add_median=False, add_min=False, add_max=False, add_unused_fields=False): """ Generates a MultiVote object that contains the predictions made by each of the models. """ votes = MultiVote([]) for order in range(0, len(self.models)): model = self.models[order] prediction_info = model.predict( \ input_data, by_name=by_name, add_confidence=True, add_distribution=True, add_count=True, add_median=add_median, add_min=add_min, add_max=add_max, add_unused_fields=add_unused_fields, missing_strategy=missing_strategy) if model.boosting is not None: votes.boosting = True prediction_info.update( \ {"weight": model.boosting.get("weight")}) if model.boosting.get("objective_class") is not None: prediction_info.update( \ {"class": model.boosting.get("objective_class")}) votes.append(prediction_info) return votes
def predict(self, input_data, by_name=True, method=PLURALITY_CODE): """Makes a prediction based on the prediction made by every model. :param input_data: Test data to be used as input :param by_name: Boolean that is set to True if field_names (as alternative to field ids) are used in the input_data dict :param method: numeric key code for the following combination methods in classifications/regressions: 0 - majority vote (plurality)/ average: PLURALITY_CODE 1 - confidence weighted majority vote / error weighted: CONFIDENCE_CODE 2 - probability weighted majority vote / average: PROBABILITY_CODE """ # When only one group of models is found you use the # corresponding multimodel to predict input_data_array = self.format_input_data(input_data, by_name=by_name) votes_split = [] options = None for fun in self.predict_functions: votes_split.append(fun(*input_data_array)) votes = MultiVote(votes_split, boosting_offsets=self.boosting_offsets) if self.boosting is not None and not self.regression: categories = [ \ d[0] for d in self.fields[self.objective_id]["summary"]["categories"]] options = {"categories": categories} result = votes.combine(method=method, options=options) return result
def predict(self, input_data, by_name=True, method=PLURALITY_CODE, with_confidence=False, options=None): """Makes a prediction based on the prediction made by every model. The method parameter is a numeric key to the following combination methods in classifications/regressions: 0 - majority vote (plurality)/ average: PLURALITY_CODE 1 - confidence weighted majority vote / error weighted: CONFIDENCE_CODE 2 - probability weighted majority vote / average: PROBABILITY_CODE 3 - threshold filtered vote / doesn't apply: THRESHOLD_CODE """ if len(self.models_splits) > 1: # If there's more than one chunck of models, they must be # sequentially used to generate the votes for the prediction votes = MultiVote([]) for models_split in self.models_splits: models = [retrieve_resource(self.api, model_id, query_string=ONLY_MODEL) for model_id in models_split] multi_model = MultiModel(models, api=self.api) votes_split = multi_model.generate_votes(input_data, by_name=by_name) votes.extend(votes_split.predictions) else: # When only one group of models is found you use the # corresponding multimodel to predict votes_split = self.multi_model.generate_votes(input_data, by_name=by_name) votes = MultiVote(votes_split.predictions) return votes.combine(method=method, with_confidence=with_confidence, options=options)
def aggregate_multivote(multivote, options, labels, models_per_label, ordered, models_order, label_separator=None): """Aggregate the model's predictions for multi-label fields in a concatenated format into a final prediction """ if label_separator is None: label_separator = "," predictions = multivote.predictions if ordered and models_per_label == 1: # as multi-labeled models are created from end to start votes # must be reversed to match predictions.reverse() else: predictions = [ prediction for (_, prediction) in sorted(zip(models_order, predictions), key=lambda x: x[0]) ] if (labels is None or len(labels) * models_per_label != len(predictions)): sys.exit("Failed to make a multi-label prediction. No" " valid label info is found.") prediction_list = [] confidence_list = [] # In the following case, we must vote each label using the models # in the ensemble and the chosen method if models_per_label > 1: label_predictions = [ predictions[i:i + models_per_label] for i in range(0, len(predictions), models_per_label) ] predictions = [] for label_prediction in label_predictions: label_multivote = MultiVote(label_prediction) prediction_info = label_multivote.combine(method=AGGREGATION, full=True, options=options) predictions.append({ 'prediction': prediction_info["prediction"], 'confidence': prediction_info["confidence"] }) for vote_index, vote_prediction in enumerate(predictions): if ast.literal_eval(vote_prediction['prediction']): prediction_list.append(labels[vote_index]) confidence = str(vote_prediction['confidence']) confidence_list.append(confidence) prediction = [ label_separator.join(prediction_list), label_separator.join(confidence_list) ] return prediction
def _generate_votes(self, input_data, missing_strategy=LAST_PREDICTION, unused_fields=None): """ Generates a MultiVote object that contains the predictions made by each of the models. Please note that this function calls a _predict method which assumes input data has been properly checked against the model fields. Only casting to the correct type will be applied. """ votes = MultiVote([]) for order in range(0, len(self.models)): model = self.models[order] prediction_info = model._predict( \ input_data, missing_strategy=missing_strategy, unused_fields=unused_fields) if model.boosting is not None: votes.boosting = True prediction_info.update( \ {"weight": model.boosting.get("weight")}) if model.boosting.get("objective_class") is not None: prediction_info.update( \ {"class": model.boosting.get("objective_class")}) votes.append(prediction_info) return votes
def predict(self, input_data, method=PLURALITY_CODE, full=False): """Makes a prediction based on the prediction made by every model. :param input_data: Test data to be used as input :param method: numeric key code for the following combination methods in classifications/regressions: 0 - majority vote (plurality)/ average: PLURALITY_CODE 1 - confidence weighted majority vote / error weighted: CONFIDENCE_CODE 2 - probability weighted majority vote / average: PROBABILITY_CODE """ # When only one group of models is found you use the # corresponding multimodel to predict votes_split = [] options = None count = 1 for fun in self.predict_functions: prediction = fun(input_data) prediction.update({"order": count, "count": 1}) count += 1 votes_split.append(prediction) votes = MultiVote(votes_split, boosting_offsets=self.boosting_offsets) if self.boosting is not None and not self.regression: categories = [ \ d[0] for d in self.fields[self.objective_id]["summary"]["categories"]] options = {"categories": categories} result = votes.combine(method=method, options=options, full=full) if isinstance(result, dict): del result['count'] return result
def generate_votes(self, input_data, by_name=True): """ Generates a MultiVote object that contains the predictions made by each of the models. """ votes = MultiVote([]) for order in range(0, len(self.models)): model = self.models[order] prediction_info = model.predict(input_data, by_name=by_name, with_confidence=True) prediction, confidence, distribution, instances = prediction_info prediction_row = [prediction, confidence, order, distribution, instances] votes.append_row(prediction_row) return votes
def generate_probability_votes(self, input_data, by_name=True, missing_strategy=LAST_PREDICTION, method=PROBABILITY_CODE): votes = MultiVote([]) for order in range(0, len(self.models)): model = self.models[order] model.class_names = self.class_names votes.probabilities = True try: if method == PROBABILITY_CODE: prediction_info = model.predict_probability( input_data, by_name=by_name, compact=True, missing_strategy=missing_strategy) elif method == CONFIDENCE_CODE: prediction_info = model.predict_confidence( input_data, by_name=by_name, compact=True, missing_strategy=missing_strategy) elif method == PLURALITY_CODE: prediction_info = [0.0] * len(self.class_names) prediction = model.predict( input_data, by_name=by_name, missing_strategy=missing_strategy) prediction_info[self.class_names.index(prediction)] = 1.0 else: raise ValueError('%d is not a valid "method"' % method) except (AttributeError, TypeError): if method == PLURALITY_CODE: prediction_info = [0.0] * len(self.class_names) prediction = model.predict(input_data, by_name=by_name) prediction_info[self.class_names.index(prediction)] = 1.0 else: prediction_info = model.predict_probability( input_data, by_name=by_name, compact=True) votes.append(prediction_info) return votes
def aggregate_multivote(multivote, options, labels, models_per_label, ordered, models_order, label_separator=None): """Aggregate the model's predictions for multi-label fields in a concatenated format into a final prediction """ if label_separator is None: label_separator = "," predictions = multivote.predictions if ordered and models_per_label == 1: # as multi-labeled models are created from end to start votes # must be reversed to match predictions.reverse() else: predictions = [prediction for (_, prediction) in sorted(zip(models_order, predictions), key=lambda x: x[0])] if (labels is None or len(labels) * models_per_label != len(predictions)): sys.exit("Failed to make a multi-label prediction. No" " valid label info is found.") prediction_list = [] confidence_list = [] # In the following case, we must vote each label using the models # in the ensemble and the chosen method if models_per_label > 1: label_predictions = [predictions[i: i + models_per_label] for i in range(0, len(predictions), models_per_label)] predictions = [] for label_prediction in label_predictions: label_multivote = MultiVote(label_prediction) prediction, confidence = label_multivote.combine( method=AGGREGATION, with_confidence=True, options=options) predictions.append({'prediction': prediction, 'confidence': confidence}) for vote_index in range(0, len(predictions)): if ast.literal_eval(predictions[vote_index]['prediction']): prediction_list.append(labels[vote_index]) confidence = str(predictions[vote_index]['confidence']) confidence_list.append(confidence) prediction = [label_separator.join(prediction_list), label_separator.join(confidence_list)] return prediction
def i_create_a_multivote(step, predictions_file): predictions_file = res_filename(predictions_file) try: with open(predictions_file, 'r') as predictions_file: world.multivote = MultiVote(json.load(predictions_file)) except IOError: assert False, "Failed to read %s" % predictions_file
def generate_votes(self, input_data, by_name=True, missing_strategy=LAST_PREDICTION): """ Generates a MultiVote object that contains the predictions made by each of the models. """ votes = MultiVote([]) for order in range(0, len(self.models)): model = self.models[order] prediction_info = model.predict(input_data, by_name=by_name, with_confidence=True, missing_strategy=missing_strategy) prediction, confidence, distribution, instances = prediction_info prediction_row = [prediction, confidence, order, distribution, instances] votes.append_row(prediction_row) return votes
def generate_votes(self, input_data, by_name=True, missing_strategy=LAST_PREDICTION, add_median=False): """ Generates a MultiVote object that contains the predictions made by each of the models. """ votes = MultiVote([]) for order in range(0, len(self.models)): model = self.models[order] prediction_info = model.predict(input_data, by_name=by_name, add_confidence=True, add_distribution=True, add_count=True, add_median=add_median, missing_strategy=missing_strategy) votes.append(prediction_info) return votes
def predict(self, input_data, by_name=True, method=PLURALITY_CODE, with_confidence=False): """Makes a prediction based on the prediction made by every model. The method parameter is a numeric key to the following combination methods in classifications/regressions: 0 - majority vote (plurality)/ average: PLURALITY_CODE 1 - confidence weighted majority vote / error weighted: CONFIDENCE_CODE 2 - probability weighted majority vote / average: PROBABILITY_CODE """ votes = MultiVote([]) for models_split in self.models_splits: models = [retrieve_model(self.api, model_id) for model_id in models_split] multi_model = MultiModel(models) votes_split = multi_model.generate_votes(input_data, by_name=by_name) votes.extend(votes_split.predictions) return votes.combine(method=method, with_confidence=with_confidence)
def generate_votes(self, input_data, missing_strategy=LAST_PREDICTION): """ Generates a MultiVote object that contains the predictions made by each of the models. """ votes = MultiVote([]) for order in range(0, len(self.models)): model = self.models[order] prediction_info = model.predict( \ input_data, missing_strategy=missing_strategy, full=True) if model.boosting is not None: votes.boosting = True prediction_info.update( \ {"weight": model.boosting.get("weight")}) if model.boosting.get("objective_class") is not None: prediction_info.update( \ {"class": model.boosting.get("objective_class")}) votes.append(prediction_info) return votes
def predict(self, input_data, by_name=True, method=PLURALITY_CODE): """Makes a prediction based on the prediction made by every model. The method parameter is a numeric key to the following combination methods in classifications/regressions: 0 - majority vote (plurality)/ average: PLURALITY_CODE 1 - confidence weighted majority vote / error weighted: CONFIDENCE_CODE 2 - probability weighted majority vote / average: PROBABILITY_CODE """ votes = MultiVote([]) for order in range(0, len(self.models)): model = self.models[order] prediction_info = model.predict(input_data, by_name=by_name, with_confidence=True) prediction, confidence, distribution, instances = prediction_info prediction_row = [prediction, confidence, order, distribution, instances] votes.append_row(prediction_row) return votes.combine(method=method)
def read_votes(votes_files, to_prediction, data_locale=None): """Reads the votes found in the votes' files. Returns a list of MultiVote objects containing the list of predictions. votes_files parameter should contain the path to the files where votes are stored In to_prediction parameter we expect the method of a local model object that casts the string prediction values read from the file to their real type. For instance >>> local_model = Model(model) >>> prediction = local_model.to_prediction("1") >>> isinstance(prediction, int) True >>> read_votes(["my_predictions_file"], local_model.to_prediction) data_locale should contain the string identification for the locale used in numeric formatting. """ votes = [] for order in range(0, len(votes_files)): votes_file = votes_files[order] index = 0 with UnicodeReader(votes_file) as rdr: for row in rdr: prediction = to_prediction(row[0], data_locale=data_locale) if index > (len(votes) - 1): votes.append(MultiVote([])) distribution = None instances = None if len(row) > 2: distribution = ast.literal_eval(row[2]) instances = int(row[3]) try: confidence = float(row[1]) except ValueError: confidence = 0.0 prediction_row = [ prediction, confidence, order, distribution, instances ] votes[index].append_row(prediction_row) index += 1 return votes
def batch_predict(self, input_data_list, output_file_path=None, by_name=True, reuse=False, missing_strategy=LAST_PREDICTION, headers=None, to_file=True, use_median=False): """Makes predictions for a list of input data. When the to_file argument is set to True, the predictions generated for each model are stored in an output file. The name of the file will use the following syntax: model_[id of the model]__predictions.csv For instance, when using model/50c0de043b563519830001c2 to predict, the output file name will be model_50c0de043b563519830001c2__predictions.csv On the contrary, if it is False, the function returns a list of MultiVote objects with the model's predictions. """ add_headers = (isinstance(input_data_list[0], list) and headers is not None and len(headers) == len(input_data_list[0])) if not add_headers and not isinstance(input_data_list[0], dict): raise ValueError("Input data list is not a dictionary or the" " headers and input data information are not" " consistent.") order = 0 if not to_file: votes = [] for model in self.models: order += 1 out = None if to_file: output_file = get_predictions_file_name( model.resource_id, output_file_path) if reuse: try: predictions_file = open(output_file) predictions_file.close() continue except IOError: pass try: out = UnicodeWriter(output_file) except IOError: raise Exception("Cannot find %s directory." % output_file_path) if out: out.open_writer() for index, input_data in enumerate(input_data_list): if add_headers: input_data = dict(zip(headers, input_data)) prediction = model.predict(input_data, by_name=by_name, with_confidence=True, missing_strategy=missing_strategy) if use_median and model.tree.regression: # if median is to be used, we just place it as prediction # starting the list prediction[0] = prediction[-1] prediction = prediction[:-1] if to_file: out.writerow(prediction) else: # prediction is a row that contains prediction, confidence, # distribution, instances prediction_row = prediction[0:2] prediction_row.append(order) prediction_row.extend(prediction[2:]) if len(votes) <= index: votes.append(MultiVote([])) votes[index].append_row(prediction_row) if out: out.close_writer() if not to_file: return votes
if (labels is None or len(labels) * models_per_label != len(predictions)): sys.exit("Failed to make a multi-label prediction. No" " valid label info is found.") prediction_list = [] confidence_list = [] # In the following case, we must vote each label using the models # in the ensemble and the chosen method if models_per_label > 1: label_predictions = [predictions[i: i + models_per_label] for i in range(0, len(predictions), models_per_label)] predictions = [] for label_prediction in label_predictions: label_multivote = MultiVote(label_prediction) prediction, confidence = label_multivote.combine( method=method, with_confidence=True, options=options) predictions.append({'prediction': prediction, 'confidence': confidence}) for vote_index in range(0, len(predictions)): if ast.literal_eval(predictions[vote_index]['prediction']): prediction_list.append(labels[vote_index]) confidence = str(predictions[vote_index]['confidence']) confidence_list.append(confidence) prediction = [label_separator.join(prediction_list), label_separator.join(confidence_list)] elif method == COMBINATION: predictions = multivote.predictions global_distribution = [] for prediction in predictions:
def batch_predict(self, input_data_list, output_file_path=None, by_name=True, reuse=False, missing_strategy=LAST_PREDICTION, headers=None, to_file=True): """Makes predictions for a list of input data. When the to_file argument is set to True, the predictions generated for each model are stored in an output file. The name of the file will use the following syntax: model_[id of the model]__predictions.csv For instance, when using model/50c0de043b563519830001c2 to predict, the output file name will be model_50c0de043b563519830001c2__predictions.csv On the contrary, if it is False, the function returns a list of MultiVote objects with the model's predictions. """ add_headers = (isinstance(input_data_list[0], list) and headers is not None and len(headers) == len(input_data_list[0])) if not add_headers and not isinstance(input_data_list[0], dict): raise ValueError("Input data list is not a dictionary or the" " headers and input data information are not" " consistent.") order = 0 if not to_file: votes = [] for model in self.models: order += 1 if to_file: output_file = get_predictions_file_name( model.resource_id, output_file_path) if reuse: try: predictions_file = open(output_file) predictions_file.close() continue except IOError: pass try: predictions_file = csv.writer(open(output_file, 'w', 0), lineterminator="\n") except IOError: raise Exception("Cannot find %s directory." % output_file_path) for index, input_data in enumerate(input_data_list): if add_headers: input_data = dict(zip(headers, input_data)) prediction = model.predict(input_data, by_name=by_name, with_confidence=True, missing_strategy=missing_strategy) if to_file: if isinstance(prediction[0], basestring): prediction[0] = prediction[0].encode("utf-8") predictions_file.writerow(prediction) else: prediction, confidence, distribution, instances = prediction prediction_row = [ prediction, confidence, order, distribution, instances ] if len(votes) <= index: votes.append(MultiVote([])) votes[index].append_row(prediction_row) if not to_file: return votes
or len(labels) * models_per_label != len(predictions)): sys.exit("Failed to make a multi-label prediction. No" " valid label info is found.") prediction_list = [] confidence_list = [] # In the following case, we must vote each label using the models # in the ensemble and the chosen method if models_per_label > 1: label_predictions = [ predictions[i:i + models_per_label] for i in range(0, len(predictions), models_per_label) ] predictions = [] for label_prediction in label_predictions: label_multivote = MultiVote(label_prediction) prediction, confidence = label_multivote.combine( method=method, with_confidence=True, options=options) predictions.append({ 'prediction': prediction, 'confidence': confidence }) for vote_index in range(0, len(predictions)): if ast.literal_eval(predictions[vote_index]['prediction']): prediction_list.append(labels[vote_index]) confidence = str(predictions[vote_index]['confidence']) confidence_list.append(confidence) prediction = [ label_separator.join(prediction_list), label_separator.join(confidence_list) ]
def predict(self, input_data, method=None, options=None, missing_strategy=LAST_PREDICTION, operating_point=None, operating_kind=None, median=False, full=False): """Makes a prediction based on the prediction made by every model. :param input_data: Test data to be used as input :param method: **deprecated**. Please check the `operating_kind` attribute. Numeric key code for the following combination methods in classifications/regressions: 0 - majority vote (plurality)/ average: PLURALITY_CODE 1 - confidence weighted majority vote / error weighted: CONFIDENCE_CODE 2 - probability weighted majority vote / average: PROBABILITY_CODE 3 - threshold filtered vote / doesn't apply: THRESHOLD_CODE :param options: Options to be used in threshold filtered votes. :param missing_strategy: numeric key for the individual model's prediction method. See the model predict method. :param operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - its kind: probability, confidence or voting - its threshold: the minimum established for the positive_class to be predicted. The operating_point is then defined as a map with three attributes, e.g.: {"positive_class": "Iris-setosa", "kind": "probability", "threshold": 0.5} :param operating_kind: "probability", "confidence" or "votes". Sets the property that decides the prediction. Used only if no operating_point is used :param median: Uses the median of each individual model's predicted node as individual prediction for the specified combination method. :param full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - confidence: prediction's confidence - probability: prediction's probability - path: rules that lead to the prediction - count: number of training instances supporting the prediction - next: field to check in the next split - min: minim value of the training instances in the predicted node - max: maximum value of the training instances in the predicted node - median: median of the values of the training instances in the predicted node - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model new_data = self.filter_input_data( \ input_data, add_unused_fields=full) unused_fields = None if full: input_data, unused_fields = new_data else: input_data = new_data # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) if median and method is None: # predictions with median are only available with old combiners method = PLURALITY_CODE if method is None and operating_point is None and \ operating_kind is None and not median: # operating_point has precedence over operating_kind. If no # combiner is set, default operating kind is "probability" operating_kind = "probability" if operating_point: if self.regression: raise ValueError("The operating_point argument can only be" " used in classifications.") prediction = self.predict_operating( \ input_data, missing_strategy=missing_strategy, operating_point=operating_point) if full: return prediction return prediction["prediction"] if operating_kind: if self.regression: # for regressions, operating_kind defaults to the old # combiners method = 1 if operating_kind == "confidence" else 0 return self.predict( \ input_data, method=method, options=options, missing_strategy=missing_strategy, operating_point=None, operating_kind=None, full=full) prediction = self.predict_operating_kind( \ input_data, missing_strategy=missing_strategy, operating_kind=operating_kind) return prediction if len(self.models_splits) > 1: # If there's more than one chunk of models, they must be # sequentially used to generate the votes for the prediction votes = MultiVote([], boosting_offsets=self.boosting_offsets) for models_split in self.models_splits: models = self._get_models(models_split) multi_model = MultiModel(models, api=self.api, fields=self.fields) votes_split = multi_model._generate_votes( input_data, missing_strategy=missing_strategy, unused_fields=unused_fields) if median: for prediction in votes_split.predictions: prediction['prediction'] = prediction['median'] votes.extend(votes_split.predictions) else: # When only one group of models is found you use the # corresponding multimodel to predict votes_split = self.multi_model._generate_votes( input_data, missing_strategy=missing_strategy, unused_fields=unused_fields) votes = MultiVote(votes_split.predictions, boosting_offsets=self.boosting_offsets) if median: for prediction in votes.predictions: prediction['prediction'] = prediction['median'] if self.boosting is not None and not self.regression: categories = [ \ d[0] for d in self.fields[self.objective_id]["summary"]["categories"]] options = {"categories": categories} result = votes.combine(method=method, options=options, full=full) if full: unused_fields = set(input_data.keys()) for prediction in votes.predictions: unused_fields = unused_fields.intersection( \ set(prediction.get("unused_fields", []))) if not isinstance(result, dict): result = {"prediction": result} result['unused_fields'] = list(unused_fields) return result