示例#1
0
    def generate_votes(self, input_data, by_name=True,
                       missing_strategy=LAST_PREDICTION,
                       add_median=False, add_min=False, add_max=False,
                       add_unused_fields=False):
        """ Generates a MultiVote object that contains the predictions
            made by each of the models.
        """
        votes = MultiVote([])
        for order in range(0, len(self.models)):
            model = self.models[order]
            prediction_info = model.predict( \
                input_data, by_name=by_name,
                add_confidence=True,
                add_distribution=True,
                add_count=True,
                add_median=add_median,
                add_min=add_min,
                add_max=add_max,
                add_unused_fields=add_unused_fields,
                missing_strategy=missing_strategy)

            if model.boosting is not None:
                votes.boosting = True
                prediction_info.update( \
                    {"weight": model.boosting.get("weight")})
                if model.boosting.get("objective_class") is not None:
                    prediction_info.update( \
                        {"class": model.boosting.get("objective_class")})

            votes.append(prediction_info)

        return votes
示例#2
0
    def generate_votes(self, input_data, by_name=True,
                       missing_strategy=LAST_PREDICTION,
                       add_median=False, add_min=False, add_max=False,
                       add_unused_fields=False):
        """ Generates a MultiVote object that contains the predictions
            made by each of the models.
        """
        votes = MultiVote([])
        for order in range(0, len(self.models)):
            model = self.models[order]
            prediction_info = model.predict( \
                input_data, by_name=by_name,
                add_confidence=True,
                add_distribution=True,
                add_count=True,
                add_median=add_median,
                add_min=add_min,
                add_max=add_max,
                add_unused_fields=add_unused_fields,
                missing_strategy=missing_strategy)

            if model.boosting is not None:
                votes.boosting = True
                prediction_info.update( \
                    {"weight": model.boosting.get("weight")})
                if model.boosting.get("objective_class") is not None:
                    prediction_info.update( \
                        {"class": model.boosting.get("objective_class")})

            votes.append(prediction_info)

        return votes
示例#3
0
    def predict(self, input_data, by_name=True, method=PLURALITY_CODE):
        """Makes a prediction based on the prediction made by every model.

        :param input_data: Test data to be used as input
        :param by_name: Boolean that is set to True if field_names (as
                        alternative to field ids) are used in the
                        input_data dict
        :param method: numeric key code for the following combination
                       methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
        """

        # When only one group of models is found you use the
        # corresponding multimodel to predict
        input_data_array = self.format_input_data(input_data, by_name=by_name)
        votes_split = []
        options = None
        for fun in self.predict_functions:
            votes_split.append(fun(*input_data_array))

        votes = MultiVote(votes_split, boosting_offsets=self.boosting_offsets)
        if self.boosting is not None and not self.regression:
            categories = [ \
                d[0] for d in
                self.fields[self.objective_id]["summary"]["categories"]]
            options = {"categories": categories}

        result = votes.combine(method=method, options=options)

        return result
示例#4
0
    def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
                with_confidence=False, options=None):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        """

        if len(self.models_splits) > 1:
            # If there's more than one chunck of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([])
            for models_split in self.models_splits:
                models = [retrieve_resource(self.api, model_id,
                                            query_string=ONLY_MODEL)
                          for model_id in models_split]
                multi_model = MultiModel(models, api=self.api)
                votes_split = multi_model.generate_votes(input_data,
                                                         by_name=by_name)
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model.generate_votes(input_data,
                                                          by_name=by_name)
            votes = MultiVote(votes_split.predictions)
        return votes.combine(method=method, with_confidence=with_confidence,
                             options=options)
示例#5
0
def aggregate_multivote(multivote,
                        options,
                        labels,
                        models_per_label,
                        ordered,
                        models_order,
                        label_separator=None):
    """Aggregate the model's predictions for multi-label fields in a
       concatenated format into a final prediction

    """

    if label_separator is None:
        label_separator = ","
    predictions = multivote.predictions

    if ordered and models_per_label == 1:
        # as multi-labeled models are created from end to start votes
        # must be reversed to match
        predictions.reverse()
    else:
        predictions = [
            prediction
            for (_, prediction) in sorted(zip(models_order, predictions),
                                          key=lambda x: x[0])
        ]

    if (labels is None or len(labels) * models_per_label != len(predictions)):
        sys.exit("Failed to make a multi-label prediction. No"
                 " valid label info is found.")
    prediction_list = []
    confidence_list = []
    # In the following case, we must vote each label using the models
    # in the ensemble and the chosen method

    if models_per_label > 1:
        label_predictions = [
            predictions[i:i + models_per_label]
            for i in range(0, len(predictions), models_per_label)
        ]
        predictions = []
        for label_prediction in label_predictions:
            label_multivote = MultiVote(label_prediction)
            prediction_info = label_multivote.combine(method=AGGREGATION,
                                                      full=True,
                                                      options=options)
            predictions.append({
                'prediction': prediction_info["prediction"],
                'confidence': prediction_info["confidence"]
            })
    for vote_index, vote_prediction in enumerate(predictions):
        if ast.literal_eval(vote_prediction['prediction']):
            prediction_list.append(labels[vote_index])
            confidence = str(vote_prediction['confidence'])
            confidence_list.append(confidence)
    prediction = [
        label_separator.join(prediction_list),
        label_separator.join(confidence_list)
    ]
    return prediction
示例#6
0
    def _generate_votes(self,
                        input_data,
                        missing_strategy=LAST_PREDICTION,
                        unused_fields=None):
        """ Generates a MultiVote object that contains the predictions
            made by each of the models. Please note that this function
            calls a _predict method which assumes input data has been
            properly checked against the model fields. Only casting
            to the correct type will be applied.
        """
        votes = MultiVote([])
        for order in range(0, len(self.models)):
            model = self.models[order]
            prediction_info = model._predict( \
                input_data,
                missing_strategy=missing_strategy, unused_fields=unused_fields)

            if model.boosting is not None:
                votes.boosting = True
                prediction_info.update( \
                    {"weight": model.boosting.get("weight")})
                if model.boosting.get("objective_class") is not None:
                    prediction_info.update( \
                        {"class": model.boosting.get("objective_class")})

            votes.append(prediction_info)

        return votes
示例#7
0
    def predict(self, input_data, method=PLURALITY_CODE, full=False):
        """Makes a prediction based on the prediction made by every model.

        :param input_data: Test data to be used as input
        :param method: numeric key code for the following combination
                       methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
        """

        # When only one group of models is found you use the
        # corresponding multimodel to predict
        votes_split = []
        options = None
        count = 1
        for fun in self.predict_functions:
            prediction = fun(input_data)
            prediction.update({"order": count, "count": 1})
            count += 1
            votes_split.append(prediction)
        votes = MultiVote(votes_split, boosting_offsets=self.boosting_offsets)
        if self.boosting is not None and not self.regression:
            categories = [ \
                d[0] for d in
                self.fields[self.objective_id]["summary"]["categories"]]
            options = {"categories": categories}

        result = votes.combine(method=method, options=options, full=full)
        if isinstance(result, dict):
            del result['count']

        return result
示例#8
0
 def generate_votes(self, input_data, by_name=True):
     """ Generates a MultiVote object that contains the predictions
         made by each of the models.
     """
     votes = MultiVote([])
     for order in range(0, len(self.models)):
         model = self.models[order]
         prediction_info = model.predict(input_data, by_name=by_name, with_confidence=True)
         prediction, confidence, distribution, instances = prediction_info
         prediction_row = [prediction, confidence, order, distribution, instances]
         votes.append_row(prediction_row)
     return votes
示例#9
0
    def generate_probability_votes(self,
                                   input_data,
                                   by_name=True,
                                   missing_strategy=LAST_PREDICTION,
                                   method=PROBABILITY_CODE):

        votes = MultiVote([])
        for order in range(0, len(self.models)):
            model = self.models[order]
            model.class_names = self.class_names
            votes.probabilities = True

            try:
                if method == PROBABILITY_CODE:
                    prediction_info = model.predict_probability(
                        input_data,
                        by_name=by_name,
                        compact=True,
                        missing_strategy=missing_strategy)
                elif method == CONFIDENCE_CODE:
                    prediction_info = model.predict_confidence(
                        input_data,
                        by_name=by_name,
                        compact=True,
                        missing_strategy=missing_strategy)
                elif method == PLURALITY_CODE:
                    prediction_info = [0.0] * len(self.class_names)
                    prediction = model.predict(
                        input_data,
                        by_name=by_name,
                        missing_strategy=missing_strategy)
                    prediction_info[self.class_names.index(prediction)] = 1.0
                else:
                    raise ValueError('%d is not a valid "method"' % method)
            except (AttributeError, TypeError):
                if method == PLURALITY_CODE:
                    prediction_info = [0.0] * len(self.class_names)
                    prediction = model.predict(input_data, by_name=by_name)
                    prediction_info[self.class_names.index(prediction)] = 1.0
                else:
                    prediction_info = model.predict_probability(
                        input_data,
                        by_name=by_name,
                        compact=True)

            votes.append(prediction_info)

        return votes
示例#10
0
def aggregate_multivote(multivote, options, labels, models_per_label, ordered,
                        models_order, label_separator=None):
    """Aggregate the model's predictions for multi-label fields in a
       concatenated format into a final prediction

    """

    if label_separator is None:
        label_separator = ","
    predictions = multivote.predictions

    if ordered and models_per_label == 1:
        # as multi-labeled models are created from end to start votes
        # must be reversed to match
        predictions.reverse()
    else:
        predictions = [prediction for (_, prediction)
                       in sorted(zip(models_order, predictions),
                                 key=lambda x: x[0])]

    if (labels is None or
            len(labels) * models_per_label != len(predictions)):
        sys.exit("Failed to make a multi-label prediction. No"
                 " valid label info is found.")
    prediction_list = []
    confidence_list = []
    # In the following case, we must vote each label using the models
    # in the ensemble and the chosen method

    if models_per_label > 1:
        label_predictions = [predictions[i: i + models_per_label] for
                             i in range(0, len(predictions),
                                        models_per_label)]
        predictions = []
        for label_prediction in label_predictions:
            label_multivote = MultiVote(label_prediction)
            prediction, confidence = label_multivote.combine(
                method=AGGREGATION, with_confidence=True, options=options)
            predictions.append({'prediction': prediction,
                                'confidence': confidence})
    for vote_index in range(0, len(predictions)):
        if ast.literal_eval(predictions[vote_index]['prediction']):
            prediction_list.append(labels[vote_index])
            confidence = str(predictions[vote_index]['confidence'])
            confidence_list.append(confidence)
    prediction = [label_separator.join(prediction_list),
                  label_separator.join(confidence_list)]
    return prediction
def i_create_a_multivote(step, predictions_file):
    predictions_file = res_filename(predictions_file)
    try:
        with open(predictions_file, 'r') as predictions_file:
            world.multivote = MultiVote(json.load(predictions_file))
    except IOError:
        assert False, "Failed to read %s" % predictions_file
示例#12
0
 def generate_votes(self, input_data, by_name=True,
                    missing_strategy=LAST_PREDICTION):
     """ Generates a MultiVote object that contains the predictions
         made by each of the models.
     """
     votes = MultiVote([])
     for order in range(0, len(self.models)):
         model = self.models[order]
         prediction_info = model.predict(input_data, by_name=by_name,
                                         with_confidence=True,
                                         missing_strategy=missing_strategy)
         prediction, confidence, distribution, instances = prediction_info
         prediction_row = [prediction, confidence, order,
                           distribution, instances]
         votes.append_row(prediction_row)
     return votes
示例#13
0
 def generate_votes(self, input_data, by_name=True,
                    missing_strategy=LAST_PREDICTION,
                    add_median=False):
     """ Generates a MultiVote object that contains the predictions
         made by each of the models.
     """
     votes = MultiVote([])
     for order in range(0, len(self.models)):
         model = self.models[order]
         prediction_info = model.predict(input_data, by_name=by_name,
                                         add_confidence=True,
                                         add_distribution=True,
                                         add_count=True,
                                         add_median=add_median,
                                         missing_strategy=missing_strategy)
         votes.append(prediction_info)
     return votes
示例#14
0
    def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
                with_confidence=False, options=None):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        """

        if len(self.models_splits) > 1:
            # If there's more than one chunck of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([])
            for models_split in self.models_splits:
                models = [retrieve_resource(self.api, model_id,
                                            query_string=ONLY_MODEL)
                          for model_id in models_split]
                multi_model = MultiModel(models, api=self.api)
                votes_split = multi_model.generate_votes(input_data,
                                                         by_name=by_name)
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model.generate_votes(input_data,
                                                          by_name=by_name)
            votes = MultiVote(votes_split.predictions)
        return votes.combine(method=method, with_confidence=with_confidence,
                             options=options)
示例#15
0
 def generate_votes(self,
                    input_data,
                    by_name=True,
                    missing_strategy=LAST_PREDICTION,
                    add_median=False):
     """ Generates a MultiVote object that contains the predictions
         made by each of the models.
     """
     votes = MultiVote([])
     for order in range(0, len(self.models)):
         model = self.models[order]
         prediction_info = model.predict(input_data,
                                         by_name=by_name,
                                         add_confidence=True,
                                         add_distribution=True,
                                         add_count=True,
                                         add_median=add_median,
                                         missing_strategy=missing_strategy)
         votes.append(prediction_info)
     return votes
示例#16
0
    def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
                with_confidence=False):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
        """
        votes = MultiVote([])
        for models_split in self.models_splits:
            models = [retrieve_model(self.api, model_id)
                      for model_id in models_split]
            multi_model = MultiModel(models)
            votes_split = multi_model.generate_votes(input_data,
                                                     by_name=by_name)
            votes.extend(votes_split.predictions)
        return votes.combine(method=method, with_confidence=with_confidence)
示例#17
0
    def generate_votes(self, input_data, missing_strategy=LAST_PREDICTION):
        """ Generates a MultiVote object that contains the predictions
            made by each of the models.
        """
        votes = MultiVote([])
        for order in range(0, len(self.models)):
            model = self.models[order]
            prediction_info = model.predict( \
                input_data, missing_strategy=missing_strategy, full=True)

            if model.boosting is not None:
                votes.boosting = True
                prediction_info.update( \
                    {"weight": model.boosting.get("weight")})
                if model.boosting.get("objective_class") is not None:
                    prediction_info.update( \
                        {"class": model.boosting.get("objective_class")})

            votes.append(prediction_info)

        return votes
示例#18
0
    def predict(self, input_data, by_name=True, method=PLURALITY_CODE):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
        """

        votes = MultiVote([])
        for order in range(0, len(self.models)):
            model = self.models[order]
            prediction_info = model.predict(input_data, by_name=by_name,
                                            with_confidence=True)
            prediction, confidence, distribution, instances = prediction_info
            prediction_row = [prediction, confidence, order,
                              distribution, instances]
            votes.append_row(prediction_row)

        return votes.combine(method=method)
示例#19
0
    def predict(self, input_data, by_name=True, method=PLURALITY_CODE):
        """Makes a prediction based on the prediction made by every model.

        :param input_data: Test data to be used as input
        :param by_name: Boolean that is set to True if field_names (as
                        alternative to field ids) are used in the
                        input_data dict
        :param method: numeric key code for the following combination
                       methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
        """


        # When only one group of models is found you use the
        # corresponding multimodel to predict
        input_data_array = self.format_input_data(input_data, by_name=by_name)
        votes_split = []
        options = None
        for fun in self.predict_functions:
            votes_split.append(fun(*input_data_array))

        votes = MultiVote(votes_split,
                          boosting_offsets=self.boosting_offsets)
        if self.boosting is not None and not self.regression:
            categories = [ \
                d[0] for d in
                self.fields[self.objective_id]["summary"]["categories"]]
            options = {"categories": categories}

        result = votes.combine(method=method, options=options)

        return result
示例#20
0
def read_votes(votes_files, to_prediction, data_locale=None):
    """Reads the votes found in the votes' files.

       Returns a list of MultiVote objects containing the list of predictions.
       votes_files parameter should contain the path to the files where votes
       are stored
       In to_prediction parameter we expect the method of a local model object
       that casts the string prediction values read from the file to their
       real type. For instance
           >>> local_model = Model(model)
           >>> prediction = local_model.to_prediction("1")
           >>> isinstance(prediction, int)
           True
           >>> read_votes(["my_predictions_file"], local_model.to_prediction)
       data_locale should contain the string identification for the locale
       used in numeric formatting.
    """
    votes = []
    for order in range(0, len(votes_files)):
        votes_file = votes_files[order]
        index = 0
        with UnicodeReader(votes_file) as rdr:
            for row in rdr:
                prediction = to_prediction(row[0], data_locale=data_locale)
                if index > (len(votes) - 1):
                    votes.append(MultiVote([]))
                distribution = None
                instances = None
                if len(row) > 2:
                    distribution = ast.literal_eval(row[2])
                    instances = int(row[3])
                    try:
                        confidence = float(row[1])
                    except ValueError:
                        confidence = 0.0
                prediction_row = [
                    prediction, confidence, order, distribution, instances
                ]
                votes[index].append_row(prediction_row)
                index += 1
    return votes
示例#21
0
    def batch_predict(self,
                      input_data_list,
                      output_file_path=None,
                      by_name=True,
                      reuse=False,
                      missing_strategy=LAST_PREDICTION,
                      headers=None,
                      to_file=True,
                      use_median=False):
        """Makes predictions for a list of input data.

           When the to_file argument is set to True, the predictions
           generated for each model are stored in an output
           file. The name of the file will use the following syntax:
                model_[id of the model]__predictions.csv
           For instance, when using model/50c0de043b563519830001c2 to predict,
           the output file name will be
                model_50c0de043b563519830001c2__predictions.csv
            On the contrary, if it is False, the function returns a list
            of MultiVote objects with the model's predictions.
        """
        add_headers = (isinstance(input_data_list[0], list)
                       and headers is not None
                       and len(headers) == len(input_data_list[0]))
        if not add_headers and not isinstance(input_data_list[0], dict):
            raise ValueError("Input data list is not a dictionary or the"
                             " headers and input data information are not"
                             " consistent.")
        order = 0
        if not to_file:
            votes = []

        for model in self.models:
            order += 1
            out = None
            if to_file:
                output_file = get_predictions_file_name(
                    model.resource_id, output_file_path)
                if reuse:
                    try:
                        predictions_file = open(output_file)
                        predictions_file.close()
                        continue
                    except IOError:
                        pass
                try:
                    out = UnicodeWriter(output_file)
                except IOError:
                    raise Exception("Cannot find %s directory." %
                                    output_file_path)

            if out:
                out.open_writer()
            for index, input_data in enumerate(input_data_list):
                if add_headers:
                    input_data = dict(zip(headers, input_data))
                prediction = model.predict(input_data,
                                           by_name=by_name,
                                           with_confidence=True,
                                           missing_strategy=missing_strategy)
                if use_median and model.tree.regression:
                    # if median is to be used, we just place it as prediction
                    # starting the list
                    prediction[0] = prediction[-1]
                prediction = prediction[:-1]
                if to_file:
                    out.writerow(prediction)
                else:
                    # prediction is a row that contains prediction, confidence,
                    # distribution, instances
                    prediction_row = prediction[0:2]
                    prediction_row.append(order)
                    prediction_row.extend(prediction[2:])

                    if len(votes) <= index:
                        votes.append(MultiVote([]))
                    votes[index].append_row(prediction_row)
            if out:
                out.close_writer()
        if not to_file:
            return votes
示例#22
0
            if (labels is None or
                    len(labels) * models_per_label != len(predictions)):
                sys.exit("Failed to make a multi-label prediction. No"
                         " valid label info is found.")
            prediction_list = []
            confidence_list = []
            # In the following case, we must vote each label using the models
            # in the ensemble and the chosen method

            if models_per_label > 1:
                label_predictions = [predictions[i: i + models_per_label] for
                                     i in range(0, len(predictions),
                                                models_per_label)]
                predictions = []
                for label_prediction in label_predictions:
                    label_multivote = MultiVote(label_prediction)
                    prediction, confidence = label_multivote.combine(
                        method=method, with_confidence=True, options=options)
                    predictions.append({'prediction': prediction,
                                        'confidence': confidence})
            for vote_index in range(0, len(predictions)):
                if ast.literal_eval(predictions[vote_index]['prediction']):
                    prediction_list.append(labels[vote_index])
                    confidence = str(predictions[vote_index]['confidence'])
                    confidence_list.append(confidence)
            prediction = [label_separator.join(prediction_list),
                          label_separator.join(confidence_list)]
        elif method == COMBINATION:
            predictions = multivote.predictions
            global_distribution = []
            for prediction in predictions:
示例#23
0
    def batch_predict(self,
                      input_data_list,
                      output_file_path=None,
                      by_name=True,
                      reuse=False,
                      missing_strategy=LAST_PREDICTION,
                      headers=None,
                      to_file=True):
        """Makes predictions for a list of input data.

           When the to_file argument is set to True, the predictions
           generated for each model are stored in an output
           file. The name of the file will use the following syntax:
                model_[id of the model]__predictions.csv
           For instance, when using model/50c0de043b563519830001c2 to predict,
           the output file name will be
                model_50c0de043b563519830001c2__predictions.csv
            On the contrary, if it is False, the function returns a list
            of MultiVote objects with the model's predictions.
        """
        add_headers = (isinstance(input_data_list[0], list)
                       and headers is not None
                       and len(headers) == len(input_data_list[0]))
        if not add_headers and not isinstance(input_data_list[0], dict):
            raise ValueError("Input data list is not a dictionary or the"
                             " headers and input data information are not"
                             " consistent.")
        order = 0
        if not to_file:
            votes = []

        for model in self.models:
            order += 1
            if to_file:
                output_file = get_predictions_file_name(
                    model.resource_id, output_file_path)
                if reuse:
                    try:
                        predictions_file = open(output_file)
                        predictions_file.close()
                        continue
                    except IOError:
                        pass
                try:
                    predictions_file = csv.writer(open(output_file, 'w', 0),
                                                  lineterminator="\n")
                except IOError:
                    raise Exception("Cannot find %s directory." %
                                    output_file_path)

            for index, input_data in enumerate(input_data_list):
                if add_headers:
                    input_data = dict(zip(headers, input_data))
                prediction = model.predict(input_data,
                                           by_name=by_name,
                                           with_confidence=True,
                                           missing_strategy=missing_strategy)
                if to_file:
                    if isinstance(prediction[0], basestring):
                        prediction[0] = prediction[0].encode("utf-8")
                    predictions_file.writerow(prediction)
                else:
                    prediction, confidence, distribution, instances = prediction
                    prediction_row = [
                        prediction, confidence, order, distribution, instances
                    ]
                    if len(votes) <= index:
                        votes.append(MultiVote([]))
                    votes[index].append_row(prediction_row)

        if not to_file:
            return votes
示例#24
0
                    or len(labels) * models_per_label != len(predictions)):
                sys.exit("Failed to make a multi-label prediction. No"
                         " valid label info is found.")
            prediction_list = []
            confidence_list = []
            # In the following case, we must vote each label using the models
            # in the ensemble and the chosen method

            if models_per_label > 1:
                label_predictions = [
                    predictions[i:i + models_per_label]
                    for i in range(0, len(predictions), models_per_label)
                ]
                predictions = []
                for label_prediction in label_predictions:
                    label_multivote = MultiVote(label_prediction)
                    prediction, confidence = label_multivote.combine(
                        method=method, with_confidence=True, options=options)
                    predictions.append({
                        'prediction': prediction,
                        'confidence': confidence
                    })
            for vote_index in range(0, len(predictions)):
                if ast.literal_eval(predictions[vote_index]['prediction']):
                    prediction_list.append(labels[vote_index])
                    confidence = str(predictions[vote_index]['confidence'])
                    confidence_list.append(confidence)
            prediction = [
                label_separator.join(prediction_list),
                label_separator.join(confidence_list)
            ]
示例#25
0
    def predict(self,
                input_data,
                method=None,
                options=None,
                missing_strategy=LAST_PREDICTION,
                operating_point=None,
                operating_kind=None,
                median=False,
                full=False):
        """Makes a prediction based on the prediction made by every model.

        :param input_data: Test data to be used as input
        :param method: **deprecated**. Please check the `operating_kind`
                       attribute. Numeric key code for the following
                       combination methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        :param options: Options to be used in threshold filtered votes.
        :param missing_strategy: numeric key for the individual model's
                                 prediction method. See the model predict
                                 method.
        :param operating_point: In classification models, this is the point of
                                the ROC curve where the model will be used at.
                                The operating point can be defined in terms of:
                                  - the positive_class, the class that is
                                    important to predict accurately
                                  - its kind: probability, confidence or voting
                                  - its threshold: the minimum established
                                    for the positive_class to be predicted.
                                    The operating_point is then defined as a
                                    map with three attributes, e.g.:
                                       {"positive_class": "Iris-setosa",
                                        "kind": "probability",
                                        "threshold": 0.5}
        :param operating_kind: "probability", "confidence" or "votes". Sets the
                               property that decides the prediction.
                               Used only if no operating_point is used
        :param median: Uses the median of each individual model's predicted
                       node as individual prediction for the specified
                       combination method.
        :param full: Boolean that controls whether to include the prediction's
                     attributes. By default, only the prediction is produced.
                     If set to True, the rest of available information is
                     added in a dictionary format. The dictionary keys can be:
                      - prediction: the prediction value
                      - confidence: prediction's confidence
                      - probability: prediction's probability
                      - path: rules that lead to the prediction
                      - count: number of training instances supporting the
                               prediction
                      - next: field to check in the next split
                      - min: minim value of the training instances in the
                             predicted node
                      - max: maximum value of the training instances in the
                             predicted node
                      - median: median of the values of the training instances
                                in the predicted node
                      - unused_fields: list of fields in the input data that
                                       are not being used in the model
        """

        # Checks and cleans input_data leaving the fields used in the model
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        unused_fields = None
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        if median and method is None:
            # predictions with median are only available with old combiners
            method = PLURALITY_CODE

        if method is None and operating_point is None and \
            operating_kind is None and not median:
            # operating_point has precedence over operating_kind. If no
            # combiner is set, default operating kind is "probability"
            operating_kind = "probability"

        if operating_point:
            if self.regression:
                raise ValueError("The operating_point argument can only be"
                                 " used in classifications.")
            prediction = self.predict_operating( \
                input_data,
                missing_strategy=missing_strategy,
                operating_point=operating_point)
            if full:
                return prediction
            return prediction["prediction"]

        if operating_kind:
            if self.regression:
                # for regressions, operating_kind defaults to the old
                # combiners
                method = 1 if operating_kind == "confidence" else 0
                return self.predict( \
                    input_data, method=method,
                    options=options, missing_strategy=missing_strategy,
                    operating_point=None, operating_kind=None, full=full)
            prediction = self.predict_operating_kind( \
                input_data,
                missing_strategy=missing_strategy,
                operating_kind=operating_kind)
            return prediction

        if len(self.models_splits) > 1:
            # If there's more than one chunk of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([], boosting_offsets=self.boosting_offsets)

            for models_split in self.models_splits:
                models = self._get_models(models_split)
                multi_model = MultiModel(models,
                                         api=self.api,
                                         fields=self.fields)

                votes_split = multi_model._generate_votes(
                    input_data,
                    missing_strategy=missing_strategy,
                    unused_fields=unused_fields)
                if median:
                    for prediction in votes_split.predictions:
                        prediction['prediction'] = prediction['median']
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model._generate_votes(
                input_data,
                missing_strategy=missing_strategy,
                unused_fields=unused_fields)

            votes = MultiVote(votes_split.predictions,
                              boosting_offsets=self.boosting_offsets)
            if median:
                for prediction in votes.predictions:
                    prediction['prediction'] = prediction['median']

        if self.boosting is not None and not self.regression:
            categories = [ \
                d[0] for d in
                self.fields[self.objective_id]["summary"]["categories"]]
            options = {"categories": categories}
        result = votes.combine(method=method, options=options, full=full)
        if full:
            unused_fields = set(input_data.keys())
            for prediction in votes.predictions:
                unused_fields = unused_fields.intersection( \
                    set(prediction.get("unused_fields", [])))
            if not isinstance(result, dict):
                result = {"prediction": result}
            result['unused_fields'] = list(unused_fields)

        return result