Exemplo n.º 1
0
    def predict(self,
                input_data,
                missing_strategy=LAST_PREDICTION,
                operating_point=None,
                full=False):
        """Makes a prediction based on a number of field values.

        input_data: Input data to be predicted
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        operating_point: In classification models, this is the point of the
                         ROC curve where the model will be used at. The
                         operating point can be defined in terms of:
                         - the positive_class, the class that is important to
                           predict accurately
                         - the threshold,
                           the value that is stablished
                           as minimum for the positive_class to be predicted.
                         - the kind of measure used to set a threshold:
                           probability or confidence (if available)
                         The operating_point is then defined as a map with
                         two attributes, e.g.:
                           {"positive_class": "Iris-setosa",
                            "threshold": 0.5,
                            "kind": "probability"}
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - probability: prediction's probability
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model
        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        if not self.missing_numerics:
            check_no_missing_numerics(input_data, self.model_fields)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        full_prediction = self._predict( \
            input_data, missing_strategy=missing_strategy,
            operating_point=operating_point,
            unused_fields=unused_fields)
        if full:
            return dict((key, value) for key, value in \
                full_prediction.items() if value is not None)

        return full_prediction['prediction']
Exemplo n.º 2
0
    def predict(self, input_data, missing_strategy=LAST_PREDICTION,
                operating_point=None, full=False):
        """Makes a prediction based on a number of field values.

        input_data: Input data to be predicted
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        operating_point: In classification models, this is the point of the
                         ROC curve where the model will be used at. The
                         operating point can be defined in terms of:
                         - the positive_class, the class that is important to
                           predict accurately
                         - the probability_threshold,
                           the probability that is stablished
                           as minimum for the positive_class to be predicted.
                         The operating_point is then defined as a map with
                         two attributes, e.g.:
                           {"positive_class": "Iris-setosa",
                            "probability_threshold": 0.5}
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - probability: prediction's probability
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model
        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        if not self.missing_numerics:
            check_no_missing_numerics(input_data, self.fields)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        full_prediction = self._predict( \
            input_data, missing_strategy=missing_strategy,
            operating_point=operating_point,
            unused_fields=unused_fields)
        if full:
            return dict((key, value) for key, value in \
                full_prediction.iteritems() if value is not None)

        return full_prediction['prediction']
Exemplo n.º 3
0
    def predict(self,
                input_data,
                operating_point=None,
                operating_kind=None,
                full=False):
        """Returns the class prediction and the probability distribution

        input_data: Input data to be predicted
        operating_point: In classification models, this is the point of the
                         ROC curve where the model will be used at. The
                         operating point can be defined in terms of:
                         - the positive_class, the class that is important to
                           predict accurately
                         - the probability_threshold,
                           the probability that is stablished
                           as minimum for the positive_class to be predicted.
                         The operating_point is then defined as a map with
                         two attributes, e.g.:
                           {"positive_class": "Iris-setosa",
                            "probability_threshold": 0.5}
        operating_kind: "probability". Sets the
                        property that decides the prediction. Used only if
                        no operating_point is used
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - probability: prediction's probability
                  - distribution: distribution of probabilities for each
                                  of the objective field classes
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model

        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        # When operating_point is used, we need the probabilities
        # of all possible classes to decide, so se use
        # the `predict_probability` method
        if operating_point:
            return self.predict_operating( \
                input_data, operating_point=operating_point)
        if operating_kind:
            return self.predict_operating_kind( \
                input_data, operating_kind=operating_kind)

        # In case that missing_numerics is False, checks that all numeric
        # fields are present in input data.
        if not self.missing_numerics:
            check_no_missing_numerics(input_data, self.model_fields,
                                      self.weight_field)

        if self.balance_fields:
            balance_input(input_data, self.fields)

        # Computes text and categorical field expansion
        unique_terms = self.get_unique_terms(input_data)

        probabilities = {}
        total = 0
        # Computes the contributions for each category
        for category in self.coefficients:
            probability = self.category_probability( \
                input_data, unique_terms, category)
            try:
                order = self.categories[self.objective_id].index(category)
            except ValueError:
                if category == '':
                    order = len(self.categories[self.objective_id])
            probabilities[category] = {
                "category": category,
                "probability": probability,
                "order": order
            }
            total += probabilities[category]["probability"]
        # Normalizes the contributions to get a probability
        for category in probabilities:
            probabilities[category]["probability"] /= total
            probabilities[category]["probability"] = round( \
                probabilities[category]["probability"], PRECISION)

        # Chooses the most probable category as prediction
        predictions = sorted(list(probabilities.items()),
                             key=lambda x:
                             (x[1]["probability"], -x[1]["order"]),
                             reverse=True)
        for prediction, probability in predictions:
            del probability['order']
        prediction, probability = predictions[0]

        result = {
            "prediction":
            prediction,
            "probability":
            probability["probability"],
            "distribution": [{
                "category": category,
                "probability": probability["probability"]
            } for category, probability in predictions]
        }

        if full:
            result.update({'unused_fields': unused_fields})
        else:
            result = result["prediction"]

        return result
Exemplo n.º 4
0
    def predict_probability(self, input_data,
                            missing_strategy=LAST_PREDICTION,
                            compact=False):

        """For classification models, Predicts a probability for
        each possible output class, based on input values.  The input
        fields must be a dictionary keyed by field name or field ID.

        For regressions, the output is a single element list
        containing the prediction.

        :param input_data: Input data to be predicted
        :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy
                                 for missing fields
        :param compact: If False, prediction is returned as a list of maps, one
                        per class, with the keys "prediction" and "probability"
                        mapped to the name of the class and it's probability,
                        respectively.  If True, returns a list of probabilities
                        ordered by the sorted order of the class names.
        """
        votes = MultiVoteList([])
        if not self.missing_numerics:
            check_no_missing_numerics(input_data, self.fields)

        for models_split in self.models_splits:
            models = []
            for model in models_split:
                if get_resource_type(model) == "fusion":
                    models.append(Fusion(model, api=self.api))
                else:
                    models.append(SupervisedModel(model, api=self.api))
            votes_split = []
            for model in models:
                try:
                    prediction = model.predict_probability( \
                        input_data,
                        missing_strategy=missing_strategy,
                        compact=True)

                except ValueError:
                    # logistic regressions can raise this error if they
                    # have missing_numerics=False and some numeric missings
                    # are found
                    continue
                if self.regression:
                    prediction = prediction[0]
                    if self.weights is not None:
                        prediction = self.weigh(prediction, model.resource_id)
                else:
                    if self.weights is not None:
                        prediction = self.weigh( \
                            prediction, model.resource_id)
                    # we need to check that all classes in the fusion
                    # are also in the composing model
                    if not self.regression and \
                            self.class_names != model.class_names:
                        try:
                            prediction = rearrange_prediction( \
                                model.class_names,
                                self.class_names,
                                prediction)
                        except AttributeError:
                            # class_names should be defined, but just in case
                            pass
                votes_split.append(prediction)


            votes.extend(votes_split)
        if self.regression:
            total_weight = len(votes.predictions) if self.weights is None \
                else sum(self.weights)
            prediction = sum([prediction for prediction in \
                votes.predictions]) / float(total_weight)
            if compact:
                output = [prediction]
            else:
                output = {"prediction": prediction}

        else:
            output = votes.combine_to_distribution(normalize=True)
            if not compact:
                output = [{'category': class_name,
                           'probability': probability}
                          for class_name, probability in
                          zip(self.class_names, output)]

        return output