Пример #1
0
    def missing_factors(self, input_mask):
        """Returns the factors to divide the PCA values when input
        data has missings

        """

        sum_eigenvectors = []
        for row in self.eigenvectors:
            eigenvector = [a * b for a, b in zip(input_mask, row)]
            sum_eigenvectors.append(dot([eigenvector], [eigenvector])[0][0])
        return sum_eigenvectors
Пример #2
0
    def confidence_bounds(self, input_array):
        """Computes the confidence interval for the prediction

        """
        product = dot(dot([input_array], self.xtx_inverse),
                      [input_array])[0][0]
        valid = True
        try:
            confidence_interval = self.t_crit * math.sqrt( \
                self.mean_squared_error * product)
            prediction_interval = self.t_crit * math.sqrt( \
                self.mean_squared_error * (product + 1))
            valid = True
        except ValueError:
            valid = False
            confidence_interval, prediction_interval = (0, 0)

        return {"confidence_interval": confidence_interval,
                "prediction_interval": prediction_interval,
                "valid": valid}
Пример #3
0
    def missing_factors(self, input_mask):
        """Returns the factors to divide the PCA values when input
        data has missings

        """

        sum_eigenvectors = []
        for row in self.eigenvectors:
            eigenvector = [a * b for a, b in zip(input_mask, row)]
            sum_eigenvectors.append(dot([eigenvector], [eigenvector])[0][0])
        return sum_eigenvectors
Пример #4
0
    def predict(self, input_data, full=False):
        """Returns the prediction and the confidence intervals

        input_data: Input data to be predicted
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model

        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        norm_input_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        if full:
            norm_input_data, unused_fields = norm_input_data

        # Strips affixes for numeric values and casts to the final field type
        cast(norm_input_data, self.fields)

        # In case that the training data has no missings, input data shouldn't
        check_no_training_missings(norm_input_data, self.model_fields,
                                   self.weight_field,
                                   self.objective_id)

        # Computes text and categorical field expansion
        unique_terms = self.get_unique_terms(norm_input_data)

        # Creates an input vector with the values for all expanded fields.
        input_array = self.expand_input(norm_input_data, unique_terms)
        compact_input_array = self.expand_input(norm_input_data, unique_terms,
                                                True)

        prediction = dot([flatten(self.coefficients)], [input_array])[0][0]

        result = {
            "prediction": prediction}
        if self.xtx_inverse:
            result.update({"confidence_bounds": self.confidence_bounds( \
                compact_input_array)})

        if full:
            result.update({"unused_fields": unused_fields})
        else:
            result = result["prediction"]

        return result
Пример #5
0
    def predict(self, input_data, full=False):
        """Returns the prediction and the confidence intervals

        input_data: Input data to be predicted
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model

        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        if full:
            new_data, unused_fields = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(new_data, self.fields)

        # In case that the training data has no missings, input data shouldn't
        check_no_training_missings(new_data, self.fields, self.weight_field,
                                   self.objective_id)

        # Computes text and categorical field expansion
        unique_terms = self.get_unique_terms(new_data)

        # Creates an input vector with the values for all expanded fields.
        input_array = self.expand_input(new_data, unique_terms)
        compact_input_array = self.expand_input(new_data, unique_terms, True)

        prediction = dot([flatten(self.coefficients)], [input_array])[0][0]

        result = {
            "prediction": prediction}
        if self.xtx_inverse is not None:
            result.update({"confidence_bounds": self.confidence_bounds( \
                compact_input_array)})

        if full:
            result.update({"unused_fields": unused_fields})
        else:
            result = result["prediction"]

        return result
Пример #6
0
    def projection(self,
                   input_data,
                   max_components=None,
                   variance_threshold=None,
                   full=False):
        """Returns the projection of input data in the new components

        input_data: Input data to be projected

        """

        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=False)

        # Strips affixes for numeric values and casts to the final field type
        cast(new_data, self.fields)

        # Computes text and categorical field expansion into an input array of
        # terms and frequencies
        unique_terms = self.get_unique_terms(new_data)

        # Creates an input vector with the values for all expanded fields.
        # The input mask marks the non-missing or categorical fields
        # The `missings` variable is a boolean indicating whether there's
        # non-categorical fields missing
        input_array, missings, input_mask = self.expand_input(
            new_data, unique_terms)
        components = self.eigenvectors[:]
        if max_components is not None:
            components = components[0:max_components]
        if variance_threshold is not None:
            for index, cumulative in enumerate(self.cumulative_variance):
                if cumulative > variance_threshold:
                    components = components[0:index + 1]

        result = [value[0] for value in dot(components, [input_array])]

        # if non-categorical fields values are missing in input data
        # there's an additional normalization
        if missings:
            missing_sums = self.missing_factors(input_mask)
            for index, value in enumerate(result):
                result[index] = value / missing_sums[index] \
                    if missing_sums[index] > 0 else value
        if full:
            result = dict(zip(["PC%s" % index \
                for index in range(1, len(components) + 1)], result))
        return result
Пример #7
0
    def projection(self, input_data, max_components=None,
                   variance_threshold=None, full=False):
        """Returns the projection of input data in the new components

        input_data: Input data to be projected

        """

        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=False)

        # Strips affixes for numeric values and casts to the final field type
        cast(new_data, self.fields)

        # Computes text and categorical field expansion into an input array of
        # terms and frequencies
        unique_terms = self.get_unique_terms(new_data)


        # Creates an input vector with the values for all expanded fields.
        # The input mask marks the non-missing or categorical fields
        # The `missings` variable is a boolean indicating whether there's
        # non-categorical fields missing
        input_array, missings, input_mask = self.expand_input(new_data,
                                                              unique_terms)
        components = self.eigenvectors[:]
        if max_components is not None:
            components = components[0: max_components]
        if variance_threshold is not None:
            for index, cumulative in enumerate(self.cumulative_variance):
                if cumulative > variance_threshold:
                    components = components[0: index + 1]

        result = [value[0] for value in dot(components, [input_array])]

        # if non-categorical fields values are missing in input data
        # there's an additional normalization
        if missings:
            missing_sums = self.missing_factors(input_mask)
            for index, value in enumerate(result):
                result[index] = value / missing_sums[index] \
                    if missing_sums[index] > 0 else value
        if full:
            result = dict(zip(["PC%s" % index \
                for index in range(1, len(components) + 1)], result))
        return result
Пример #8
0
    def categorical_encoding(self, inputs, field_id, compact):
        """Returns the result of combining the encoded categories
        according to the field_codings projections

        The result is the components generated by the categorical field
        """

        new_inputs = inputs[:]

        projections = self.field_codings[field_id].get( \
                CONTRAST, self.field_codings[field_id].get(OTHER))
        if projections is not None:
            new_inputs = flatten(dot(projections, [new_inputs]))

        if compact and self.field_codings[field_id].get(DUMMY) is not None:
            dummy_class = self.field_codings[field_id][DUMMY]
            index = self.categories[field_id].index(dummy_class)
            cat_new_inputs = new_inputs[0:index]
            if len(new_inputs) > (index + 1):
                cat_new_inputs.extend(new_inputs[index + 1:])
            new_inputs = cat_new_inputs

        return new_inputs
Пример #9
0
    def categorical_encoding(self, inputs, field_id, compact):
        """Returns the result of combining the encoded categories
        according to the field_codings projections

        The result is the components generated by the categorical field
        """

        new_inputs = inputs[:]

        projections = self.field_codings[field_id].get( \
                CONTRAST, self.field_codings[field_id].get(OTHER))
        if projections is not None:
            new_inputs = flatten(dot(projections, [new_inputs]))

        if compact and self.field_codings[field_id].get(DUMMY) is not None:
            dummy_class = self.field_codings[field_id][DUMMY]
            index = self.categories[field_id].index(dummy_class)
            cat_new_inputs = new_inputs[0: index]
            if len(new_inputs) > (index + 1):
                cat_new_inputs.extend(new_inputs[index + 1 :])
            new_inputs = cat_new_inputs

        return new_inputs