예제 #1
0
    def run(self):
        np.seterr(divide='warn', invalid='warn')
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['columns_to_ignore']]

        # Make predictions on the validation dataset normally and with various columns missing
        normal_predictions = self.transaction.model_backend.predict('validate')
        normal_accuracy = evaluate_accuracy(normal_predictions, self.transaction.input_data.validation_df, self.transaction.lmd['column_stats'], output_columns)

        empty_input_predictions = {}
        empty_inpurt_accuracy = {}

        ignorable_input_columns = [x for x in input_columns if self.transaction.lmd['column_stats'][x]['data_type'] != DATA_TYPES.FILE_PATH and x not in [y[0] for y in self.transaction.lmd['model_order_by']]]
        for col in ignorable_input_columns:
            empty_input_predictions[col] = self.transaction.model_backend.predict('validate', ignore_columns=[col])
            empty_inpurt_accuracy[col] = evaluate_accuracy(empty_input_predictions[col], self.transaction.input_data.validation_df, self.transaction.lmd['column_stats'], output_columns)

        # Get some information about the importance of each column
        if not self.transaction.lmd['disable_optional_analysis']:
            self.transaction.lmd['column_importances'] = {}
            for col in ignorable_input_columns:
                column_importance = (1 - empty_inpurt_accuracy[col]/normal_accuracy)
                column_importance = np.ceil(10*column_importance)
                self.transaction.lmd['column_importances'][col] = float(10 if column_importance > 10 else column_importance)

        # Run Probabilistic Validator
        overall_accuracy_arr = []
        self.transaction.lmd['accuracy_histogram'] = {}
        self.transaction.lmd['confusion_matrices'] = {}
        self.transaction.lmd['accuracy_samples'] = {}
        self.transaction.hmd['probabilistic_validators'] = {}

        for col in output_columns:
            pval = ProbabilisticValidator(col_stats=self.transaction.lmd['column_stats'][col], col_name=col, input_columns=input_columns)
            predictions_arr = [normal_predictions] + [empty_input_predictions[col] for col in ignorable_input_columns]

            pval.fit(self.transaction.input_data.validation_df, predictions_arr, [[x] for x in ignorable_input_columns])
            overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats()
            overall_accuracy_arr.append(overall_accuracy)

            self.transaction.lmd['accuracy_histogram'][col] = accuracy_histogram
            self.transaction.lmd['confusion_matrices'][col] = cm
            self.transaction.lmd['accuracy_samples'][col] = accuracy_samples
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(pval)

        print(overall_accuracy_arr)
        self.transaction.lmd['validation_set_accuracy'] = sum(overall_accuracy_arr)/len(overall_accuracy_arr)
예제 #2
0
    def run(self):
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [
            col for col in self.transaction.lmd['columns']
            if col not in output_columns
            and col not in self.transaction.lmd['malformed_columns']['names']
        ]

        # Test some hypotheses about our columns

        if self.transaction.lmd['disable_optional_analysis'] is False:
            column_evaluator = ColumnEvaluator(self.transaction)
            column_importances, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution = column_evaluator.get_column_importance(
                model=self.transaction.model_backend,
                output_columns=output_columns,
                input_columns=input_columns,
                full_dataset=self.transaction.input_data.validation_df,
                stats=self.transaction.lmd['column_stats'])

            self.transaction.lmd['column_importances'] = column_importances
            self.transaction.lmd['columns_buckets_importances'] = buckets_stats
            self.transaction.lmd[
                'columnless_prediction_distribution'] = columnless_prediction_distribution
            self.transaction.lmd[
                'all_columns_prediction_distribution'] = all_columns_prediction_distribution

        # Create the probabilistic validators for each of the predict column
        probabilistic_validators = {}
        for col in output_columns:
            if 'percentage_buckets' in self.transaction.lmd['column_stats'][
                    col]:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])
            else:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])

        ignorable_input_columns = []
        for input_column in input_columns:
            if self.transaction.lmd['column_stats'][input_column][
                    'data_type'] != DATA_TYPES.FILE_PATH and input_column not in [
                        x[0] for x in self.transaction.lmd['model_order_by']
                    ]:
                ignorable_input_columns.append(input_column)

        normal_predictions = self.transaction.model_backend.predict('validate')

        # Single observation on the validation dataset when we have no ignorable column
        if len(ignorable_input_columns) == 0:
            for pcol in output_columns:
                for i in range(
                        len(self.transaction.input_data.validation_df[pcol])):
                    probabilistic_validators[pcol].register_observation(
                        features_existence=[True for col in input_columns],
                        real_value=self.transaction.input_data.
                        validation_df[pcol].iloc[i],
                        predicted_value=normal_predictions[pcol][i],
                        hmd=self.transaction.hmd)

        # Run on the validation set multiple times, each time with one of the column blanked out
        for column_name in ignorable_input_columns:
            ignore_columns = []
            ignore_columns.append(column_name)

            ignore_col_predictions = self.transaction.model_backend.predict(
                'validate', ignore_columns)

            # create a vector that has True for each feature that was passed to the model tester and False if it was blanked
            features_existence = [
                True if np_col not in ignore_columns else False
                for np_col in input_columns
            ]

            pv = {}
            for pcol in output_columns:
                for i in range(
                        len(self.transaction.input_data.validation_df[pcol])):
                    probabilistic_validators[pcol].register_observation(
                        features_existence=features_existence,
                        real_value=self.transaction.input_data.
                        validation_df[pcol].iloc[i],
                        predicted_value=ignore_col_predictions[pcol][i],
                        hmd=self.transaction.hmd)
                    probabilistic_validators[pcol].register_observation(
                        features_existence=[True for col in input_columns],
                        real_value=self.transaction.input_data.
                        validation_df[pcol].iloc[i],
                        predicted_value=normal_predictions[pcol][i],
                        hmd=self.transaction.hmd)
        self.transaction.lmd['accuracy_histogram'] = {}

        total_accuracy = 0
        for pcol in output_columns:
            probabilistic_validators[pcol].partial_fit()
            accuracy_histogram, validation_set_accuracy = probabilistic_validators[
                pcol].get_accuracy_histogram()
            self.transaction.lmd['accuracy_histogram'][
                pcol] = accuracy_histogram
            total_accuracy += validation_set_accuracy
        self.transaction.lmd['validation_set_accuracy'] = total_accuracy / len(
            output_columns)

        # Pickle for later use
        self.transaction.hmd['probabilistic_validators'] = {}
        for col in probabilistic_validators:
            confusion_matrix = probabilistic_validators[
                col].get_confusion_matrix()
            self.transaction.lmd['confusion_matrices'][col] = confusion_matrix
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(
                probabilistic_validators[col])
예제 #3
0
    def run(self):
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [
            col for col in self.transaction.lmd['columns']
            if col not in output_columns
            and col not in self.transaction.lmd['malformed_columns']['names']
        ]
        validation_dataset = {}

        for row_ind in self.transaction.input_data.validation_indexes[
                KEY_NO_GROUP_BY]:
            for col_ind, col in enumerate(self.transaction.lmd['columns']):
                if col in self.transaction.lmd['malformed_columns']['names']:
                    continue
                if col not in validation_dataset:
                    validation_dataset[col] = []
                validation_dataset[col].append(
                    self.transaction.input_data.data_array[row_ind][col_ind])

        # Test some hypotheses about our columns
        column_evaluator = ColumnEvaluator(self.transaction)
        column_importances, buckets_stats = column_evaluator.get_column_importance(
            model=self.transaction.model_backend,
            output_columns=output_columns,
            input_columns=input_columns,
            full_dataset=validation_dataset,
            stats=self.transaction.lmd['column_stats'])
        self.transaction.lmd['column_importances'] = column_importances
        self.transaction.lmd[
            'unusual_columns_buckets_importances'] = buckets_stats

        # Create the probabilistic validators for each of the predict column
        probabilistic_validators = {}
        for col in output_columns:
            if 'percentage_buckets' in self.transaction.lmd['column_stats'][
                    col]:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])
            else:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])

        # Run on the validation set multiple times, each time with one of the column blanked out
        for column_name in input_columns:
            ignore_columns = []
            ignore_columns.append(column_name)

            predictions = self.transaction.model_backend.predict(
                'validate', ignore_columns)

            # create a vector that has True for each feature that was passed to the model tester and False if it was blanked
            features_existence = [
                True if np_col not in ignore_columns else False
                for np_col in input_columns
            ]

            # A separate probabilistic model is trained for each predicted column, we may want to change this in the future, @TODO
            for pcol in output_columns:
                for i in range(len(predictions[pcol])):
                    predicted_val = predictions[pcol][i]
                    real_val = validation_dataset[pcol][i]
                    probabilistic_validators[pcol].register_observation(
                        features_existence=features_existence,
                        real_value=real_val,
                        predicted_value=predicted_val)

        for pcol in output_columns:
            probabilistic_validators[pcol].partial_fit()

        # Pickle for later use
        self.transaction.hmd['probabilistic_validators'] = {}
        for col in probabilistic_validators:
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(
                probabilistic_validators[col])
예제 #4
0
    def run(self):
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [
            col for col in self.transaction.lmd['columns']
            if col not in output_columns
            and col not in self.transaction.lmd['malformed_columns']['names']
        ]

        # Test some hypotheses about our columns
        column_evaluator = ColumnEvaluator(self.transaction)
        column_importances, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution = column_evaluator.get_column_importance(
            model=self.transaction.model_backend,
            output_columns=output_columns,
            input_columns=input_columns,
            full_dataset=self.transaction.input_data.validation_df,
            stats=self.transaction.lmd['column_stats'])

        self.transaction.lmd['column_importances'] = column_importances
        self.transaction.lmd['columns_buckets_importances'] = buckets_stats
        self.transaction.lmd[
            'columnless_prediction_distribution'] = columnless_prediction_distribution
        self.transaction.lmd[
            'all_columns_prediction_distribution'] = all_columns_prediction_distribution
        # Create the probabilistic validators for each of the predict column
        probabilistic_validators = {}
        for col in output_columns:
            if 'percentage_buckets' in self.transaction.lmd['column_stats'][
                    col]:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])
            else:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])

        ignorable_input_columns = []
        for input_column in input_columns:
            if self.transaction.lmd['column_stats'][input_column][
                    'data_type'] != DATA_TYPES.FILE_PATH:
                ignorable_input_columns.append(input_column)

        predictions = self.transaction.model_backend.predict('validate')
        for pcol in output_columns:
            i = 0
            for real_val in self.transaction.input_data.validation_df[pcol]:
                predicted_val = predictions[pcol][i]
                probabilistic_validators[pcol].register_observation(
                    features_existence=[True for col in input_columns],
                    real_value=real_val,
                    predicted_value=predicted_val)
                i += 1

        # Run on the validation set multiple times, each time with one of the column blanked out
        for column_name in ignorable_input_columns:
            ignore_columns = []
            ignore_columns.append(column_name)

            predictions = self.transaction.model_backend.predict(
                'validate', ignore_columns)

            # create a vector that has True for each feature that was passed to the model tester and False if it was blanked
            features_existence = [
                True if np_col not in ignore_columns else False
                for np_col in input_columns
            ]

            # A separate probabilistic model is trained for each predicted column, we may want to change this in the future, @TODO
            for pcol in output_columns:
                i = 0
                for real_val in self.transaction.input_data.validation_df:
                    predicted_val = predictions[pcol][i]
                    probabilistic_validators[pcol].register_observation(
                        features_existence=features_existence,
                        real_value=real_val,
                        predicted_value=predicted_val)
                    i += 1

        self.transaction.lmd['accuracy_histogram'] = {}
        for pcol in output_columns:
            probabilistic_validators[pcol].partial_fit()
            accuracy_histogram = probabilistic_validators[
                pcol].get_accuracy_histogram()
            self.transaction.lmd['accuracy_histogram'][
                pcol] = accuracy_histogram

        # Pickle for later use
        self.transaction.hmd['probabilistic_validators'] = {}
        for col in probabilistic_validators:
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(
                probabilistic_validators[col])