예제 #1
0
    def run(self):
        np.seterr(divide='warn', invalid='warn')
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['columns_to_ignore']]

        # Make predictions on the validation dataset normally and with various columns missing
        normal_predictions = self.transaction.model_backend.predict('validate')
        normal_accuracy = evaluate_accuracy(normal_predictions, self.transaction.input_data.validation_df, self.transaction.lmd['column_stats'], output_columns)

        empty_input_predictions = {}
        empty_inpurt_accuracy = {}

        ignorable_input_columns = [x for x in input_columns if self.transaction.lmd['column_stats'][x]['data_type'] != DATA_TYPES.FILE_PATH and x not in [y[0] for y in self.transaction.lmd['model_order_by']]]
        for col in ignorable_input_columns:
            empty_input_predictions[col] = self.transaction.model_backend.predict('validate', ignore_columns=[col])
            empty_inpurt_accuracy[col] = evaluate_accuracy(empty_input_predictions[col], self.transaction.input_data.validation_df, self.transaction.lmd['column_stats'], output_columns)

        # Get some information about the importance of each column
        if not self.transaction.lmd['disable_optional_analysis']:
            self.transaction.lmd['column_importances'] = {}
            for col in ignorable_input_columns:
                column_importance = (1 - empty_inpurt_accuracy[col]/normal_accuracy)
                column_importance = np.ceil(10*column_importance)
                self.transaction.lmd['column_importances'][col] = float(10 if column_importance > 10 else column_importance)

        # Run Probabilistic Validator
        overall_accuracy_arr = []
        self.transaction.lmd['accuracy_histogram'] = {}
        self.transaction.lmd['confusion_matrices'] = {}
        self.transaction.lmd['accuracy_samples'] = {}
        self.transaction.hmd['probabilistic_validators'] = {}

        for col in output_columns:
            pval = ProbabilisticValidator(col_stats=self.transaction.lmd['column_stats'][col], col_name=col, input_columns=input_columns)
            predictions_arr = [normal_predictions] + [empty_input_predictions[col] for col in ignorable_input_columns]

            pval.fit(self.transaction.input_data.validation_df, predictions_arr, [[x] for x in ignorable_input_columns])
            overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats()
            overall_accuracy_arr.append(overall_accuracy)

            self.transaction.lmd['accuracy_histogram'][col] = accuracy_histogram
            self.transaction.lmd['confusion_matrices'][col] = cm
            self.transaction.lmd['accuracy_samples'][col] = accuracy_samples
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(pval)

        print(overall_accuracy_arr)
        self.transaction.lmd['validation_set_accuracy'] = sum(overall_accuracy_arr)/len(overall_accuracy_arr)
예제 #2
0
    def get_column_importance(self, model, output_columns, input_columns,
                              full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        self.normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(self.normal_predictions,
                                            full_dataset, stats,
                                            output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            stats_generator = StatsGenerator(session=None,
                                             transaction=self.transaction)
            input_data = TransactionData()
            input_data.data_frame = self.normal_predictions[[output_column]]
            input_data.columns = [output_column]
            # @TODO: Running stats generator just to get the histogram is very inefficient, change this
            validation_set_output_stats = stats_generator.run(
                input_data=input_data, modify_light_metadata=False)

            if validation_set_output_stats is None:
                pass
            elif 'histogram' in validation_set_output_stats[output_column]:
                all_columns_prediction_distribution[
                    output_column] = validation_set_output_stats[
                        output_column]['histogram']

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [
                col for col in ignorable_input_columns if col != input_column
            ]
            col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions,
                                                  full_dataset, stats,
                                                  output_columns)

            col_only_normalized_accuracy = col_only_accuracy / normal_accuracy

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            col_missing_predictions = model.predict('validate', ignore_columns)

            col_missing_accuracy = evaluate_accuracy(col_missing_predictions,
                                                     full_dataset, stats,
                                                     output_columns)

            col_missing_reverse_accuracy = (
                normal_accuracy - col_missing_accuracy) / normal_accuracy
            column_importance = (col_only_normalized_accuracy +
                                 col_missing_reverse_accuracy) / 2
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}
                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                input_data = TransactionData()
                input_data.data_frame = col_missing_predictions[[
                    output_column
                ]]
                input_data.columns = [output_column]

                # @TODO: Running stats generator just to get the histogram is very inefficient, change this
                col_missing_output_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                if col_missing_output_stats is None:
                    pass
                elif 'histogram' in col_missing_output_stats[output_column]:
                    columnless_prediction_distribution[output_column][
                        input_column] = col_missing_output_stats[
                            output_column]['histogram']

        # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column
        #for column in ignorable_input_columns:
        #    if c(column_importance_dict[column] > 0.8 or column_importance_dict[column] < 0.2):

        for output_column in output_columns:
            buckets_stats[output_column] = {}

            bucket_indexes = {}
            for index, row in full_dataset.iterrows():
                print(index)
                value = row[output_column]
                if 'percentage_buckets' in stats[output_column]:
                    percentage_buckets = stats[output_column][
                        'percentage_buckets']
                else:
                    percentage_buckets = None

                value_bucket = get_value_bucket(value, percentage_buckets,
                                                stats[output_column])
                if value_bucket not in bucket_indexes:
                    bucket_indexes[value_bucket] = []
                bucket_indexes[value_bucket].append(index)

            for bucket in bucket_indexes:
                buckets_stats[output_column][bucket] = {}
                input_data = TransactionData()
                input_data.data_frame = full_dataset.loc[
                    bucket_indexes[bucket]]
                input_data.columns = input_data.data_frame.columns

                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                try:
                    col_buckets_stats = stats_generator.run(
                        input_data=input_data, modify_light_metadata=False)
                    buckets_stats[output_column][bucket].update(
                        col_buckets_stats)
                except:
                    print('Cloud not generate bucket stats for sub-bucket: {}'.
                          format(bucket))

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
예제 #3
0
    def get_column_importance(self, model, output_columns, input_columns,
                              full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        self.normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(self.normal_predictions,
                                            full_dataset, stats,
                                            output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            stats_generator = StatsGenerator(session=None,
                                             transaction=self.transaction)
            input_data = TransactionData()
            input_data.data_array = list(
                map(lambda x: [x],
                    list(self.normal_predictions[output_column])))
            input_data.columns = [output_column]
            validation_set_output_stats = stats_generator.run(
                input_data=input_data, modify_light_metadata=False)

            if validation_set_output_stats is None:
                pass
            elif 'histogram' in validation_set_output_stats[output_column]:
                all_columns_prediction_distribution[
                    output_column] = validation_set_output_stats[
                        output_column]['histogram']

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [
                col for col in ignorable_input_columns if col != input_column
            ]
            col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions,
                                                  full_dataset, stats,
                                                  output_columns)

            col_only_normalized_accuracy = col_only_accuracy / normal_accuracy

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            col_missing_predictions = model.predict('validate', ignore_columns)

            col_missing_accuracy = evaluate_accuracy(col_missing_predictions,
                                                     full_dataset, stats,
                                                     output_columns)

            col_missing_reverse_accuracy = (
                normal_accuracy - col_missing_accuracy) / normal_accuracy
            column_importance = (col_only_normalized_accuracy +
                                 col_missing_reverse_accuracy) / 2
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}
                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                input_data = TransactionData()
                input_data.data_array = list(
                    map(lambda x: [x],
                        list(col_missing_predictions[output_column])))
                input_data.columns = [output_column]
                col_missing_output_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                if col_missing_output_stats is None:
                    pass
                elif 'histogram' in col_missing_output_stats[output_column]:
                    columnless_prediction_distribution[output_column][
                        input_column] = col_missing_output_stats[
                            output_column]['histogram']

            # If this coulmn is either very important or not important at all, compute stats for each of the buckets (in the validation data)
            if column_importance > 0.8 or column_importance < 0.2:
                split_data = {}
                for value in full_dataset[input_column]:

                    if 'percentage_buckets' in stats[input_column]:
                        bucket = stats[input_column]['percentage_buckets']
                    else:
                        bucket = None

                    vb = get_value_bucket(value, bucket, stats[input_column])
                    if f'{input_column}_bucket_{vb}' not in split_data:
                        split_data[f'{input_column}_bucket_{vb}'] = []

                    split_data[f'{input_column}_bucket_{vb}'].append(value)

                row_wise_data = []
                max_length = max(list(map(len, split_data.values())))

                columns = []
                for i in range(max_length):
                    row_wise_data.append([])
                    for k in split_data.keys():
                        # If the sub bucket has less than 6 values, it's no relevant
                        if len(split_data[k]) > 6:
                            columns.append(k)
                            if len(split_data[k]) > i:
                                row_wise_data[-1].append(split_data[k][i])
                            else:
                                row_wise_data[-1].append(None)

                input_data = TransactionData()
                input_data.data_array = row_wise_data
                input_data.columns = columns

                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                col_buckets_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                buckets_stats.update(col_buckets_stats)

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
예제 #4
0
    def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        with disable_console_output(True):
            normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(normal_predictions, full_dataset, stats, output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            # @TODO: Running stats generator just to get the histogram is very inefficient, change this
            validation_set_output_column_histogram, _ = StatsGenerator.get_histogram(normal_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype'])

            if validation_set_output_column_histogram is not None:
                all_columns_prediction_distribution[output_column] = validation_set_output_column_histogram

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH and input_column not in [x[0] for x in self.transaction.lmd['model_order_by']]:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [col for col in ignorable_input_columns if col != input_column]
            with disable_console_output(True):
                col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns)

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            with disable_console_output(True):
                col_missing_predictions = model.predict('validate', ignore_columns)
            col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns)

            combined_column_accuracy = ((normal_accuracy - col_missing_accuracy) + col_only_accuracy)/2
            if combined_column_accuracy < 0:
                combined_column_accuracy = 0
            column_importance = 10*(1 - (normal_accuracy - combined_column_accuracy)/normal_accuracy)
            if column_importance < 1:
                column_importance = 1
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}

                col_missing_output_histogram, _ = StatsGenerator.get_histogram(col_missing_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype'])

                if col_missing_output_histogram is not None:
                    columnless_prediction_distribution[output_column][input_column] = col_missing_output_histogram

        # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column
        for output_column in output_columns:
                buckets_stats[output_column] = {}

                bucket_indexes = {}
                for index,row in full_dataset.iterrows():
                    value = row[output_column]
                    if 'percentage_buckets' in stats[output_column]:
                        percentage_buckets = stats[output_column]['percentage_buckets']
                    else:
                        percentage_buckets = None

                    value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column], self.transaction.hmd)
                    if value_bucket not in bucket_indexes:
                        bucket_indexes[value_bucket] = []
                    bucket_indexes[value_bucket].append(index)

                for bucket in bucket_indexes:
                    buckets_stats[output_column][bucket] = {}
                    input_data = TransactionData()
                    input_data.data_frame = full_dataset.loc[bucket_indexes[bucket]]
                    input_data.columns = input_data.columns

                    stats_generator = StatsGenerator(session=None, transaction=self.transaction)
                    try:
                        with disable_console_output():
                            col_buckets_stats = stats_generator.run(input_data=input_data, modify_light_metadata=False, print_logs=False)
                        buckets_stats[output_column][bucket].update(col_buckets_stats)
                    except Exception as e:
                        pass

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution