Пример #1
0
    def register_observation(self, features_existence, real_value,
                             predicted_value):
        """
        # Register an observation in the validator's internal buffers

        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(
            predicted_value)
        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(
                str(real_value).replace(',', '.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets,
                                                 self.col_stats)
            real_value_b = get_value_bucket(real_value, self.buckets,
                                            self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence
            self.X_buff.append(X)
            self.Y_buff.append(real_value_b)
        else:
            predicted_value_b = predicted_value
            real_value_b = real_value
            self.X_buff.append(features_existence)
            self.Y_buff.append(real_value_b == predicted_value_b)
Пример #2
0
    def register_observation(self,
                             features_existence,
                             real_value,
                             predicted_value,
                             hmd=None):
        """
        # Register an observation in the validator's internal buffers

        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        try:
            predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(
                predicted_value)
        except:
            predicted_value = None

        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(
                str(real_value).replace(',', '.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets,
                                                 self.col_stats, hmd)
            real_value_b = get_value_bucket(real_value, self.buckets,
                                            self.col_stats, hmd)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence
            self._X_buff.append(X)

            self._Y_buff.append(real_value_b)
            self._real_buckets_buff = self._Y_buff
            self._predicted_buckets_buff.append(predicted_value_b)

            # If no column is ignored, compute the accuracy for this bucket
            nr_missing_features = len(
                [x for x in features_existence if x is False or x is 0])
            if nr_missing_features == 0:
                if real_value_b not in self.bucket_accuracy:
                    self.bucket_accuracy[real_value_b] = []
                self.bucket_accuracy[real_value_b].append(
                    int(real_value_b == predicted_value_b))
        else:
            predicted_value_b = predicted_value
            real_value_b = real_value
            self._X_buff.append(features_existence)
            self._Y_buff.append(real_value_b == predicted_value_b)
            self._real_buckets_buff.append(real_value_b)
            self._predicted_buckets_buff.append(predicted_value_b)
Пример #3
0
    def evaluate_prediction_accuracy(self, features_existence, predicted_value):
        """
        # Fit the probabilistic validator on an observation    def evaluate_prediction_accuracy(self, features_existence, predicted_value):
        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param predicted_value: The predicted value/label
        :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
        """
        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = [X + features_existence]
        else:
            X = [features_existence]

        #X = [[predicted_value_b, *features_existence]]
        log_types = np.seterr()
        np.seterr(divide='ignore')
        distribution = self._probabilistic_model.predict_proba(np.array(X))
        np.seterr(divide=log_types['divide'])

        if self.buckets is not None:
            return ProbabilityEvaluation(self.buckets, distribution[0].tolist(), predicted_value).most_likely_probability
        else:
            return distribution[0][1]
Пример #4
0
    def evaluate_prediction_accuracy(self, features_existence,
                                     predicted_value):
        """
        # Fit the probabilistic validator on an observation
        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param predicted_value: The predicted value/label
        :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
        """
        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets,
                                                 self.col_stats)
            X = [0] * (len(self.buckets) + 1)
            X[predicted_value_b] = 1
            X = [X + features_existence]
        else:
            X = [features_existence]

        try:
            true_index = self._probabilistic_model.classes_.tolist().index(
                True)
        except:
            print('Only got classes: ',
                  str(self._probabilistic_model.classes_.tolist()),
                  ' in the probabilistic model\'s Y vector !')
            true_index = None

        if true_index is None:
            probability_true_prediction = 0
        else:
            probability_true_prediction = self._probabilistic_model.predict_proba(
                np.array(X))[0][true_index]

        return probability_true_prediction
Пример #5
0
def compute_outlier_buckets(outlier_values, hist_x, hist_y, percentage_buckets,
                            col_stats):
    outlier_buckets = []
    # map each bucket to list of outliers in it
    bucket_outliers = defaultdict(list)
    for value in outlier_values:
        vb_index = get_value_bucket(value, percentage_buckets, col_stats)
        vb = percentage_buckets[vb_index]
        bucket_outliers[vb].append(value)

    # Filter out buckets without outliers,
    # then sort by number of outliers in ascending order
    buckets_with_outliers = sorted(filter(lambda kv: len(kv[1]) > 0,
                                          bucket_outliers.items()),
                                   key=lambda kv: len(kv[1]))

    for i, (bucket, outlier_values) in enumerate(buckets_with_outliers):
        bucket_index = hist_x.index(bucket)

        bucket_values_num = hist_y[bucket_index]
        bucket_outliers_num = len(outlier_values)

        # Is the bucket in the 95th percentile by number of outliers?
        percentile_outlier = ((i + 1) / len(buckets_with_outliers)) >= 0.95

        # Are half of values in the bucket outliers?
        predominantly_outlier = False
        if bucket_values_num:
            predominantly_outlier = (bucket_outliers_num /
                                     bucket_values_num) > 0.5

        if predominantly_outlier or percentile_outlier:
            outlier_buckets.append(bucket)
    return outlier_buckets
Пример #6
0
    def evaluate_prediction_accuracy(self, features_existence, predicted_value,
                                     always_use_model_prediction):
        """
        # Fit the probabilistic validator on an observation
        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param predicted_value: The predicted value/label
        :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
        """
        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets,
                                                 self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = [X + features_existence]
        else:
            X = [features_existence]

        distribution = self._probabilistic_model.predict_proba(np.array(X))[0]
        distribution = distribution.tolist()

        if len([x for x in distribution if x > 0.01]) > 4:
            # @HACK
            mean = np.mean(distribution)
            std = np.std(distribution)

            distribution = [x if x > (mean - std) else 0 for x in distribution]

            sum_dist = sum(distribution)
            # Avoid divison by zero in certain edge cases
            sum_dist = 0.00001 if sum_dist == 0 else sum_dist
            distribution = [x / sum_dist for x in distribution]

            min_val = min([x for x in distribution if x > 0.001])
            distribution = [
                x - min_val if x > min_val else 0 for x in distribution
            ]

            sum_dist = sum(distribution)
            # Avoid divison by zero in certain edge cases
            sum_dist = 0.00001 if sum_dist == 0 else sum_dist
            distribution = [x / sum_dist for x in distribution]
            # @HACK
        else:
            pass

        return ProbabilityEvaluation(self.buckets, distribution,
                                     predicted_value,
                                     always_use_model_prediction)
Пример #7
0
    def get_column_importance(self, model, output_columns, input_columns,
                              full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        self.normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(self.normal_predictions,
                                            full_dataset, stats,
                                            output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            stats_generator = StatsGenerator(session=None,
                                             transaction=self.transaction)
            input_data = TransactionData()
            input_data.data_frame = self.normal_predictions[[output_column]]
            input_data.columns = [output_column]
            # @TODO: Running stats generator just to get the histogram is very inefficient, change this
            validation_set_output_stats = stats_generator.run(
                input_data=input_data, modify_light_metadata=False)

            if validation_set_output_stats is None:
                pass
            elif 'histogram' in validation_set_output_stats[output_column]:
                all_columns_prediction_distribution[
                    output_column] = validation_set_output_stats[
                        output_column]['histogram']

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [
                col for col in ignorable_input_columns if col != input_column
            ]
            col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions,
                                                  full_dataset, stats,
                                                  output_columns)

            col_only_normalized_accuracy = col_only_accuracy / normal_accuracy

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            col_missing_predictions = model.predict('validate', ignore_columns)

            col_missing_accuracy = evaluate_accuracy(col_missing_predictions,
                                                     full_dataset, stats,
                                                     output_columns)

            col_missing_reverse_accuracy = (
                normal_accuracy - col_missing_accuracy) / normal_accuracy
            column_importance = (col_only_normalized_accuracy +
                                 col_missing_reverse_accuracy) / 2
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}
                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                input_data = TransactionData()
                input_data.data_frame = col_missing_predictions[[
                    output_column
                ]]
                input_data.columns = [output_column]

                # @TODO: Running stats generator just to get the histogram is very inefficient, change this
                col_missing_output_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                if col_missing_output_stats is None:
                    pass
                elif 'histogram' in col_missing_output_stats[output_column]:
                    columnless_prediction_distribution[output_column][
                        input_column] = col_missing_output_stats[
                            output_column]['histogram']

        # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column
        #for column in ignorable_input_columns:
        #    if c(column_importance_dict[column] > 0.8 or column_importance_dict[column] < 0.2):

        for output_column in output_columns:
            buckets_stats[output_column] = {}

            bucket_indexes = {}
            for index, row in full_dataset.iterrows():
                print(index)
                value = row[output_column]
                if 'percentage_buckets' in stats[output_column]:
                    percentage_buckets = stats[output_column][
                        'percentage_buckets']
                else:
                    percentage_buckets = None

                value_bucket = get_value_bucket(value, percentage_buckets,
                                                stats[output_column])
                if value_bucket not in bucket_indexes:
                    bucket_indexes[value_bucket] = []
                bucket_indexes[value_bucket].append(index)

            for bucket in bucket_indexes:
                buckets_stats[output_column][bucket] = {}
                input_data = TransactionData()
                input_data.data_frame = full_dataset.loc[
                    bucket_indexes[bucket]]
                input_data.columns = input_data.data_frame.columns

                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                try:
                    col_buckets_stats = stats_generator.run(
                        input_data=input_data, modify_light_metadata=False)
                    buckets_stats[output_column][bucket].update(
                        col_buckets_stats)
                except:
                    print('Cloud not generate bucket stats for sub-bucket: {}'.
                          format(bucket))

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
Пример #8
0
    def get_column_importance(self, model, output_columns, input_columns,
                              full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        self.normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(self.normal_predictions,
                                            full_dataset, stats,
                                            output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            stats_generator = StatsGenerator(session=None,
                                             transaction=self.transaction)
            input_data = TransactionData()
            input_data.data_array = list(
                map(lambda x: [x],
                    list(self.normal_predictions[output_column])))
            input_data.columns = [output_column]
            validation_set_output_stats = stats_generator.run(
                input_data=input_data, modify_light_metadata=False)

            if validation_set_output_stats is None:
                pass
            elif 'histogram' in validation_set_output_stats[output_column]:
                all_columns_prediction_distribution[
                    output_column] = validation_set_output_stats[
                        output_column]['histogram']

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [
                col for col in ignorable_input_columns if col != input_column
            ]
            col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions,
                                                  full_dataset, stats,
                                                  output_columns)

            col_only_normalized_accuracy = col_only_accuracy / normal_accuracy

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            col_missing_predictions = model.predict('validate', ignore_columns)

            col_missing_accuracy = evaluate_accuracy(col_missing_predictions,
                                                     full_dataset, stats,
                                                     output_columns)

            col_missing_reverse_accuracy = (
                normal_accuracy - col_missing_accuracy) / normal_accuracy
            column_importance = (col_only_normalized_accuracy +
                                 col_missing_reverse_accuracy) / 2
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}
                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                input_data = TransactionData()
                input_data.data_array = list(
                    map(lambda x: [x],
                        list(col_missing_predictions[output_column])))
                input_data.columns = [output_column]
                col_missing_output_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                if col_missing_output_stats is None:
                    pass
                elif 'histogram' in col_missing_output_stats[output_column]:
                    columnless_prediction_distribution[output_column][
                        input_column] = col_missing_output_stats[
                            output_column]['histogram']

            # If this coulmn is either very important or not important at all, compute stats for each of the buckets (in the validation data)
            if column_importance > 0.8 or column_importance < 0.2:
                split_data = {}
                for value in full_dataset[input_column]:

                    if 'percentage_buckets' in stats[input_column]:
                        bucket = stats[input_column]['percentage_buckets']
                    else:
                        bucket = None

                    vb = get_value_bucket(value, bucket, stats[input_column])
                    if f'{input_column}_bucket_{vb}' not in split_data:
                        split_data[f'{input_column}_bucket_{vb}'] = []

                    split_data[f'{input_column}_bucket_{vb}'].append(value)

                row_wise_data = []
                max_length = max(list(map(len, split_data.values())))

                columns = []
                for i in range(max_length):
                    row_wise_data.append([])
                    for k in split_data.keys():
                        # If the sub bucket has less than 6 values, it's no relevant
                        if len(split_data[k]) > 6:
                            columns.append(k)
                            if len(split_data[k]) > i:
                                row_wise_data[-1].append(split_data[k][i])
                            else:
                                row_wise_data[-1].append(None)

                input_data = TransactionData()
                input_data.data_array = row_wise_data
                input_data.columns = columns

                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                col_buckets_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                buckets_stats.update(col_buckets_stats)

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
Пример #9
0
    def fit(self, real_df, predictions_arr, missing_col_arr, hmd=None):
        """
        # Fit the probabilistic validator

        :param real_df: A dataframe with the real inputs and outputs for every row
        :param predictions_arr: An array containing arrays of predictions, one containing the "normal" predictions and the rest containing predictions with various missing column
        :param missing_col_arr: The missing columns for each of the prediction arrays, same order as the arrays in `predictions_arr`, starting from the second element of `predictions_arr` (The first is assumed to have no missing columns)


        """
        self.real_values_bucketized = []
        self.normal_predictions_bucketized = []

        column_indexes = {}
        for i, col in enumerate(self.input_columns):
            column_indexes[col] = i

        real_present_inputs_arr = []
        for _, row in real_df.iterrows():
            present_inputs = [1] * len(self.input_columns)
            for i, col in enumerate(self.input_columns):
                if str(row[col]) in ('None', 'nan', '', 'Nan', 'NAN', 'NaN'):
                    present_inputs[i] = 0
            real_present_inputs_arr.append(present_inputs)

        X = []
        Y = []
        for n in range(len(predictions_arr)):
            for m in range(len(real_df)):
                row = real_df.iloc[m]
                predicted_value = predictions_arr[n][self.col_name][m]
                real_value = row[self.col_name]
                try:
                    predicted_value = predicted_value if self.col_stats[
                        'data_type'] != DATA_TYPES.NUMERIC else float(
                            predicted_value)
                except:
                    predicted_value = None

                try:
                    real_value = real_value if self.col_stats[
                        'data_type'] != DATA_TYPES.NUMERIC else float(
                            str(real_value).replace(',', '.'))
                except:
                    real_value = None

                if self.buckets is not None:
                    predicted_value_b = get_value_bucket(
                        predicted_value, self.buckets, self.col_stats, hmd)
                    real_value_b = get_value_bucket(real_value, self.buckets,
                                                    self.col_stats, hmd)

                    X.append([0] * (len(self.buckets) + 1))
                    X[-1][predicted_value_b] = 1

                else:
                    predicted_value_b = predicted_value
                    real_value_b = real_value_b

                    X.append([])

                Y.append(real_value_b == predicted_value_b)

                if n == 0:
                    self.real_values_bucketized.append(real_value_b)
                    self.normal_predictions_bucketized.append(
                        predicted_value_b)

                feature_existance = real_present_inputs_arr[m]
                if n > 0:
                    for missing_col in missing_col_arr[n - 1]:
                        feature_existance[self.input_columns.index(
                            missing_col)] = 0

                X[-1] += feature_existance

        log_types = np.seterr()
        np.seterr(divide='ignore')
        self._probabilistic_model.fit(X, Y)
        np.seterr(divide=log_types['divide'])
Пример #10
0
    def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        with disable_console_output(True):
            normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(normal_predictions, full_dataset, stats, output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            # @TODO: Running stats generator just to get the histogram is very inefficient, change this
            validation_set_output_column_histogram, _ = StatsGenerator.get_histogram(normal_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype'])

            if validation_set_output_column_histogram is not None:
                all_columns_prediction_distribution[output_column] = validation_set_output_column_histogram

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH and input_column not in [x[0] for x in self.transaction.lmd['model_order_by']]:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [col for col in ignorable_input_columns if col != input_column]
            with disable_console_output(True):
                col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns)

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            with disable_console_output(True):
                col_missing_predictions = model.predict('validate', ignore_columns)
            col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns)

            combined_column_accuracy = ((normal_accuracy - col_missing_accuracy) + col_only_accuracy)/2
            if combined_column_accuracy < 0:
                combined_column_accuracy = 0
            column_importance = 10*(1 - (normal_accuracy - combined_column_accuracy)/normal_accuracy)
            if column_importance < 1:
                column_importance = 1
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}

                col_missing_output_histogram, _ = StatsGenerator.get_histogram(col_missing_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype'])

                if col_missing_output_histogram is not None:
                    columnless_prediction_distribution[output_column][input_column] = col_missing_output_histogram

        # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column
        for output_column in output_columns:
                buckets_stats[output_column] = {}

                bucket_indexes = {}
                for index,row in full_dataset.iterrows():
                    value = row[output_column]
                    if 'percentage_buckets' in stats[output_column]:
                        percentage_buckets = stats[output_column]['percentage_buckets']
                    else:
                        percentage_buckets = None

                    value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column], self.transaction.hmd)
                    if value_bucket not in bucket_indexes:
                        bucket_indexes[value_bucket] = []
                    bucket_indexes[value_bucket].append(index)

                for bucket in bucket_indexes:
                    buckets_stats[output_column][bucket] = {}
                    input_data = TransactionData()
                    input_data.data_frame = full_dataset.loc[bucket_indexes[bucket]]
                    input_data.columns = input_data.columns

                    stats_generator = StatsGenerator(session=None, transaction=self.transaction)
                    try:
                        with disable_console_output():
                            col_buckets_stats = stats_generator.run(input_data=input_data, modify_light_metadata=False, print_logs=False)
                        buckets_stats[output_column][bucket].update(col_buckets_stats)
                    except Exception as e:
                        pass

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution