def run(self): np.seterr(divide='warn', invalid='warn') """ # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions """ output_columns = self.transaction.lmd['predict_columns'] input_columns = [col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['columns_to_ignore']] # Make predictions on the validation dataset normally and with various columns missing normal_predictions = self.transaction.model_backend.predict('validate') normal_accuracy = evaluate_accuracy(normal_predictions, self.transaction.input_data.validation_df, self.transaction.lmd['column_stats'], output_columns) empty_input_predictions = {} empty_inpurt_accuracy = {} ignorable_input_columns = [x for x in input_columns if self.transaction.lmd['column_stats'][x]['data_type'] != DATA_TYPES.FILE_PATH and x not in [y[0] for y in self.transaction.lmd['model_order_by']]] for col in ignorable_input_columns: empty_input_predictions[col] = self.transaction.model_backend.predict('validate', ignore_columns=[col]) empty_inpurt_accuracy[col] = evaluate_accuracy(empty_input_predictions[col], self.transaction.input_data.validation_df, self.transaction.lmd['column_stats'], output_columns) # Get some information about the importance of each column if not self.transaction.lmd['disable_optional_analysis']: self.transaction.lmd['column_importances'] = {} for col in ignorable_input_columns: column_importance = (1 - empty_inpurt_accuracy[col]/normal_accuracy) column_importance = np.ceil(10*column_importance) self.transaction.lmd['column_importances'][col] = float(10 if column_importance > 10 else column_importance) # Run Probabilistic Validator overall_accuracy_arr = [] self.transaction.lmd['accuracy_histogram'] = {} self.transaction.lmd['confusion_matrices'] = {} self.transaction.lmd['accuracy_samples'] = {} self.transaction.hmd['probabilistic_validators'] = {} for col in output_columns: pval = ProbabilisticValidator(col_stats=self.transaction.lmd['column_stats'][col], col_name=col, input_columns=input_columns) predictions_arr = [normal_predictions] + [empty_input_predictions[col] for col in ignorable_input_columns] pval.fit(self.transaction.input_data.validation_df, predictions_arr, [[x] for x in ignorable_input_columns]) overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats() overall_accuracy_arr.append(overall_accuracy) self.transaction.lmd['accuracy_histogram'][col] = accuracy_histogram self.transaction.lmd['confusion_matrices'][col] = cm self.transaction.lmd['accuracy_samples'][col] = accuracy_samples self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(pval) print(overall_accuracy_arr) self.transaction.lmd['validation_set_accuracy'] = sum(overall_accuracy_arr)/len(overall_accuracy_arr)
def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats): columnless_prediction_distribution = {} all_columns_prediction_distribution = {} self.normal_predictions = model.predict('validate') normal_accuracy = evaluate_accuracy(self.normal_predictions, full_dataset, stats, output_columns) column_importance_dict = {} buckets_stats = {} # Histogram for when all columns are present, in order to plot the force vectors for output_column in output_columns: stats_generator = StatsGenerator(session=None, transaction=self.transaction) input_data = TransactionData() input_data.data_frame = self.normal_predictions[[output_column]] input_data.columns = [output_column] # @TODO: Running stats generator just to get the histogram is very inefficient, change this validation_set_output_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) if validation_set_output_stats is None: pass elif 'histogram' in validation_set_output_stats[output_column]: all_columns_prediction_distribution[ output_column] = validation_set_output_stats[ output_column]['histogram'] ignorable_input_columns = [] for input_column in input_columns: if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH: ignorable_input_columns.append(input_column) for input_column in ignorable_input_columns: # See what happens with the accuracy of the outputs if only this column is present ignore_columns = [ col for col in ignorable_input_columns if col != input_column ] col_only_predictions = model.predict('validate', ignore_columns) col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns) col_only_normalized_accuracy = col_only_accuracy / normal_accuracy # See what happens with the accuracy if all columns but this one are present ignore_columns = [input_column] col_missing_predictions = model.predict('validate', ignore_columns) col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns) col_missing_reverse_accuracy = ( normal_accuracy - col_missing_accuracy) / normal_accuracy column_importance = (col_only_normalized_accuracy + col_missing_reverse_accuracy) / 2 column_importance_dict[input_column] = column_importance # Histogram for when the column is missing, in order to plot the force vectors for output_column in output_columns: if output_column not in columnless_prediction_distribution: columnless_prediction_distribution[output_column] = {} stats_generator = StatsGenerator(session=None, transaction=self.transaction) input_data = TransactionData() input_data.data_frame = col_missing_predictions[[ output_column ]] input_data.columns = [output_column] # @TODO: Running stats generator just to get the histogram is very inefficient, change this col_missing_output_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) if col_missing_output_stats is None: pass elif 'histogram' in col_missing_output_stats[output_column]: columnless_prediction_distribution[output_column][ input_column] = col_missing_output_stats[ output_column]['histogram'] # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column #for column in ignorable_input_columns: # if c(column_importance_dict[column] > 0.8 or column_importance_dict[column] < 0.2): for output_column in output_columns: buckets_stats[output_column] = {} bucket_indexes = {} for index, row in full_dataset.iterrows(): print(index) value = row[output_column] if 'percentage_buckets' in stats[output_column]: percentage_buckets = stats[output_column][ 'percentage_buckets'] else: percentage_buckets = None value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column]) if value_bucket not in bucket_indexes: bucket_indexes[value_bucket] = [] bucket_indexes[value_bucket].append(index) for bucket in bucket_indexes: buckets_stats[output_column][bucket] = {} input_data = TransactionData() input_data.data_frame = full_dataset.loc[ bucket_indexes[bucket]] input_data.columns = input_data.data_frame.columns stats_generator = StatsGenerator(session=None, transaction=self.transaction) try: col_buckets_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) buckets_stats[output_column][bucket].update( col_buckets_stats) except: print('Cloud not generate bucket stats for sub-bucket: {}'. format(bucket)) return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats): columnless_prediction_distribution = {} all_columns_prediction_distribution = {} self.normal_predictions = model.predict('validate') normal_accuracy = evaluate_accuracy(self.normal_predictions, full_dataset, stats, output_columns) column_importance_dict = {} buckets_stats = {} # Histogram for when all columns are present, in order to plot the force vectors for output_column in output_columns: stats_generator = StatsGenerator(session=None, transaction=self.transaction) input_data = TransactionData() input_data.data_array = list( map(lambda x: [x], list(self.normal_predictions[output_column]))) input_data.columns = [output_column] validation_set_output_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) if validation_set_output_stats is None: pass elif 'histogram' in validation_set_output_stats[output_column]: all_columns_prediction_distribution[ output_column] = validation_set_output_stats[ output_column]['histogram'] ignorable_input_columns = [] for input_column in input_columns: if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH: ignorable_input_columns.append(input_column) for input_column in ignorable_input_columns: # See what happens with the accuracy of the outputs if only this column is present ignore_columns = [ col for col in ignorable_input_columns if col != input_column ] col_only_predictions = model.predict('validate', ignore_columns) col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns) col_only_normalized_accuracy = col_only_accuracy / normal_accuracy # See what happens with the accuracy if all columns but this one are present ignore_columns = [input_column] col_missing_predictions = model.predict('validate', ignore_columns) col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns) col_missing_reverse_accuracy = ( normal_accuracy - col_missing_accuracy) / normal_accuracy column_importance = (col_only_normalized_accuracy + col_missing_reverse_accuracy) / 2 column_importance_dict[input_column] = column_importance # Histogram for when the column is missing, in order to plot the force vectors for output_column in output_columns: if output_column not in columnless_prediction_distribution: columnless_prediction_distribution[output_column] = {} stats_generator = StatsGenerator(session=None, transaction=self.transaction) input_data = TransactionData() input_data.data_array = list( map(lambda x: [x], list(col_missing_predictions[output_column]))) input_data.columns = [output_column] col_missing_output_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) if col_missing_output_stats is None: pass elif 'histogram' in col_missing_output_stats[output_column]: columnless_prediction_distribution[output_column][ input_column] = col_missing_output_stats[ output_column]['histogram'] # If this coulmn is either very important or not important at all, compute stats for each of the buckets (in the validation data) if column_importance > 0.8 or column_importance < 0.2: split_data = {} for value in full_dataset[input_column]: if 'percentage_buckets' in stats[input_column]: bucket = stats[input_column]['percentage_buckets'] else: bucket = None vb = get_value_bucket(value, bucket, stats[input_column]) if f'{input_column}_bucket_{vb}' not in split_data: split_data[f'{input_column}_bucket_{vb}'] = [] split_data[f'{input_column}_bucket_{vb}'].append(value) row_wise_data = [] max_length = max(list(map(len, split_data.values()))) columns = [] for i in range(max_length): row_wise_data.append([]) for k in split_data.keys(): # If the sub bucket has less than 6 values, it's no relevant if len(split_data[k]) > 6: columns.append(k) if len(split_data[k]) > i: row_wise_data[-1].append(split_data[k][i]) else: row_wise_data[-1].append(None) input_data = TransactionData() input_data.data_array = row_wise_data input_data.columns = columns stats_generator = StatsGenerator(session=None, transaction=self.transaction) col_buckets_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False) buckets_stats.update(col_buckets_stats) return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats): columnless_prediction_distribution = {} all_columns_prediction_distribution = {} with disable_console_output(True): normal_predictions = model.predict('validate') normal_accuracy = evaluate_accuracy(normal_predictions, full_dataset, stats, output_columns) column_importance_dict = {} buckets_stats = {} # Histogram for when all columns are present, in order to plot the force vectors for output_column in output_columns: # @TODO: Running stats generator just to get the histogram is very inefficient, change this validation_set_output_column_histogram, _ = StatsGenerator.get_histogram(normal_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype']) if validation_set_output_column_histogram is not None: all_columns_prediction_distribution[output_column] = validation_set_output_column_histogram ignorable_input_columns = [] for input_column in input_columns: if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH and input_column not in [x[0] for x in self.transaction.lmd['model_order_by']]: ignorable_input_columns.append(input_column) for input_column in ignorable_input_columns: # See what happens with the accuracy of the outputs if only this column is present ignore_columns = [col for col in ignorable_input_columns if col != input_column] with disable_console_output(True): col_only_predictions = model.predict('validate', ignore_columns) col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns) # See what happens with the accuracy if all columns but this one are present ignore_columns = [input_column] with disable_console_output(True): col_missing_predictions = model.predict('validate', ignore_columns) col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns) combined_column_accuracy = ((normal_accuracy - col_missing_accuracy) + col_only_accuracy)/2 if combined_column_accuracy < 0: combined_column_accuracy = 0 column_importance = 10*(1 - (normal_accuracy - combined_column_accuracy)/normal_accuracy) if column_importance < 1: column_importance = 1 column_importance_dict[input_column] = column_importance # Histogram for when the column is missing, in order to plot the force vectors for output_column in output_columns: if output_column not in columnless_prediction_distribution: columnless_prediction_distribution[output_column] = {} col_missing_output_histogram, _ = StatsGenerator.get_histogram(col_missing_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype']) if col_missing_output_histogram is not None: columnless_prediction_distribution[output_column][input_column] = col_missing_output_histogram # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column for output_column in output_columns: buckets_stats[output_column] = {} bucket_indexes = {} for index,row in full_dataset.iterrows(): value = row[output_column] if 'percentage_buckets' in stats[output_column]: percentage_buckets = stats[output_column]['percentage_buckets'] else: percentage_buckets = None value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column], self.transaction.hmd) if value_bucket not in bucket_indexes: bucket_indexes[value_bucket] = [] bucket_indexes[value_bucket].append(index) for bucket in bucket_indexes: buckets_stats[output_column][bucket] = {} input_data = TransactionData() input_data.data_frame = full_dataset.loc[bucket_indexes[bucket]] input_data.columns = input_data.columns stats_generator = StatsGenerator(session=None, transaction=self.transaction) try: with disable_console_output(): col_buckets_stats = stats_generator.run(input_data=input_data, modify_light_metadata=False, print_logs=False) buckets_stats[output_column][bucket].update(col_buckets_stats) except Exception as e: pass return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution