def audit(self, verbose=False): features_to_audit = [ h for i, h in enumerate(self.headers) if i not in self.features_to_ignore ] if self.features_to_audit is None else self.features_to_audit output_files = [] for i, feature in enumerate(features_to_audit): message = "Auditing: '{}' ({}/{}).".format(feature, i + 1, len(features_to_audit)) vprint(message, verbose) cleaned_feature_name = feature.replace(".", "_").replace(" ", "_") output_file = "{}.audit".format(cleaned_feature_name) full_filepath = self.OUTPUT_DIR + "/" + output_file output_files.append(full_filepath) self.audit_feature(feature, full_filepath) audit_msg1 = "Audit file dump set to {}".format(self.dump_all) audit_msg2 = "All audit files have been saved." if self.dump_all else "Only mininal audit files have been saved." print("{}: {}".format(audit_msg1, audit_msg2)) print("Audit files dumped to: {}.\n".format(self.OUTPUT_DIR)) return output_files
def __call__(self, data, output_dir=None, dump_all=False, features_to_audit=None): start_time = datetime.now() headers, train_set, test_set, response_header, features_to_ignore, correct_types = data self._audits_data = { "headers": headers, "train": train_set, "test": test_set, "response": response_header, "ignore": features_to_ignore, "types": correct_types, "full_audit": True if features_to_audit is None else False } if self.trained_model == None: """ ModelFactories require a `build` method that accepts some training data with which to train a brand new model. This `build` method should output a Model object that has a `test` method -- which, when given test data in the same format as the training data, yields a confusion table detailing the correct and incorrect predictions of the model. """ all_data = train_set + test_set model_factory = self.ModelFactory( all_data, headers, response_header, features_to_ignore=features_to_ignore, options=self.model_options) if self.trained_model != None: model_or_factory = self.trained_model elif not self.RETRAIN_MODEL_PER_REPAIR: vprint("Training initial model.", self.verbose) model = model_factory.build(train_set) # Check the quality of the initial model on verbose runs. if self.verbose: print("Calculating original model statistics on test data:") print("\tTraining Set:") train_pred_tuples = model.test(train_set) train_conf_matrix = get_conf_matrix(train_pred_tuples) print("\t\tConf-Matrix:", train_conf_matrix) for measurer in self.measurers: print("\t\t{}: {}".format(measurer.__name__, measurer(train_conf_matrix))) print("\tTesting Set:") test_pred_tuples = model.test(test_set) test_conf_matrix = get_conf_matrix(test_pred_tuples) print("\t\tConf-Matrix", test_conf_matrix) for measurer in self.measurers: print("\t\t{}: {}".format(measurer.__name__, measurer(test_conf_matrix))) model_or_factory = model else: model_or_factory = model_factory # Translate the headers into indexes for the auditor. audit_indices_to_ignore = [ headers.index(f) for f in features_to_ignore ] # Don't audit the response feature. audit_indices_to_ignore.append(headers.index(response_header)) # Prepare the auditor. auditor = GradientFeatureAuditor( model_or_factory, headers, train_set, test_set, repair_steps=self.REPAIR_STEPS, kdd=self.kdd, features_to_ignore=audit_indices_to_ignore, features_to_audit=features_to_audit, output_dir=output_dir, dump_all=dump_all) # Perform the Gradient Feature Audit and dump the audit results into files. audit_filenames = auditor.audit(verbose=self.verbose) # Retrieve repaired data from audit self._audits_data["rep_test"] = auditor._rep_test ranked_features = [] for measurer in self.measurers: vprint("Ranking audit files by {}.".format(measurer.__name__), self.verbose) #ranked_graph_filename = "{}/{}.png".format(auditor.OUTPUT_DIR, measurer.__name__) ranks = rank_audit_files(audit_filenames, measurer) vprint("\t{}".format(ranks), self.verbose) ranked_features.append((measurer, ranks)) end_time = datetime.now() # Store a summary of this experiment. model_id = model_factory.factory_name if self.trained_model == None else "Pretrained" model_name = model_factory.verbose_factory_name if self.trained_model == None else "Pretrained" summary = [ "Audit Start Time: {}".format(start_time), "Audit End Time: {}".format(end_time), "Retrained Per Repair: {}".format(self.RETRAIN_MODEL_PER_REPAIR), "Model Factory ID: {}".format(model_id), "Model Type: {}".format(model_name), "Non-standard Model Options: {}".format(self.model_options), "Train Size: {}".format(len(train_set)), "Test Size: {}".format(len(test_set)), "Non-standard Ignored Features: {}".format(features_to_ignore), "Features: {}\n".format(headers) ] # Print summary for line in summary: print(line) for ranker, ranks in ranked_features: print("Ranked Features by {}: {}".format(ranker.__name__, ranks)) groups = group_audit_ranks(audit_filenames, ranker) print("\tApprox. Trend Groups: {}\n".format(groups)) if ranker.__name__ == "accuracy": self._audits_data["ranks"] = ranks # Dump all experiment results if opted if dump_all: vprint("Dumping original training data.", self.verbose) # Dump the train data to the log. train_dump = "{}/original_train_data".format(auditor.OUTPUT_DIR) with open(train_dump + ".csv", "w") as f: writer = csv.writer(f) writer.writerow(headers) for row in train_set: writer.writerow(row) if self.WRITE_ORIGINAL_PREDICTIONS: # Dump the predictions on the test data. with open(train_dump + ".predictions", "w") as f: writer = csv.writer(f) file_headers = ["Response", "Prediction"] writer.writerow(file_headers) for response, guess in train_pred_tuples: writer.writerow([response, guess]) vprint("Dumping original testing data.", self.verbose) # Dump the train data to the log. test_dump = "{}/original_test_data".format(auditor.OUTPUT_DIR) with open(test_dump + ".csv", "w") as f: writer = csv.writer(f) writer.writerow(headers) for row in test_set: writer.writerow(row) if self.WRITE_ORIGINAL_PREDICTIONS: # Dump the predictions on the test data. with open(test_dump + ".predictions", "w") as f: writer = csv.writer(f) file_headers = ["Response", "Prediction"] writer.writerow(file_headers) for response, guess in test_pred_tuples: writer.writerow([response, guess]) # Graph the audit files. vprint("Graphing audit files.", self.verbose) for audit_filename in audit_filenames: audit_image_filename = audit_filename + ".png" graph_audit(audit_filename, self.measurers, audit_image_filename) # Store a graph of how many predictions change as features are repaired. vprint("Graphing prediction changes throughout repair.", self.verbose) output_image = auditor.OUTPUT_DIR + "/similarity_to_original_predictions.png" graph_prediction_consistency(auditor.OUTPUT_DIR, output_image) for measurer in self.measurers: ranked_graph_filename = "{}/{}.png".format(auditor.OUTPUT_DIR, measurer.__name__) graph_audits(audit_filenames, measurer, ranked_graph_filename) # Store a summary of this experiment to file. summary_file = "{}/summary.txt".format(auditor.OUTPUT_DIR) with open(summary_file, "w") as f: for line in summary: f.write(line + '\n') for ranker, ranks in ranked_features: f.write("Ranked Features by {}: {}\n".format( ranker.__name__, ranks)) groups = group_audit_ranks(audit_filenames, ranker) f.write("\tApprox. Trend Groups: {}\n".format(groups)) vprint("Summary file written to: {}\n".format(summary_file), self.verbose)
def train(self, train_set, test_set, headers, response_header, features_to_ignore=[]): """ A method to train a model using model factories. ModelFactories require a `build` method that accepts some training data with which to train a brand new model. This `build` method should output a Model object that has a `test` method -- which, when given test data in the same format as the training data, yields a confusion table detailing the correct and incorrect predictions of the model. Parameters ---------- train_set, test_set : list of list or numpy.array with teh dimensions (# of features)*(# of samples). Data for training the model and testing the model. headers : list of strings The headers of the data. response_header : string The response header of the data. features_to_ignore : list of strings (default = []) The features we want to ignore. """ all_data = train_set + test_set model_factory = self.ModelFactory( all_data, headers, response_header, features_to_ignore=features_to_ignore, options=self.model_options) vprint("Training initial model.", self.verbose) model = model_factory.build(train_set) # Check the quality of the initial model on verbose runs. if self.verbose: print("Calculating original model statistics on test data:") print("\tTraining Set:") train_pred_tuples = model.test(train_set) train_conf_matrix = get_conf_matrix(train_pred_tuples) print("\t\tConf-Matrix:", train_conf_matrix) for measurer in self.measurers: print("\t\t{}: {}".format(measurer.__name__, measurer(train_conf_matrix))) print("\tTesting Set:") test_pred_tuples = model.test(test_set) test_conf_matrix = get_conf_matrix(test_pred_tuples) print("\t\tConf-Matrix", test_conf_matrix) for measurer in self.measurers: print("\t\t{}: {}".format(measurer.__name__, measurer(test_conf_matrix))) return model