示例#1
0
    def audit(self, verbose=False):

        features_to_audit = [
            h for i, h in enumerate(self.headers)
            if i not in self.features_to_ignore
        ] if self.features_to_audit is None else self.features_to_audit

        output_files = []
        for i, feature in enumerate(features_to_audit):
            message = "Auditing: '{}' ({}/{}).".format(feature, i + 1,
                                                       len(features_to_audit))
            vprint(message, verbose)

            cleaned_feature_name = feature.replace(".", "_").replace(" ", "_")
            output_file = "{}.audit".format(cleaned_feature_name)
            full_filepath = self.OUTPUT_DIR + "/" + output_file
            output_files.append(full_filepath)

            self.audit_feature(feature, full_filepath)

        audit_msg1 = "Audit file dump set to {}".format(self.dump_all)
        audit_msg2 = "All audit files have been saved." if self.dump_all else "Only mininal audit files have been saved."
        print("{}: {}".format(audit_msg1, audit_msg2))
        print("Audit files dumped to: {}.\n".format(self.OUTPUT_DIR))

        return output_files
示例#2
0
    def __call__(self,
                 data,
                 output_dir=None,
                 dump_all=False,
                 features_to_audit=None):
        start_time = datetime.now()

        headers, train_set, test_set, response_header, features_to_ignore, correct_types = data

        self._audits_data = {
            "headers": headers,
            "train": train_set,
            "test": test_set,
            "response": response_header,
            "ignore": features_to_ignore,
            "types": correct_types,
            "full_audit": True if features_to_audit is None else False
        }

        if self.trained_model == None:
            """
       ModelFactories require a `build` method that accepts some training data
       with which to train a brand new model. This `build` method should output
       a Model object that has a `test` method -- which, when given test data
       in the same format as the training data, yields a confusion table detailing
       the correct and incorrect predictions of the model.
      """

            all_data = train_set + test_set
            model_factory = self.ModelFactory(
                all_data,
                headers,
                response_header,
                features_to_ignore=features_to_ignore,
                options=self.model_options)

        if self.trained_model != None:
            model_or_factory = self.trained_model
        elif not self.RETRAIN_MODEL_PER_REPAIR:
            vprint("Training initial model.", self.verbose)
            model = model_factory.build(train_set)

            # Check the quality of the initial model on verbose runs.
            if self.verbose:
                print("Calculating original model statistics on test data:")
                print("\tTraining Set:")
                train_pred_tuples = model.test(train_set)
                train_conf_matrix = get_conf_matrix(train_pred_tuples)
                print("\t\tConf-Matrix:", train_conf_matrix)
                for measurer in self.measurers:
                    print("\t\t{}: {}".format(measurer.__name__,
                                              measurer(train_conf_matrix)))

                print("\tTesting Set:")
                test_pred_tuples = model.test(test_set)
                test_conf_matrix = get_conf_matrix(test_pred_tuples)
                print("\t\tConf-Matrix", test_conf_matrix)
                for measurer in self.measurers:
                    print("\t\t{}: {}".format(measurer.__name__,
                                              measurer(test_conf_matrix)))

            model_or_factory = model
        else:
            model_or_factory = model_factory

        # Translate the headers into indexes for the auditor.
        audit_indices_to_ignore = [
            headers.index(f) for f in features_to_ignore
        ]

        # Don't audit the response feature.
        audit_indices_to_ignore.append(headers.index(response_header))

        # Prepare the auditor.
        auditor = GradientFeatureAuditor(
            model_or_factory,
            headers,
            train_set,
            test_set,
            repair_steps=self.REPAIR_STEPS,
            kdd=self.kdd,
            features_to_ignore=audit_indices_to_ignore,
            features_to_audit=features_to_audit,
            output_dir=output_dir,
            dump_all=dump_all)

        # Perform the Gradient Feature Audit and dump the audit results into files.
        audit_filenames = auditor.audit(verbose=self.verbose)

        # Retrieve repaired data from audit
        self._audits_data["rep_test"] = auditor._rep_test

        ranked_features = []
        for measurer in self.measurers:
            vprint("Ranking audit files by {}.".format(measurer.__name__),
                   self.verbose)
            #ranked_graph_filename = "{}/{}.png".format(auditor.OUTPUT_DIR, measurer.__name__)
            ranks = rank_audit_files(audit_filenames, measurer)
            vprint("\t{}".format(ranks), self.verbose)
            ranked_features.append((measurer, ranks))

        end_time = datetime.now()

        # Store a summary of this experiment.
        model_id = model_factory.factory_name if self.trained_model == None else "Pretrained"
        model_name = model_factory.verbose_factory_name if self.trained_model == None else "Pretrained"
        summary = [
            "Audit Start Time: {}".format(start_time),
            "Audit End Time: {}".format(end_time),
            "Retrained Per Repair: {}".format(self.RETRAIN_MODEL_PER_REPAIR),
            "Model Factory ID: {}".format(model_id),
            "Model Type: {}".format(model_name),
            "Non-standard Model Options: {}".format(self.model_options),
            "Train Size: {}".format(len(train_set)),
            "Test Size: {}".format(len(test_set)),
            "Non-standard Ignored Features: {}".format(features_to_ignore),
            "Features: {}\n".format(headers)
        ]

        # Print summary
        for line in summary:
            print(line)

        for ranker, ranks in ranked_features:
            print("Ranked Features by {}: {}".format(ranker.__name__, ranks))
            groups = group_audit_ranks(audit_filenames, ranker)
            print("\tApprox. Trend Groups: {}\n".format(groups))

            if ranker.__name__ == "accuracy":
                self._audits_data["ranks"] = ranks

        # Dump all experiment results if opted
        if dump_all:
            vprint("Dumping original training data.", self.verbose)
            # Dump the train data to the log.
            train_dump = "{}/original_train_data".format(auditor.OUTPUT_DIR)
            with open(train_dump + ".csv", "w") as f:
                writer = csv.writer(f)
                writer.writerow(headers)
                for row in train_set:
                    writer.writerow(row)

            if self.WRITE_ORIGINAL_PREDICTIONS:
                # Dump the predictions on the test data.
                with open(train_dump + ".predictions", "w") as f:
                    writer = csv.writer(f)
                    file_headers = ["Response", "Prediction"]
                    writer.writerow(file_headers)
                    for response, guess in train_pred_tuples:
                        writer.writerow([response, guess])

            vprint("Dumping original testing data.", self.verbose)
            # Dump the train data to the log.
            test_dump = "{}/original_test_data".format(auditor.OUTPUT_DIR)
            with open(test_dump + ".csv", "w") as f:
                writer = csv.writer(f)
                writer.writerow(headers)
                for row in test_set:
                    writer.writerow(row)

            if self.WRITE_ORIGINAL_PREDICTIONS:
                # Dump the predictions on the test data.
                with open(test_dump + ".predictions", "w") as f:
                    writer = csv.writer(f)
                    file_headers = ["Response", "Prediction"]
                    writer.writerow(file_headers)
                    for response, guess in test_pred_tuples:
                        writer.writerow([response, guess])

            # Graph the audit files.
            vprint("Graphing audit files.", self.verbose)
            for audit_filename in audit_filenames:
                audit_image_filename = audit_filename + ".png"
                graph_audit(audit_filename, self.measurers,
                            audit_image_filename)

            # Store a graph of how many predictions change as features are repaired.
            vprint("Graphing prediction changes throughout repair.",
                   self.verbose)
            output_image = auditor.OUTPUT_DIR + "/similarity_to_original_predictions.png"
            graph_prediction_consistency(auditor.OUTPUT_DIR, output_image)

        for measurer in self.measurers:
            ranked_graph_filename = "{}/{}.png".format(auditor.OUTPUT_DIR,
                                                       measurer.__name__)
            graph_audits(audit_filenames, measurer, ranked_graph_filename)

        # Store a summary of this experiment to file.
        summary_file = "{}/summary.txt".format(auditor.OUTPUT_DIR)
        with open(summary_file, "w") as f:
            for line in summary:
                f.write(line + '\n')

            for ranker, ranks in ranked_features:
                f.write("Ranked Features by {}: {}\n".format(
                    ranker.__name__, ranks))
                groups = group_audit_ranks(audit_filenames, ranker)
                f.write("\tApprox. Trend Groups: {}\n".format(groups))

        vprint("Summary file written to: {}\n".format(summary_file),
               self.verbose)
示例#3
0
    def train(self,
              train_set,
              test_set,
              headers,
              response_header,
              features_to_ignore=[]):
        """
    A method to train a model using model factories. 
    ModelFactories require a `build` method that accepts some training data
    with which to train a brand new model. This `build` method should output
    a Model object that has a `test` method -- which, when given test data
    in the same format as the training data, yields a confusion table detailing
    the correct and incorrect predictions of the model.
    
    Parameters
    ----------
    train_set, test_set : list of list or numpy.array with teh dimensions (# of features)*(# of samples).
      Data for training the model and testing the model.

    headers : list of strings
      The headers of the data.

    response_header : string
      The response header of the data.

    features_to_ignore : list of strings (default = [])
      The features we want to ignore.

    """

        all_data = train_set + test_set
        model_factory = self.ModelFactory(
            all_data,
            headers,
            response_header,
            features_to_ignore=features_to_ignore,
            options=self.model_options)

        vprint("Training initial model.", self.verbose)
        model = model_factory.build(train_set)

        # Check the quality of the initial model on verbose runs.
        if self.verbose:
            print("Calculating original model statistics on test data:")
            print("\tTraining Set:")
            train_pred_tuples = model.test(train_set)
            train_conf_matrix = get_conf_matrix(train_pred_tuples)
            print("\t\tConf-Matrix:", train_conf_matrix)
            for measurer in self.measurers:
                print("\t\t{}: {}".format(measurer.__name__,
                                          measurer(train_conf_matrix)))

            print("\tTesting Set:")
            test_pred_tuples = model.test(test_set)
            test_conf_matrix = get_conf_matrix(test_pred_tuples)
            print("\t\tConf-Matrix", test_conf_matrix)
            for measurer in self.measurers:
                print("\t\t{}: {}".format(measurer.__name__,
                                          measurer(test_conf_matrix)))

        return model