示例#1
0
 def test_matrix_filter_binary_classification(self):
     (X_train, y_train, X_test, y_test,
      _) = create_binary_classification_dataset()
     feature_names = list(X_train.columns)
     model_task = ModelTask.CLASSIFICATION
     run_error_analyzer_on_models(X_train, y_train, X_test, y_test,
                                  feature_names, model_task)
    def test_importances_binary_classification(self):
        X_train, y_train, X_test, y_test, _ = \
            create_binary_classification_dataset()
        feature_names = list(X_train.columns)
        models = create_models_classification(X_train, y_train)

        for model in models:
            categorical_features = []
            run_error_analyzer(model, X_test, y_test, feature_names,
                               categorical_features)
示例#3
0
    def test_matrix_filter_binary_classification(self):
        x_train, y_train, x_test, y_test, _ = \
            create_binary_classification_dataset()
        feature_names = list(x_train.columns)
        models = create_models(x_train, y_train)

        for model in models:
            categorical_features = []
            run_error_analyzer(model, x_test, y_test, feature_names,
                               categorical_features)
    def test_raianalyzer_binary(self):
        x_train, y_train, x_test, y_test, classes = \
            create_binary_classification_dataset()
        x_train = pd.DataFrame(x_train)
        x_test = pd.DataFrame(x_test)
        models = create_models(x_train, y_train)
        x_train[LABELS] = y_train
        x_test[LABELS] = y_test

        for model in models:
            run_raianalyzer(model, x_train, x_test, LABELS, classes)
    def test_explain_model_binary_classification_with_different_format_predictions(
            self, mimic_explainer, if_predictions_as_dataframe, explainable_model):
        x_train, y_train, x_test, y_test, classes = create_binary_classification_dataset()
        model = LogisticRegression(random_state=42).fit(x_train, y_train)
        model.fit(x_train, y_train)

        model = PredictAsDataFrameClassificationTestModel(
            model, return_predictions_as_dataframe=if_predictions_as_dataframe)
        kwargs = {}
        explainer = mimic_explainer(model, x_train, explainable_model, **kwargs)
        global_explanation = explainer.explain_global(evaluation_examples=x_test)
        assert global_explanation is not None
 def test_large_data_surrogate_error_tree(self):
     # validate tree trains quickly for large data
     X_train, y_train, X_test, y_test, _ = \
         create_binary_classification_dataset(100)
     feature_names = list(X_train.columns)
     model = create_sklearn_random_forest_regressor(X_train, y_train)
     X_test, y_test = replicate_dataset(X_test, y_test)
     assert X_test.shape[0] > 1000000
     t0 = time.time()
     categorical_features = []
     model_analyzer = ModelAnalyzer(model, X_test, y_test, feature_names,
                                    categorical_features)
     max_depth = 3
     num_leaves = 31
     min_child_samples = 20
     categories_reindexed = []
     cat_ind_reindexed = []
     diff = model_analyzer.get_diff()
     surrogate = create_surrogate_model(model_analyzer, X_test, diff,
                                        max_depth, num_leaves,
                                        min_child_samples,
                                        cat_ind_reindexed)
     t1 = time.time()
     execution_time = t1 - t0
     print(
         "creating surrogate model took {} seconds".format(execution_time))
     # assert we don't take too long to train the tree on 1 million rows
     # note we train on >1 million rows in ~1 second
     assert execution_time < 20
     model_json = surrogate._Booster.dump_model()
     tree_structure = model_json["tree_info"][0]['tree_structure']
     max_split_index = get_max_split_index(tree_structure) + 1
     assert max_split_index == 3
     cache_subtree_features(tree_structure, feature_names)
     pred_y = model_analyzer.model.predict(X_test)
     traversed_X_test = X_test.copy()
     traversed_X_test[DIFF] = diff
     traversed_X_test[TRUE_Y] = y_test
     traversed_X_test[PRED_Y] = pred_y
     t2 = time.time()
     tree = traverse(traversed_X_test,
                     tree_structure,
                     max_split_index,
                     (categories_reindexed, cat_ind_reindexed), [],
                     feature_names,
                     metric=model_analyzer.metric,
                     classes=model_analyzer.classes)
     t3 = time.time()
     execution_time = t3 - t2
     print("traversing tree took {} seconds".format(execution_time))
     assert tree is not None
    def test_model_analysis_binary(self, manager_type):
        x_train, y_train, x_test, y_test, classes = \
            create_binary_classification_dataset()
        x_train = pd.DataFrame(x_train)
        x_test = pd.DataFrame(x_test)
        models = create_models_classification(x_train, y_train)
        x_train[LABELS] = y_train
        x_test[LABELS] = y_test
        manager_args = None

        for model in models:
            run_model_analysis(model,
                               x_train,
                               x_test,
                               LABELS, [],
                               manager_type,
                               manager_args,
                               classes=classes)
 def test_large_data_importances(self):
     # mutual information can be very costly for large number of rows
     # hence, assert we downsample to compute importances for large data
     X_train, y_train, X_test, y_test, _ = \
         create_binary_classification_dataset(100)
     feature_names = list(X_train.columns)
     model = create_sklearn_random_forest_regressor(X_train, y_train)
     X_test, y_test = replicate_dataset(X_test, y_test)
     assert X_test.shape[0] > 1000000
     t0 = time.time()
     categorical_features = []
     model_analyzer = ModelAnalyzer(model, X_test, y_test,
                                    feature_names,
                                    categorical_features)
     model_analyzer.compute_importances()
     t1 = time.time()
     execution_time = t1 - t0
     print(execution_time)
     # assert we don't take too long and downsample the dataset
     # note execution time is in seconds
     assert execution_time < 20