def test_matrix_filter_binary_classification(self): (X_train, y_train, X_test, y_test, _) = create_binary_classification_dataset() feature_names = list(X_train.columns) model_task = ModelTask.CLASSIFICATION run_error_analyzer_on_models(X_train, y_train, X_test, y_test, feature_names, model_task)
def test_importances_binary_classification(self): X_train, y_train, X_test, y_test, _ = \ create_binary_classification_dataset() feature_names = list(X_train.columns) models = create_models_classification(X_train, y_train) for model in models: categorical_features = [] run_error_analyzer(model, X_test, y_test, feature_names, categorical_features)
def test_matrix_filter_binary_classification(self): x_train, y_train, x_test, y_test, _ = \ create_binary_classification_dataset() feature_names = list(x_train.columns) models = create_models(x_train, y_train) for model in models: categorical_features = [] run_error_analyzer(model, x_test, y_test, feature_names, categorical_features)
def test_raianalyzer_binary(self): x_train, y_train, x_test, y_test, classes = \ create_binary_classification_dataset() x_train = pd.DataFrame(x_train) x_test = pd.DataFrame(x_test) models = create_models(x_train, y_train) x_train[LABELS] = y_train x_test[LABELS] = y_test for model in models: run_raianalyzer(model, x_train, x_test, LABELS, classes)
def test_explain_model_binary_classification_with_different_format_predictions( self, mimic_explainer, if_predictions_as_dataframe, explainable_model): x_train, y_train, x_test, y_test, classes = create_binary_classification_dataset() model = LogisticRegression(random_state=42).fit(x_train, y_train) model.fit(x_train, y_train) model = PredictAsDataFrameClassificationTestModel( model, return_predictions_as_dataframe=if_predictions_as_dataframe) kwargs = {} explainer = mimic_explainer(model, x_train, explainable_model, **kwargs) global_explanation = explainer.explain_global(evaluation_examples=x_test) assert global_explanation is not None
def test_large_data_surrogate_error_tree(self): # validate tree trains quickly for large data X_train, y_train, X_test, y_test, _ = \ create_binary_classification_dataset(100) feature_names = list(X_train.columns) model = create_sklearn_random_forest_regressor(X_train, y_train) X_test, y_test = replicate_dataset(X_test, y_test) assert X_test.shape[0] > 1000000 t0 = time.time() categorical_features = [] model_analyzer = ModelAnalyzer(model, X_test, y_test, feature_names, categorical_features) max_depth = 3 num_leaves = 31 min_child_samples = 20 categories_reindexed = [] cat_ind_reindexed = [] diff = model_analyzer.get_diff() surrogate = create_surrogate_model(model_analyzer, X_test, diff, max_depth, num_leaves, min_child_samples, cat_ind_reindexed) t1 = time.time() execution_time = t1 - t0 print( "creating surrogate model took {} seconds".format(execution_time)) # assert we don't take too long to train the tree on 1 million rows # note we train on >1 million rows in ~1 second assert execution_time < 20 model_json = surrogate._Booster.dump_model() tree_structure = model_json["tree_info"][0]['tree_structure'] max_split_index = get_max_split_index(tree_structure) + 1 assert max_split_index == 3 cache_subtree_features(tree_structure, feature_names) pred_y = model_analyzer.model.predict(X_test) traversed_X_test = X_test.copy() traversed_X_test[DIFF] = diff traversed_X_test[TRUE_Y] = y_test traversed_X_test[PRED_Y] = pred_y t2 = time.time() tree = traverse(traversed_X_test, tree_structure, max_split_index, (categories_reindexed, cat_ind_reindexed), [], feature_names, metric=model_analyzer.metric, classes=model_analyzer.classes) t3 = time.time() execution_time = t3 - t2 print("traversing tree took {} seconds".format(execution_time)) assert tree is not None
def test_model_analysis_binary(self, manager_type): x_train, y_train, x_test, y_test, classes = \ create_binary_classification_dataset() x_train = pd.DataFrame(x_train) x_test = pd.DataFrame(x_test) models = create_models_classification(x_train, y_train) x_train[LABELS] = y_train x_test[LABELS] = y_test manager_args = None for model in models: run_model_analysis(model, x_train, x_test, LABELS, [], manager_type, manager_args, classes=classes)
def test_large_data_importances(self): # mutual information can be very costly for large number of rows # hence, assert we downsample to compute importances for large data X_train, y_train, X_test, y_test, _ = \ create_binary_classification_dataset(100) feature_names = list(X_train.columns) model = create_sklearn_random_forest_regressor(X_train, y_train) X_test, y_test = replicate_dataset(X_test, y_test) assert X_test.shape[0] > 1000000 t0 = time.time() categorical_features = [] model_analyzer = ModelAnalyzer(model, X_test, y_test, feature_names, categorical_features) model_analyzer.compute_importances() t1 = time.time() execution_time = t1 - t0 print(execution_time) # assert we don't take too long and downsample the dataset # note execution time is in seconds assert execution_time < 20