def test_fast_permutation_importance_matches_sklearn_output( mock_supports_fast_importance, pipeline_class, parameters, has_minimal_dependencies): if has_minimal_dependencies and pipeline_class == LinearPipelineWithTargetEncoderAndOHE: pytest.skip( "Skipping test_fast_permutation_importance_matches_sklearn_output for target encoder cause " "dependency not installed.") X, y = load_fraud(100) if pipeline_class == LinearPipelineWithTextFeatures: X = X.set_types(logical_types={'provider': 'NaturalLanguage'}) # Do this to make sure we use the same int as sklearn under the hood random_state = np.random.RandomState(0) random_seed = random_state.randint(np.iinfo(np.int32).max + 1) mock_supports_fast_importance.return_value = True parameters['Random Forest Classifier'] = {'n_jobs': 1} pipeline = pipeline_class(parameters=parameters) pipeline.fit(X, y) fast_scores = calculate_permutation_importance(pipeline, X, y, objective='Log Loss Binary', random_seed=random_seed) mock_supports_fast_importance.return_value = False slow_scores = calculate_permutation_importance(pipeline, X, y, objective='Log Loss Binary', random_seed=0) pd.testing.assert_frame_equal(fast_scores, slow_scores)
def test_partial_dependence_more_categories_than_grid_resolution(logistic_regression_binary_pipeline_class): def round_dict_keys(dictionary, places=6): """ Function to round all keys of a dictionary that has floats as keys. """ dictionary_rounded = {} for key in dictionary: dictionary_rounded[round(key, places)] = dictionary[key] return dictionary_rounded X, y = load_fraud(1000) X = X.drop(columns=['datetime', 'expiration_date', 'country', 'region', 'provider']) pipeline = logistic_regression_binary_pipeline_class({}) pipeline.fit(X, y) num_cat_features = len(set(X["currency"].to_series())) assert num_cat_features == 164 part_dep_ans = {0.1432616813857269: 154, 0.1502346349971562: 1, 0.14487916687594762: 1, 0.1573183451314127: 1, 0.11695462432136654: 1, 0.07950579532536253: 1, 0.006794444792966759: 1, 0.17745270478939879: 1, 0.1666874487986626: 1, 0.13357573073236878: 1, 0.06778096366056789: 1} part_dep_ans_rounded = round_dict_keys(part_dep_ans) # Check the case where grid_resolution < number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features / 2)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict) # Check the case where grid_resolution == number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict) # Check the case where grid_resolution > number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features * 2)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict)
def test_fraud(): X, y = demos.load_fraud() assert X.shape == (99992, 12) assert y.shape == (99992, ) assert isinstance(X, ww.DataTable) assert isinstance(y, ww.DataColumn) X, y = demos.load_fraud(1000) assert X.shape == (1000, 12) assert y.shape == (1000, ) X, y = demos.load_fraud(1000, return_pandas=True) assert X.shape == (1000, 12) assert y.shape == (1000, ) assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series)
def test_partial_dependence_more_categories_than_grid_resolution( logistic_regression_binary_pipeline_class): def round_dict_keys(dictionary, places=6): """ Function to round all keys of a dictionary that has floats as keys. """ dictionary_rounded = {} for key in dictionary: dictionary_rounded[round(key, places)] = dictionary[key] return dictionary_rounded X, y = load_fraud(1000) X = X.drop(columns=[ 'datetime', 'expiration_date', 'country', 'region', 'provider' ]) pipeline = logistic_regression_binary_pipeline_class({}) pipeline.fit(X, y) num_cat_features = len(set(X["currency"].to_series())) assert num_cat_features == 164 part_dep_ans = { 0.1424060057413758: 154, 0.006837318701999957: 1, 0.24445532203317386: 1, 0.15637574440029903: 1, 0.11676042311300606: 1, 0.13434069071819482: 1, 0.1502609021969637: 1, 0.14486201259150977: 1, 0.16687406140200164: 1, 0.06815227785761911: 1, 0.0791821060634158: 1 } part_dep_ans_rounded = round_dict_keys(part_dep_ans) # Check the case where grid_resolution < number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features / 2)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict) # Check the case where grid_resolution == number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict) # Check the case where grid_resolution > number of categorical features part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features * 2)) part_dep_dict = dict(part_dep["partial_dependence"].value_counts()) assert part_dep_ans_rounded == round_dict_keys(part_dep_dict)
def test_partial_dependence_respect_grid_resolution(): X, y = load_fraud(1000) pl = BinaryClassificationPipeline(component_graph=["DateTime Featurization Component", "One Hot Encoder", "Random Forest Classifier"]) pl.fit(X, y) dep = partial_dependence(pl, X, features="amount", grid_resolution=20) assert dep.shape[0] == 20 assert dep.shape[0] != max(X.select('categorical').describe().loc["nunique"]) + 1 dep = partial_dependence(pl, X, features="provider", grid_resolution=20) assert dep.shape[0] == X['provider'].to_series().nunique() assert dep.shape[0] != max(X.select('categorical').describe().loc["nunique"]) + 1
def fraud_100(): return load_fraud(n_rows=100)