示例#1
0
def test_partial_dependence_baseline():
    X = pd.DataFrame([[1, 0], [0, 1]])
    y = pd.Series([0, 1])
    pipeline = BinaryClassificationPipeline(component_graph=["Baseline Classifier"], parameters={})
    pipeline.fit(X, y)
    with pytest.raises(ValueError, match="Partial dependence plots are not supported for Baseline pipelines"):
        partial_dependence(pipeline, X, features=0, grid_resolution=20)
示例#2
0
def test_partial_dependence_respect_grid_resolution():
    X, y = load_fraud(1000)

    pl = BinaryClassificationPipeline(component_graph=["DateTime Featurization Component", "One Hot Encoder", "Random Forest Classifier"])
    pl.fit(X, y)
    dep = partial_dependence(pl, X, features="amount", grid_resolution=20)

    assert dep.shape[0] == 20
    assert dep.shape[0] != max(X.select('categorical').describe().loc["nunique"]) + 1

    dep = partial_dependence(pl, X, features="provider", grid_resolution=20)
    assert dep.shape[0] == X['provider'].to_series().nunique()
    assert dep.shape[0] != max(X.select('categorical').describe().loc["nunique"]) + 1
示例#3
0
def test_graph_partial_dependence_regression_date_order(X_y_binary):
    pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')

    X, y = X_y_binary
    pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier'])
    X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
    y = pd.Series(y)
    dt_series = pd.Series(pd.date_range('20200101', periods=X.shape[0])).sample(frac=1).reset_index(drop=True)
    X['dt_column'] = pd.to_datetime(dt_series, errors='coerce')

    pipeline.fit(X, y)

    fig = graph_partial_dependence(pipeline, X, features='dt_column', grid_resolution=5)
    plot_data = fig.to_dict()['data'][0]
    assert plot_data['type'] == 'scatter'
    assert plot_data['x'].tolist() == list(pd.date_range('20200101', periods=X.shape[0]))
示例#4
0
def test_partial_dependence_datetime(problem_type, X_y_regression, X_y_binary, X_y_multi):
    if problem_type == 'binary':
        X, y = X_y_binary
        pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier'])
    elif problem_type == 'multiclass':
        X, y = X_y_multi
        pipeline = MulticlassClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier'])
    else:
        X, y = X_y_regression
        pipeline = RegressionPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Linear Regressor'])

    X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
    y = pd.Series(y)
    X['dt_column'] = pd.Series(pd.date_range('20200101', periods=X.shape[0]))

    pipeline.fit(X, y)
    part_dep = partial_dependence(pipeline, X, features='dt_column')
    if problem_type == 'multiclass':
        assert len(part_dep["partial_dependence"]) == 300  # 100 rows * 3 classes
        assert len(part_dep["feature_values"]) == 300
    else:
        assert len(part_dep["partial_dependence"]) == 100
        assert len(part_dep["feature_values"]) == 100
    assert not part_dep.isnull().any(axis=None)

    part_dep = partial_dependence(pipeline, X, features=20)
    if problem_type == 'multiclass':
        assert len(part_dep["partial_dependence"]) == 300  # 100 rows * 3 classes
        assert len(part_dep["feature_values"]) == 300
    else:
        assert len(part_dep["partial_dependence"]) == 100
        assert len(part_dep["feature_values"]) == 100
    assert not part_dep.isnull().any(axis=None)

    with pytest.raises(ValueError, match='Two-way partial dependence is not supported for datetime columns.'):
        part_dep = partial_dependence(pipeline, X, features=('0', 'dt_column'))
    with pytest.raises(ValueError, match='Two-way partial dependence is not supported for datetime columns.'):
        part_dep = partial_dependence(pipeline, X, features=(0, 20))
示例#5
0
def decision_tree_classification_pipeline_class(X_y_categorical_classification):
    pipeline = BinaryClassificationPipeline(['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Decision Tree Classifier'])
    X, y = X_y_categorical_classification
    pipeline.fit(X, y)
    return pipeline