示例#1
0
def test_wine():
    X, y = demos.load_wine()
    assert X.shape == (178, 13)
    assert y.shape == (178, )
    assert isinstance(X, ww.DataTable)
    assert isinstance(y, ww.DataColumn)

    X, y = demos.load_wine(return_pandas=True)
    assert X.shape == (178, 13)
    assert y.shape == (178, )
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
示例#2
0
def test_partial_dependence_multiclass(
        logistic_regression_multiclass_pipeline_class):
    X, y = load_wine()
    pipeline = logistic_regression_multiclass_pipeline_class(
        parameters={"Logistic Regression Classifier": {
            "n_jobs": 1
        }})
    pipeline.fit(X, y)

    num_classes = y.to_series().nunique()
    grid_resolution = 20

    one_way_part_dep = partial_dependence(pipeline=pipeline,
                                          X=X,
                                          features="magnesium",
                                          grid_resolution=grid_resolution)
    assert "class_label" in one_way_part_dep.columns
    assert one_way_part_dep["class_label"].nunique() == num_classes
    assert len(one_way_part_dep.index) == num_classes * grid_resolution
    assert list(one_way_part_dep.columns) == [
        "feature_values", "partial_dependence", "class_label"
    ]

    two_way_part_dep = partial_dependence(pipeline=pipeline,
                                          X=X,
                                          features=("magnesium", "alcohol"),
                                          grid_resolution=grid_resolution)

    assert "class_label" in two_way_part_dep.columns
    assert two_way_part_dep["class_label"].nunique() == num_classes
    assert len(two_way_part_dep.index) == num_classes * grid_resolution
    assert len(two_way_part_dep.columns) == grid_resolution + 1
def test_pipeline_has_classes_property(
        logistic_regression_binary_pipeline_class,
        logistic_regression_multiclass_pipeline_class, problem_type, use_ints):
    if problem_type == "binary":
        X, y = load_breast_cancer(return_pandas=True)
        pipeline = logistic_regression_binary_pipeline_class(
            parameters={"Logistic Regression Classifier": {
                "n_jobs": 1
            }})
        if use_ints:
            y = y.map({'malignant': 0, 'benign': 1})
            answer = [0, 1]
        else:
            answer = ["benign", "malignant"]
    elif problem_type == "multi":
        X, y = load_wine(return_pandas=True)
        pipeline = logistic_regression_multiclass_pipeline_class(
            parameters={"Logistic Regression Classifier": {
                "n_jobs": 1
            }})
        if use_ints:
            y = y.map({"class_0": 0, "class_1": 1, "class_2": 2})
            answer = [0, 1, 2]
        else:
            answer = ["class_0", "class_1", "class_2"]

    with pytest.raises(
            AttributeError,
            match="Cannot access class names before fitting the pipeline."):
        pipeline.classes_

    pipeline.fit(X, y)
    pd.testing.assert_series_equal(pd.Series(pipeline.classes_),
                                   pd.Series(answer))
示例#4
0
def test_graph_partial_dependence_multiclass(logistic_regression_multiclass_pipeline_class):
    go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')
    X, y = load_wine()
    pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}})
    pipeline.fit(X, y)

    # Test one-way without class labels
    fig_one_way_no_class_labels = graph_partial_dependence(pipeline, X, features='magnesium', grid_resolution=20)
    assert isinstance(fig_one_way_no_class_labels, go.Figure)
    fig_dict = fig_one_way_no_class_labels.to_dict()
    assert len(fig_dict['data']) == len(pipeline.classes_)
    for data, label in zip(fig_dict['data'], pipeline.classes_):
        assert len(data['x']) == 20
        assert len(data['y']) == 20
        assert data['name'] == label

    # Check that all the subplots axes have the same range
    for suplot_1_axis, suplot_2_axis in [('axis2', 'axis3'), ('axis2', 'axis4'), ('axis3', 'axis4')]:
        for axis_type in ['x', 'y']:
            assert fig_dict['layout'][axis_type + suplot_1_axis]['range'] == fig_dict['layout'][axis_type + suplot_2_axis]['range']

    # Test one-way with class labels
    fig_one_way_class_labels = graph_partial_dependence(pipeline, X, features='magnesium', class_label='class_1', grid_resolution=20)
    assert isinstance(fig_one_way_class_labels, go.Figure)
    fig_dict = fig_one_way_class_labels.to_dict()
    assert len(fig_dict['data']) == 1
    assert len(fig_dict['data'][0]['x']) == 20
    assert len(fig_dict['data'][0]['y']) == 20
    assert fig_dict['data'][0]['name'] == 'class_1'

    msg = "Class wine is not one of the classes the pipeline was fit on: class_0, class_1, class_2"
    with pytest.raises(ValueError, match=msg):
        graph_partial_dependence(pipeline, X, features='alcohol', class_label='wine')

    # Test two-way without class labels
    fig_two_way_no_class_labels = graph_partial_dependence(pipeline, X, features=('magnesium', 'alcohol'), grid_resolution=20)
    assert isinstance(fig_two_way_no_class_labels, go.Figure)
    fig_dict = fig_two_way_no_class_labels.to_dict()
    assert len(fig_dict['data']) == 3, "Figure does not have partial dependence data for each class."
    assert all([len(fig_dict["data"][i]['x']) == 20 for i in range(3)])
    assert all([len(fig_dict["data"][i]['y']) == 20 for i in range(3)])
    assert [fig_dict["data"][i]['name'] for i in range(3)] == ["class_0", "class_1", "class_2"]

    # Check that all the subplots axes have the same range
    for suplot_1_axis, suplot_2_axis in [('axis', 'axis2'), ('axis', 'axis3'), ('axis2', 'axis3')]:
        for axis_type in ['x', 'y']:
            assert fig_dict['layout'][axis_type + suplot_1_axis]['range'] == fig_dict['layout'][axis_type + suplot_2_axis]['range']

    # Test two-way with class labels
    fig_two_way_class_labels = graph_partial_dependence(pipeline, X, features=('magnesium', 'alcohol'), class_label='class_1', grid_resolution=20)
    assert isinstance(fig_two_way_class_labels, go.Figure)
    fig_dict = fig_two_way_class_labels.to_dict()
    assert len(fig_dict['data']) == 1
    assert len(fig_dict['data'][0]['x']) == 20
    assert len(fig_dict['data'][0]['y']) == 20
    assert fig_dict['data'][0]['name'] == 'class_1'

    msg = "Class wine is not one of the classes the pipeline was fit on: class_0, class_1, class_2"
    with pytest.raises(ValueError, match=msg):
        graph_partial_dependence(pipeline, X, features='alcohol', class_label='wine')
示例#5
0
def test_invalid_targets_regression_pipeline(target_type, dummy_regression_pipeline_class):
    X, y = load_wine(return_pandas=True)
    if target_type == "category":
        y = pd.Series(y).astype("category")
    if target_type == "bool":
        X, y = load_breast_cancer(return_pandas=True)
        y = y.map({"malignant": False, "benign": True})
    mock_regression_pipeline = dummy_regression_pipeline_class(parameters={})
    with pytest.raises(ValueError, match="Regression pipeline can only handle numeric target data"):
        mock_regression_pipeline.fit(X, y)
示例#6
0
def test_partial_dependence_multiclass_categorical(class_label,
                                                   logistic_regression_multiclass_pipeline_class):
    pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')

    X, y = load_wine()
    X['categorical_column'] = ww.DataColumn(pd.Series([i % 3 for i in range(X.shape[0])]).astype(str),
                                            logical_type="Categorical")
    X['categorical_column_2'] = ww.DataColumn(pd.Series([i % 6 for i in range(X.shape[0])]).astype(str),
                                              logical_type="Categorical")

    pipeline = logistic_regression_multiclass_pipeline_class({"Logistic Regression Classifier": {"n_jobs": 1}})

    pipeline.fit(X, y)

    fig = graph_partial_dependence(pipeline, X, features='categorical_column', class_label=class_label,
                                   grid_resolution=5)

    for i, plot_data in enumerate(fig.to_dict()['data']):
        assert plot_data['type'] == 'bar'
        assert plot_data['x'].tolist() == ['0', '1', '2']
        if class_label is None:
            assert plot_data['name'] == f'class_{i}'
        else:
            assert plot_data['name'] == class_label

    fig = graph_partial_dependence(pipeline, X, features=('alcohol', 'categorical_column'), class_label=class_label,
                                   grid_resolution=5)

    for i, plot_data in enumerate(fig.to_dict()['data']):
        assert plot_data['type'] == 'contour'
        assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2']
        if class_label is None:
            assert plot_data['name'] == f'class_{i}'
        else:
            assert plot_data['name'] == class_label

    fig = graph_partial_dependence(pipeline, X, features=('categorical_column_2', 'categorical_column'),
                                   class_label=class_label, grid_resolution=5)

    for i, plot_data in enumerate(fig.to_dict()['data']):
        assert plot_data['type'] == 'contour'
        assert fig.to_dict()['layout']['xaxis']['ticktext'] == ['0', '1', '2']
        assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2', '3', '4', '5']
        if class_label is None:
            assert plot_data['name'] == f'class_{i}'
        else:
            assert plot_data['name'] == class_label