def test_histogram_projection():
    """
    Tests whether HistogramForColumns works for projections
    """
    test_code = cleandoc("""
            import pandas as pd

            pandas_df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c', 'cat_b'], 
                'B': [1, 2, 4, 5, 7], 'C': [2, 2, 10, 5, 7]})
            pandas_df = pandas_df[['B', 'C']]
            pandas_df = pandas_df[['C']]
            """)

    inspector_result = PipelineInspector \
        .on_pipeline_from_string(test_code) \
        .add_required_inspection(HistogramForColumns(["A"])) \
        .execute()
    inspection_results = list(
        inspector_result.dag_node_to_inspection_results.values())

    histogram_output = inspection_results[0][HistogramForColumns(["A"])]
    expected_histogram = {'A': {'cat_a': 2, 'cat_b': 2, 'cat_c': 1}}
    compare(histogram_output, expected_histogram)

    histogram_output = inspection_results[1][HistogramForColumns(["A"])]
    expected_histogram = {'A': {'cat_a': 2, 'cat_b': 2, 'cat_c': 1}}
    compare(histogram_output, expected_histogram)

    histogram_output = inspection_results[2][HistogramForColumns(["A"])]
    expected_histogram = {'A': {'cat_a': 2, 'cat_b': 2, 'cat_c': 1}}
    compare(histogram_output, expected_histogram)
def test_histogram_merge():
    """
    Tests whether HistogramForColumns works for joins
    """
    test_code = cleandoc("""
            import pandas as pd

            df_a = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c', 'cat_b'], 'B': [1, 2, 4, 5, 7]})
            df_b = pd.DataFrame({'B': [1, 2, 3, 4, 5], 'C': [1, 5, 4, 11, None]})
            df_merged = df_a.merge(df_b, on='B')
            """)

    inspector_result = PipelineInspector \
        .on_pipeline_from_string(test_code) \
        .add_required_inspection(HistogramForColumns(["A"])) \
        .execute()
    inspection_results = list(
        inspector_result.dag_node_to_inspection_results.values())

    histogram_output = inspection_results[0][HistogramForColumns(["A"])]
    expected_histogram = {'A': {'cat_a': 2, 'cat_b': 2, 'cat_c': 1}}
    compare(histogram_output, expected_histogram)

    histogram_output = inspection_results[1][HistogramForColumns(["A"])]
    expected_histogram = {'A': {}}
    compare(histogram_output, expected_histogram)

    histogram_output = inspection_results[2][HistogramForColumns(["A"])]
    expected_histogram = {'A': {'cat_a': 2, 'cat_b': 1, 'cat_c': 1}}
    compare(histogram_output, expected_histogram)
Пример #3
0
def test_inspector_adult_easy_str_pipeline():
    """
    Tests whether the str version of the inspector works
    """
    with open(ADULT_SIMPLE_PY) as file:
        code = file.read()

        inspector_result = PipelineInspector\
            .on_pipeline_from_string(code)\
            .add_required_inspection(MaterializeFirstOutputRows(5)) \
            .add_check(NoBiasIntroducedFor(['race'])) \
            .add_check(NoIllegalFeatures()) \
            .execute()
        extracted_dag = inspector_result.dag
        expected_dag = get_expected_dag_adult_easy("<string-source>")
        compare(networkx.to_dict_of_dicts(extracted_dag),
                networkx.to_dict_of_dicts(expected_dag))

        assert HistogramForColumns(['race']) in list(
            inspector_result.dag_node_to_inspection_results.values())[0]
        check_to_check_results = inspector_result.check_to_check_results
        assert check_to_check_results[NoBiasIntroducedFor(
            ['race'])].status == CheckStatus.SUCCESS
        assert check_to_check_results[
            NoIllegalFeatures()].status == CheckStatus.FAILURE
Пример #4
0
def run_and_assert_all_op_outputs_inspected(py_file_path,
                                            sensitive_columns,
                                            dag_png_path,
                                            custom_monkey_patching=None):
    """
    Execute the pipeline with a few checks and inspections.
    Assert that mlinspect properly lets inspections inspect all DAG nodes
    """
    if custom_monkey_patching is None:
        custom_monkey_patching = []

    inspector_result = PipelineInspector \
        .on_pipeline_from_py_file(py_file_path) \
        .add_check(NoBiasIntroducedFor(sensitive_columns)) \
        .add_check(NoIllegalFeatures()) \
        .add_required_inspection(MissingEmbeddings(20)) \
        .add_required_inspection(RowLineage(5)) \
        .add_required_inspection(MaterializeFirstOutputRows(5)) \
        .add_custom_monkey_patching_modules(custom_monkey_patching) \
        .execute()

    for dag_node, inspection_result in inspector_result.dag_node_to_inspection_results.items(
    ):
        assert dag_node.operator_info.operator != OperatorType.MISSING_OP
        assert MaterializeFirstOutputRows(5) in inspection_result
        assert RowLineage(5) in inspection_result
        assert MissingEmbeddings(20) in inspection_result
        assert HistogramForColumns(sensitive_columns) in inspection_result
        if dag_node.operator_info.operator != OperatorType.ESTIMATOR:  # Estimator does not have output
            assert inspection_result[MaterializeFirstOutputRows(5)] is not None
            assert inspection_result[RowLineage(5)] is not None
            assert inspection_result[HistogramForColumns(
                sensitive_columns)] is not None
        else:
            assert inspection_result[MaterializeFirstOutputRows(5)] is None
            assert inspection_result[RowLineage(5)] is not None
            assert inspection_result[HistogramForColumns(
                sensitive_columns)] is None

    save_fig_to_path(inspector_result.dag, dag_png_path)
    assert os.path.isfile(dag_png_path)

    return inspector_result.dag
Пример #5
0
def test_inspector_adult_easy_ipynb_pipeline():
    """
    Tests whether the .ipynb version of the inspector works
    """
    inspector_result = PipelineInspector\
        .on_pipeline_from_ipynb_file(ADULT_SIMPLE_IPYNB)\
        .add_required_inspection(MaterializeFirstOutputRows(5)) \
        .add_check(NoBiasIntroducedFor(['race'])) \
        .add_check(NoIllegalFeatures()) \
        .execute()
    extracted_dag = inspector_result.dag
    expected_dag = get_expected_dag_adult_easy_ipynb()
    compare(networkx.to_dict_of_dicts(extracted_dag),
            networkx.to_dict_of_dicts(expected_dag))

    assert HistogramForColumns(
        ['race']) in inspector_result.inspection_to_annotations
    check_to_check_results = inspector_result.check_to_check_results
    assert check_to_check_results[NoBiasIntroducedFor(
        ['race'])].status == CheckStatus.SUCCESS
    assert check_to_check_results[
        NoIllegalFeatures()].status == CheckStatus.FAILURE