def test_histogram_projection(): """ Tests whether HistogramForColumns works for projections """ test_code = cleandoc(""" import pandas as pd pandas_df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c', 'cat_b'], 'B': [1, 2, 4, 5, 7], 'C': [2, 2, 10, 5, 7]}) pandas_df = pandas_df[['B', 'C']] pandas_df = pandas_df[['C']] """) inspector_result = PipelineInspector \ .on_pipeline_from_string(test_code) \ .add_required_inspection(HistogramForColumns(["A"])) \ .execute() inspection_results = list( inspector_result.dag_node_to_inspection_results.values()) histogram_output = inspection_results[0][HistogramForColumns(["A"])] expected_histogram = {'A': {'cat_a': 2, 'cat_b': 2, 'cat_c': 1}} compare(histogram_output, expected_histogram) histogram_output = inspection_results[1][HistogramForColumns(["A"])] expected_histogram = {'A': {'cat_a': 2, 'cat_b': 2, 'cat_c': 1}} compare(histogram_output, expected_histogram) histogram_output = inspection_results[2][HistogramForColumns(["A"])] expected_histogram = {'A': {'cat_a': 2, 'cat_b': 2, 'cat_c': 1}} compare(histogram_output, expected_histogram)
def test_histogram_merge(): """ Tests whether HistogramForColumns works for joins """ test_code = cleandoc(""" import pandas as pd df_a = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c', 'cat_b'], 'B': [1, 2, 4, 5, 7]}) df_b = pd.DataFrame({'B': [1, 2, 3, 4, 5], 'C': [1, 5, 4, 11, None]}) df_merged = df_a.merge(df_b, on='B') """) inspector_result = PipelineInspector \ .on_pipeline_from_string(test_code) \ .add_required_inspection(HistogramForColumns(["A"])) \ .execute() inspection_results = list( inspector_result.dag_node_to_inspection_results.values()) histogram_output = inspection_results[0][HistogramForColumns(["A"])] expected_histogram = {'A': {'cat_a': 2, 'cat_b': 2, 'cat_c': 1}} compare(histogram_output, expected_histogram) histogram_output = inspection_results[1][HistogramForColumns(["A"])] expected_histogram = {'A': {}} compare(histogram_output, expected_histogram) histogram_output = inspection_results[2][HistogramForColumns(["A"])] expected_histogram = {'A': {'cat_a': 2, 'cat_b': 1, 'cat_c': 1}} compare(histogram_output, expected_histogram)
def test_inspector_adult_easy_str_pipeline(): """ Tests whether the str version of the inspector works """ with open(ADULT_SIMPLE_PY) as file: code = file.read() inspector_result = PipelineInspector\ .on_pipeline_from_string(code)\ .add_required_inspection(MaterializeFirstOutputRows(5)) \ .add_check(NoBiasIntroducedFor(['race'])) \ .add_check(NoIllegalFeatures()) \ .execute() extracted_dag = inspector_result.dag expected_dag = get_expected_dag_adult_easy("<string-source>") compare(networkx.to_dict_of_dicts(extracted_dag), networkx.to_dict_of_dicts(expected_dag)) assert HistogramForColumns(['race']) in list( inspector_result.dag_node_to_inspection_results.values())[0] check_to_check_results = inspector_result.check_to_check_results assert check_to_check_results[NoBiasIntroducedFor( ['race'])].status == CheckStatus.SUCCESS assert check_to_check_results[ NoIllegalFeatures()].status == CheckStatus.FAILURE
def run_and_assert_all_op_outputs_inspected(py_file_path, sensitive_columns, dag_png_path, custom_monkey_patching=None): """ Execute the pipeline with a few checks and inspections. Assert that mlinspect properly lets inspections inspect all DAG nodes """ if custom_monkey_patching is None: custom_monkey_patching = [] inspector_result = PipelineInspector \ .on_pipeline_from_py_file(py_file_path) \ .add_check(NoBiasIntroducedFor(sensitive_columns)) \ .add_check(NoIllegalFeatures()) \ .add_required_inspection(MissingEmbeddings(20)) \ .add_required_inspection(RowLineage(5)) \ .add_required_inspection(MaterializeFirstOutputRows(5)) \ .add_custom_monkey_patching_modules(custom_monkey_patching) \ .execute() for dag_node, inspection_result in inspector_result.dag_node_to_inspection_results.items( ): assert dag_node.operator_info.operator != OperatorType.MISSING_OP assert MaterializeFirstOutputRows(5) in inspection_result assert RowLineage(5) in inspection_result assert MissingEmbeddings(20) in inspection_result assert HistogramForColumns(sensitive_columns) in inspection_result if dag_node.operator_info.operator != OperatorType.ESTIMATOR: # Estimator does not have output assert inspection_result[MaterializeFirstOutputRows(5)] is not None assert inspection_result[RowLineage(5)] is not None assert inspection_result[HistogramForColumns( sensitive_columns)] is not None else: assert inspection_result[MaterializeFirstOutputRows(5)] is None assert inspection_result[RowLineage(5)] is not None assert inspection_result[HistogramForColumns( sensitive_columns)] is None save_fig_to_path(inspector_result.dag, dag_png_path) assert os.path.isfile(dag_png_path) return inspector_result.dag
def test_inspector_adult_easy_ipynb_pipeline(): """ Tests whether the .ipynb version of the inspector works """ inspector_result = PipelineInspector\ .on_pipeline_from_ipynb_file(ADULT_SIMPLE_IPYNB)\ .add_required_inspection(MaterializeFirstOutputRows(5)) \ .add_check(NoBiasIntroducedFor(['race'])) \ .add_check(NoIllegalFeatures()) \ .execute() extracted_dag = inspector_result.dag expected_dag = get_expected_dag_adult_easy_ipynb() compare(networkx.to_dict_of_dicts(extracted_dag), networkx.to_dict_of_dicts(expected_dag)) assert HistogramForColumns( ['race']) in inspector_result.inspection_to_annotations check_to_check_results = inspector_result.check_to_check_results assert check_to_check_results[NoBiasIntroducedFor( ['race'])].status == CheckStatus.SUCCESS assert check_to_check_results[ NoIllegalFeatures()].status == CheckStatus.FAILURE