def test_materialize_first_rows_inspection():
    """
    Tests whether the MaterializeFirstOutputRows works
    """
    inspector_result = PipelineInspector \
        .on_pipeline_from_py_file(ADULT_SIMPLE_PY) \
        .add_required_inspection(MaterializeFirstOutputRows(2)) \
        .execute()
    inspection_result = inspector_result.inspection_to_annotations
    assert MaterializeFirstOutputRows(2) in inspection_result
    result = inspection_result[MaterializeFirstOutputRows(2)]

    assert_df_dicts_equal(result, get_expected_result())
def run_and_assert_all_op_outputs_inspected(py_file_path, sensitive_columns, dag_png_path):
    """
    Execute the pipeline with a few checks and inspections.
    Assert that mlinspect properly lets inspections inspect all DAG nodes
    """

    inspector_result = PipelineInspector \
        .on_pipeline_from_py_file(py_file_path) \
        .add_check(NoBiasIntroducedFor(sensitive_columns)) \
        .add_check(NoIllegalFeatures()) \
        .add_required_inspection(MissingEmbeddings(20)) \
        .add_required_inspection(RowLineage(5)) \
        .add_required_inspection(MaterializeFirstOutputRows(5)) \
        .execute()
    materialize_output = inspector_result.inspection_to_annotations[MaterializeFirstOutputRows(5)]
    assert len(materialize_output) == (len(inspector_result.dag.nodes) - 1)  # Estimator does not have output

    save_fig_to_path(inspector_result.dag, dag_png_path)
    assert os.path.isfile(dag_png_path)
def run_and_assert_all_op_outputs_inspected(py_file_path,
                                            sensitive_columns,
                                            dag_png_path,
                                            custom_monkey_patching=None):
    """
    Execute the pipeline with a few checks and inspections.
    Assert that mlinspect properly lets inspections inspect all DAG nodes
    """
    if custom_monkey_patching is None:
        custom_monkey_patching = []

    inspector_result = PipelineInspector \
        .on_pipeline_from_py_file(py_file_path) \
        .add_check(NoBiasIntroducedFor(sensitive_columns)) \
        .add_check(NoIllegalFeatures()) \
        .add_required_inspection(MissingEmbeddings(20)) \
        .add_required_inspection(RowLineage(5)) \
        .add_required_inspection(MaterializeFirstOutputRows(5)) \
        .add_custom_monkey_patching_modules(custom_monkey_patching) \
        .execute()

    for dag_node, inspection_result in inspector_result.dag_node_to_inspection_results.items(
    ):
        assert dag_node.operator_info.operator != OperatorType.MISSING_OP
        assert MaterializeFirstOutputRows(5) in inspection_result
        assert RowLineage(5) in inspection_result
        assert MissingEmbeddings(20) in inspection_result
        assert HistogramForColumns(sensitive_columns) in inspection_result
        if dag_node.operator_info.operator != OperatorType.ESTIMATOR:  # Estimator does not have output
            assert inspection_result[MaterializeFirstOutputRows(5)] is not None
            assert inspection_result[RowLineage(5)] is not None
            assert inspection_result[HistogramForColumns(
                sensitive_columns)] is not None
        else:
            assert inspection_result[MaterializeFirstOutputRows(5)] is None
            assert inspection_result[RowLineage(5)] is not None
            assert inspection_result[HistogramForColumns(
                sensitive_columns)] is None

    save_fig_to_path(inspector_result.dag, dag_png_path)
    assert os.path.isfile(dag_png_path)

    return inspector_result.dag
def test_materialize_first_rows_inspection():
    """
    Tests whether the MaterializeFirstOutputRows works
    """
    inspector_result = PipelineInspector \
        .on_pipeline_from_py_file(ADULT_SIMPLE_PY) \
        .add_required_inspection(MaterializeFirstOutputRows(2)) \
        .execute()

    dag_node_to_inspection_results = list(inspector_result.dag_node_to_inspection_results.items())

    assert_output_looks_as_expected(dag_node_to_inspection_results)
def run_multiple_test_analyzers(code):
    """
   An utility function to test backends.
   Also useful to debug annotation propagation.
   """
    analyzers = [RandomAnnotationTestingInspection(2), MaterializeFirstOutputRows(5),
                 RowLineage(2)]
    result = PipelineInspector \
        .on_pipeline_from_string(code) \
        .add_required_inspections(analyzers) \
        .execute()
    inspection_results = result.inspection_to_annotations
    return inspection_results, analyzers
def assert_output_looks_as_expected(dag_node_to_inspection_results):
    """
    Tests whether the output of MaterializeFirstOutputRows looks as expected for the adult_simple pipeline
    """
    assert dag_node_to_inspection_results[0][0].optional_code_info.source_code == \
           "pd.read_csv(train_file, na_values='?', index_col=0)"
    actual_df = dag_node_to_inspection_results[0][1][MaterializeFirstOutputRows(2)]
    expected_df = DataFrame([[46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty',
                              'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K'],
                             [29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners',
                              'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K']],
                            columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                                     'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                                     'hours-per-week', 'native-country', 'income-per-year'])
    pandas.testing.assert_frame_equal(actual_df.reset_index(drop=True), expected_df.reset_index(drop=True))
    assert dag_node_to_inspection_results[1][0].optional_code_info.source_code == 'raw_data.dropna()'
    actual_df = dag_node_to_inspection_results[1][1][MaterializeFirstOutputRows(2)]
    expected_df = DataFrame([[46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty',
                              'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K'],
                             [29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners',
                              'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K']],
                            columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                                     'marital-status', 'occupation', 'relationship', 'race', 'sex',
                                     'capital-gain', 'capital-loss', 'hours-per-week',
                                     'native-country', 'income-per-year'])
    pandas.testing.assert_frame_equal(actual_df.reset_index(drop=True), expected_df.reset_index(drop=True))

    assert dag_node_to_inspection_results[2][0].optional_code_info.source_code == "data['income-per-year']"
    actual_df = dag_node_to_inspection_results[2][1][MaterializeFirstOutputRows(2)]
    expected_df = DataFrame([['<=50K'], ['<=50K']], columns=['income-per-year'])
    pandas.testing.assert_frame_equal(actual_df.reset_index(drop=True), expected_df.reset_index(drop=True))

    assert dag_node_to_inspection_results[3][0].optional_code_info.source_code == \
           "preprocessing.label_binarize(data['income-per-year'], classes=['>50K', '<=50K'])"
    actual_df = dag_node_to_inspection_results[3][1][MaterializeFirstOutputRows(2)]
    expected_df = DataFrame([[array(1)], [array(1)]], columns=['array'])
    pandas.testing.assert_frame_equal(actual_df.reset_index(drop=True), expected_df.reset_index(drop=True))

    assert dag_node_to_inspection_results[4][0].code_location.lineno == 18
    actual_df = dag_node_to_inspection_results[4][1][MaterializeFirstOutputRows(2)]
    expected_df = DataFrame([['Some-college', 'Private'], ['Some-college', 'Local-gov']],
                            columns=['education', 'workclass'])
    pandas.testing.assert_frame_equal(actual_df.reset_index(drop=True), expected_df.reset_index(drop=True))

    assert dag_node_to_inspection_results[5][0].optional_code_info.source_code == \
           "preprocessing.OneHotEncoder(handle_unknown='ignore')"
    actual_df = dag_node_to_inspection_results[5][1][MaterializeFirstOutputRows(2)]
    expected_df = DataFrame([[([array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0.,
                                       0., 0., 0.])])],
                             [[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
                                      0., 0., 0.])]]],
                            columns=['array'])
    pandas.testing.assert_frame_equal(actual_df.reset_index(drop=True), expected_df.reset_index(drop=True))

    assert dag_node_to_inspection_results[6][0].code_location.lineno == 18
    actual_df = dag_node_to_inspection_results[6][1][MaterializeFirstOutputRows(2)]
    expected_df = DataFrame([[46, 40], [29, 50]], columns=['age', 'hours-per-week'])
    pandas.testing.assert_frame_equal(actual_df.reset_index(drop=True), expected_df.reset_index(drop=True))

    assert dag_node_to_inspection_results[7][0].optional_code_info.source_code == 'preprocessing.StandardScaler()'
    actual_df = dag_node_to_inspection_results[7][1][MaterializeFirstOutputRows(2)]
    expected_df = DataFrame([[array([RangeComparison(0.5, 0.6), RangeComparison(-0.1, -0.05)])],
                             [array([RangeComparison(-0.8, -0.7), RangeComparison(0.7, 0.8)])]],
                            columns=['array'])
    pandas.testing.assert_frame_equal(actual_df.reset_index(drop=True), expected_df.reset_index(drop=True))

    assert dag_node_to_inspection_results[8][0].code_location.lineno == 18
    actual_df = dag_node_to_inspection_results[8][1][MaterializeFirstOutputRows(2)]
    expected_df = DataFrame([[array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
                                     0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, RangeComparison(0.5, 0.6),
                                     RangeComparison(-0.1, -0.05)])],
                             [array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
                                     0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, RangeComparison(-0.8, -0.7),
                                     RangeComparison(0.7, 0.8)])]],
                            columns=['array'])
    pandas.testing.assert_frame_equal(actual_df.reset_index(drop=True), expected_df.reset_index(drop=True))

    assert dag_node_to_inspection_results[9][0].optional_code_info.source_code == 'tree.DecisionTreeClassifier()'
    actual_df = dag_node_to_inspection_results[9][1][MaterializeFirstOutputRows(2)]
    expected_df = DataFrame([[array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
                                     0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, RangeComparison(0.5, 0.6),
                                     RangeComparison(-0.1, -0.05)])],
                             [array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
                                     0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, RangeComparison(-0.8, -0.7),
                                     RangeComparison(0.7, 0.8)])]],
                            columns=['array'])
    pandas.testing.assert_frame_equal(actual_df.reset_index(drop=True), expected_df.reset_index(drop=True))

    assert dag_node_to_inspection_results[10][0].optional_code_info.source_code == 'tree.DecisionTreeClassifier()'
    actual_df = dag_node_to_inspection_results[10][1][MaterializeFirstOutputRows(2)]
    expected_df = DataFrame([[array([1])], [array([1])]], columns=['array'])
    pandas.testing.assert_frame_equal(actual_df.reset_index(drop=True), expected_df.reset_index(drop=True))

    assert dag_node_to_inspection_results[11][0].optional_code_info.source_code == 'tree.DecisionTreeClassifier()'
    actual_df = dag_node_to_inspection_results[11][1][MaterializeFirstOutputRows(2)]
    assert actual_df is None