def test_frame_merge_sorted():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'merge') works if the sort option is set to True
    """
    test_code = cleandoc("""
        import pandas as pd

        df_a = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]})
        df_b = pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]})
        df_merged = df_a.merge(df_b, on='B', sort=True)
        df_expected = pd.DataFrame({'A': [5, 8, 4, 2], 'B': [1, 2, 4, 5], 'C': [1, 11, 5, None]})
        pd.testing.assert_frame_equal(df_merged, df_expected)
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(5)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3])

    expected_dag = networkx.DiGraph()
    expected_a = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A', 'B']),
        OptionalCodeInfo(
            CodeReference(3, 7, 3, 65),
            "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]})"))
    expected_b = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['B', 'C']),
        OptionalCodeInfo(
            CodeReference(4, 7, 4, 69),
            "pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]})"))
    expected_join = DagNode(
        2, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.JOIN,
                        FunctionInfo('pandas.core.frame', 'merge')),
        DagNodeDetails("on 'B'", ['A', 'B', 'C']),
        OptionalCodeInfo(CodeReference(5, 12, 5, 47),
                         "df_a.merge(df_b, on='B', sort=True)"))
    expected_dag.add_edge(expected_a, expected_join)
    expected_dag.add_edge(expected_b, expected_join)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_join]
    lineage_output = inspection_results_data_source[RowLineage(5)]
    expected_lineage_df = DataFrame(
        [[5, 1, 1., {LineageId(0, 4), LineageId(1, 0)}],
         [8, 2, 11., {LineageId(0, 3), LineageId(1, 3)}],
         [4, 4, 5., {LineageId(0, 2), LineageId(1, 1)}],
         [2, 5, math.nan, {LineageId(0, 1), LineageId(1, 4)}]],
        columns=['A', 'B', 'C', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_black_box_operation():
    """
    Tests whether the monkey patching of pandas function works
    """
    test_code = cleandoc("""
        import pandas
        from mlinspect.testing._testing_helper_utils import black_box_df_op
        
        df = black_box_df_op()
        df = df.dropna()
        print("df")
        """)

    extracted_dag = _pipeline_executor.singleton.run(
        python_code=test_code, track_code_references=True).dag

    expected_dag = networkx.DiGraph()
    expected_missing_op = DagNode(
        -1, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.MISSING_OP, None),
        DagNodeDetails(
            'Warning! Operator <string-source>:5 (df.dropna()) encountered a '
            'DataFrame resulting from an operation without mlinspect support!',
            ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16),
                                     'df.dropna()'))
    expected_select = DagNode(
        0, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails('dropna', ['A']),
        OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()'))
    expected_dag.add_edge(expected_missing_op, expected_select)
    compare(networkx.to_dict_of_dicts(extracted_dag),
            networkx.to_dict_of_dicts(expected_dag))
def test_func_defs_and_loops():
    """
    Tests whether the monkey patching of pandas function works
    """
    test_code = get_test_code_with_function_def_and_for_loop()

    extracted_dag = _pipeline_executor.singleton.run(
        python_code=test_code, track_code_references=True).dag

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(CodeReference(4, 9, 4, 44),
                         "pd.DataFrame([0, 1], columns=['A'])"))
    expected_select_1 = DagNode(
        1, BasicCodeLocation("<string-source>", 8),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails('dropna', ['A']),
        OptionalCodeInfo(CodeReference(8, 9, 8, 20), 'df.dropna()'))
    expected_dag.add_edge(expected_data_source, expected_select_1)
    expected_select_2 = DagNode(
        2, BasicCodeLocation("<string-source>", 8),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails('dropna', ['A']),
        OptionalCodeInfo(CodeReference(8, 9, 8, 20), 'df.dropna()'))
    expected_dag.add_edge(expected_select_1, expected_select_2)
    compare(networkx.to_dict_of_dicts(extracted_dag),
            networkx.to_dict_of_dicts(expected_dag))
def test_statsmodels_add_constant():
    """
    Tests whether the monkey patching of ('statsmodel.api', 'add_constant') works
    """
    test_code = cleandoc("""
        import numpy as np
        import statsmodels.api as sm
        np.random.seed(42)
        test = np.random.random(100)
        test = sm.add_constant(test)
        assert len(test) == 100
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    expected_dag = networkx.DiGraph()
    expected_random = DagNode(
        0, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('numpy.random', 'random')),
        DagNodeDetails('random', ['array']),
        OptionalCodeInfo(CodeReference(4, 7, 4, 28), "np.random.random(100)"))

    expected_constant = DagNode(
        1, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.PROJECTION_MODIFY,
                        FunctionInfo('statsmodel.api', 'add_constant')),
        DagNodeDetails('Adds const column', ['array']),
        OptionalCodeInfo(CodeReference(5, 7, 5, 28), "sm.add_constant(test)"))
    expected_dag.add_edge(expected_random, expected_constant)

    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_random]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=1)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_constant]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[numpy.array([0.5, 1.]), {LineageId(0, 0)}],
         [numpy.array([0.5, 1.]), {LineageId(0, 1)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=1)
Exemplo n.º 5
0
def test_frame__setitem__():
    """
    Tests whether the monkey patching of ('pandas.core.frame', '__setitem__') works
    """
    test_code = cleandoc("""
                import pandas as pd

                pandas_df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                              'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                              'baz': [1, 2, 3, 4, 5, 6],
                              'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
                pandas_df['baz'] = pandas_df['baz'] + 1
                df_expected = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                              'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                              'baz': [2, 3, 4, 5, 6, 7],
                              'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
                pd.testing.assert_frame_equal(pandas_df, df_expected)
                """)
    inspector_result = _pipeline_executor.singleton.run(python_code=test_code, track_code_references=True,
                                                        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(0,
                                   BasicCodeLocation("<string-source>", 3),
                                   OperatorContext(OperatorType.DATA_SOURCE,
                                                   FunctionInfo('pandas.core.frame', 'DataFrame')),
                                   DagNodeDetails(None, ['foo', 'bar', 'baz', 'zoo']),
                                   OptionalCodeInfo(CodeReference(3, 12, 6, 53),
                                                    "pd.DataFrame({'foo': ['one', 'one', 'one', 'two', "
                                                    "'two', 'two'],\n"
                                                    "              'bar': ['A', 'B', 'C', 'A', 'B', 'C'],\n"
                                                    "              'baz': [1, 2, 3, 4, 5, 6],\n"
                                                    "              'zoo': ['x', 'y', 'z', 'q', 'w', 't']})"))
    expected_project = DagNode(1,
                               BasicCodeLocation("<string-source>", 7),
                               OperatorContext(OperatorType.PROJECTION,
                                               FunctionInfo('pandas.core.frame', '__getitem__')),
                               DagNodeDetails("to ['baz']", ['baz']),
                               OptionalCodeInfo(CodeReference(7, 19, 7, 35), "pandas_df['baz']"))
    expected_dag.add_edge(expected_data_source, expected_project)
    expected_project_modify = DagNode(2,
                                      BasicCodeLocation("<string-source>", 7),
                                      OperatorContext(OperatorType.PROJECTION_MODIFY,
                                                      FunctionInfo('pandas.core.frame', '__setitem__')),
                                      DagNodeDetails("modifies ['baz']", ['foo', 'bar', 'baz', 'zoo']),
                                      OptionalCodeInfo(CodeReference(7, 0, 7, 39),
                                                       "pandas_df['baz'] = pandas_df['baz'] + 1"))
    expected_dag.add_edge(expected_data_source, expected_project_modify)

    compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[expected_project_modify]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame([['one', 'A', 2, 'x', {LineageId(0, 0)}],
                                     ['one', 'B', 3, 'y', {LineageId(0, 1)}]],
                                    columns=['foo', 'bar', 'baz', 'zoo', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_my_word_to_vec_transformer():
    """
    Tests whether the monkey patching of ('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer') works
    """
    test_code = cleandoc("""
                import pandas as pd
                from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer
                import numpy as np

                df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']})
                word_to_vec = MyW2VTransformer(min_count=2, size=2, workers=1)
                encoded_data = word_to_vec.fit_transform(df)
                assert encoded_data.shape == (4, 2)
                """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(3)],
        custom_monkey_patching=[custom_monkeypatching])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(
            CodeReference(5, 5, 5, 62),
            "pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']})"))
    expected_estimator = DagNode(
        1, BasicCodeLocation("<string-source>", 6),
        OperatorContext(
            OperatorType.TRANSFORMER,
            FunctionInfo('example_pipelines.healthcare.healthcare_utils',
                         'MyW2VTransformer')),
        DagNodeDetails('Word2Vec', ['array']),
        OptionalCodeInfo(CodeReference(6, 14, 6, 62),
                         'MyW2VTransformer(min_count=2, size=2, workers=1)'))
    expected_dag.add_edge(expected_data_source, expected_estimator)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_estimator]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame(
        [[numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 0)}],
         [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 1)}],
         [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 2)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_series_equal(
        lineage_output["mlinspect_lineage"],
        expected_lineage_df["mlinspect_lineage"])
    assert expected_lineage_df.iloc[0, 0].shape == (3, )
def test_frame__getitem__selection():
    """
    Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for filtering
    """
    test_code = cleandoc("""
                import pandas as pd

                df = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]})
                df_selection = df[df['A'] > 3]
                df_expected = pd.DataFrame({'A': [4, 8, 5], 'B': [4, 11, None]})
                pd.testing.assert_frame_equal(df_selection.reset_index(drop=True), df_expected.reset_index(drop=True))
                """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A', 'B']),
        OptionalCodeInfo(
            CodeReference(3, 5, 3, 67),
            "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]})"))
    expected_projection = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.PROJECTION,
                        FunctionInfo('pandas.core.frame', '__getitem__')),
        DagNodeDetails("to ['A']", ['A']),
        OptionalCodeInfo(CodeReference(4, 18, 4, 25), "df['A']"))
    expected_dag.add_edge(expected_data_source, expected_projection)
    expected_selection = DagNode(
        2, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', '__getitem__')),
        DagNodeDetails("Select by Series: df[df['A'] > 3]", ['A', 'B']),
        OptionalCodeInfo(CodeReference(4, 15, 4, 30), "df[df['A'] > 3]"))
    expected_dag.add_edge(expected_data_source, expected_selection)

    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_selection]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[4, 4., {LineageId(0, 2)}], [8, 11., {LineageId(0, 3)}]],
        columns=['A', 'B', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_groupby_agg():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'groupby') and ('pandas.core.groupbygeneric', 'agg')
    works.
    """
    test_code = cleandoc("""
        import pandas as pd

        df = pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], 'value': [1, 2, 1, 3, 4]})
        df_groupby_agg = df.groupby('group').agg(mean_value=('value', 'mean'))
        
        df_expected = pd.DataFrame({'group': ['A', 'B', 'C'], 'mean_value': [1, 3, 3]})
        pd.testing.assert_frame_equal(df_groupby_agg.reset_index(drop=False), df_expected.reset_index(drop=True))
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2])

    expected_dag = networkx.DiGraph()
    expected_data = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['group', 'value']),
        OptionalCodeInfo(
            CodeReference(3, 5, 3, 81),
            "pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], "
            "'value': [1, 2, 1, 3, 4]})"))
    expected_groupby_agg = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.GROUP_BY_AGG,
                        FunctionInfo('pandas.core.groupby.generic', 'agg')),
        DagNodeDetails(
            "Groupby 'group', Aggregate: '{'mean_value': ('value', 'mean')}'",
            ['group', 'mean_value']),
        OptionalCodeInfo(
            CodeReference(4, 17, 4, 70),
            "df.groupby('group').agg(mean_value=('value', 'mean'))"))
    expected_dag.add_edge(expected_data, expected_groupby_agg)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_groupby_agg]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [["A", 1, {LineageId(1, 0)}], ['B', 3, {LineageId(1, 1)}]],
        columns=['group', 'mean_value', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_frame__getitem__frame():
    """
    Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for multiple string arguments
    """
    test_code = cleandoc("""
                import pandas as pd

                df = pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], [9, 2, 3], [6, 1, 2], [1, 2, 3]], 
                    columns=['A', 'B', 'C'])
                df_projection = df[['A', 'C']]
                df_expected = pd.DataFrame([[0, 2], [1, 3], [4, 2], [9, 3], [6, 2], [1, 3]], columns=['A', 'C'])
                pd.testing.assert_frame_equal(df_projection, df_expected)
                """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A', 'B', 'C']),
        OptionalCodeInfo(
            CodeReference(3, 5, 4, 28),
            "pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], "
            "[9, 2, 3], [6, 1, 2], [1, 2, 3]], \n"
            "    columns=['A', 'B', 'C'])"))
    expected_project = DagNode(
        1, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.PROJECTION,
                        FunctionInfo('pandas.core.frame', '__getitem__')),
        DagNodeDetails("to ['A', 'C']", ['A', 'C']),
        OptionalCodeInfo(CodeReference(5, 16, 5, 30), "df[['A', 'C']]"))
    expected_dag.add_edge(expected_data_source, expected_project)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_project]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0, 2, {LineageId(0, 0)}], [1, 3, {LineageId(0, 1)}]],
        columns=['A', 'C', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_frame_replace():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'replace') works
    """
    test_code = cleandoc("""
        import pandas as pd

        df = pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], columns=['A'])
        df_replace = df.replace('Medium', 'Low')
        df_expected = pd.DataFrame(['Low', 'Low', 'Low', 'High', None], columns=['A'])
        pd.testing.assert_frame_equal(df_replace.reset_index(drop=True), df_expected.reset_index(drop=True))
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(
            CodeReference(3, 5, 3, 72),
            "pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], "
            "columns=['A'])"))
    expected_modify = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.PROJECTION_MODIFY,
                        FunctionInfo('pandas.core.frame', 'replace')),
        DagNodeDetails("Replace 'Medium' with 'Low'", ['A']),
        OptionalCodeInfo(CodeReference(4, 13, 4, 40),
                         "df.replace('Medium', 'Low')"))
    expected_dag.add_edge(expected_data_source, expected_modify)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_modify]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [['Low', {LineageId(0, 0)}], ['Low', {LineageId(0, 1)}]],
        columns=['A', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def add_test_label_node(test_label_arg, caller_filename, function_info, lineno,
                        optional_code_reference, optional_source_code):
    """Add a Test Label DAG Node for a estimator.score call"""
    # pylint: disable=too-many-arguments
    operator_context = OperatorContext(OperatorType.TEST_LABELS, function_info)
    input_info_test_labels = get_input_info(test_label_arg, caller_filename,
                                            lineno, function_info,
                                            optional_code_reference,
                                            optional_source_code)
    test_label_op_id = _pipeline_executor.singleton.get_next_op_id()
    test_labels_dag_node = DagNode(
        test_label_op_id, BasicCodeLocation(caller_filename,
                                            lineno), operator_context,
        DagNodeDetails(None, get_column_names(test_label_arg)),
        get_optional_code_info_or_none(optional_code_reference,
                                       optional_source_code))
    input_infos = SklearnBackend.before_call(
        operator_context, [input_info_test_labels.annotated_dfobject])
    label_backend_result = SklearnBackend.after_call(operator_context,
                                                     input_infos,
                                                     test_label_arg)
    add_dag_node(test_labels_dag_node, [input_info_test_labels.dag_node],
                 label_backend_result)
    test_labels_result = label_backend_result.annotated_dfobject.result_data
    return label_backend_result, test_labels_dag_node, test_labels_result
Exemplo n.º 12
0
def test_numpy_random():
    """
    Tests whether the monkey patching of ('numpy.random', 'random') works
    """
    test_code = cleandoc("""
        import numpy as np
        np.random.seed(42)
        test = np.random.random(100)
        assert len(test) == 100
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]

    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('numpy.random', 'random')),
        DagNodeDetails('random', ['array']),
        OptionalCodeInfo(CodeReference(3, 7, 3, 28), "np.random.random(100)"))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=1)
def add_train_data_node(estimator, train_data_arg, function_info):
    """Add a Train Label DAG Node for a estimator.fit call"""
    input_info_train_data = get_input_info(
        train_data_arg, estimator.mlinspect_caller_filename,
        estimator.mlinspect_lineno, function_info,
        estimator.mlinspect_optional_code_reference,
        estimator.mlinspect_optional_source_code)
    train_data_op_id = _pipeline_executor.singleton.get_next_op_id()
    operator_context = OperatorContext(OperatorType.TRAIN_DATA, function_info)
    train_data_dag_node = DagNode(
        train_data_op_id,
        BasicCodeLocation(estimator.mlinspect_caller_filename,
                          estimator.mlinspect_lineno), operator_context,
        DagNodeDetails(None, ["array"]),
        get_optional_code_info_or_none(
            estimator.mlinspect_optional_code_reference,
            estimator.mlinspect_optional_source_code))
    input_infos = SklearnBackend.before_call(
        operator_context, [input_info_train_data.annotated_dfobject])
    data_backend_result = SklearnBackend.after_call(operator_context,
                                                    input_infos,
                                                    train_data_arg)
    add_dag_node(train_data_dag_node, [input_info_train_data.dag_node],
                 data_backend_result)
    train_data_result = data_backend_result.annotated_dfobject.result_data
    return data_backend_result, train_data_dag_node, train_data_result
Exemplo n.º 14
0
    def patched_fit_transform(self, *args, **kwargs):
        """ Patch for ('example_pipelines.healthcare.healthcare_utils.MyW2VTransformer', 'fit_transform') """
        # pylint: disable=no-method-argument
        self.mlinspect_fit_transform_active = True  # pylint: disable=attribute-defined-outside-init
        original = gorilla.get_original_attribute(
            healthcare_utils.MyW2VTransformer, 'fit_transform')
        function_info = FunctionInfo(
            'example_pipelines.healthcare.healthcare_utils',
            'MyW2VTransformer')
        input_info = get_input_info(args[0], self.mlinspect_caller_filename,
                                    self.mlinspect_lineno, function_info,
                                    self.mlinspect_optional_code_reference,
                                    self.mlinspect_optional_source_code)

        operator_context = OperatorContext(OperatorType.TRANSFORMER,
                                           function_info)
        input_infos = SklearnBackend.before_call(
            operator_context, [input_info.annotated_dfobject])
        result = original(self, input_infos[0].result_data, *args[1:],
                          **kwargs)
        backend_result = SklearnBackend.after_call(operator_context,
                                                   input_infos, result)
        new_return_value = backend_result.annotated_dfobject.result_data
        assert isinstance(new_return_value, MlinspectNdarray)
        dag_node = DagNode(
            singleton.get_next_op_id(),
            BasicCodeLocation(self.mlinspect_caller_filename,
                              self.mlinspect_lineno), operator_context,
            DagNodeDetails("Word2Vec: fit_transform", ['array']),
            get_optional_code_info_or_none(
                self.mlinspect_optional_code_reference,
                self.mlinspect_optional_source_code))
        add_dag_node(dag_node, [input_info.dag_node], backend_result)
        self.mlinspect_fit_transform_active = False  # pylint: disable=attribute-defined-outside-init
        return new_return_value
def test_frame__init__():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'DataFrame') works
    """
    test_code = cleandoc("""
        import pandas as pd

        df = pd.DataFrame([0, 1, 2], columns=['A'])
        assert len(df) == 3
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]

    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(CodeReference(3, 5, 3, 43),
                         "pd.DataFrame([0, 1, 2], columns=['A'])"))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0, {LineageId(0, 0)}], [1, {LineageId(0, 1)}]],
        columns=['A', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_get_rdataset():
    """
    Tests whether the monkey patching of ('statsmodels.datasets', 'get_rdataset') works
    """
    test_code = cleandoc("""
        import statsmodels.api as sm

        dat = sm.datasets.get_rdataset("Guerry", "HistData").data
        assert len(dat) == 86
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]
    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('statsmodels.datasets', 'get_rdataset')),
        DagNodeDetails(
            'Data from A.-M. Guerry, "Essay on the Moral Statistics of France"',
            [
                'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop',
                'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity',
                'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide',
                'Donation_clergy', 'Lottery', 'Desertion', 'Instruction',
                'Prostitutes', 'Distance', 'Area', 'Pop1831'
            ]),
        OptionalCodeInfo(CodeReference(3, 6, 3, 52),
                         """sm.datasets.get_rdataset("Guerry", "HistData")"""))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[
            1, 'E', 'Ain', 28870, 15890, 37, 5098, 33120, 35039, '2:Med', 73,
            58, 11, 71, 60, 69, 41, 55, 46, 13, 218.372, 5762, 346.03,
            {LineageId(0, 0)}
        ],
         [
             2, 'N', 'Aisne', 26226, 5521, 51, 8901, 14572, 12831, '2:Med', 22,
             10, 82, 4, 82, 36, 38, 82, 24, 327, 65.945, 7369, 513.0,
             {LineageId(0, 1)}
         ]],
        columns=[
            'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop',
            'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity',
            'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide',
            'Donation_clergy', 'Lottery', 'Desertion', 'Instruction',
            'Prostitutes', 'Distance', 'Area', 'Pop1831', 'mlinspect_lineage'
        ])

    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_read_csv():
    """
    Tests whether the monkey patching of ('pandas.io.parsers', 'read_csv') works
    """
    test_code = cleandoc("""
        import os
        import pandas as pd
        from mlinspect.utils import get_project_root
        
        train_file = os.path.join(str(get_project_root()), "example_pipelines", "adult_complex", "adult_train.csv")
        raw_data = pd.read_csv(train_file, na_values='?', index_col=0)
        assert len(raw_data) == 22792
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]
    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 6),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.io.parsers', 'read_csv')),
        DagNodeDetails(StringComparison(r".*\.csv"), [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income-per-year'
        ]),
        OptionalCodeInfo(
            CodeReference(6, 11, 6, 62),
            "pd.read_csv(train_file, na_values='?', index_col=0)"))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[
            46, 'Private', 128645, 'Some-college', 10, 'Divorced',
            'Prof-specialty', 'Not-in-family', 'White', 'Female', 0, 0, 40,
            'United-States', '<=50K', {LineageId(0, 0)}
        ],
         [
             29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married',
             'Handlers-cleaners', 'Not-in-family', 'White', 'Male', 0, 0, 50,
             'United-States', '<=50K', {LineageId(0, 1)}
         ]],
        columns=[
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income-per-year', 'mlinspect_lineage'
        ])

    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_frame_dropna():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'dropna') works
    """
    test_code = cleandoc("""
        import pandas as pd
        
        df = pd.DataFrame([0, 2, 4, 5, None], columns=['A'])
        assert len(df) == 5
        df = df.dropna()
        assert len(df) == 4
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(CodeReference(3, 5, 3, 52),
                         "pd.DataFrame([0, 2, 4, 5, None], columns=['A'])"))
    expected_select = DagNode(
        1, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails('dropna', ['A']),
        OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()'))
    expected_dag.add_edge(expected_data_source, expected_select)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_select]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0., {LineageId(0, 0)}], [2., {LineageId(0, 1)}]],
        columns=['A', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
Exemplo n.º 19
0
def get_input_info(df_object, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) \
        -> InputInfo:
    """
    Uses the patched _mlinspect_dag_node attribute and the singleton.op_id_to_dag_node map to find the parent DAG node
    for the DAG node we want to insert in the next step.
    """
    # pylint: disable=too-many-arguments, unused-argument, protected-access, unused-variable, too-many-locals
    if isinstance(df_object, DataFrame):
        columns = list(df_object.columns)  # TODO: Update this for numpy arrays etc. later
    elif isinstance(df_object, Series):
        columns = [df_object.name]
    elif isinstance(df_object, (csr_matrix, numpy.ndarray)):
        columns = ['array']
    else:
        raise NotImplementedError("TODO: Mlinspect info storage for type: '{}'".format(type(df_object)))
    if hasattr(df_object, "_mlinspect_annotation"):
        input_op_id = df_object._mlinspect_dag_node
        input_dag_node = singleton.op_id_to_dag_node[input_op_id]
        annotation_df = df_object._mlinspect_annotation
        input_info = InputInfo(input_dag_node, AnnotatedDfObject(df_object, annotation_df))
    else:
        operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info)
        backend_result = execute_inspection_visits_data_source(operator_context, df_object)
        if optional_code_reference:
            code_reference = "({})".format(optional_source_code)
        else:
            code_reference = ""
        description = "Warning! Operator {}:{} {} encountered a DataFrame resulting from an operation " \
                      "without mlinspect support!".format(caller_filename, lineno, code_reference)
        missing_op_id = singleton.get_next_missing_op_id()
        input_dag_node = DagNode(missing_op_id,
                                 BasicCodeLocation(caller_filename, lineno),
                                 OperatorContext(OperatorType.MISSING_OP, None),
                                 DagNodeDetails(description, columns),
                                 OptionalCodeInfo(optional_code_reference, optional_source_code))
        add_dag_node(input_dag_node, [], backend_result)
        annotation_df = backend_result.annotated_dfobject.result_annotation
        input_info = InputInfo(input_dag_node, AnnotatedDfObject(df_object, annotation_df))
    return input_info
Exemplo n.º 20
0
def get_expected_dag_adult_easy_py():
    """
    Get the expected DAG for the adult_easy pipeline
    """
    # pylint: disable=too-many-locals
    expected_graph = networkx.DiGraph()

    expected_data_source = DagNode(18, OperatorType.DATA_SOURCE, CodeReference(12, 11, 12, 62),
                                   ('pandas.io.parsers', 'read_csv'),
                                   "adult_train.csv",
                                   ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                                    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                                    'hours-per-week', 'native-country', 'income-per-year'],
                                   "pd.read_csv(train_file, na_values='?', index_col=0)")
    expected_graph.add_node(expected_data_source)

    expected_select = DagNode(20, OperatorType.SELECTION, CodeReference(14, 7, 14, 24), ('pandas.core.frame', 'dropna'),
                              "dropna", ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                                         'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                                         'hours-per-week', 'native-country', 'income-per-year'],
                              'raw_data.dropna()')
    expected_graph.add_edge(expected_data_source, expected_select)

    expected_train_data = DagNode(56, OperatorType.TRAIN_DATA, CodeReference(24, 18, 26, 51),
                                  ('sklearn.pipeline', 'fit', 'Train Data'), None,
                                  ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                                   'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain',
                                   'capital-loss', 'hours-per-week', 'native-country', 'income-per-year'],
                                  'income_pipeline.fit(data, labels)'
                                  )
    expected_graph.add_edge(expected_select, expected_train_data)

    pipeline_str = "compose.ColumnTransformer(transformers=[\n" \
                   "    ('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), " \
                   "['education', 'workclass']),\n" \
                   "    ('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week'])\n" \
                   "])"
    expected_pipeline_project_one = DagNode(34, OperatorType.PROJECTION, CodeReference(18, 25, 21, 2),
                                            ('sklearn.compose._column_transformer', 'ColumnTransformer',
                                             'Projection'),
                                            "to ['education'] (ColumnTransformer)", ['education'], pipeline_str)
    expected_graph.add_edge(expected_train_data, expected_pipeline_project_one)
    expected_pipeline_project_two = DagNode(35, OperatorType.PROJECTION, CodeReference(18, 25, 21, 2),
                                            ('sklearn.compose._column_transformer', 'ColumnTransformer',
                                             'Projection'),
                                            "to ['workclass'] (ColumnTransformer)", ['workclass'], pipeline_str)
    expected_graph.add_edge(expected_train_data, expected_pipeline_project_two)
    expected_pipeline_project_three = DagNode(40, OperatorType.PROJECTION, CodeReference(18, 25, 21, 2),
                                              ('sklearn.compose._column_transformer', 'ColumnTransformer',
                                               'Projection'),
                                              "to ['age'] (ColumnTransformer)", ['age'], pipeline_str)
    expected_graph.add_edge(expected_train_data, expected_pipeline_project_three)
    expected_pipeline_project_four = DagNode(41, OperatorType.PROJECTION, CodeReference(18, 25, 21, 2),
                                             ('sklearn.compose._column_transformer', 'ColumnTransformer',
                                              'Projection'),
                                             "to ['hours-per-week'] (ColumnTransformer)", ['hours-per-week'],
                                             pipeline_str)
    expected_graph.add_edge(expected_train_data, expected_pipeline_project_four)

    expected_pipeline_transformer_one = DagNode(34, OperatorType.TRANSFORMER, CodeReference(19, 20, 19, 72),
                                                ('sklearn.preprocessing._encoders', 'OneHotEncoder', 'Pipeline'),
                                                "Categorical Encoder (OneHotEncoder), Column: 'education'",
                                                ['education'],
                                                "preprocessing.OneHotEncoder(handle_unknown='ignore')")
    expected_graph.add_edge(expected_pipeline_project_one, expected_pipeline_transformer_one)
    expected_pipeline_transformer_two = DagNode(35, OperatorType.TRANSFORMER, CodeReference(19, 20, 19, 72),
                                                ('sklearn.preprocessing._encoders', 'OneHotEncoder', 'Pipeline'),
                                                "Categorical Encoder (OneHotEncoder), Column: 'workclass'",
                                                ['workclass'], "preprocessing.OneHotEncoder(handle_unknown='ignore')")
    expected_graph.add_edge(expected_pipeline_project_two, expected_pipeline_transformer_two)
    expected_pipeline_transformer_three = DagNode(40, OperatorType.TRANSFORMER, CodeReference(20, 16, 20, 46),
                                                  ('sklearn.preprocessing._data', 'StandardScaler', 'Pipeline'),
                                                  "Numerical Encoder (StandardScaler), Column: 'age'", ['age'],
                                                  'preprocessing.StandardScaler()')
    expected_graph.add_edge(expected_pipeline_project_three, expected_pipeline_transformer_three)
    expected_pipeline_transformer_four = DagNode(41, OperatorType.TRANSFORMER, CodeReference(20, 16, 20, 46),
                                                 ('sklearn.preprocessing._data', 'StandardScaler', 'Pipeline'),
                                                 "Numerical Encoder (StandardScaler), Column: 'hours-per-week'",
                                                 ['hours-per-week'], 'preprocessing.StandardScaler()')
    expected_graph.add_edge(expected_pipeline_project_four, expected_pipeline_transformer_four)

    expected_pipeline_concatenation = DagNode(46, OperatorType.CONCATENATION, CodeReference(18, 25, 21, 2),
                                              ('sklearn.compose._column_transformer', 'ColumnTransformer',
                                               'Concatenation'), None, ['array'], pipeline_str)
    expected_graph.add_edge(expected_pipeline_transformer_one, expected_pipeline_concatenation)
    expected_graph.add_edge(expected_pipeline_transformer_two, expected_pipeline_concatenation)
    expected_graph.add_edge(expected_pipeline_transformer_three, expected_pipeline_concatenation)
    expected_graph.add_edge(expected_pipeline_transformer_four, expected_pipeline_concatenation)

    expected_estimator = DagNode(51, OperatorType.ESTIMATOR, CodeReference(26, 19, 26, 48),
                                 ('sklearn.tree._classes', 'DecisionTreeClassifier', 'Pipeline'),
                                 "Decision Tree", source_code='tree.DecisionTreeClassifier()')
    expected_graph.add_edge(expected_pipeline_concatenation, expected_estimator)

    expected_pipeline_fit = DagNode(56, OperatorType.FIT, CodeReference(24, 18, 26, 51),
                                    ('sklearn.pipeline', 'fit', 'Pipeline'),
                                    source_code='income_pipeline.fit(data, labels)')
    expected_graph.add_edge(expected_estimator, expected_pipeline_fit)

    expected_project = DagNode(23, OperatorType.PROJECTION, CodeReference(16, 38, 16, 61),
                               ('pandas.core.frame', '__getitem__', 'Projection'), "to ['income-per-year']",
                               ['income-per-year'], "data['income-per-year']")
    expected_graph.add_edge(expected_select, expected_project)

    expected_project_modify = DagNode(28, OperatorType.PROJECTION_MODIFY, CodeReference(16, 9, 16, 89),
                                      ('sklearn.preprocessing._label', 'label_binarize'),
                                      "label_binarize, classes: ['>50K', '<=50K']", ['array'],
                                      "preprocessing.label_binarize(data['income-per-year'], "
                                      "classes=['>50K', '<=50K'])")
    expected_graph.add_edge(expected_project, expected_project_modify)

    expected_train_labels = DagNode(56, OperatorType.TRAIN_LABELS, CodeReference(24, 18, 26, 51),
                                    ('sklearn.pipeline', 'fit', 'Train Labels'), None, ['array'],
                                    'income_pipeline.fit(data, labels)')
    expected_graph.add_edge(expected_project_modify, expected_train_labels)
    expected_graph.add_edge(expected_train_labels, expected_pipeline_fit)

    return expected_graph
Exemplo n.º 21
0
def get_expected_dag_adult_easy(caller_filename: str,
                                line_offset: int = 0,
                                with_code_references=True):
    """
    Get the expected DAG for the adult_easy pipeline
    """
    # pylint: disable=too-many-locals
    # The line numbers differ slightly between the .py file and the.ipynb file
    expected_graph = networkx.DiGraph()

    expected_data_source = DagNode(
        0, BasicCodeLocation(caller_filename, 12 + line_offset),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.io.parsers', 'read_csv')),
        DagNodeDetails('adult_train.csv', [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income-per-year'
        ]),
        OptionalCodeInfo(
            CodeReference(12 + line_offset, 11, 12 + line_offset, 62),
            "pd.read_csv(train_file, na_values='?', index_col=0)"))
    expected_graph.add_node(expected_data_source)

    expected_select = DagNode(
        1, BasicCodeLocation(caller_filename, 14 + line_offset),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails('dropna', [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income-per-year'
        ]),
        OptionalCodeInfo(
            CodeReference(14 + line_offset, 7, 14 + line_offset, 24),
            'raw_data.dropna()'))
    expected_graph.add_edge(expected_data_source, expected_select)

    pipeline_str = "compose.ColumnTransformer(transformers=[\n" \
                   "    ('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), " \
                   "['education', 'workclass']),\n" \
                   "    ('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week'])\n" \
                   "])"
    expected_pipeline_project_one = DagNode(
        4, BasicCodeLocation(caller_filename, 18 + line_offset),
        OperatorContext(
            OperatorType.PROJECTION,
            FunctionInfo('sklearn.compose._column_transformer',
                         'ColumnTransformer')),
        DagNodeDetails("to ['education', 'workclass']",
                       ['education', 'workclass']),
        OptionalCodeInfo(
            CodeReference(18 + line_offset, 25, 21 + line_offset, 2),
            pipeline_str))
    expected_graph.add_edge(expected_select, expected_pipeline_project_one)
    expected_pipeline_project_two = DagNode(
        6, BasicCodeLocation(caller_filename, 18 + line_offset),
        OperatorContext(
            OperatorType.PROJECTION,
            FunctionInfo('sklearn.compose._column_transformer',
                         'ColumnTransformer')),
        DagNodeDetails("to ['age', 'hours-per-week']",
                       ['age', 'hours-per-week']),
        OptionalCodeInfo(
            CodeReference(18 + line_offset, 25, 21 + line_offset, 2),
            pipeline_str))
    expected_graph.add_edge(expected_select, expected_pipeline_project_two)

    expected_pipeline_transformer_one = DagNode(
        5, BasicCodeLocation(caller_filename, 19 + line_offset),
        OperatorContext(
            OperatorType.TRANSFORMER,
            FunctionInfo('sklearn.preprocessing._encoders', 'OneHotEncoder')),
        DagNodeDetails('One-Hot Encoder: fit_transform', ['array']),
        OptionalCodeInfo(
            CodeReference(19 + line_offset, 20, 19 + line_offset, 72),
            "preprocessing.OneHotEncoder(handle_unknown='ignore')"))
    expected_pipeline_transformer_two = DagNode(
        7, BasicCodeLocation(caller_filename, 20 + line_offset),
        OperatorContext(
            OperatorType.TRANSFORMER,
            FunctionInfo('sklearn.preprocessing._data', 'StandardScaler')),
        DagNodeDetails('Standard Scaler: fit_transform', ['array']),
        OptionalCodeInfo(
            CodeReference(20 + line_offset, 16, 20 + line_offset, 46),
            'preprocessing.StandardScaler()'))
    expected_graph.add_edge(expected_pipeline_project_one,
                            expected_pipeline_transformer_one)
    expected_graph.add_edge(expected_pipeline_project_two,
                            expected_pipeline_transformer_two)

    expected_pipeline_concatenation = DagNode(
        8, BasicCodeLocation(caller_filename, 18 + line_offset),
        OperatorContext(
            OperatorType.CONCATENATION,
            FunctionInfo('sklearn.compose._column_transformer',
                         'ColumnTransformer')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(
            CodeReference(18 + line_offset, 25, 21 + line_offset, 2),
            pipeline_str))
    expected_graph.add_edge(expected_pipeline_transformer_one,
                            expected_pipeline_concatenation)
    expected_graph.add_edge(expected_pipeline_transformer_two,
                            expected_pipeline_concatenation)

    expected_train_data = DagNode(
        9, BasicCodeLocation(caller_filename, 26 + line_offset),
        OperatorContext(
            OperatorType.TRAIN_DATA,
            FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(
            CodeReference(26 + line_offset, 19, 26 + line_offset, 48),
            'tree.DecisionTreeClassifier()'))
    expected_graph.add_edge(expected_pipeline_concatenation,
                            expected_train_data)

    expected_project = DagNode(
        2, BasicCodeLocation(caller_filename, 16 + line_offset),
        OperatorContext(OperatorType.PROJECTION,
                        FunctionInfo('pandas.core.frame', '__getitem__')),
        DagNodeDetails("to ['income-per-year']", ['income-per-year']),
        OptionalCodeInfo(
            CodeReference(16 + line_offset, 38, 16 + line_offset, 61),
            "data['income-per-year']"))
    expected_graph.add_edge(expected_select, expected_project)

    expected_project_modify = DagNode(
        3, BasicCodeLocation(caller_filename, 16 + line_offset),
        OperatorContext(
            OperatorType.PROJECTION_MODIFY,
            FunctionInfo('sklearn.preprocessing._label', 'label_binarize')),
        DagNodeDetails("label_binarize, classes: ['>50K', '<=50K']",
                       ['array']),
        OptionalCodeInfo(
            CodeReference(16 + line_offset, 9, 16 + line_offset, 89),
            "preprocessing.label_binarize(data['income-per-year'], "
            "classes=['>50K', '<=50K'])"))
    expected_graph.add_edge(expected_project, expected_project_modify)

    expected_train_labels = DagNode(
        10, BasicCodeLocation(caller_filename, 26 + line_offset),
        OperatorContext(
            OperatorType.TRAIN_LABELS,
            FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(
            CodeReference(26 + line_offset, 19, 26 + line_offset, 48),
            'tree.DecisionTreeClassifier()'))
    expected_graph.add_edge(expected_project_modify, expected_train_labels)

    expected_estimator = DagNode(
        11, BasicCodeLocation(caller_filename, 26 + line_offset),
        OperatorContext(
            OperatorType.ESTIMATOR,
            FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')),
        DagNodeDetails('Decision Tree', []),
        OptionalCodeInfo(
            CodeReference(26 + line_offset, 19, 26 + line_offset, 48),
            'tree.DecisionTreeClassifier()'))
    expected_graph.add_edge(expected_train_data, expected_estimator)
    expected_graph.add_edge(expected_train_labels, expected_estimator)

    if not with_code_references:
        for dag_node in expected_graph.nodes:
            dag_node.optional_code_info = None

    return expected_graph
def test_ols_fit():
    """
    Tests whether the monkey patching of ('statsmodels.regression.linear_model.OLS', 'fit') works
    """
    test_code = cleandoc("""
        import numpy as np
        import statsmodels.api as sm
        np.random.seed(42)
        nobs = 100
        X = np.random.random((nobs, 2))
        X = sm.add_constant(X)
        beta = [1, .1, .5]
        e = np.random.random(nobs)
        y = np.dot(X, beta) + e
        results = sm.OLS(y, X).fit()
        assert results.summary() is not None
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(3)])
    inspector_result.dag.remove_nodes_from(
        list(inspector_result.dag.nodes)[0:4])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[1])

    expected_dag = networkx.DiGraph()
    expected_train_data = DagNode(
        3, BasicCodeLocation("<string-source>", 10),
        OperatorContext(OperatorType.TRAIN_DATA,
                        FunctionInfo('statsmodel.api.OLS', 'fit')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)'))
    expected_train_labels = DagNode(
        4, BasicCodeLocation("<string-source>", 10),
        OperatorContext(OperatorType.TRAIN_LABELS,
                        FunctionInfo('statsmodel.api.OLS', 'fit')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)'))
    expected_ols = DagNode(
        5, BasicCodeLocation("<string-source>", 10),
        OperatorContext(OperatorType.ESTIMATOR,
                        FunctionInfo('statsmodel.api.OLS', 'fit')),
        DagNodeDetails('Decision Tree', []),
        OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)'))
    expected_dag.add_edge(expected_train_data, expected_ols)
    expected_dag.add_edge(expected_train_labels, expected_ols)

    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_train_data]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame(
        [[
            numpy.array([1.0, 0.3745401188473625, 0.9507143064099162]),
            {LineageId(3, 0)}
        ],
         [
             numpy.array([1.0, 0.7319939418114051, 0.5986584841970366]),
             {LineageId(3, 1)}
         ],
         [
             numpy.array([1.0, 0.15601864044243652, 0.15599452033620265]),
             {LineageId(3, 2)}
         ]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=0.1)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_train_labels]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame([[2.154842811243982, {LineageId(5, 0)}],
                                     [1.4566686012747074, {LineageId(5, 1)}],
                                     [1.2552278383069588, {LineageId(5, 2)}]],
                                    columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=0.1)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_ols]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame(
        [[{LineageId(5, 0), LineageId(3, 0)}],
         [{LineageId(5, 1), LineageId(3, 1)}],
         [{LineageId(5, 2), LineageId(3, 2)}]],
        columns=['mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        check_column_type=False)
def get_expected_result():
    """
    Get the expected PrintFirstRowsAnalyzer(2) result for the adult_easy example
    """
    pipeline_str = "compose.ColumnTransformer(transformers=[\n    " \
                   "('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), " \
                   "['education', 'workclass']),\n    " \
                   "('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week'])\n])"

    expected_result = {
        DagNode(node_id=18, operator_type=OperatorType.DATA_SOURCE, module=('pandas.io.parsers', 'read_csv'),
                code_reference=CodeReference(12, 11, 12, 62), description='adult_train.csv',
                columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                         'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                         'hours-per-week', 'native-country', 'income-per-year'],
                source_code="pd.read_csv(train_file, na_values='?', index_col=0)"):
            DataFrame([[46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty',
                        'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K'],
                       [29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners',
                        'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K']],
                      columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                               'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                               'hours-per-week', 'native-country', 'income-per-year']),
        DagNode(node_id=20, operator_type=OperatorType.SELECTION, module=('pandas.core.frame', 'dropna'),
                code_reference=CodeReference(14, 7, 14, 24), description='dropna',
                columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                         'marital-status', 'occupation', 'relationship', 'race', 'sex',
                         'capital-gain', 'capital-loss', 'hours-per-week',
                         'native-country', 'income-per-year'],
                source_code='raw_data.dropna()'
                ):
            DataFrame([[46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty',
                        'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K'],
                       [29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners',
                        'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K']],
                      columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                               'marital-status', 'occupation', 'relationship', 'race', 'sex',
                               'capital-gain', 'capital-loss', 'hours-per-week',
                               'native-country', 'income-per-year']),
        DagNode(node_id=23, operator_type=OperatorType.PROJECTION, module=('pandas.core.frame', '__getitem__',
                                                                           'Projection'),
                code_reference=CodeReference(16, 38, 16, 61), description="to ['income-per-year']",
                columns=['income-per-year'], source_code="data['income-per-year']"):
            DataFrame([['<=50K'], ['<=50K']], columns=['array']),
        DagNode(node_id=28, operator_type=OperatorType.PROJECTION_MODIFY,
                module=('sklearn.preprocessing._label', 'label_binarize'),
                code_reference=CodeReference(16, 9, 16, 89),
                description="label_binarize, classes: ['>50K', '<=50K']", columns=['array'],
                source_code="preprocessing.label_binarize(data['income-per-year'], classes=['>50K', '<=50K'])"):
            DataFrame([[array(1)], [array(1)]], columns=['array']),
        DagNode(node_id=56, operator_type=OperatorType.TRAIN_DATA, module=('sklearn.pipeline', 'fit', 'Train Data'),
                code_reference=CodeReference(24, 18, 26, 51), description=None,
                columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                         'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                         'hours-per-week', 'native-country', 'income-per-year'],
                source_code='income_pipeline.fit(data, labels)'):
            DataFrame([
                [46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty',
                 'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K'],
                [29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners',
                 'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K']],
                      columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                               'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                               'hours-per-week', 'native-country', 'income-per-year']),
        DagNode(node_id=56, operator_type=OperatorType.TRAIN_LABELS, module=('sklearn.pipeline', 'fit', 'Train Labels'),
                code_reference=CodeReference(24, 18, 26, 51), description=None, columns=['array'],
                source_code='income_pipeline.fit(data, labels)'):
            DataFrame([[array(1)], [array(1)]], columns=['array']),

        DagNode(node_id=40, operator_type=OperatorType.PROJECTION, code_reference=CodeReference(18, 25, 21, 2),
                module=('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'),
                description="to ['age'] (ColumnTransformer)", columns=['age'], source_code=pipeline_str):
            DataFrame([[46], [29]], columns=['age']),
        DagNode(node_id=34, operator_type=OperatorType.PROJECTION, code_reference=CodeReference(18, 25, 21, 2),
                module=('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'),
                description="to ['education'] (ColumnTransformer)", columns=['education'], source_code=pipeline_str):
            DataFrame([['Some-college'], ['Some-college']], columns=['education']),
        DagNode(node_id=41, operator_type=OperatorType.PROJECTION, code_reference=CodeReference(18, 25, 21, 2),
                module=('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'),
                description="to ['hours-per-week'] (ColumnTransformer)", columns=['hours-per-week'],
                source_code=pipeline_str):
            DataFrame([[40], [50]], columns=['hours-per-week']),
        DagNode(node_id=35, operator_type=OperatorType.PROJECTION, code_reference=CodeReference(18, 25, 21, 2),
                module=('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'),
                description="to ['workclass'] (ColumnTransformer)", columns=['workclass'], source_code=pipeline_str):
            DataFrame([['Private'], ['Local-gov']], columns=['workclass']),
        DagNode(node_id=40, operator_type=OperatorType.TRANSFORMER, code_reference=CodeReference(20, 16, 20, 46),
                module=('sklearn.preprocessing._data', 'StandardScaler', 'Pipeline'),
                description="Numerical Encoder (StandardScaler), Column: 'age'", columns=['age'],
                source_code='preprocessing.StandardScaler()'):
            DataFrame([[array(RangeComparison(0.5, 0.6))], [array(RangeComparison(-0.8, -0.7))]], columns=['age']),
        DagNode(node_id=41, operator_type=OperatorType.TRANSFORMER, code_reference=CodeReference(20, 16, 20, 46),
                module=('sklearn.preprocessing._data', 'StandardScaler', 'Pipeline'),
                description="Numerical Encoder (StandardScaler), Column: 'hours-per-week'", columns=['hours-per-week'],
                source_code='preprocessing.StandardScaler()'):
            DataFrame([[array(RangeComparison(-0.09, -0.08))], [array(RangeComparison(0.7, 0.8))]],
                      columns=['hours-per-week']),
        DagNode(node_id=34, operator_type=OperatorType.TRANSFORMER, code_reference=CodeReference(19, 20, 19, 72),
                module=('sklearn.preprocessing._encoders', 'OneHotEncoder', 'Pipeline'),
                description="Categorical Encoder (OneHotEncoder), Column: 'education'", columns=['education'],
                source_code="preprocessing.OneHotEncoder(handle_unknown='ignore')"):
            DataFrame([[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])],
                       [array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])]],
                      columns=['education']),
        DagNode(node_id=35, operator_type=OperatorType.TRANSFORMER, code_reference=CodeReference(19, 20, 19, 72),
                module=('sklearn.preprocessing._encoders', 'OneHotEncoder', 'Pipeline'),
                description="Categorical Encoder (OneHotEncoder), Column: 'workclass'", columns=['workclass'],
                source_code="preprocessing.OneHotEncoder(handle_unknown='ignore')"):
            DataFrame([[array([0., 0., 1., 0., 0., 0., 0.])], [array([0., 1., 0., 0., 0., 0., 0.])]],
                      columns=['workclass']),
        DagNode(node_id=46, operator_type=OperatorType.CONCATENATION, code_reference=CodeReference(18, 25, 21, 2),
                module=('sklearn.compose._column_transformer', 'ColumnTransformer', 'Concatenation'),
                description=None, columns=['array'], source_code=pipeline_str):
            DataFrame([[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                               0., 0., 0., 0., 0., 1., 0., 0., 1., 0.,
                               0., 0., 0., RangeComparison(0.5, 0.6),
                               RangeComparison(-0.09, -0.08)])],
                       [array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                               0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
                               0., 0., 0., RangeComparison(-0.8, -0.7),
                               RangeComparison(0.7, 0.8)])]],
                      columns=['array']),
        DagNode(node_id=51, operator_type=OperatorType.ESTIMATOR, code_reference=CodeReference(26, 19, 26, 48),
                module=('sklearn.tree._classes', 'DecisionTreeClassifier', 'Pipeline'),
                description='Decision Tree', source_code='tree.DecisionTreeClassifier()'): None
    }
    return expected_result