def test_func_defs_and_loops(): """ Tests whether the monkey patching of pandas function works """ test_code = get_test_code_with_function_def_and_for_loop() extracted_dag = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True).dag expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo(CodeReference(4, 9, 4, 44), "pd.DataFrame([0, 1], columns=['A'])")) expected_select_1 = DagNode( 1, BasicCodeLocation("<string-source>", 8), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(8, 9, 8, 20), 'df.dropna()')) expected_dag.add_edge(expected_data_source, expected_select_1) expected_select_2 = DagNode( 2, BasicCodeLocation("<string-source>", 8), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(8, 9, 8, 20), 'df.dropna()')) expected_dag.add_edge(expected_select_1, expected_select_2) compare(networkx.to_dict_of_dicts(extracted_dag), networkx.to_dict_of_dicts(expected_dag))
def test_frame_merge_sorted(): """ Tests whether the monkey patching of ('pandas.core.frame', 'merge') works if the sort option is set to True """ test_code = cleandoc(""" import pandas as pd df_a = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]}) df_b = pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]}) df_merged = df_a.merge(df_b, on='B', sort=True) df_expected = pd.DataFrame({'A': [5, 8, 4, 2], 'B': [1, 2, 4, 5], 'C': [1, 11, 5, None]}) pd.testing.assert_frame_equal(df_merged, df_expected) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(5)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_a = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B']), OptionalCodeInfo( CodeReference(3, 7, 3, 65), "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]})")) expected_b = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['B', 'C']), OptionalCodeInfo( CodeReference(4, 7, 4, 69), "pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]})")) expected_join = DagNode( 2, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.JOIN, FunctionInfo('pandas.core.frame', 'merge')), DagNodeDetails("on 'B'", ['A', 'B', 'C']), OptionalCodeInfo(CodeReference(5, 12, 5, 47), "df_a.merge(df_b, on='B', sort=True)")) expected_dag.add_edge(expected_a, expected_join) expected_dag.add_edge(expected_b, expected_join) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_join] lineage_output = inspection_results_data_source[RowLineage(5)] expected_lineage_df = DataFrame( [[5, 1, 1., {LineageId(0, 4), LineageId(1, 0)}], [8, 2, 11., {LineageId(0, 3), LineageId(1, 3)}], [4, 4, 5., {LineageId(0, 2), LineageId(1, 1)}], [2, 5, math.nan, {LineageId(0, 1), LineageId(1, 4)}]], columns=['A', 'B', 'C', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_black_box_operation(): """ Tests whether the monkey patching of pandas function works """ test_code = cleandoc(""" import pandas from mlinspect.testing._testing_helper_utils import black_box_df_op df = black_box_df_op() df = df.dropna() print("df") """) extracted_dag = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True).dag expected_dag = networkx.DiGraph() expected_missing_op = DagNode( -1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.MISSING_OP, None), DagNodeDetails( 'Warning! Operator <string-source>:5 (df.dropna()) encountered a ' 'DataFrame resulting from an operation without mlinspect support!', ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()')) expected_select = DagNode( 0, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()')) expected_dag.add_edge(expected_missing_op, expected_select) compare(networkx.to_dict_of_dicts(extracted_dag), networkx.to_dict_of_dicts(expected_dag))
def test_statsmodels_add_constant(): """ Tests whether the monkey patching of ('statsmodel.api', 'add_constant') works """ test_code = cleandoc(""" import numpy as np import statsmodels.api as sm np.random.seed(42) test = np.random.random(100) test = sm.add_constant(test) assert len(test) == 100 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) expected_dag = networkx.DiGraph() expected_random = DagNode( 0, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('numpy.random', 'random')), DagNodeDetails('random', ['array']), OptionalCodeInfo(CodeReference(4, 7, 4, 28), "np.random.random(100)")) expected_constant = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('statsmodel.api', 'add_constant')), DagNodeDetails('Adds const column', ['array']), OptionalCodeInfo(CodeReference(5, 7, 5, 28), "sm.add_constant(test)")) expected_dag.add_edge(expected_random, expected_constant) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_random] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_constant] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[numpy.array([0.5, 1.]), {LineageId(0, 0)}], [numpy.array([0.5, 1.]), {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1)
def test_frame__setitem__(): """ Tests whether the monkey patching of ('pandas.core.frame', '__setitem__') works """ test_code = cleandoc(""" import pandas as pd pandas_df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 'baz': [1, 2, 3, 4, 5, 6], 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) pandas_df['baz'] = pandas_df['baz'] + 1 df_expected = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 'baz': [2, 3, 4, 5, 6, 7], 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) pd.testing.assert_frame_equal(pandas_df, df_expected) """) inspector_result = _pipeline_executor.singleton.run(python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_data_source = DagNode(0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['foo', 'bar', 'baz', 'zoo']), OptionalCodeInfo(CodeReference(3, 12, 6, 53), "pd.DataFrame({'foo': ['one', 'one', 'one', 'two', " "'two', 'two'],\n" " 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],\n" " 'baz': [1, 2, 3, 4, 5, 6],\n" " 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})")) expected_project = DagNode(1, BasicCodeLocation("<string-source>", 7), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['baz']", ['baz']), OptionalCodeInfo(CodeReference(7, 19, 7, 35), "pandas_df['baz']")) expected_dag.add_edge(expected_data_source, expected_project) expected_project_modify = DagNode(2, BasicCodeLocation("<string-source>", 7), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('pandas.core.frame', '__setitem__')), DagNodeDetails("modifies ['baz']", ['foo', 'bar', 'baz', 'zoo']), OptionalCodeInfo(CodeReference(7, 0, 7, 39), "pandas_df['baz'] = pandas_df['baz'] + 1")) expected_dag.add_edge(expected_data_source, expected_project_modify) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[expected_project_modify] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame([['one', 'A', 2, 'x', {LineageId(0, 0)}], ['one', 'B', 3, 'y', {LineageId(0, 1)}]], columns=['foo', 'bar', 'baz', 'zoo', 'mlinspect_lineage']) pandas.testing.assert_frame_equal(lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_my_word_to_vec_transformer(): """ Tests whether the monkey patching of ('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer') works """ test_code = cleandoc(""" import pandas as pd from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer import numpy as np df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']}) word_to_vec = MyW2VTransformer(min_count=2, size=2, workers=1) encoded_data = word_to_vec.fit_transform(df) assert encoded_data.shape == (4, 2) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(3)], custom_monkey_patching=[custom_monkeypatching]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo( CodeReference(5, 5, 5, 62), "pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']})")) expected_estimator = DagNode( 1, BasicCodeLocation("<string-source>", 6), OperatorContext( OperatorType.TRANSFORMER, FunctionInfo('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer')), DagNodeDetails('Word2Vec', ['array']), OptionalCodeInfo(CodeReference(6, 14, 6, 62), 'MyW2VTransformer(min_count=2, size=2, workers=1)')) expected_dag.add_edge(expected_data_source, expected_estimator) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_estimator] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 0)}], [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 1)}], [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 2)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_series_equal( lineage_output["mlinspect_lineage"], expected_lineage_df["mlinspect_lineage"]) assert expected_lineage_df.iloc[0, 0].shape == (3, )
def test_frame__getitem__selection(): """ Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for filtering """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]}) df_selection = df[df['A'] > 3] df_expected = pd.DataFrame({'A': [4, 8, 5], 'B': [4, 11, None]}) pd.testing.assert_frame_equal(df_selection.reset_index(drop=True), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B']), OptionalCodeInfo( CodeReference(3, 5, 3, 67), "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]})")) expected_projection = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['A']", ['A']), OptionalCodeInfo(CodeReference(4, 18, 4, 25), "df['A']")) expected_dag.add_edge(expected_data_source, expected_projection) expected_selection = DagNode( 2, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("Select by Series: df[df['A'] > 3]", ['A', 'B']), OptionalCodeInfo(CodeReference(4, 15, 4, 30), "df[df['A'] > 3]")) expected_dag.add_edge(expected_data_source, expected_selection) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_selection] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[4, 4., {LineageId(0, 2)}], [8, 11., {LineageId(0, 3)}]], columns=['A', 'B', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_groupby_agg(): """ Tests whether the monkey patching of ('pandas.core.frame', 'groupby') and ('pandas.core.groupbygeneric', 'agg') works. """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], 'value': [1, 2, 1, 3, 4]}) df_groupby_agg = df.groupby('group').agg(mean_value=('value', 'mean')) df_expected = pd.DataFrame({'group': ['A', 'B', 'C'], 'mean_value': [1, 3, 3]}) pd.testing.assert_frame_equal(df_groupby_agg.reset_index(drop=False), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['group', 'value']), OptionalCodeInfo( CodeReference(3, 5, 3, 81), "pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], " "'value': [1, 2, 1, 3, 4]})")) expected_groupby_agg = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.GROUP_BY_AGG, FunctionInfo('pandas.core.groupby.generic', 'agg')), DagNodeDetails( "Groupby 'group', Aggregate: '{'mean_value': ('value', 'mean')}'", ['group', 'mean_value']), OptionalCodeInfo( CodeReference(4, 17, 4, 70), "df.groupby('group').agg(mean_value=('value', 'mean'))")) expected_dag.add_edge(expected_data, expected_groupby_agg) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_groupby_agg] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [["A", 1, {LineageId(1, 0)}], ['B', 3, {LineageId(1, 1)}]], columns=['group', 'mean_value', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame__getitem__frame(): """ Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for multiple string arguments """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], [9, 2, 3], [6, 1, 2], [1, 2, 3]], columns=['A', 'B', 'C']) df_projection = df[['A', 'C']] df_expected = pd.DataFrame([[0, 2], [1, 3], [4, 2], [9, 3], [6, 2], [1, 3]], columns=['A', 'C']) pd.testing.assert_frame_equal(df_projection, df_expected) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B', 'C']), OptionalCodeInfo( CodeReference(3, 5, 4, 28), "pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], " "[9, 2, 3], [6, 1, 2], [1, 2, 3]], \n" " columns=['A', 'B', 'C'])")) expected_project = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['A', 'C']", ['A', 'C']), OptionalCodeInfo(CodeReference(5, 16, 5, 30), "df[['A', 'C']]")) expected_dag.add_edge(expected_data_source, expected_project) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_project] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0, 2, {LineageId(0, 0)}], [1, 3, {LineageId(0, 1)}]], columns=['A', 'C', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame_replace(): """ Tests whether the monkey patching of ('pandas.core.frame', 'replace') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], columns=['A']) df_replace = df.replace('Medium', 'Low') df_expected = pd.DataFrame(['Low', 'Low', 'Low', 'High', None], columns=['A']) pd.testing.assert_frame_equal(df_replace.reset_index(drop=True), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo( CodeReference(3, 5, 3, 72), "pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], " "columns=['A'])")) expected_modify = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('pandas.core.frame', 'replace')), DagNodeDetails("Replace 'Medium' with 'Low'", ['A']), OptionalCodeInfo(CodeReference(4, 13, 4, 40), "df.replace('Medium', 'Low')")) expected_dag.add_edge(expected_data_source, expected_modify) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_modify] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [['Low', {LineageId(0, 0)}], ['Low', {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame__init__(): """ Tests whether the monkey patching of ('pandas.core.frame', 'DataFrame') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([0, 1, 2], columns=['A']) assert len(df) == 3 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo(CodeReference(3, 5, 3, 43), "pd.DataFrame([0, 1, 2], columns=['A'])")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0, {LineageId(0, 0)}], [1, {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_numpy_random(): """ Tests whether the monkey patching of ('numpy.random', 'random') works """ test_code = cleandoc(""" import numpy as np np.random.seed(42) test = np.random.random(100) assert len(test) == 100 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('numpy.random', 'random')), DagNodeDetails('random', ['array']), OptionalCodeInfo(CodeReference(3, 7, 3, 28), "np.random.random(100)")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1)
def test_get_rdataset(): """ Tests whether the monkey patching of ('statsmodels.datasets', 'get_rdataset') works """ test_code = cleandoc(""" import statsmodels.api as sm dat = sm.datasets.get_rdataset("Guerry", "HistData").data assert len(dat) == 86 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('statsmodels.datasets', 'get_rdataset')), DagNodeDetails( 'Data from A.-M. Guerry, "Essay on the Moral Statistics of France"', [ 'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop', 'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity', 'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide', 'Donation_clergy', 'Lottery', 'Desertion', 'Instruction', 'Prostitutes', 'Distance', 'Area', 'Pop1831' ]), OptionalCodeInfo(CodeReference(3, 6, 3, 52), """sm.datasets.get_rdataset("Guerry", "HistData")""")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[ 1, 'E', 'Ain', 28870, 15890, 37, 5098, 33120, 35039, '2:Med', 73, 58, 11, 71, 60, 69, 41, 55, 46, 13, 218.372, 5762, 346.03, {LineageId(0, 0)} ], [ 2, 'N', 'Aisne', 26226, 5521, 51, 8901, 14572, 12831, '2:Med', 22, 10, 82, 4, 82, 36, 38, 82, 24, 327, 65.945, 7369, 513.0, {LineageId(0, 1)} ]], columns=[ 'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop', 'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity', 'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide', 'Donation_clergy', 'Lottery', 'Desertion', 'Instruction', 'Prostitutes', 'Distance', 'Area', 'Pop1831', 'mlinspect_lineage' ]) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_read_csv(): """ Tests whether the monkey patching of ('pandas.io.parsers', 'read_csv') works """ test_code = cleandoc(""" import os import pandas as pd from mlinspect.utils import get_project_root train_file = os.path.join(str(get_project_root()), "example_pipelines", "adult_complex", "adult_train.csv") raw_data = pd.read_csv(train_file, na_values='?', index_col=0) assert len(raw_data) == 22792 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 6), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.io.parsers', 'read_csv')), DagNodeDetails(StringComparison(r".*\.csv"), [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year' ]), OptionalCodeInfo( CodeReference(6, 11, 6, 62), "pd.read_csv(train_file, na_values='?', index_col=0)")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[ 46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty', 'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K', {LineageId(0, 0)} ], [ 29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K', {LineageId(0, 1)} ]], columns=[ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year', 'mlinspect_lineage' ]) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame_dropna(): """ Tests whether the monkey patching of ('pandas.core.frame', 'dropna') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([0, 2, 4, 5, None], columns=['A']) assert len(df) == 5 df = df.dropna() assert len(df) == 4 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo(CodeReference(3, 5, 3, 52), "pd.DataFrame([0, 2, 4, 5, None], columns=['A'])")) expected_select = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()')) expected_dag.add_edge(expected_data_source, expected_select) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_select] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0., {LineageId(0, 0)}], [2., {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def get_optional_code_info_or_none(optional_code_reference: CodeReference or None, optional_source_code: str or None) -> OptionalCodeInfo or None: """ If code reference tracking is enabled, return OptionalCodeInfo, otherwise None """ if singleton.track_code_references: assert optional_code_reference is not None assert optional_source_code is not None code_info_or_none = OptionalCodeInfo(optional_code_reference, optional_source_code) else: assert optional_code_reference is None assert optional_source_code is None code_info_or_none = None return code_info_or_none
def get_input_info(df_object, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) \ -> InputInfo: """ Uses the patched _mlinspect_dag_node attribute and the singleton.op_id_to_dag_node map to find the parent DAG node for the DAG node we want to insert in the next step. """ # pylint: disable=too-many-arguments, unused-argument, protected-access, unused-variable, too-many-locals if isinstance(df_object, DataFrame): columns = list(df_object.columns) # TODO: Update this for numpy arrays etc. later elif isinstance(df_object, Series): columns = [df_object.name] elif isinstance(df_object, (csr_matrix, numpy.ndarray)): columns = ['array'] else: raise NotImplementedError("TODO: Mlinspect info storage for type: '{}'".format(type(df_object))) if hasattr(df_object, "_mlinspect_annotation"): input_op_id = df_object._mlinspect_dag_node input_dag_node = singleton.op_id_to_dag_node[input_op_id] annotation_df = df_object._mlinspect_annotation input_info = InputInfo(input_dag_node, AnnotatedDfObject(df_object, annotation_df)) else: operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info) backend_result = execute_inspection_visits_data_source(operator_context, df_object) if optional_code_reference: code_reference = "({})".format(optional_source_code) else: code_reference = "" description = "Warning! Operator {}:{} {} encountered a DataFrame resulting from an operation " \ "without mlinspect support!".format(caller_filename, lineno, code_reference) missing_op_id = singleton.get_next_missing_op_id() input_dag_node = DagNode(missing_op_id, BasicCodeLocation(caller_filename, lineno), OperatorContext(OperatorType.MISSING_OP, None), DagNodeDetails(description, columns), OptionalCodeInfo(optional_code_reference, optional_source_code)) add_dag_node(input_dag_node, [], backend_result) annotation_df = backend_result.annotated_dfobject.result_annotation input_info = InputInfo(input_dag_node, AnnotatedDfObject(df_object, annotation_df)) return input_info
def get_expected_dag_adult_easy(caller_filename: str, line_offset: int = 0, with_code_references=True): """ Get the expected DAG for the adult_easy pipeline """ # pylint: disable=too-many-locals # The line numbers differ slightly between the .py file and the.ipynb file expected_graph = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation(caller_filename, 12 + line_offset), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.io.parsers', 'read_csv')), DagNodeDetails('adult_train.csv', [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year' ]), OptionalCodeInfo( CodeReference(12 + line_offset, 11, 12 + line_offset, 62), "pd.read_csv(train_file, na_values='?', index_col=0)")) expected_graph.add_node(expected_data_source) expected_select = DagNode( 1, BasicCodeLocation(caller_filename, 14 + line_offset), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year' ]), OptionalCodeInfo( CodeReference(14 + line_offset, 7, 14 + line_offset, 24), 'raw_data.dropna()')) expected_graph.add_edge(expected_data_source, expected_select) pipeline_str = "compose.ColumnTransformer(transformers=[\n" \ " ('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), " \ "['education', 'workclass']),\n" \ " ('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week'])\n" \ "])" expected_pipeline_project_one = DagNode( 4, BasicCodeLocation(caller_filename, 18 + line_offset), OperatorContext( OperatorType.PROJECTION, FunctionInfo('sklearn.compose._column_transformer', 'ColumnTransformer')), DagNodeDetails("to ['education', 'workclass']", ['education', 'workclass']), OptionalCodeInfo( CodeReference(18 + line_offset, 25, 21 + line_offset, 2), pipeline_str)) expected_graph.add_edge(expected_select, expected_pipeline_project_one) expected_pipeline_project_two = DagNode( 6, BasicCodeLocation(caller_filename, 18 + line_offset), OperatorContext( OperatorType.PROJECTION, FunctionInfo('sklearn.compose._column_transformer', 'ColumnTransformer')), DagNodeDetails("to ['age', 'hours-per-week']", ['age', 'hours-per-week']), OptionalCodeInfo( CodeReference(18 + line_offset, 25, 21 + line_offset, 2), pipeline_str)) expected_graph.add_edge(expected_select, expected_pipeline_project_two) expected_pipeline_transformer_one = DagNode( 5, BasicCodeLocation(caller_filename, 19 + line_offset), OperatorContext( OperatorType.TRANSFORMER, FunctionInfo('sklearn.preprocessing._encoders', 'OneHotEncoder')), DagNodeDetails('One-Hot Encoder: fit_transform', ['array']), OptionalCodeInfo( CodeReference(19 + line_offset, 20, 19 + line_offset, 72), "preprocessing.OneHotEncoder(handle_unknown='ignore')")) expected_pipeline_transformer_two = DagNode( 7, BasicCodeLocation(caller_filename, 20 + line_offset), OperatorContext( OperatorType.TRANSFORMER, FunctionInfo('sklearn.preprocessing._data', 'StandardScaler')), DagNodeDetails('Standard Scaler: fit_transform', ['array']), OptionalCodeInfo( CodeReference(20 + line_offset, 16, 20 + line_offset, 46), 'preprocessing.StandardScaler()')) expected_graph.add_edge(expected_pipeline_project_one, expected_pipeline_transformer_one) expected_graph.add_edge(expected_pipeline_project_two, expected_pipeline_transformer_two) expected_pipeline_concatenation = DagNode( 8, BasicCodeLocation(caller_filename, 18 + line_offset), OperatorContext( OperatorType.CONCATENATION, FunctionInfo('sklearn.compose._column_transformer', 'ColumnTransformer')), DagNodeDetails(None, ['array']), OptionalCodeInfo( CodeReference(18 + line_offset, 25, 21 + line_offset, 2), pipeline_str)) expected_graph.add_edge(expected_pipeline_transformer_one, expected_pipeline_concatenation) expected_graph.add_edge(expected_pipeline_transformer_two, expected_pipeline_concatenation) expected_train_data = DagNode( 9, BasicCodeLocation(caller_filename, 26 + line_offset), OperatorContext( OperatorType.TRAIN_DATA, FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')), DagNodeDetails(None, ['array']), OptionalCodeInfo( CodeReference(26 + line_offset, 19, 26 + line_offset, 48), 'tree.DecisionTreeClassifier()')) expected_graph.add_edge(expected_pipeline_concatenation, expected_train_data) expected_project = DagNode( 2, BasicCodeLocation(caller_filename, 16 + line_offset), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['income-per-year']", ['income-per-year']), OptionalCodeInfo( CodeReference(16 + line_offset, 38, 16 + line_offset, 61), "data['income-per-year']")) expected_graph.add_edge(expected_select, expected_project) expected_project_modify = DagNode( 3, BasicCodeLocation(caller_filename, 16 + line_offset), OperatorContext( OperatorType.PROJECTION_MODIFY, FunctionInfo('sklearn.preprocessing._label', 'label_binarize')), DagNodeDetails("label_binarize, classes: ['>50K', '<=50K']", ['array']), OptionalCodeInfo( CodeReference(16 + line_offset, 9, 16 + line_offset, 89), "preprocessing.label_binarize(data['income-per-year'], " "classes=['>50K', '<=50K'])")) expected_graph.add_edge(expected_project, expected_project_modify) expected_train_labels = DagNode( 10, BasicCodeLocation(caller_filename, 26 + line_offset), OperatorContext( OperatorType.TRAIN_LABELS, FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')), DagNodeDetails(None, ['array']), OptionalCodeInfo( CodeReference(26 + line_offset, 19, 26 + line_offset, 48), 'tree.DecisionTreeClassifier()')) expected_graph.add_edge(expected_project_modify, expected_train_labels) expected_estimator = DagNode( 11, BasicCodeLocation(caller_filename, 26 + line_offset), OperatorContext( OperatorType.ESTIMATOR, FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')), DagNodeDetails('Decision Tree', []), OptionalCodeInfo( CodeReference(26 + line_offset, 19, 26 + line_offset, 48), 'tree.DecisionTreeClassifier()')) expected_graph.add_edge(expected_train_data, expected_estimator) expected_graph.add_edge(expected_train_labels, expected_estimator) if not with_code_references: for dag_node in expected_graph.nodes: dag_node.optional_code_info = None return expected_graph
def test_ols_fit(): """ Tests whether the monkey patching of ('statsmodels.regression.linear_model.OLS', 'fit') works """ test_code = cleandoc(""" import numpy as np import statsmodels.api as sm np.random.seed(42) nobs = 100 X = np.random.random((nobs, 2)) X = sm.add_constant(X) beta = [1, .1, .5] e = np.random.random(nobs) y = np.dot(X, beta) + e results = sm.OLS(y, X).fit() assert results.summary() is not None """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(3)]) inspector_result.dag.remove_nodes_from( list(inspector_result.dag.nodes)[0:4]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[1]) expected_dag = networkx.DiGraph() expected_train_data = DagNode( 3, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.TRAIN_DATA, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails(None, ['array']), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_train_labels = DagNode( 4, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.TRAIN_LABELS, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails(None, ['array']), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_ols = DagNode( 5, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.ESTIMATOR, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails('Decision Tree', []), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_dag.add_edge(expected_train_data, expected_ols) expected_dag.add_edge(expected_train_labels, expected_ols) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_train_data] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[ numpy.array([1.0, 0.3745401188473625, 0.9507143064099162]), {LineageId(3, 0)} ], [ numpy.array([1.0, 0.7319939418114051, 0.5986584841970366]), {LineageId(3, 1)} ], [ numpy.array([1.0, 0.15601864044243652, 0.15599452033620265]), {LineageId(3, 2)} ]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=0.1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_train_labels] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame([[2.154842811243982, {LineageId(5, 0)}], [1.4566686012747074, {LineageId(5, 1)}], [1.2552278383069588, {LineageId(5, 2)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=0.1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_ols] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[{LineageId(5, 0), LineageId(3, 0)}], [{LineageId(5, 1), LineageId(3, 1)}], [{LineageId(5, 2), LineageId(3, 2)}]], columns=['mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), check_column_type=False)