def test_save_fig_to_path(): """ Tests whether the .py version of the inspector works """ extracted_dag = get_expected_dag_adult_easy_py() filename = os.path.join(str(get_project_root()), "test", "pipelines", "adult_easy.png") save_fig_to_path(extracted_dag, filename) assert os.path.isfile(filename)
def test_save_fig_to_path(): """ Tests whether the .py version of the inspector works """ extracted_dag = get_expected_dag_adult_easy("<string-source>") filename = os.path.join(str(get_project_root()), "example_pipelines", "adult_simple", "adult_simple.png") save_fig_to_path(extracted_dag, filename) assert os.path.isfile(filename)
""" COMPAS pipeline """ import os import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, label_binarize from mlinspect.utils import get_project_root train_file = os.path.join(str(get_project_root()), "experiments", "user_interviews", "compas_train_modified.csv") train = pd.read_csv(train_file, na_values='?', index_col=0) test_file = os.path.join(str(get_project_root()), "example_pipelines", "compas", "compas_test.csv") test = pd.read_csv(test_file, na_values='?', index_col=0) train = train[[ 'sex', 'dob', 'age', 'c_charge_degree', 'race', 'score_text', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out' ]] test = test[[ 'sex', 'dob', 'age', 'c_charge_degree', 'race', 'score_text', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out' ]]
""" Tests whether the MaterializeFirstRowsInspection works """ import os from testfixtures import compare, RangeComparison from numpy.ma import array from mlinspect.inspections.inspection_input import InspectionInputRow from mlinspect.inspections.materialize_first_rows_inspection import MaterializeFirstRowsInspection from mlinspect.instrumentation.dag_node import DagNode, OperatorType, CodeReference from mlinspect.pipeline_inspector import PipelineInspector from mlinspect.utils import get_project_root FILE_PY = os.path.join(str(get_project_root()), "test", "pipelines", "adult_easy.py") def test_materialize_first_rows_inspection(): """ Tests whether the MaterializeFirstRowsInspection works """ inspector_result = PipelineInspector \ .on_pipeline_from_py_file(FILE_PY) \ .add_inspection(MaterializeFirstRowsInspection(2)) \ .execute() inspection_result = inspector_result.inspection_to_annotations assert MaterializeFirstRowsInspection(2) in inspection_result result = inspection_result[MaterializeFirstRowsInspection(2)] compare(result, get_expected_result())
""" An example pipeline """ import os import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, label_binarize from mlinspect.utils import get_project_root train_file = os.path.join(str(get_project_root()), "example_pipelines", "compas", "compas_train.csv") train_data = pd.read_csv(train_file, na_values='?', index_col=0) test_file = os.path.join(str(get_project_root()), "example_pipelines", "compas", "compas_test.csv") test_data = pd.read_csv(test_file, na_values='?', index_col=0) train_data = train_data[ ['sex', 'dob', 'age', 'c_charge_degree', 'race', 'score_text', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out']] test_data = test_data[ ['sex', 'dob', 'age', 'c_charge_degree', 'race', 'score_text', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out']] train_data = train_data[(train_data['days_b_screening_arrest'] <= 30) & (train_data['days_b_screening_arrest'] >= -30)] train_data = train_data[train_data['is_recid'] != -1] train_data = train_data[train_data['c_charge_degree'] != "O"] train_data = train_data[train_data['score_text'] != 'N/A']
""" Tests whether the adult_easy test pipeline works """ import ast import os import nbformat from nbconvert import PythonExporter from mlinspect.utils import get_project_root FILE_PY = os.path.join(str(get_project_root()), "test", "pipelines", "adult_easy.py") FILE_NB = os.path.join(str(get_project_root()), "test", "pipelines", "adult_easy.ipynb") def test_py_pipeline_runs(): """ Tests whether the .py version of the pipeline works """ with open(FILE_PY) as file: text = file.read() parsed_ast = ast.parse(text) exec(compile(parsed_ast, filename="<ast>", mode="exec")) def test_nb_pipeline_runs(): """ Tests whether the .ipynb version of the pipeline works """ with open(FILE_NB) as file: notebook = nbformat.reads(file.read(), nbformat.NO_CONVERT)
""" Some useful utils for the project """ import os from mlinspect.utils import get_project_root ADULT_SIMPLE_PY = os.path.join(str(get_project_root()), "example_pipelines", "adult_simple", "adult_simple.py") ADULT_SIMPLE_IPYNB = os.path.join(str(get_project_root()), "example_pipelines", "adult_simple", "adult_simple.ipynb") ADULT_SIMPLE_PNG = os.path.join(str(get_project_root()), "example_pipelines", "adult_simple", "adult_simple.png") ADULT_COMPLEX_PY = os.path.join(str(get_project_root()), "example_pipelines", "adult_complex", "adult_complex.py") ADULT_COMPLEX_PNG = os.path.join(str(get_project_root()), "example_pipelines", "adult_complex", "adult_complex.png") COMPAS_PY = os.path.join(str(get_project_root()), "example_pipelines", "compas", "compas.py") COMPAS_PNG = os.path.join(str(get_project_root()), "example_pipelines", "compas", "compas.png") HEALTHCARE_PY = os.path.join(str(get_project_root()), "example_pipelines", "healthcare", "healthcare.py") HEALTHCARE_PNG = os.path.join(str(get_project_root()), "example_pipelines", "healthcare", "healthcare.png")
""" Adult income pipeline """ import os import pandas as pd from sklearn import compose, preprocessing, tree, pipeline from mlinspect.utils import get_project_root print('pipeline start') train_file_a = os.path.join(str(get_project_root()), "experiments", "user_interviews", "adult_simple_train_a.csv") raw_data_a = pd.read_csv(train_file_a, na_values='?', index_col=0) train_file_b = os.path.join(str(get_project_root()), "experiments", "user_interviews", "adult_simple_train_b.csv") raw_data_b = pd.read_csv(train_file_b, na_values='?', index_col=0) merged_raw_data = raw_data_a.merge(raw_data_b, on="id") data = merged_raw_data.dropna() labels = preprocessing.label_binarize(data['income-per-year'], classes=['>50K', '<=50K']) column_transformer = compose.ColumnTransformer( transformers=[('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']), ('numeric', preprocessing.StandardScaler(),
from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from tensorflow.keras.wrappers.scikit_learn import KerasClassifier from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer, create_model from mlinspect.utils import get_project_root # FutureWarning: Given feature/column names or counts do not match the ones for the data given during fit warnings.filterwarnings('ignore') COUNTIES_OF_INTEREST = ['county2', 'county3'] # load input data sources (data generated with https://www.mockaroo.com as a single file and then split into two) patients = pd.read_csv(os.path.join(str(get_project_root()), "example_pipelines", "healthcare", "healthcare_patients.csv"), na_values='?') histories = pd.read_csv(os.path.join(str(get_project_root()), "example_pipelines", "healthcare", "healthcare_histories.csv"), na_values='?') # combine input data into a single table data = patients.merge(histories, on=['ssn']) # compute mean complications per age group, append as column complications = data.groupby('age_group').agg( mean_complications=('complications', 'mean'))
""" Tests whether the healthcare demo works """ import os from importnb import Notebook import matplotlib from mlinspect.utils import get_project_root DEMO_NB_FILE = os.path.join(str(get_project_root()), "demo", "feature_overview", "feature_overview.ipynb") def test_demo_nb(): """ Tests whether the demo notebook works """ matplotlib.use( "template") # Disable plt.show when executing nb as part of this test Notebook.load(DEMO_NB_FILE)
""" Tests whether the healthcare demo works """ import os from importnb import Notebook import matplotlib from mlinspect.utils import get_project_root ADULT_SIMPLE_TASK_NB = os.path.join(str(get_project_root()), "experiments", "user_interviews", "example-task-with-solution.ipynb") COMPAS_TASK_NB = os.path.join(str(get_project_root()), "experiments", "user_interviews", "task-1-solution.ipynb") HEALTHCARE_TASK_NB = os.path.join(str(get_project_root()), "experiments", "user_interviews", "task-2-solution.ipynb") def test_adult_simple_task_nb(): """ Tests whether this task notebook works """ matplotlib.use( "template") # Disable plt.show when executing nb as part of this test Notebook.load(ADULT_SIMPLE_TASK_NB) def test_compas_task_nb(): """ Tests whether this task notebook works
""" Tests whether the performance benchmark notebook works """ import os import matplotlib from importnb import Notebook from mlinspect.utils import get_project_root EXPERIMENT_NB_FILE = os.path.join(str(get_project_root()), "experiments", "performance", "performance_benchmarks.ipynb") def test_experiment_nb(): """ Tests whether the experiment notebook works """ matplotlib.use( "template") # Disable plt.show when executing nb as part of this test Notebook.load(EXPERIMENT_NB_FILE)
def test_get_project_root(): """ Tests whether get_project_root works """ assert get_project_root() == Path(__file__).parent.parent
""" Tests whether the healthcare demo works """ import ast import os from importnb import Notebook from mlinspect.utils import get_project_root ADULT_EASY_FILE_PY = os.path.join(str(get_project_root()), "test", "pipelines", "adult_easy.py") FILE_NB = os.path.join(str(get_project_root()), "test", "pipelines", "adult_easy.ipynb") PIPELINE_FILE_PY = os.path.join(str(get_project_root()), "demo", "healthcare", "healthcare.py") DEMO_NB_FILE = os.path.join(str(get_project_root()), "demo", "healthcare", "healthcare_demo.ipynb") def test_py_pipeline_runs(): """ Tests whether the .py version of the pipeline works """ with open(PIPELINE_FILE_PY) as file: healthcare_code = file.read() parsed_ast = ast.parse(healthcare_code) exec(compile(parsed_ast, filename="<ast>", mode="exec"))
""" An example pipeline """ import os import pandas as pd from sklearn import compose, preprocessing, tree, pipeline from mlinspect.utils import get_project_root print('pipeline start') train_file = os.path.join(str(get_project_root()), "test", "data", "adult_train.csv") raw_data = pd.read_csv(train_file, na_values='?', index_col=0) data = raw_data.dropna() labels = preprocessing.label_binarize(data['income-per-year'], classes=['>50K', '<=50K']) feature_transformation = compose.ColumnTransformer( transformers=[('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']), ('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week'])]) income_pipeline = pipeline.Pipeline([('features', feature_transformation), ('classifier', tree.DecisionTreeClassifier())]) income_pipeline.fit(data, labels)
""" An example pipeline """ import os import pandas as pd from sklearn import compose, preprocessing, tree, pipeline from mlinspect.utils import get_project_root print('pipeline start') train_file = os.path.join(str(get_project_root()), "example_pipelines", "adult_complex", "adult_train.csv") raw_data = pd.read_csv(train_file, na_values='?', index_col=0) data = raw_data.dropna() labels = preprocessing.label_binarize(data['income-per-year'], classes=['>50K', '<=50K']) feature_transformation = compose.ColumnTransformer(transformers=[ ('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']), ('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week']) ]) income_pipeline = pipeline.Pipeline([ ('features', feature_transformation), ('classifier', tree.DecisionTreeClassifier())]) income_pipeline.fit(data, labels) print('pipeline finished')