def upload_interpretability_aml(global_explanation, comment, y_test): #Extract explanation client (remote or local?) run = Run.get_context() client = ExplanationClient.from_run(run) client.upload_model_explanation(global_explanation, comment=comment, true_ys=y_test)
) fig.show() # End - Story No. 3018 modified Mukesh Dutta 9/3/2021 ''' # Story No. 3404 # COMMAND ---------- # DBTITLE 1,Model explanation from azureml.interpret import ExplanationClient from azureml.interpret.common.exceptions import ExplanationNotFoundException # Get model explanation data if new_training == "True": time.sleep(1200) try: explaination_client = ExplanationClient.from_run(best_run) if not explaination_client is None: # TS : added if condition - 2021-05-17 12:57pm client = explaination_client engineered_explanations = client.download_model_explanation(raw=False) #print(engineered_explanations.get_feature_importance_dict()) print( "You can visualize the engineered explanations under the 'Explanations (preview)' tab in the AutoML run at:-\n" + best_run.get_portal_url()) feature_imp_dict_eng = pd.DataFrame( engineered_explanations.get_feature_importance_dict().items()) feature_imp_dict_eng.columns = ["Feature", "Importance"] raw_explanations = client.download_model_explanation(raw=True) # print(raw_explanations.get_feature_importance_dict()) print(
def model_train(df): run = Run.get_context() df.drop("Sno", axis=1, inplace=True) y_raw = df['Risk'] X_raw = df.drop('Risk', axis=1) categorical_features = X_raw.select_dtypes(include=['object']).columns numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value="missing")), ('onehotencoder', OneHotEncoder(categories='auto', sparse=False))]) numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())]) feature_engineering_pipeline = ColumnTransformer(transformers=[ ('numeric', numeric_transformer, numeric_features), ('categorical', categorical_transformer, categorical_features) ], remainder="drop") # Encode Labels le = LabelEncoder() encoded_y = le.fit_transform(y_raw) # Train test split X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.20, stratify=encoded_y, random_state=42) # Create sklearn pipeline lr_clf = Pipeline( steps=[('preprocessor', feature_engineering_pipeline ), ('classifier', LogisticRegression(solver="lbfgs"))]) # Train the model lr_clf.fit(X_train, y_train) # Capture metrics train_acc = lr_clf.score(X_train, y_train) test_acc = lr_clf.score(X_test, y_test) print("Training accuracy: %.3f" % train_acc) print("Testing accuracy: %.3f" % test_acc) # Log to Azure ML run.log('Train accuracy', train_acc) run.log('Test accuracy', test_acc) # Explain model explainer = TabularExplainer(lr_clf.steps[-1][1], initialization_examples=X_train, features=X_raw.columns, classes=["Good", "Bad"], transformations=feature_engineering_pipeline) # explain overall model predictions (global explanation) global_explanation = explainer.explain_global(X_test) # Sorted SHAP values print('ranked global importance values: {}'.format( global_explanation.get_ranked_global_values())) # Corresponding feature names print('ranked global importance names: {}'.format( global_explanation.get_ranked_global_names())) # Feature ranks (based on original order of features) print('global importance rank: {}'.format( global_explanation.global_importance_rank)) client = ExplanationClient.from_run(run) client.upload_model_explanation(global_explanation, comment='Global Explanation: All Features') return lr_clf
from azureml.interpret import ExplanationClient from cuml.model_selection import train_test_split from azureml.core.run import Run import joblib import os import cuml from cuml.benchmark.datagen import load_higgs OUTPUT_DIR = './outputs/' os.makedirs(OUTPUT_DIR, exist_ok=True) X, y = load_higgs() N_ROWS = 1000000 run = Run.get_context() client = ExplanationClient.from_run(run) run.log('N_ROWS', N_ROWS) X_train, X_test, y_train, y_test = train_test_split(X[:N_ROWS], y[:N_ROWS], random_state=1) # write x_test out as a pickle file for later visualization x_test_pkl = 'x_test.pkl' with open(x_test_pkl, 'wb') as file: joblib.dump(value=X_test, filename=os.path.join(OUTPUT_DIR, x_test_pkl)) run.upload_file('x_test_higgs.pkl', os.path.join(OUTPUT_DIR, x_test_pkl)) gamma = 0.001 C = 100. # Use SVC algorithm to create a model reg = cuml.svm.SVC(C=C, gamma=gamma, probability=True)
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--model_name", type=str, help="Name of the Model", default="diabetes_model.pkl", ) parser.add_argument("--step_output", type=str, help=("output for passing data to next step")) parser.add_argument("--dataset_version", type=str, help=("dataset version")) parser.add_argument("--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered")) parser.add_argument( "--caller_run_id", type=str, help=("caller run id, for example ADF pipeline run id")) parser.add_argument("--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation")) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [dataset_version]: %s" % args.dataset_version) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [caller_run_id]: %s" % args.caller_run_id) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name step_output_path = args.step_output dataset_version = args.dataset_version data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() print("Getting training parameters") # Load the training parameters from the parameters file with open("parameters.json") as f: pars = json.load(f) try: train_args = pars["training"] except KeyError: print("Could not load training values from file") train_args = {} # Log the training parameters print(f"Parameters: {train_args}") for (k, v) in train_args.items(): run.log(k, v) run.parent.log(k, v) # Get the dataset if (dataset_name): if (data_file_path == 'none'): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 else: dataset = register_dataset(run.experiment.workspace, dataset_name, os.environ.get("DATASTORE_NAME"), data_file_path) else: e = ("No dataset provided") print(e) raise Exception(e) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset run.parent.tag("dataset_id", value=dataset.id) # Split the data into test/train df0 = dataset.to_pandas_dataframe() df = prepare_data(df0) data = split_data(df) # Train the model model = train_model(data, train_args) explainer = TabularExplainer(model, data["train"]["X"], features=df0.drop(['car name', 'mpg'], axis=1).columns) global_explanation = explainer.explain_global(data["test"]["X"]) client = ExplanationClient.from_run(run) client.upload_model_explanation(global_explanation, comment='MPG Predication Explanation') # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, data) for (k, v) in metrics.items(): run.log(k, v) run.parent.log(k, v) # Pass model file to next step os.makedirs(step_output_path, exist_ok=True) model_output_path = os.path.join(step_output_path, model_name) joblib.dump(value=model, filename=model_output_path) # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) joblib.dump(value=model, filename=output_path) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
def main(): # Connect to your AMLS Workspace and retrieve your data ws = run.experiment.workspace training_dataset_name = args.train_dataset_name train_dataset = Dataset.get_by_name(ws, training_dataset_name, version='latest') val_dataset_name = args.val_dataset_name val_dataset = Dataset.get_by_name(ws, val_dataset_name, version='latest') print('Datasets Retrieved') # Transform your data to Pandas trainTab = train_dataset trainDF = trainTab.to_pandas_dataframe() valTab = val_dataset valDF = valTab.to_pandas_dataframe() print('Datasets Converted to Pandas') # Split out X and Y variables for both training and validation data X, Y = split_x_y(trainDF, args.target_column_name) val_X, val_Y = split_x_y(valDF, args.target_column_name) print("Data Ready for Scoring") # Set your model and hyperparameters hyperparameters = dict(eta=args.eta,\ learning_rate=args.learning_rate,\ scale_pos_weight=args.scale_pos_weight,\ booster = args.booster,\ min_child_weight = args.min_child_weight,\ max_depth = args.max_depth,\ gamma = args.gamma,\ subsample = args.subsample,\ colsample_bytree = args.colsample_bytree,\ reg_lambda = args.reg_lambda,\ alpha = args.alpha,\ objective = args.objective) model = XGBClassifier(**hyperparameters) print('Hyperparameters Set') # Fit your model xgbModel = model.fit(X, Y) print("Model Fit") # Score your training data with cross validation and log metrics ss = ShuffleSplit(n_splits=args.k_folds, test_size=args.shuffle_split_size, random_state=33) bootstrap_sample_number = args.k_folds * 100 score_log_classification_training_data(model, X, Y, ss, bootstrap_sample_number) # Log a Confusion Matrix and Precision Recall Curve for your training data log_classification_charts("Training", xgbModel, X, Y) # Score your validation data and log metrics score_log_classification_validation_data(xgbModel, X, Y) print("Scoring Done for Validation Data") # Log a Confusion Matrix and Precision Recall Curve for your training data log_classification_charts("Validation", xgbModel, val_X, val_Y) # Model Explanations client = ExplanationClient.from_run(run) explainer = MimicExplainer(xgbModel, X, LGBMExplainableModel, classes=list(val_Y.unique()), features=val_X.columns, shap_values_output='probability', model_task='classification') global_explanation = explainer.explain_global(X) print(global_explanation) client.upload_model_explanation(global_explanation, top_k=30) print("Global Explanations Created") # Save local Explanations in json format to a column in the Validation Set valDF = save_local_explanations(explainer, valDF, val_X) print("Explanations Saved to Validation Data") # Save Global Explanations as a pandas dataframe globalExplanations = save_global_explanations(explainer, val_X) print("Global Explanations Saved as Pandas Dataframe") # Make a folder in which to save your output os.makedirs('outputs', exist_ok=True) # Save your Model joblib.dump(xgbModel, 'outputs/XGBmodel.pkl') print("Model Saved") # Save your Explainer Model joblib.dump(explainer, 'outputs/LGBMexplainer.pkl') print("Explainer Model Saved") # Save your Validation Set Predictions valDF = make_classification_predictions(xgbModel, valDF, val_X, val_Y) valCSV = valDF.to_csv('outputs/validationPredictions.csv', index=False) print('Validation Predictions written to CSV file in logs') # Save your Global Explanations globalExplanationsCSV = globalExplanations.to_csv( 'outputs/globalExplanations.csv', index=False) print('Global Explanations written to CSV file in logs')