Пример #1
0
def upload_interpretability_aml(global_explanation, comment, y_test):
    #Extract explanation client (remote or local?)
    run = Run.get_context()
    client = ExplanationClient.from_run(run)
    client.upload_model_explanation(global_explanation,
                                    comment=comment,
                                    true_ys=y_test)
Пример #2
0
                 )

fig.show() 
# End - Story No. 3018 modified Mukesh Dutta 9/3/2021 
''' # Story No. 3404

# COMMAND ----------

# DBTITLE 1,Model explanation
from azureml.interpret import ExplanationClient
from azureml.interpret.common.exceptions import ExplanationNotFoundException
# Get model explanation data
if new_training == "True":
    time.sleep(1200)
try:
    explaination_client = ExplanationClient.from_run(best_run)
    if not explaination_client is None:  # TS : added if condition - 2021-05-17 12:57pm
        client = explaination_client
        engineered_explanations = client.download_model_explanation(raw=False)
        #print(engineered_explanations.get_feature_importance_dict())
        print(
            "You can visualize the engineered explanations under the 'Explanations (preview)' tab in the AutoML run at:-\n"
            + best_run.get_portal_url())

        feature_imp_dict_eng = pd.DataFrame(
            engineered_explanations.get_feature_importance_dict().items())
        feature_imp_dict_eng.columns = ["Feature", "Importance"]

        raw_explanations = client.download_model_explanation(raw=True)
        # print(raw_explanations.get_feature_importance_dict())
        print(
Пример #3
0
def model_train(df):
    run = Run.get_context()

    df.drop("Sno", axis=1, inplace=True)

    y_raw = df['Risk']
    X_raw = df.drop('Risk', axis=1)

    categorical_features = X_raw.select_dtypes(include=['object']).columns
    numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns

    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value="missing")),
               ('onehotencoder',
                OneHotEncoder(categories='auto', sparse=False))])

    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

    feature_engineering_pipeline = ColumnTransformer(transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ],
                                                     remainder="drop")

    # Encode Labels
    le = LabelEncoder()
    encoded_y = le.fit_transform(y_raw)

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_raw,
                                                        encoded_y,
                                                        test_size=0.20,
                                                        stratify=encoded_y,
                                                        random_state=42)

    # Create sklearn pipeline
    lr_clf = Pipeline(
        steps=[('preprocessor', feature_engineering_pipeline
                ), ('classifier', LogisticRegression(solver="lbfgs"))])
    # Train the model
    lr_clf.fit(X_train, y_train)

    # Capture metrics
    train_acc = lr_clf.score(X_train, y_train)
    test_acc = lr_clf.score(X_test, y_test)
    print("Training accuracy: %.3f" % train_acc)
    print("Testing accuracy: %.3f" % test_acc)

    # Log to Azure ML
    run.log('Train accuracy', train_acc)
    run.log('Test accuracy', test_acc)

    # Explain model
    explainer = TabularExplainer(lr_clf.steps[-1][1],
                                 initialization_examples=X_train,
                                 features=X_raw.columns,
                                 classes=["Good", "Bad"],
                                 transformations=feature_engineering_pipeline)

    # explain overall model predictions (global explanation)
    global_explanation = explainer.explain_global(X_test)

    # Sorted SHAP values
    print('ranked global importance values: {}'.format(
        global_explanation.get_ranked_global_values()))
    # Corresponding feature names
    print('ranked global importance names: {}'.format(
        global_explanation.get_ranked_global_names()))
    # Feature ranks (based on original order of features)
    print('global importance rank: {}'.format(
        global_explanation.global_importance_rank))

    client = ExplanationClient.from_run(run)
    client.upload_model_explanation(global_explanation,
                                    comment='Global Explanation: All Features')

    return lr_clf
Пример #4
0
from azureml.interpret import ExplanationClient
from cuml.model_selection import train_test_split
from azureml.core.run import Run
import joblib
import os
import cuml
from cuml.benchmark.datagen import load_higgs


OUTPUT_DIR = './outputs/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

X, y = load_higgs()
N_ROWS = 1000000
run = Run.get_context()
client = ExplanationClient.from_run(run)
run.log('N_ROWS', N_ROWS)
X_train, X_test, y_train, y_test = train_test_split(X[:N_ROWS],
                                                    y[:N_ROWS],
                                                    random_state=1)
# write x_test out as a pickle file for later visualization
x_test_pkl = 'x_test.pkl'
with open(x_test_pkl, 'wb') as file:
    joblib.dump(value=X_test, filename=os.path.join(OUTPUT_DIR, x_test_pkl))
run.upload_file('x_test_higgs.pkl', os.path.join(OUTPUT_DIR, x_test_pkl))


gamma = 0.001
C = 100.
# Use SVC algorithm to create a model
reg = cuml.svm.SVC(C=C, gamma=gamma, probability=True)
Пример #5
0
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="diabetes_model.pkl",
    )

    parser.add_argument("--step_output",
                        type=str,
                        help=("output for passing data to next step"))

    parser.add_argument("--dataset_version",
                        type=str,
                        help=("dataset version"))

    parser.add_argument("--data_file_path",
                        type=str,
                        help=("data file path, if specified,\
               a new version of the dataset will be registered"))

    parser.add_argument(
        "--caller_run_id",
        type=str,
        help=("caller run id, for example ADF pipeline run id"))

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation"))

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [caller_run_id]: %s" % args.caller_run_id)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    print("Getting training parameters")

    # Load the training parameters from the parameters file
    with open("parameters.json") as f:
        pars = json.load(f)
    try:
        train_args = pars["training"]
    except KeyError:
        print("Could not load training values from file")
        train_args = {}

    # Log the training parameters
    print(f"Parameters: {train_args}")
    for (k, v) in train_args.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Get the dataset
    if (dataset_name):
        if (data_file_path == 'none'):
            dataset = Dataset.get_by_name(run.experiment.workspace,
                                          dataset_name,
                                          dataset_version)  # NOQA: E402, E501
        else:
            dataset = register_dataset(run.experiment.workspace, dataset_name,
                                       os.environ.get("DATASTORE_NAME"),
                                       data_file_path)
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset
    run.parent.tag("dataset_id", value=dataset.id)

    # Split the data into test/train
    df0 = dataset.to_pandas_dataframe()
    df = prepare_data(df0)
    data = split_data(df)

    # Train the model
    model = train_model(data, train_args)
    explainer = TabularExplainer(model,
                                 data["train"]["X"],
                                 features=df0.drop(['car name', 'mpg'],
                                                   axis=1).columns)
    global_explanation = explainer.explain_global(data["test"]["X"])
    client = ExplanationClient.from_run(run)
    client.upload_model_explanation(global_explanation,
                                    comment='MPG Predication Explanation')

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, data)
    for (k, v) in metrics.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Pass model file to next step
    os.makedirs(step_output_path, exist_ok=True)
    model_output_path = os.path.join(step_output_path, model_name)
    joblib.dump(value=model, filename=model_output_path)

    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)
    joblib.dump(value=model, filename=output_path)

    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
def main():
    # Connect to your AMLS Workspace and retrieve your data
    ws = run.experiment.workspace
    training_dataset_name = args.train_dataset_name
    train_dataset = Dataset.get_by_name(ws,
                                        training_dataset_name,
                                        version='latest')
    val_dataset_name = args.val_dataset_name
    val_dataset = Dataset.get_by_name(ws, val_dataset_name, version='latest')
    print('Datasets Retrieved')

    # Transform your data to Pandas
    trainTab = train_dataset
    trainDF = trainTab.to_pandas_dataframe()
    valTab = val_dataset
    valDF = valTab.to_pandas_dataframe()
    print('Datasets Converted to Pandas')

    # Split out X and Y variables for both training and validation data
    X, Y = split_x_y(trainDF, args.target_column_name)
    val_X, val_Y = split_x_y(valDF, args.target_column_name)
    print("Data Ready for Scoring")

    # Set your model and hyperparameters
    hyperparameters = dict(eta=args.eta,\
                           learning_rate=args.learning_rate,\
                           scale_pos_weight=args.scale_pos_weight,\
                           booster = args.booster,\
                           min_child_weight = args.min_child_weight,\
                           max_depth = args.max_depth,\
                           gamma = args.gamma,\
                           subsample = args.subsample,\
                           colsample_bytree = args.colsample_bytree,\
                           reg_lambda = args.reg_lambda,\
                           alpha = args.alpha,\
                           objective = args.objective)

    model = XGBClassifier(**hyperparameters)
    print('Hyperparameters Set')

    # Fit your model
    xgbModel = model.fit(X, Y)
    print("Model Fit")

    # Score your training data with cross validation and log metrics
    ss = ShuffleSplit(n_splits=args.k_folds,
                      test_size=args.shuffle_split_size,
                      random_state=33)
    bootstrap_sample_number = args.k_folds * 100
    score_log_classification_training_data(model, X, Y, ss,
                                           bootstrap_sample_number)

    # Log a Confusion Matrix and Precision Recall Curve for your training data
    log_classification_charts("Training", xgbModel, X, Y)

    # Score your validation data and log metrics
    score_log_classification_validation_data(xgbModel, X, Y)
    print("Scoring Done for Validation Data")

    # Log a Confusion Matrix and Precision Recall Curve for your training data
    log_classification_charts("Validation", xgbModel, val_X, val_Y)

    # Model Explanations
    client = ExplanationClient.from_run(run)
    explainer = MimicExplainer(xgbModel,
                               X,
                               LGBMExplainableModel,
                               classes=list(val_Y.unique()),
                               features=val_X.columns,
                               shap_values_output='probability',
                               model_task='classification')
    global_explanation = explainer.explain_global(X)
    print(global_explanation)
    client.upload_model_explanation(global_explanation, top_k=30)
    print("Global Explanations Created")

    # Save local Explanations in json format to a column in the Validation Set
    valDF = save_local_explanations(explainer, valDF, val_X)
    print("Explanations Saved to Validation Data")

    # Save Global Explanations as a pandas dataframe
    globalExplanations = save_global_explanations(explainer, val_X)
    print("Global Explanations Saved as Pandas Dataframe")

    # Make a folder in which to save your output
    os.makedirs('outputs', exist_ok=True)

    # Save your Model
    joblib.dump(xgbModel, 'outputs/XGBmodel.pkl')
    print("Model Saved")

    # Save your Explainer Model
    joblib.dump(explainer, 'outputs/LGBMexplainer.pkl')
    print("Explainer Model Saved")

    # Save your Validation Set Predictions
    valDF = make_classification_predictions(xgbModel, valDF, val_X, val_Y)
    valCSV = valDF.to_csv('outputs/validationPredictions.csv', index=False)
    print('Validation Predictions written to CSV file in logs')

    # Save your Global Explanations
    globalExplanationsCSV = globalExplanations.to_csv(
        'outputs/globalExplanations.csv', index=False)
    print('Global Explanations written to CSV file in logs')