Пример #1
0
def shap_values(model, x_train, x_text, features, initialization_examples):
    """    
    Provides feature importances to explain the model.
    
    Parameters:
    x_train: input dataset to train the model
    x_test: test dataset
    model: trained model
    features: list of feature names. Optional, used if doing classification
    classes: list of output class labels or names. Optional, used if doing classification
    
    Returns:
    explainer (object): provides the feature importances that determines the prediction of the model
    global_explanation (object): provides the global feature importances that determines the prediction of the model
    local_explanation (object): provides the global feature importances that determines the prediction of the model
    
    """
    explainer = TabularExplainer(model, x_train, features=features)

    # you can use the training data or the test data here
    global_explanation = explainer.explain_global(x_test)

    # explain the selected data point in the test set
    local_explanation = explainer.explain_local(x_test)

    return explainer, global_explanation, local_explanation
Пример #2
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
    
    os.makedirs('outputs', exist_ok=True)
    # files saved in the "outputs" folder are automatically uploaded into run history
    joblib.dump(model, 'outputs/modelht.pkl')
    
    #model_name
    model_file_name = 'modelht.pkl'
    
    # register the model
    run.upload_file('original_model.pkl', os.path.join('./outputs/', model_file_name))
    original_model = run.register_model(model_name='model_explain',model_path='original_model.pkl')

    # Explain predictions on your local machine
    tabular_explainer = TabularExplainer(model, x_train, features=feature_names)
    global_explanation = tabular_explainer.explain_global(x_test)

    # The explanation can then be downloaded on any compute
    comment = 'Global explanation on regression model trained on bank marketing campaing dataset'
    client.upload_model_explanation(global_explanation, comment=comment, model_id=original_model.id)    
def interpret_model(model, x_train, x_test, feature_names=None, classes=None):
    #Using SHAP TabularExplainer
    explainer = TabularExplainer(model,
                                 x_train,
                                 features=feature_names,
                                 classes=classes)
    #Generate global explanations
    global_explanation = explainer.explain_global(x_test)
    #Return Generated Explanation dashboard
    return ExplanationDashboard(global_explanation, model, datasetX=x_test)
def Global_Model_Explanation(model,
                             x_train,
                             x_test,
                             feature_names=None,
                             classes=None,
                             explantion_data=None):
    #Using SHAP TabularExplainer
    explainer = TabularExplainer(model,
                                 x_train,
                                 features=feature_names,
                                 classes=classes)
    #Generate global explanations
    if explantion_data == 'Training':
        global_explanation = explainer.explain_global(x_train)
    else:
        global_explanation = explainer.explain_global(x_test)
    ##print the global importance rank data
    print('global importance rank: {}'.format(
        global_explanation.get_feature_importance_dict()))
    #Return Generated Explanation dashboard
    return global_explanation
Пример #5
0
def interpret_global(model,
                     train,
                     test,
                     features=None,
                     classes=None,
                     local=False,
                     task=None):
    # explain predictions on your local machine
    # "features" and "classes" fields are optional
    explainer = TabularExplainer(model,
                                 train,
                                 features=features,
                                 classes=classes,
                                 model_task=task)

    # explain overall model predictions (global explanation)
    global_explanation = explainer.explain_global(test)

    # uploading global model explanation data for storage or visualization in webUX
    # the explanation can then be downloaded on any compute
    # multiple explanations can be uploaded
    return global_explanation
Пример #6
0
 def test_serialize_kernel(self):
     test_logger.info("Running test_serialize_kernel to validate inner explainer and wrapped model serialization")
     x_train, _, y_train, _, feature_names, target_names = create_scikit_cancer_data()
     model = create_sklearn_svm_classifier(x_train, y_train)
     explainer = TabularExplainer(model,
                                  x_train,
                                  features=feature_names,
                                  classes=target_names)
     # Validate wrapped model and inner explainer can be serialized
     model_name = 'wrapped_model.joblib'
     explainer_name = 'inner_explainer.joblib'
     with open(explainer_name, 'wb') as stream:
         dump(explainer.explainer.explainer, stream)
     with open(model_name, 'wb') as stream:
         dump(explainer.model, stream)
     assert path.exists(model_name)
     assert path.exists(explainer_name)
Пример #7
0
    def train_interpret(self, X, model="tabular"):
        """
        Train a interpret model

        Parameters
        ----------
        self    : object Wrapper
        X       : pd.DataFrame
                  Data that were used in the train for interpret
        model   : string, optional
                  Model to use for the interpret [tabular,mimic_LGBME,
                  mimic_Linear,mimic_SGDE,mimic_Dec_Tree]
        Returns
        -------
        None
        """
        mimic_models = {
            "mimic_LGBME": LGBMExplainableModel,
            "mimic_Linear": LinearExplainableModel,
            "mimic_SGDE": SGDExplainableModel,
            "mimic_Dec_Tree": DecisionTreeExplainableModel,
        }
        if model == "tabular":
            explainer = TabularExplainer(self.artifacts["model"],
                                         X,
                                         features=self.artifacts["columns"])
        else:
            explainer = MimicExplainer(
                self.artifacts["model"],
                X,
                mimic_models[model],
                augment_data=True,
                max_num_of_augmentations=10,
                features=self.artifacts["columns"],
            )
        self.artifacts["explainer"] = explainer
Пример #8
0
def train_model(df, target):
    # Creating dummy columns for each categorical feature
    categorical = []
    for col, value in df.iteritems():
        if value.dtype == 'object':
            categorical.append(col)
    # Store the numerical columns in a list numerical
    numerical = df.columns.difference(categorical)
    numeric_transformations = [
        ([f],
         Pipeline(steps=[('imputer', SimpleImputer(
             strategy='median')), ('scaler', StandardScaler())]))
        for f in numerical
    ]
    categorical_transformations = [([f],
                                    OneHotEncoder(handle_unknown='ignore',
                                                  sparse=False))
                                   for f in categorical]
    transformations = numeric_transformations + categorical_transformations
    # Append classifier to preprocessing pipeline
    clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations)
                           ), ('classifier',
                               LogisticRegression(solver='lbfgs'))])
    # Split data into train and test
    x_train, x_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.35,
                                                        random_state=0,
                                                        stratify=target)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(classification_report(y_test, y_pred))
    accu = accuracy_score(y_test, y_pred)
    model_file_name = 'classifier.pkl'
    # save model in the outputs folder so it automatically get uploaded
    with open(model_file_name, 'wb') as file:
        joblib.dump(value=clf,
                    filename=os.path.join('./outputs/', model_file_name))
    run = Run.get_context()
    run.log("accuracy", accu)
    # we upload the model into the experiment artifact store, but do not register it as a model until unit tests are sucessfully passed in next ML step
    run.upload_file(model_file_name, os.path.join('./outputs/',
                                                  model_file_name))
    #Interpret steps
    client = ExplanationClient.from_run(run)
    # Using SHAP TabularExplainer
    explainer = TabularExplainer(clf.steps[-1][1],
                                 initialization_examples=x_train,
                                 features=df.columns,
                                 classes=["Not leaving", "leaving"],
                                 transformations=transformations)
    # explain overall model predictions (global explanation)
    global_explanation = explainer.explain_global(x_test)
    # Sorted SHAP values
    print('ranked global importance values: {}'.format(
        global_explanation.get_ranked_global_values()))
    # Corresponding feature names
    print('ranked global importance names: {}'.format(
        global_explanation.get_ranked_global_names()))
    # Feature ranks (based on original order of features)
    print('global importance rank: {}'.format(
        global_explanation.global_importance_rank))
    # uploading global model explanation data for storage or visualization in webUX
    # the explanation can then be downloaded on any compute
    # multiple explanations can be uploaded
    client.upload_model_explanation(global_explanation,
                                    comment='global explanation: all features')
Пример #9
0
def model_train(df):
    run = Run.get_context()

    df.drop("Sno", axis=1, inplace=True)

    y_raw = df['Risk']
    X_raw = df.drop('Risk', axis=1)

    categorical_features = X_raw.select_dtypes(include=['object']).columns
    numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns

    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value="missing")),
               ('onehotencoder',
                OneHotEncoder(categories='auto', sparse=False))])

    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

    feature_engineering_pipeline = ColumnTransformer(transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ],
                                                     remainder="drop")

    # Encode Labels
    le = LabelEncoder()
    encoded_y = le.fit_transform(y_raw)

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_raw,
                                                        encoded_y,
                                                        test_size=0.20,
                                                        stratify=encoded_y,
                                                        random_state=42)

    # Create sklearn pipeline
    lr_clf = Pipeline(
        steps=[('preprocessor', feature_engineering_pipeline
                ), ('classifier', LogisticRegression(solver="lbfgs"))])
    # Train the model
    lr_clf.fit(X_train, y_train)

    # Capture metrics
    train_acc = lr_clf.score(X_train, y_train)
    test_acc = lr_clf.score(X_test, y_test)
    print("Training accuracy: %.3f" % train_acc)
    print("Testing accuracy: %.3f" % test_acc)

    # Log to Azure ML
    run.log('Train accuracy', train_acc)
    run.log('Test accuracy', test_acc)

    # Explain model
    explainer = TabularExplainer(lr_clf.steps[-1][1],
                                 initialization_examples=X_train,
                                 features=X_raw.columns,
                                 classes=["Good", "Bad"],
                                 transformations=feature_engineering_pipeline)

    # explain overall model predictions (global explanation)
    global_explanation = explainer.explain_global(X_test)

    # Sorted SHAP values
    print('ranked global importance values: {}'.format(
        global_explanation.get_ranked_global_values()))
    # Corresponding feature names
    print('ranked global importance names: {}'.format(
        global_explanation.get_ranked_global_names()))
    # Feature ranks (based on original order of features)
    print('global importance rank: {}'.format(
        global_explanation.global_importance_rank))

    client = ExplanationClient.from_run(run)
    client.upload_model_explanation(global_explanation,
                                    comment='Global Explanation: All Features')

    return lr_clf
model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha)
# save model in the outputs folder so it automatically get uploaded
with open(model_file_name, 'wb') as file:
    joblib.dump(value=reg, filename=os.path.join(OUTPUT_DIR, model_file_name))

# register the model
run.upload_file('original_model.pkl',
                os.path.join('./outputs/', model_file_name))
original_model = run.register_model(
    model_name='model_explain_model_on_amlcomp',
    model_path='original_model.pkl')

# Explain predictions on your local machine
tabular_explainer = TabularExplainer(model,
                                     X_train,
                                     features=boston_data.feature_names)

# Explain overall model predictions (global explanation)
# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# x_train can be passed as well, but with more examples explanations it will
# take longer although they may be more accurate
global_explanation = tabular_explainer.explain_global(X_test)

# Uploading model explanation data for storage or visualization in webUX
# The explanation can then be downloaded on any compute
comment = 'Global explanation on regression model trained on boston dataset'
client.upload_model_explanation(global_explanation,
                                comment=comment,
                                model_id=original_model.id)
Пример #11
0
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="diabetes_model.pkl",
    )

    parser.add_argument("--step_output",
                        type=str,
                        help=("output for passing data to next step"))

    parser.add_argument("--dataset_version",
                        type=str,
                        help=("dataset version"))

    parser.add_argument("--data_file_path",
                        type=str,
                        help=("data file path, if specified,\
               a new version of the dataset will be registered"))

    parser.add_argument(
        "--caller_run_id",
        type=str,
        help=("caller run id, for example ADF pipeline run id"))

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation"))

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [caller_run_id]: %s" % args.caller_run_id)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    print("Getting training parameters")

    # Load the training parameters from the parameters file
    with open("parameters.json") as f:
        pars = json.load(f)
    try:
        train_args = pars["training"]
    except KeyError:
        print("Could not load training values from file")
        train_args = {}

    # Log the training parameters
    print(f"Parameters: {train_args}")
    for (k, v) in train_args.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Get the dataset
    if (dataset_name):
        if (data_file_path == 'none'):
            dataset = Dataset.get_by_name(run.experiment.workspace,
                                          dataset_name,
                                          dataset_version)  # NOQA: E402, E501
        else:
            dataset = register_dataset(run.experiment.workspace, dataset_name,
                                       os.environ.get("DATASTORE_NAME"),
                                       data_file_path)
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset
    run.parent.tag("dataset_id", value=dataset.id)

    # Split the data into test/train
    df0 = dataset.to_pandas_dataframe()
    df = prepare_data(df0)
    data = split_data(df)

    # Train the model
    model = train_model(data, train_args)
    explainer = TabularExplainer(model,
                                 data["train"]["X"],
                                 features=df0.drop(['car name', 'mpg'],
                                                   axis=1).columns)
    global_explanation = explainer.explain_global(data["test"]["X"])
    client = ExplanationClient.from_run(run)
    client.upload_model_explanation(global_explanation,
                                    comment='MPG Predication Explanation')

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, data)
    for (k, v) in metrics.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Pass model file to next step
    os.makedirs(step_output_path, exist_ok=True)
    model_output_path = os.path.join(step_output_path, model_name)
    joblib.dump(value=model, filename=model_output_path)

    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)
    joblib.dump(value=model, filename=output_path)

    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
Пример #12
0
# preds = reg.predict(X_test)
run.log('C', C)
run.log('gamma', gamma)


model_file_name = 'svc.pkl'
# save model in the outputs folder so it automatically get uploaded
with open(model_file_name, 'wb') as file:
    joblib.dump(value=reg, filename=os.path.join(OUTPUT_DIR,
                                                 model_file_name))

# register the model
run.upload_file('original_model.pkl', os.path.join('./outputs/', model_file_name))
original_model = run.register_model(model_name='model_explain_model_on_amlcomp',
                                    model_path='original_model.pkl')

# Explain predictions on your local machine
tabular_explainer = TabularExplainer(model, X_train.to_pandas(), features=X_train.columns, use_gpu=True)

# Explain overall model predictions (global explanation)
# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# x_train can be passed as well, but with more examples explanations it will
# take longer although they may be more accurate
global_explanation = tabular_explainer.explain_global(X_test.to_pandas()[:50])

# Uploading model explanation data for storage or visualization in webUX
# The explanation can then be downloaded on any compute
comment = 'Global explanation on regression model trained on boston dataset'
client.upload_model_explanation(global_explanation, comment=comment, model_id=original_model.id)
Пример #13
0
# MimicExplainer
from interpret.ext.blackbox import MimicExplainer
from interpret.ext.glassbox import DecisionTreeExplainableModel

mim_explainer = MimicExplainer(model=loan_model,
                             initialization_examples=X_test,
                             explainable_model = DecisionTreeExplainableModel,
                             features=['loan_amount','income','age','marital_status'], 
                             classes=['reject', 'approve'])
                             

# TabularExplainer
from interpret.ext.blackbox import TabularExplainer

tab_explainer = TabularExplainer(model=loan_model,
                             initialization_examples=X_test,
                             features=['loan_amount','income','age','marital_status'],
                             classes=['reject', 'approve'])


# PFIExplainer
from interpret.ext.blackbox import PFIExplainer

pfi_explainer = PFIExplainer(model = loan_model,
                             features=['loan_amount','income','age','marital_status'],
                             classes=['reject', 'approve'])



# MimicExplainer
global_mim_explanation = mim_explainer.explain_global(X_train)
global_mim_feature_importance = global_mim_explanation.get_feature_importance_dict()
# save model for use outside the script
model_file_name = 'original_model.pkl'
with open(model_file_name, 'wb') as file:
    joblib.dump(value=clf, filename=os.path.join(OUTPUT_DIR, model_file_name))

# register the model with the model management service for later use
run.upload_file('original_model.pkl', os.path.join(OUTPUT_DIR,
                                                   model_file_name))
original_model = run.register_model(model_name='amlcompute_deploy_model',
                                    model_path='original_model.pkl')
# -

# create an explainer to validate or debug the model
tabular_explainer = TabularExplainer(model,
                                     initialization_examples=x_train,
                                     features=attritionXData.columns,
                                     classes=["Staying", "Leaving"],
                                     transformations=transformations)

# explain overall model predictions (global explanation)
# passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# more data (e.g. x_train) will likely lead to higher accuracy, but at a time cost
global_explanation = tabular_explainer.explain_global(x_test)

# uploading model explanation data for storage or visualization
comment = 'Global explanation on classification model trained on IBM employee attrition dataset'
client.upload_model_explanation(global_explanation,
                                comment=comment,
                                model_id=original_model.id)

# also create a lightweight explainer for scoring time
Пример #15
0
print("Running train.py")

# Randomly pic alpha
alphas = np.arange(0.0, 0.5, 0.01)
alpha = alphas[np.random.choice(alphas.shape[0], 1, replace=False)][0]
print(alpha)
run.log("alpha", alpha)
reg = Ridge(alpha=alpha)
reg.fit(data["train"]["X"], data["train"]["y"])
preds = reg.predict(data["test"]["X"])
run.log("mse", mean_squared_error(preds, data["test"]["y"]))

# create an explainer to validate or debug the model
tabular_explainer = TabularExplainer(reg,
                                     initialization_examples=X_train,
                                     features=columns)
# explain overall model predictions (global explanation)
# passing in test dataset for evaluation examples

global_explanation = tabular_explainer.explain_global(X_test)

# uploading model explanation data for storage or visualization
comment = 'Global explanation on of Diabetes Regression'
client.upload_model_explanation(global_explanation, comment=comment)

with open(model_name, "wb") as file:
    joblib.dump(value=reg, filename=model_name)

# upload the model file explicitly into artifacts
run.upload_file(name="./outputs/" + model_name, path_or_stream=model_name)
Пример #16
0
from interpret.ext.blackbox import TabularExplainer

iris = load_iris()
X = iris['data']
y = iris['target']
classes = iris['target_names']
feature_names = iris['feature_names']

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

clf = svm.SVC(gamma=0.001, C=100., probability=True)
model = clf.fit(x_train, y_train)

explainer = TabularExplainer(model,
                             x_train,
                             features=feature_names,
                             classes=classes)

global_explanation = explainer.explain_global(x_test)


instance_num = 0
local_explanation = explainer.explain_local(x_test[instance_num, :])

prediction_value = clf.predict(x_test)[instance_num]

sorted_local_importance_values = local_explanation.get_ranked_local_values()[
    prediction_value]
sorted_local_importance_names = local_explanation.get_ranked_local_names()[
    prediction_value]
Y_predict = rfc.predict(X_test)

# Evaluate the RFC model
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_predict)
score = rfc.score(X_test, Y_test)

#explainability

from interpret.ext.blackbox import TabularExplainer

classes = ['Not Greater than 50k', "Greater than 50k"]
features = list(X.columns)

tab_explainer = TabularExplainer(trained_model,
                                 X_train,
                                 features=features,
                                 classes=classes)

#global
global_explaination = tab_explainer.explain_global(X_train)

global_fi = global_explaination.get_feature_importance_dict()

#local

X_explain = X_test[:5]

local_explaination = tab_explainer.explain_local(X_explain)

local_f = local_explaination.get_ranked_local_names()
local_importance = local_explaination.get_ranked_local_values()
Пример #18
0
# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_scores[:, 1])
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes.pkl')

# Get explanation
explainer = TabularExplainer(model, X_train, features=features, classes=labels)
explanation = explainer.explain_global(X_test)

# Get an Explanation Client and upload the explanation
explain_client = ExplanationClient.from_run(run)
explain_client.upload_model_explanation(explanation,
                                        comment='Tabular Explanation')

# Complete the run
run.complete()
Пример #19
0
model = clf.steps[-1][1]

# save model for use outside the script
model_file_name = 'log_reg.pkl'
joblib.dump(value=clf, filename=os.path.join(OUTPUT_DIR, model_file_name))

# register the model with the model management service for later use
run.upload_file('model.pkl', os.path.join(OUTPUT_DIR, model_file_name))
original_model = run.register_model(model_name='creditmodel_explainer_remote',
                                    model_path='model.pkl')

print('create explainer')
# create an explainer to validate or debug the model
tabular_explainer = TabularExplainer(model,
                                     x_train,
                                     features=creditXData.columns,
                                     classes=[0, 1],
                                     transformations=transformations)

# explain overall model predictions (global explanation)
# passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# more data (e.g. x_train) will likely lead to higher accuracy, but at a time cost
global_explanation = tabular_explainer.explain_global(x_test)

print('upload explanation')
# uploading model explanation data for storage or visualization
comment = 'Global explanation on classification model trained on German credit dataset'
client.upload_model_explanation(global_explanation, comment=comment)

if not is_remote_run:
    run.complete()
Пример #20
0
# 2. Mimic Explainer
from interpret.ext.blackbox import MimicExplainer
# You can use one of the following four interpretable models as a global surrogate to the black box model
from interpret.ext.glassbox import LGBMExplainableModel
from interpret.ext.glassbox import LinearExplainableModel
from interpret.ext.glassbox import SGDExplainableModel
from interpret.ext.glassbox import DecisionTreeExplainableModel

# OR

# 3. PFI Explainer
from interpret.ext.blackbox import PFIExplainer

# In[9]:

explainer = TabularExplainer(m, X_train, features=featu_na, classes=classes)

# In[13]:

X_tests = joblib.load("X_tests")

# In[15]:

X_test = joblib.load("X_test")

# In[16]:

X_t, X_tests, y_train, y_tests = train_test_split(X_test,
                                                  y_test,
                                                  test_size=0.02,
                                                  random_state=56)
Пример #21
0
from sklearn.model_selection import train_test_split
from interpret.ext.blackbox import TabularExplainer

breast_cancer_data = load_breast_cancer()
classes = breast_cancer_data.target_names.tolist()

x_train, x_test, y_train, y_test = train_test_split(breast_cancer_data.data,
                                                    breast_cancer_data.target,
                                                    test_size=0.2,
                                                    random_state=42)
clf = RandomForestClassifier()
model = clf.fit(x_train, y_train)

# "features" and "classes" fields are optional
explainer = TabularExplainer(model,
                             x_train,
                             features=breast_cancer_data.feature_names,
                             classes=classes)

# you can use the training data or the test data here
global_explanation = explainer.explain_global(x_train)

# sorted feature importance values and feature names
sorted_global_importance_values = global_explanation.get_ranked_global_values()
sorted_global_importance_names = global_explanation.get_ranked_global_names()
print(
    f'Type: {type(sorted_global_importance_values)}, Value: {sorted_global_importance_values}'
)
print(
    f'Type: {type(sorted_global_importance_names)}, Value: {sorted_global_importance_names}'
)
print(
run.log("Test R2 Score", test_r2)
run.log("RMSE", rmse)

print("Saving the model to outputs ...")

model_file_name = 'gbr_tickfund.pkl'
joblib.dump(value=gbr, filename='outputs/model.pkl')

with open(model_file_name, 'wb') as file:
    joblib.dump(value=gbr, filename=os.path.join(OUTPUT_DIR, model_file_name))
# register the model
run.upload_file('dev_model.pkl', os.path.join('./outputs/', model_file_name))
original_model = run.register_model(model_name='gbr_model_train_msft',
                                    model_path='dev_model.pkl')

# Explain predictions on your local machine
tabular_explainer = TabularExplainer(gbr, train_features, features=df.columns)

# Explain overall model predictions (global explanation)
# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# x_train can be passed as well, but with more examples explanations it will
# take longer although they may be more accurate
global_explanation = tabular_explainer.explain_global(test_features)

# Uploading model explanation data for storage or visualization in webUX
# The explanation can then be downloaded on any compute
comment = 'Global explanation on regression model trained on ticker fund dataset'
client.upload_model_explanation(global_explanation,
                                comment=comment,
                                model_id=original_model.id)
print(cm)

# Save the model as .pkl file and
# save model file to the outputs/ folder - this will auto upload the model to the run in AzureML
model_file_name = 'log_reg.pkl'

#TODO: Write out the real model file :)
from interpret.ext.blackbox import TabularExplainer
from azureml.interpret import ExplanationClient
# create an explanation client to store the explanation (contrib API)
client = ExplanationClient.from_run(run)
print('create explainer')
# create an explainer to validate or debug the model
tabular_explainer = TabularExplainer(clf.steps[-1][1],
                                     initialization_examples=x_train,
                                     features=attritionXData.columns,
                                     classes=["Not leaving", "leaving"],
                                     transformations=transformations)

# explain overall model predictions (global explanation)
# passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# more data (e.g. x_train) will likely lead to higher accuracy, but at a time cost
global_explanation = tabular_explainer.explain_global(x_test)

print('upload explanation')
# uploading model explanation data for storage or visualization
comment = 'Global explanation on classification model trained on IBM employee attrition dataset'
client.upload_model_explanation(global_explanation, comment=comment)

# save model in the outputs folder so it automatically get uploaded
with open(model_file_name, 'wb') as file: