Exemplo n.º 1
0
    def test_serialize_mimic_lightgbm(self):
        test_logger.info("Running test_serialize_mimic_lightgbm to validate serializing explainer with lightgbm model")
        x_train, x_test, y_train, _, feature_names, target_names = create_scikit_cancer_data()
        model = create_sklearn_svm_classifier(x_train, y_train)
        model_task = ModelTask.Unknown
        kwargs = {'explainable_model_args': {'n_jobs': 1}, 'augment_data': False, 'reset_index': True}
        explainer = MimicExplainer(model, x_train, LGBMExplainableModel, features=feature_names,
                                   model_task=model_task, classes=target_names, **kwargs)
        explanation = explainer.explain_global(x_test)
        assert explanation.method == LIGHTGBM_METHOD

        tree_explainer = shap.TreeExplainer(explainer.surrogate_model.model)

        # Validate wrapped model, surrogate, and tree explainer with surrogate can be serialized
        model_name = 'wrapped_model.joblib'
        surrogate_name = 'surrogate_model.joblib'
        tree_explainer_name = 'tree_explainer_model.joblib'
        with open(model_name, 'wb') as stream:
            dump(explainer.model, stream)
        with open(surrogate_name, 'wb') as stream:
            dump(explainer.surrogate_model.model, stream)
        with open(tree_explainer_name, 'wb') as stream:
            dump(tree_explainer, stream)
        assert path.exists(model_name)
        assert path.exists(surrogate_name)
        assert path.exists(tree_explainer_name)
    def test_error_analysis_iris_numeric_feature_names(self):
        # e2e test of error analysis with numeric feature names
        X_train, X_test, y_train, y_test, _, _ = create_iris_data()
        knn = sklearn.neighbors.KNeighborsClassifier()
        knn.fit(X_train, y_train)

        model_task = ModelTask.Classification
        explainer = MimicExplainer(knn,
                                   X_train,
                                   LGBMExplainableModel,
                                   model_task=model_task)
        global_explanation = explainer.explain_global(X_test)

        dashboard = ErrorAnalysisDashboard(global_explanation,
                                           knn,
                                           dataset=X_test,
                                           true_y=y_test)
        metric = metric_to_display_name[Metrics.ERROR_RATE]
        result = dashboard.input.debug_ml([
            global_explanation.features, [], [], DEFAULT_MAX_DEPTH,
            DEFAULT_NUM_LEAVES, DEFAULT_MIN_CHILD_SAMPLES, metric
        ])
        assert WidgetRequestResponseConstants.ERROR not in result
        matrix_features = global_explanation.features[0:1]
        result = dashboard.input.matrix(matrix_features, [], [], True, 8,
                                        metric)
        assert WidgetRequestResponseConstants.ERROR not in result
    def test_explanation_dashboard_many_columns(self):
        X, y = make_classification(n_features=2000)

        # Split data into train and test
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        classes = np.unique(y_train).tolist()
        feature_names = ["col" + str(i) for i in list(range(X_train.shape[1]))]
        X_train = pd.DataFrame(X_train, columns=feature_names)
        X_test = pd.DataFrame(X_test, columns=feature_names)
        knn = sklearn.neighbors.KNeighborsClassifier()
        knn.fit(X_train, y_train)

        model_task = ModelTask.Classification
        explainer = MimicExplainer(knn,
                                   X_train,
                                   LGBMExplainableModel,
                                   model_task=model_task)
        global_explanation = explainer.explain_global(X_test)

        ExplanationDashboard(explanation=global_explanation,
                             model=knn,
                             dataset=X_test,
                             true_y=y_test,
                             classes=classes)
Exemplo n.º 4
0
def mimic_values(x_train,
                 x_test,
                 model,
                 features,
                 augment_data,
                 max_num_of_augmentations,
                 explainable_model=LinearExplainableModel):
    """  
    Parameters:
    
    Provides feature importances to explain the model using a surrogate model
    
    x_train: input dataset to train the model
    x_test: test dataset
    model: trained model
    explainable_model: interpretable model as a global surrogate to the black box model
    features: list of feature names. Optional, used if doing classification
    classes: list of output class labels or names. Optional, used if doing classification
    augment_data:is optional and if true, oversamples the initialization examples to improve surrogate model accuracy to fit originalmodel.                  Useful for high-dimensional data where the number of rows is less than the number of columns.
    max_num_of_augmentations: is optional and defines max number of times we can increase the input data size.
    
    Returns: 
    explainer (object): provides the feature importances that determines the prediction of the model
    global_explanation (object): provides the global feature importances that determines the prediction of the model
    local_explanation (object): provides the local feature importances that determines the prediction of the model
    
    """
    explainer = MimicExplainer(
        model,
        x_train,
        explainable_model,
        augment_data=augment_data,
        max_num_of_augmentations=max_num_of_augmentations,
        features=features)

    # you can use the training data or the test data here
    global_explanation = explainer.explain_global(x_test)

    # explain the selected data point in the test set
    local_explanation = explainer.explain_local(x_test)

    return explainer, global_explanation, local_explanation
Exemplo n.º 5
0
    def test_pickle_unpickle_mimic_explainer_classification(self, surrogate_model):
        x_train, x_test, y_train, _, feature_names, target_names = create_scikit_cancer_data()
        model = create_sklearn_svm_classifier(x_train, y_train)
        model_task = ModelTask.Unknown
        surrogate_model = surrogate_model
        explainer = MimicExplainer(model, x_train, surrogate_model, features=feature_names,
                                   model_task=model_task, classes=target_names)

        self._verify_explanations(explainer, x_test, get_mimic_method(surrogate_model))
        recovered_explainer = self.pickle_unpickle_explainer(explainer)
        self._verify_explanations(recovered_explainer, x_test, get_mimic_method(surrogate_model))
def run_error_analysis_adult_census(X, y, categorical_features):
    X, y = sklearn.utils.resample(X,
                                  y,
                                  n_samples=1000,
                                  random_state=7,
                                  stratify=y)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=7,
                                                        stratify=y)

    knn = sklearn.neighbors.KNeighborsClassifier()
    knn.fit(X_train, y_train)

    model_task = ModelTask.Classification
    explainer = MimicExplainer(knn,
                               X_train,
                               LGBMExplainableModel,
                               augment_data=True,
                               max_num_of_augmentations=10,
                               model_task=model_task)
    global_explanation = explainer.explain_global(X_test)

    dashboard = ErrorAnalysisDashboard(
        global_explanation,
        knn,
        dataset=X_test,
        true_y=y_test,
        categorical_features=categorical_features)
    metric = metric_to_display_name[Metrics.ERROR_RATE]
    result = dashboard.input.debug_ml([
        global_explanation.features, [], [], DEFAULT_MAX_DEPTH,
        DEFAULT_NUM_LEAVES, DEFAULT_MIN_CHILD_SAMPLES, metric
    ])
    assert WidgetRequestResponseConstants.ERROR not in result
    matrix_features = global_explanation.features[0:1]
    result = dashboard.input.matrix(matrix_features, [], [], True, 8, metric)
    assert WidgetRequestResponseConstants.ERROR not in result
Exemplo n.º 7
0
    def test_error_analysis_adult_census(self):
        X, y = shap.datasets.adult()
        y = [1 if r else 0 for r in y]

        X, y = sklearn.utils.resample(X,
                                      y,
                                      n_samples=1000,
                                      random_state=7,
                                      stratify=y)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=7,
                                                            stratify=y)

        knn = sklearn.neighbors.KNeighborsClassifier()
        knn.fit(X_train, y_train)

        model_task = ModelTask.Classification
        explainer = MimicExplainer(knn,
                                   X_train,
                                   LGBMExplainableModel,
                                   augment_data=True,
                                   max_num_of_augmentations=10,
                                   model_task=model_task)
        global_explanation = explainer.explain_global(X_test)

        categorical_features = [
            'Workclass', 'Education-Num', 'Marital Status', 'Occupation',
            'Relationship', 'Race', 'Sex', 'Country'
        ]
        ErrorAnalysisDashboard(global_explanation,
                               knn,
                               dataset=X_test,
                               true_y=y_test,
                               categorical_features=categorical_features)
    def test_error_analysis_pandas(self):
        X_train, X_test, y_train, y_test, feature_names, _ = create_iris_data()

        # Validate error analysis dashboard on pandas DataFrame
        # and pandas Series
        X_train = pd.DataFrame(X_train, columns=feature_names)
        X_test = pd.DataFrame(X_test, columns=feature_names)
        y_train = pd.Series(y_train)
        y_test = pd.Series(y_test)

        knn = sklearn.neighbors.KNeighborsClassifier()
        knn.fit(X_train, y_train)

        model_task = ModelTask.Classification
        explainer = MimicExplainer(knn,
                                   X_train,
                                   LGBMExplainableModel,
                                   model_task=model_task)
        global_explanation = explainer.explain_global(X_test)

        ErrorAnalysisDashboard(global_explanation,
                               knn,
                               dataset=X_test,
                               true_y=y_test)
Exemplo n.º 9
0
    def test_pickle_unpickle_mimic_explainer_regression(self, surrogate_model):
        num_features = 100
        num_rows = 1000
        test_size = 0.2
        X, y = make_regression(n_samples=num_rows, n_features=num_features)
        x_train, x_test, y_train, _ = train_test_split(X, y, test_size=test_size, random_state=42)

        model = LinearRegression(normalize=True)
        model.fit(x_train, y_train)
        surrogate_model = surrogate_model
        explainer = MimicExplainer(model, x_train, surrogate_model)

        self._verify_explanations(explainer, x_test, get_mimic_method(surrogate_model))
        recovered_explainer = self.pickle_unpickle_explainer(explainer)
        self._verify_explanations(recovered_explainer, x_test, get_mimic_method(surrogate_model))
Exemplo n.º 10
0
    def train_interpret(self, X, model="tabular"):
        """
        Train a interpret model

        Parameters
        ----------
        self    : object Wrapper
        X       : pd.DataFrame
                  Data that were used in the train for interpret
        model   : string, optional
                  Model to use for the interpret [tabular,mimic_LGBME,
                  mimic_Linear,mimic_SGDE,mimic_Dec_Tree]
        Returns
        -------
        None
        """
        mimic_models = {
            "mimic_LGBME": LGBMExplainableModel,
            "mimic_Linear": LinearExplainableModel,
            "mimic_SGDE": SGDExplainableModel,
            "mimic_Dec_Tree": DecisionTreeExplainableModel,
        }
        if model == "tabular":
            explainer = TabularExplainer(self.artifacts["model"],
                                         X,
                                         features=self.artifacts["columns"])
        else:
            explainer = MimicExplainer(
                self.artifacts["model"],
                X,
                mimic_models[model],
                augment_data=True,
                max_num_of_augmentations=10,
                features=self.artifacts["columns"],
            )
        self.artifacts["explainer"] = explainer
Exemplo n.º 11
0
# pip install azureml-interpret

# explainer :- 

# -MimicExplainer -  An explainer that creates a global surrogate model that approximates your trained model and can be used to generate explanations. This explainable model must have the same kind of architecture as your trained model (for example, linear or tree-based).
# -TabuExplainer -  An explainer that acts as a wrapper around various SHAP explainer algorithms, automatically choosing the one that is most appropriate for your model architecture.
# -PFIExplainer -  a Permutation Feature Importance explainer that analyzes feature importance by shuffling feature values and measuring the impact on prediction performance.


# MimicExplainer
from interpret.ext.blackbox import MimicExplainer
from interpret.ext.glassbox import DecisionTreeExplainableModel

mim_explainer = MimicExplainer(model=loan_model,
                             initialization_examples=X_test,
                             explainable_model = DecisionTreeExplainableModel,
                             features=['loan_amount','income','age','marital_status'], 
                             classes=['reject', 'approve'])
                             

# TabularExplainer
from interpret.ext.blackbox import TabularExplainer

tab_explainer = TabularExplainer(model=loan_model,
                             initialization_examples=X_test,
                             features=['loan_amount','income','age','marital_status'],
                             classes=['reject', 'approve'])


# PFIExplainer
from interpret.ext.blackbox import PFIExplainer
def main():
    # Connect to your AMLS Workspace and retrieve your data
    ws = run.experiment.workspace
    training_dataset_name = args.train_dataset_name
    train_dataset = Dataset.get_by_name(ws,
                                        training_dataset_name,
                                        version='latest')
    val_dataset_name = args.val_dataset_name
    val_dataset = Dataset.get_by_name(ws, val_dataset_name, version='latest')
    print('Datasets Retrieved')

    # Transform your data to Pandas
    trainTab = train_dataset
    trainDF = trainTab.to_pandas_dataframe()
    valTab = val_dataset
    valDF = valTab.to_pandas_dataframe()
    print('Datasets Converted to Pandas')

    # Split out X and Y variables for both training and validation data
    X, Y = split_x_y(trainDF, args.target_column_name)
    val_X, val_Y = split_x_y(valDF, args.target_column_name)
    print("Data Ready for Scoring")

    # Set your model and hyperparameters
    hyperparameters = dict(eta=args.eta,\
                           learning_rate=args.learning_rate,\
                           scale_pos_weight=args.scale_pos_weight,\
                           booster = args.booster,\
                           min_child_weight = args.min_child_weight,\
                           max_depth = args.max_depth,\
                           gamma = args.gamma,\
                           subsample = args.subsample,\
                           colsample_bytree = args.colsample_bytree,\
                           reg_lambda = args.reg_lambda,\
                           alpha = args.alpha,\
                           objective = args.objective)

    model = XGBClassifier(**hyperparameters)
    print('Hyperparameters Set')

    # Fit your model
    xgbModel = model.fit(X, Y)
    print("Model Fit")

    # Score your training data with cross validation and log metrics
    ss = ShuffleSplit(n_splits=args.k_folds,
                      test_size=args.shuffle_split_size,
                      random_state=33)
    bootstrap_sample_number = args.k_folds * 100
    score_log_classification_training_data(model, X, Y, ss,
                                           bootstrap_sample_number)

    # Log a Confusion Matrix and Precision Recall Curve for your training data
    log_classification_charts("Training", xgbModel, X, Y)

    # Score your validation data and log metrics
    score_log_classification_validation_data(xgbModel, X, Y)
    print("Scoring Done for Validation Data")

    # Log a Confusion Matrix and Precision Recall Curve for your training data
    log_classification_charts("Validation", xgbModel, val_X, val_Y)

    # Model Explanations
    client = ExplanationClient.from_run(run)
    explainer = MimicExplainer(xgbModel,
                               X,
                               LGBMExplainableModel,
                               classes=list(val_Y.unique()),
                               features=val_X.columns,
                               shap_values_output='probability',
                               model_task='classification')
    global_explanation = explainer.explain_global(X)
    print(global_explanation)
    client.upload_model_explanation(global_explanation, top_k=30)
    print("Global Explanations Created")

    # Save local Explanations in json format to a column in the Validation Set
    valDF = save_local_explanations(explainer, valDF, val_X)
    print("Explanations Saved to Validation Data")

    # Save Global Explanations as a pandas dataframe
    globalExplanations = save_global_explanations(explainer, val_X)
    print("Global Explanations Saved as Pandas Dataframe")

    # Make a folder in which to save your output
    os.makedirs('outputs', exist_ok=True)

    # Save your Model
    joblib.dump(xgbModel, 'outputs/XGBmodel.pkl')
    print("Model Saved")

    # Save your Explainer Model
    joblib.dump(explainer, 'outputs/LGBMexplainer.pkl')
    print("Explainer Model Saved")

    # Save your Validation Set Predictions
    valDF = make_classification_predictions(xgbModel, valDF, val_X, val_Y)
    valCSV = valDF.to_csv('outputs/validationPredictions.csv', index=False)
    print('Validation Predictions written to CSV file in logs')

    # Save your Global Explanations
    globalExplanationsCSV = globalExplanations.to_csv(
        'outputs/globalExplanations.csv', index=False)
    print('Global Explanations written to CSV file in logs')