def explain(skater_exp: Explanation, training_df, test_df, explanation_target, prefix_target): job = skater_exp.job model = joblib.load(job.predictive_model.model_path) model = model[0] features = list(training_df.drop(['trace_id', 'label'], 1).columns.values) interpreter = Interpretation(training_df, feature_names=features) X_train = training_df.drop(['trace_id', 'label'], 1) Y_train = training_df['label'].values model_inst = InMemoryModel(model.predict, examples=X_train, model_type=model._estimator_type, unique_values=[1, 2], feature_names=features, target_names=['label']) surrogate_explainer = interpreter.tree_surrogate(model_inst, seed=5) surrogate_explainer.fit(X_train, Y_train, use_oracle=True, prune='post', scorer_type='default') surrogate_explainer.class_names = features viz = dtreeviz(surrogate_explainer.estimator_, X_train, Y_train, target_name='label', feature_names=features, orientation="TD", class_names=list(surrogate_explainer.class_names), fancy=True, X=None, label_fontsize=12, ticks_fontsize=8, fontname="Arial") name = create_unique_name("skater_plot.svg") viz.save(name) if os.path.getsize(name) > 15000000: return 'The file size is too big' f = open(name, "r") response = f.read() os.remove(name) if os.path.isfile(name.split('.svg')[0]): os.remove(name.split('.svg')[0]) return response
def analyze(model_prediction, X_train, y_train): skater_model = InMemoryModel(model_prediction, examples=X_train) interpreter = Interpretation(X_train, feature_names=X_train.columns) surrogate_explainer = interpreter.tree_surrogate(skater_model, seed=5) surrogate_explainer.fit(X_train, y_train, use_oracle=True, prune='post', scorer_type='default') surrogate_explainer.plot_global_decisions( colors=['coral', 'lightsteelblue', 'darkkhaki'], file_name='simple_tree_pre.png') return Image(filename='simple_tree_pre.png')
def handle(self, *args, **kwargs): # get model TARGET_MODEL = 71 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path)[0] # load data training_df, test_df = get_encoded_logs(job) features = list( training_df.drop(['trace_id', 'label'], 1).columns.values) interpreter = Interpretation(training_df, feature_names=features) X_train = training_df.drop(['trace_id', 'label'], 1) Y_train = training_df['label'].values model_inst = InMemoryModel(model.predict, examples=X_train, model_type='classifier', unique_values=[1, 2], feature_names=features, target_names=['label']) surrogate_explainer = interpreter.tree_surrogate(model_inst, seed=5) surrogate_explainer.fit(X_train, Y_train, use_oracle=True, prune='post', scorer_type='default') surrogate_explainer.class_names = features viz = dtreeviz(surrogate_explainer.estimator_, X_train, Y_train, target_name='label', feature_names=features, orientation="TD", class_names=list(surrogate_explainer.class_names), fancy=True, X=None, label_fontsize=12, ticks_fontsize=8, fontname="Arial") viz.save("skater_plot_train_2_2.svg")
## To avoid clutter I only produce plots for gradient boosting and one fold only if (fold == 2 and modelno == 5): # Plot PDPs of variable "alm" since it is the most important feature, for 3 of the 4 models ## alm not the most important feature for Gaussian Naive bayes tho, explain that # for other variables just change the name # for other models just change the number # interpreter.partial_dependence.plot_partial_dependence(["alm"], # pyint_model, grid_resolution=30, # with_variance=True) # # PDP interaction between two variables, for each class # interpreter.partial_dependence.plot_partial_dependence([("nuc", "mit")], pyint_model, # grid_resolution=10) surrogate_explainer = interpreter.tree_surrogate( oracle=pyint_model, seed=5, max_depth=4) surrogate_explainer.fit(train_data, train_target, use_oracle=True, prune='pre', scorer_type='default') surrogate_explainer.plot_global_decisions( file_name='mlp_tree_class_md4.png', fig_size=(8, 8)) #show_in_notebook('simple_tree_pre.png', width=400, height=300) # This initialization, although showcased on the docs, does not work # surrogate_explainer = interpreter.tree_surrogate(estimator_type_='classifier', # feature_names=featureNames[1:9], # class_names=["CYT", "ME3", "MIT", "NUC"], seed=5) # y_hat_train = model.predict(train_data)