def progressify(iterable, n): start_time = time.time() progress = IntProgress(min=0, max=n, layout=Layout(width='100%')) text = Label(layout=Layout(width='100%')) display(progress) display(text) for it in iterable: yield it progress.value += 1 elapsed_time = time.time() - start_time percent = progress.value * 100.0 / n progress.description = '%.1f%% (%s / %s)' % (percent, progress.value, n) text.value = 'elapsed %s' % datetime.timedelta(seconds=elapsed_time) progress.bar_style = 'success'
def compare_models(data_gen, model_2d, model_3d=None): f = IntProgress(min=0, max=data_gen.epochs) l = Label("0/" + str(data_gen.epochs)) H = HBox([f, l]) display(H) # display the bar and label output = pd.DataFrame() for i in range(data_gen.epochs): X,y = data_gen.__getitem__(i) y_pred = model_2d.predict(X[0].reshape(80,120,120,1)) output = output.append(compared_segments(y_pred,y[0]), ignore_index=True) # Update Progress Bar f.value += 1 l.value = str(f.value) + "/" + str(data_gen.epochs) return output
def downsample(img_folder, out_folder, sample = True, split = .8, down_rate = 1, crop = None, dim="3D"): # Check if dim is properly defined if dim not in ["2D","3D"]: print("dim is not either 2D or 3D") return # Load all of the base filenames, ignoring all other files in directory base_files = [file for file in os.listdir(img_folder) if file.endswith("MR.npz")] # Check if the output directories exists. If not, create it. create_dir(out_folder) create_dir(out_folder + "/train") create_dir(out_folder + "/test") create_dir(out_folder + "/train/imgs") create_dir(out_folder + "/train/segs") create_dir(out_folder + "/test/imgs") create_dir(out_folder + "/test/segs") # Set up progress bar. f = IntProgress(min=0, max=len(base_files)) l = Label("Loading File") H = HBox([f, l]) display(H) # display the bar and label # Set up the output folders out_fol_img = out_folder + "/train/imgs/" out_fol_seg = out_folder + "/train/segs/" tt = "Train: " # The label for the progress bar # If crop is not None, get the crop range: if not crop: a1 = b1 = c1 = 0 (a2,b2,c2) = np.load(img_folder + "/" + base_files[0])['arr_0'].shape else: (a1,a2,b1,b2,c1,c2) = crop print("Cropping to ", a1,a2,b1,b2,c1,c2) # For each file, load both the file and segmentation in. Downsample both and output. ds = down_rate for n, file in enumerate(base_files): img = np.load(img_folder + "/" + file)['arr_0'][a1:a2,b1:b2,c1:c2] seg = np.load(img_folder + "/" + file[:-4] + "seg.npz")['arr_0'][a1:a2,b1:b2,c1:c2] if (n+1) > len(base_files)*split: out_fol_img = out_folder + "/test/imgs/" out_fol_seg = out_folder + "/test/segs/" tt = "Test: " for i in range(ds): for j in range(ds): for k in range(ds): N = str(i + ds*j + (ds**2)*k) ds_img = img[i::ds,j::ds,k::ds] ds_seg = seg[i::ds,j::ds,k::ds] if dim is "3D": np.savez_compressed(out_fol_img + file[:-4] + N + ".npz", ds_img) np.savez_compressed(out_fol_seg + file[:-4] + N + ".npz", ds_seg) elif dim is "2D": for r in range(a2-a1): np.savez_compressed(out_fol_img + file[:-4] + N + "_" + str(r) + ".npz", ds_img[r,:,:]) np.savez_compressed(out_fol_seg + file[:-4] + N + "_" + str(r) + ".npz", ds_seg[r,:,:]) f.value += 1 # signal to increment the progress bar l.value = tt + file # Display a sample output if requested if sample: display_train_test(out_folder,dim=dim) ## Summerize preproccesing info f = ds**3 print("Train Images:", int(f*np.floor(len(base_files)*split))) print("Test Images:", int(f*(len(base_files) - np.floor(len(base_files)*split)))) print("Dimensions:", ds_img.shape) return ds_img.shape
def explain_instance(self, instance, num_reps=50, num_features=4, neighborhood_samples=10000, use_cov_matrix=False, verbose=False, figure_dir=None): npEX = np.array(self.EX) cls_proba = self.bb_classifier.predict_proba x0 = copy.deepcopy(instance) # instance to be explained mockobj = mock.Mock() # Neighborhood random samples cov_matrix = np.cov( ((X - npEX) / self.StdX).T) if use_cov_matrix else 1.0 NormV = scipy.stats.multivariate_normal.rvs(mean=np.zeros(self.F), cov=cov_matrix, size=neighborhood_samples, random_state=10) # Get the output of the black-box classifier on x0 output = cls_proba([x0])[0] label_x0 = 1 if output[1] >= output[0] else 0 prob_x0 = output[label_x0] prob_x0_F, prob_x0_T = output[0], output[1] if verbose: print('prob_x0', prob_x0, ' label_x0', self.class_names[label_x0]) # Prepare instance for LIME lime_x0 = np.divide((x0 - npEX), self.StdX, where=np.logical_not(np.isclose(self.StdX, 0))) shap_x0 = (x0 - npEX) rows = None progbar = IntProgress(min=0, max=num_reps) label = Label(value="") display(HBox([Label("K=%d " % (num_features)), progbar, label])) # Explain the same instance x0 multiple times for rnum in range(num_reps): label.value = "%d/%d" % (rnum + 1, num_reps) R = mock.Mock() # store all the computed metrics R.rnum, R.prob_x0 = rnum, prob_x0 # Explain the instance x0 with LIME lime_expl = self.LIMEEXPL.explain_instance( np.array(x0), cls_proba, num_features=num_features, top_labels=1, num_samples=self.explanation_samples) # Explain x0 using SHAP shap_phi = self.SHAPEXPL.shap_values(x0, l1_reg="num_features(10)") shap_phi0 = self.SHAPEXPL.expected_value # Take only the top @num_features from shap_phi argtop = np.argsort(np.abs(shap_phi[0])) for k in range(len(shap_phi)): shap_phi[k][argtop[:(self.F - num_features)]] = 0 # Recover both the LIME and the SHAP classifiers R.lime_g = get_LIME_classifier(lime_expl, label_x0, x0) R.shap_g = get_SHAP_classifier(label_x0, shap_phi, shap_phi0, x0, self.EX) #---------------------------------------------------------- # Evaluate the white box classifiers EL = eval_whitebox_classifier(R, R.lime_g, npEX, self.StdX, NormV, x0, label_x0, cls_proba, "lime", precision_recalls=True) ES = eval_whitebox_classifier(R, R.shap_g, npEX, np.ones(len(x0)), NormV * self.StdX, x0, label_x0, cls_proba, "shap", precision_recalls=True) R.lime_local_discr = np.abs( R.lime_g.predict([lime_x0])[0] - prob_x0) R.shap_local_discr = np.abs( R.shap_g.predict([shap_x0])[0] - prob_x0) # Indices of the most important features, ordered by their absolute value R.lime_argtop = np.argsort(np.abs(R.lime_g.coef_)) R.shap_argtop = np.argsort(np.abs(R.shap_g.coef_)) # get the K most common features in the explanation of x0 R.mcf_lime = tuple( [R.lime_argtop[-k] for k in range(num_features)]) R.mcf_shap = tuple( [R.shap_argtop[-k] for k in range(num_features)]) # Binary masks of the argtops R.lime_bin_expl, R.shap_bin_expl = np.zeros(self.F), np.zeros( self.F) R.lime_bin_expl[np.array(R.mcf_lime)] = 1 R.shap_bin_expl[np.array(R.mcf_shap)] = 1 # Save the Ridge regressors built by LIME and SHAP # lime_g_W, shap_g_W = tuple(lime_g.coef_), tuple(shap_g.coef_) # lime_g_w0, shap_g_w0 = lime_g.intercept_, shap_g.intercept_ # get the appropriate R keys R_keys = copy.copy(R.__dict__) for key in copy.copy(list(R_keys.keys())): if key.startswith("wb_"): R_keys[wb_name + key[2:]] = R_keys.pop(key) elif key in mockobj.__dict__: del R_keys[key] rows = pd.DataFrame(columns=R_keys) if rows is None else rows rows = rows.append({k: R.__dict__[k] for k in R_keys}, ignore_index=True) progbar.value += 1 label.value += " Done." # use the multiple explanations to compute the LEAF metrics # display(rows) # Jaccard distances between the various explanations (stability) lime_jaccard_mat = 1 - pdist(np.stack(rows.lime_bin_expl, axis=0), 'jaccard') shap_jaccard_mat = 1 - pdist(np.stack(rows.shap_bin_expl, axis=0), 'jaccard') self.lime_avg_jaccard_bin, self.lime_std_jaccard_bin = np.mean( lime_jaccard_mat), np.std(lime_jaccard_mat) self.shap_avg_jaccard_bin, self.shap_std_jaccard_bin = np.mean( shap_jaccard_mat), np.std(shap_jaccard_mat) # LIME/SHAP explanation comparisons lime_shap_jaccard_mat = 1 - cdist(np.stack(rows.lime_bin_expl, axis=0), np.stack(rows.shap_bin_expl, axis=0), 'jaccard') lime_shap_avg_jaccard_bin, lime_shap_std_jaccard_bin = np.mean( lime_shap_jaccard_mat), np.std(lime_shap_jaccard_mat) # store the metrics for later use self.metrics = rows def leaf_plot(stability, method): fig, ax1 = plt.subplots(figsize=(6, 2.2)) data = [ stability.flatten(), 1 - rows[method + '_local_discr'], rows[method + '_fidelity_f1'], # rows[method + '_prescriptivity_f1'], # rows[method + '_bal_prescriptivity' ], 1 - 2 * np.abs(rows[method + '_boundary_discr']) ] # color = 'tab:red' ax1.tick_params(axis='both', which='major', labelsize=12) ax1.set_xlabel('distribution') ax1.set_ylabel('LEAF metrics', color='black', fontsize=15) ax1.boxplot(data, vert=False, widths=0.7) ax1.tick_params(axis='y', labelcolor='#500000') ax1.set_yticks(np.arange(1, len(data) + 1)) ax1.set_yticklabels([ 'Stability', 'Local Concordance', 'Fidelity', 'Prescriptivity' ]) ax1.set_xlim([-0.05, 1.05]) ax1.invert_yaxis() ax2 = ax1.twinx( ) # instantiate a second axes that shares the same x-axis ax2.tick_params(axis='both', which='major', labelsize=12) ax2.set_ylabel( 'Values', color='#000080') # we already handled the x-label with ax1 ax2.boxplot(data, vert=False, widths=0.7) # ax2.boxplot([np.mean(d) for d in data], color=color) ax2.tick_params(axis='y', labelcolor='#000080') ax2.set_yticks(np.arange(1, len(data) + 1)) ax2.set_yticklabels( [" %.3f ± %.3f " % (np.mean(d), np.std(d)) for d in data]) ax2.invert_yaxis() fig.tight_layout( ) # otherwise the right y-label is slightly clipped if figure_dir is not None: imgname = figure_dir + method + "_leaf.pdf" print('Saving', imgname) plt.savefig(imgname, dpi=150, bbox_inches='tight') plt.show() # Show LIME explanation display(HTML("<h2>LIME</h2>")) lime_expl.show_in_notebook(show_table=True, show_all=False) leaf_plot(lime_jaccard_mat, 'lime') # Show SHAP explanation display(HTML("<h2>SHAP</h2>")) display(shap.force_plot(shap_phi0[label_x0], shap_phi[label_x0], x0)) leaf_plot(shap_jaccard_mat, 'shap') prescription = False if prescription: print("====================================================") lime_x1, lime_sx1 = EL shap_x1, shap_sx1 = ES print( 'SHAP accuracy %f balanced_accuracy %f precision %f recall %f' % (rows.shap_prescriptivity.mean(), rows.shap_bal_prescriptivity.mean(), rows.shap_precision_x1.mean(), rows.shap_recall_x1.mean())) lime_diff = (rows.iloc[-1].lime_g.coef_ != 0) * (lime_x1 - x0) shap_diff = (rows.iloc[-1].shap_g.coef_ != 0) * (shap_x1 - x0) print(np.array(rows.iloc[-1].lime_g.coef_ != 0)) print('lime_diff\n', lime_diff) print('shap_diff\n', shap_diff) lime_output_x1 = cls_proba([lime_x1])[0] shap_output_x1 = cls_proba([shap_x1])[0] lime_label_x1 = 1 if lime_output_x1[1] >= lime_output_x1[0] else 0 shap_label_x1 = 1 if shap_output_x1[1] >= shap_output_x1[0] else 0 print("LIME(x1) prob =", lime_output_x1) print("SHAP(x1) prob =", shap_output_x1) # df = pd.DataFrame([x0, x0 + shap_diff], index=['x', 'x\'']).round(2) # display(df.T.iloc[:math.ceil(F/2),:]) # display(df.T.iloc[math.ceil(F/2):,:]) # Show LIME explanation lime_expl = LIMEEXPL.explain_instance( np.array(shap_x1), cls_proba, num_features=num_features, top_labels=1, num_samples=self.explanation_samples) lime_expl.show_in_notebook(show_table=True, show_all=False) # leaf_plot(lime_jaccard_mat, 'lime') # Show SHAP explanation shap_phi = SHAPEXPL.shap_values(shap_x1, l1_reg="num_features(10)") shap_phi0 = SHAPEXPL.expected_value argtop = np.argsort(np.abs(shap_phi[0])) for k in range(len(shap_phi)): shap_phi[k][argtop[:(F - num_features)]] = 0 display( shap.force_plot(shap_phi0[shap_label_x1], shap_phi[shap_label_x1], shap_x1))
def worker(self): def cancel(b): self.sc.cancelJobGroup(self.job_info.group_id) def toggle(widget): def f(b): for w in widget.children: h = w.layout.height if h is None or h == "16px": w.layout.height = "0px" b.icon = "arrow-circle-down" else: w.layout.height = "16px" b.icon = "arrow-circle-right" return f style = {"description_width": "initial"} bars = {} labels = {} lastJob = None progressbars = VBox([]) cancel_button = Button(button_style="", tooltip="Cancel Spark Job", icon="window-close") cancel_button.add_class("db-button") cancel_button.on_click(cancel) toggle_button = Button(button_style="", tooltip="Toggle progress bar", icon="arrow-circle-right") toggle_button.add_class("db-button") toggle_button.on_click(toggle(progressbars)) indicator = HBox([toggle_button, progressbars]) while self.running == 1: time.sleep(0.2) jobs = [(jobid, self.tracker.getJobInfo(jobid)) for jobid in self.tracker.getJobIdsForGroup(self.job_info.group_id) if self.tracker.getJobInfo(jobid).status == "RUNNING"] for j, job in jobs: if bars.get(j, None) is None: if lastJob is not None: bars[lastJob].value = 100.0 bars[j] = FloatProgress( value=0.0, min=0.0, max=100.0, description="Job: %04d Stage: %04d" % (j, 0), bar_style="info", orientation="horizontal", style=style, ) bars[j].add_class("db-bar") labels[j] = Label( value="", description="Code:", disabled=False, layout=Layout(width="800px", height="100%", margin="0 0 0 5px"), ) labels[j].add_class("db-label") progressbar = HBox([bars[j], labels[j]]) progressbars.children = progressbars.children + ( progressbar, ) if not self.progressbar_showing: self.progressbar_showing = True display(indicator) lastJob = j stageIds = sorted(job.stageIds) for s in stageIds: stageInfo = self.tracker.getStageInfo(s) bars[j].description = "Job: %04d Stage: %04d" % (j, s) labels[j].value = "code: '%s' / stages: %s" % ( stageInfo.name, str(stageIds)[1:-1], ) if stageInfo.numActiveTasks > 0: progress = int(100 * stageInfo.numCompletedTasks / stageInfo.numTasks) bars[j].value = progress if lastJob is not None and self.running == 0: bars[lastJob].value = 100.0
def getTS(self, startDate, endDate): ndays = (endDate - startDate).days + 1 if self.init: ndays += 1 currentDate = startDate delta = timedelta(days=1) self.df = pd.DataFrame(columns=[ 'datetime', 'AOD', 'DUST_PM', 'SALT_PM', 'ORG_CARB', 'BLK_CARB', 'SO4', 'PM2.5' ]) with self.out_cp: self.out_cp.clear_output() pbar = IntProgress(min=0, max=int(ndays)) pbar.description = 'Progress:' info1 = Label('0%') info2 = Label(' ') display( VBox([ HBox([pbar, info1]), HBox([info2], layout=Layout(justify_content='center')) ])) progVal = 0 if self.init: info2.value = 'Initializing NASA Earth Data Connection..' self.initSession() self.init = False pbar.value += 1 progVal += 1 info1.value = '{:.1f}%'.format( (float(progVal) / float(ndays)) * 100.0) self.lonlatToIndex(self.plon, self.plat) while currentDate <= endDate: url = self.getUrlMERRA(currentDate) info2.value = 'Accessing data for {}'.format(currentDate) dataset = open_url(url, session=self.session) aod = np.squeeze(dataset['TOTEXTTAU'][:, self.ilat, self.ilon]) dust_pm = np.squeeze( dataset['DUSMASS25'][:, self.ilat, self.ilon]) * 1000000000.0 salt_pm = np.squeeze( dataset['SSSMASS25'][:, self.ilat, self.ilon]) * 1000000000.0 org_carb = np.squeeze( dataset['OCSMASS'][:, self.ilat, self.ilon]) * 1000000000.0 blk_carb = np.squeeze( dataset['BCSMASS'][:, self.ilat, self.ilon]) * 1000000000.0 so4 = np.squeeze(dataset['SO4SMASS'][:, self.ilat, self.ilon]) * 1000000000.0 pm25 = (1.375 * so4 + 1.6 * org_carb + blk_carb + dust_pm + salt_pm) dt = pd.date_range(currentDate, periods=24, freq='H') vardict = { 'datetime': dt, 'AOD': aod, 'DUST_PM': dust_pm, 'SALT_PM': salt_pm, 'ORG_CARB': org_carb, 'BLK_CARB': blk_carb, 'SO4': so4, 'PM2.5': pm25 } df_add = pd.DataFrame(vardict) self.df = pd.concat([self.df, df_add]) currentDate += delta progVal += 1 info1.value = '{:.1f}%'.format( (float(progVal) / float(ndays)) * 100.0) pbar.value += 1 self.stateChange = False