Exemplo n.º 1
0
def progressify(iterable, n):
    start_time = time.time()
    progress = IntProgress(min=0, max=n, layout=Layout(width='100%'))
    text = Label(layout=Layout(width='100%'))
    display(progress)
    display(text)
    for it in iterable:
        yield it
        progress.value += 1
        elapsed_time = time.time() - start_time
        percent = progress.value * 100.0 / n
        progress.description = '%.1f%% (%s / %s)' % (percent, progress.value,
                                                     n)
        text.value = 'elapsed %s' % datetime.timedelta(seconds=elapsed_time)
    progress.bar_style = 'success'
Exemplo n.º 2
0
def compare_models(data_gen, model_2d, model_3d=None):
    f = IntProgress(min=0, max=data_gen.epochs)
    l = Label("0/" + str(data_gen.epochs))
    H = HBox([f, l])
    display(H) # display the bar and label
    
    output = pd.DataFrame()

    for i in range(data_gen.epochs):
        X,y = data_gen.__getitem__(i)
        y_pred = model_2d.predict(X[0].reshape(80,120,120,1))
        output = output.append(compared_segments(y_pred,y[0]), ignore_index=True)
        
        # Update Progress Bar
        f.value += 1
        l.value = str(f.value) + "/" + str(data_gen.epochs)
        
    return output
Exemplo n.º 3
0
def downsample(img_folder, out_folder, sample = True, split = .8, down_rate = 1, crop = None, dim="3D"):
    # Check if dim is properly defined
    if dim not in ["2D","3D"]:
        print("dim is not either 2D or 3D")
        return
    
    # Load all of the base filenames, ignoring all other files in directory
    base_files = [file for file in os.listdir(img_folder) if file.endswith("MR.npz")]
    
    # Check if the output directories exists. If not, create it. 
    
    create_dir(out_folder)
    create_dir(out_folder + "/train")
    create_dir(out_folder + "/test")
    create_dir(out_folder + "/train/imgs")
    create_dir(out_folder + "/train/segs")
    create_dir(out_folder + "/test/imgs")
    create_dir(out_folder + "/test/segs")
            
    # Set up progress bar.
    
    f = IntProgress(min=0, max=len(base_files))
    l = Label("Loading File")
    H = HBox([f, l])
    display(H) # display the bar and label
    
    # Set up the output folders
    
    out_fol_img = out_folder + "/train/imgs/"
    out_fol_seg = out_folder + "/train/segs/"
    tt = "Train: " # The label for the progress bar
    
    # If crop is not None, get the crop range:
    if not crop:
        a1 = b1 = c1 = 0
        (a2,b2,c2) = np.load(img_folder + "/" + base_files[0])['arr_0'].shape
    else:
        (a1,a2,b1,b2,c1,c2) = crop
        
    print("Cropping to ", a1,a2,b1,b2,c1,c2)
    
    # For each file, load both the file and segmentation in. Downsample both and output.
    
    ds = down_rate
    for n, file in enumerate(base_files):
        img = np.load(img_folder + "/" + file)['arr_0'][a1:a2,b1:b2,c1:c2]
        seg = np.load(img_folder + "/" + file[:-4] + "seg.npz")['arr_0'][a1:a2,b1:b2,c1:c2]
        

        if (n+1) > len(base_files)*split:
            out_fol_img = out_folder + "/test/imgs/"
            out_fol_seg = out_folder + "/test/segs/"
            tt = "Test: "
            
        for i in range(ds):
            for j in range(ds):
                for k in range(ds):
                    N = str(i + ds*j + (ds**2)*k)
                    ds_img = img[i::ds,j::ds,k::ds]
                    ds_seg = seg[i::ds,j::ds,k::ds]
                    
                    if dim is "3D":
                        np.savez_compressed(out_fol_img + file[:-4] + N + ".npz", ds_img)
                        np.savez_compressed(out_fol_seg + file[:-4] + N + ".npz", ds_seg)
                    elif dim is "2D":
                        for r in range(a2-a1):
                            np.savez_compressed(out_fol_img + file[:-4] + N + "_" + str(r) + ".npz", ds_img[r,:,:])
                            np.savez_compressed(out_fol_seg + file[:-4] + N + "_" + str(r) + ".npz", ds_seg[r,:,:])
                        
                    
        f.value += 1 # signal to increment the progress bar
        l.value = tt + file
        
      
    # Display a sample output if requested
    if sample:            
        display_train_test(out_folder,dim=dim)
                    
    ## Summerize preproccesing info
    
    f = ds**3
    
    print("Train Images:", int(f*np.floor(len(base_files)*split)))
    print("Test Images:", int(f*(len(base_files) - np.floor(len(base_files)*split))))
    print("Dimensions:", ds_img.shape)
    
    return ds_img.shape
Exemplo n.º 4
0
    def explain_instance(self,
                         instance,
                         num_reps=50,
                         num_features=4,
                         neighborhood_samples=10000,
                         use_cov_matrix=False,
                         verbose=False,
                         figure_dir=None):
        npEX = np.array(self.EX)
        cls_proba = self.bb_classifier.predict_proba

        x0 = copy.deepcopy(instance)  # instance to be explained
        mockobj = mock.Mock()

        # Neighborhood random samples
        cov_matrix = np.cov(
            ((X - npEX) / self.StdX).T) if use_cov_matrix else 1.0
        NormV = scipy.stats.multivariate_normal.rvs(mean=np.zeros(self.F),
                                                    cov=cov_matrix,
                                                    size=neighborhood_samples,
                                                    random_state=10)

        # Get the output of the black-box classifier on x0
        output = cls_proba([x0])[0]
        label_x0 = 1 if output[1] >= output[0] else 0
        prob_x0 = output[label_x0]
        prob_x0_F, prob_x0_T = output[0], output[1]
        if verbose:
            print('prob_x0', prob_x0, '   label_x0',
                  self.class_names[label_x0])

        # Prepare instance for LIME
        lime_x0 = np.divide((x0 - npEX),
                            self.StdX,
                            where=np.logical_not(np.isclose(self.StdX, 0)))
        shap_x0 = (x0 - npEX)

        rows = None
        progbar = IntProgress(min=0, max=num_reps)
        label = Label(value="")
        display(HBox([Label("K=%d " % (num_features)), progbar, label]))

        # Explain the same instance x0 multiple times
        for rnum in range(num_reps):
            label.value = "%d/%d" % (rnum + 1, num_reps)
            R = mock.Mock()  # store all the computed metrics
            R.rnum, R.prob_x0 = rnum, prob_x0

            # Explain the instance x0 with LIME
            lime_expl = self.LIMEEXPL.explain_instance(
                np.array(x0),
                cls_proba,
                num_features=num_features,
                top_labels=1,
                num_samples=self.explanation_samples)

            # Explain x0 using SHAP
            shap_phi = self.SHAPEXPL.shap_values(x0, l1_reg="num_features(10)")
            shap_phi0 = self.SHAPEXPL.expected_value

            # Take only the top @num_features from shap_phi
            argtop = np.argsort(np.abs(shap_phi[0]))
            for k in range(len(shap_phi)):
                shap_phi[k][argtop[:(self.F - num_features)]] = 0

            # Recover both the LIME and the SHAP classifiers
            R.lime_g = get_LIME_classifier(lime_expl, label_x0, x0)
            R.shap_g = get_SHAP_classifier(label_x0, shap_phi, shap_phi0, x0,
                                           self.EX)

            #----------------------------------------------------------
            # Evaluate the white box classifiers
            EL = eval_whitebox_classifier(R,
                                          R.lime_g,
                                          npEX,
                                          self.StdX,
                                          NormV,
                                          x0,
                                          label_x0,
                                          cls_proba,
                                          "lime",
                                          precision_recalls=True)
            ES = eval_whitebox_classifier(R,
                                          R.shap_g,
                                          npEX,
                                          np.ones(len(x0)),
                                          NormV * self.StdX,
                                          x0,
                                          label_x0,
                                          cls_proba,
                                          "shap",
                                          precision_recalls=True)

            R.lime_local_discr = np.abs(
                R.lime_g.predict([lime_x0])[0] - prob_x0)
            R.shap_local_discr = np.abs(
                R.shap_g.predict([shap_x0])[0] - prob_x0)

            # Indices of the most important features, ordered by their absolute value
            R.lime_argtop = np.argsort(np.abs(R.lime_g.coef_))
            R.shap_argtop = np.argsort(np.abs(R.shap_g.coef_))

            # get the K most common features in the explanation of x0
            R.mcf_lime = tuple(
                [R.lime_argtop[-k] for k in range(num_features)])
            R.mcf_shap = tuple(
                [R.shap_argtop[-k] for k in range(num_features)])

            # Binary masks of the argtops
            R.lime_bin_expl, R.shap_bin_expl = np.zeros(self.F), np.zeros(
                self.F)
            R.lime_bin_expl[np.array(R.mcf_lime)] = 1
            R.shap_bin_expl[np.array(R.mcf_shap)] = 1

            # Save the Ridge regressors built by LIME and SHAP
            # lime_g_W, shap_g_W = tuple(lime_g.coef_), tuple(shap_g.coef_)
            # lime_g_w0, shap_g_w0 = lime_g.intercept_, shap_g.intercept_

            # get the appropriate R keys
            R_keys = copy.copy(R.__dict__)
            for key in copy.copy(list(R_keys.keys())):
                if key.startswith("wb_"):
                    R_keys[wb_name + key[2:]] = R_keys.pop(key)
                elif key in mockobj.__dict__:
                    del R_keys[key]

            rows = pd.DataFrame(columns=R_keys) if rows is None else rows
            rows = rows.append({k: R.__dict__[k]
                                for k in R_keys},
                               ignore_index=True)
            progbar.value += 1

        label.value += " Done."

        # use the multiple explanations to compute the LEAF metrics
        # display(rows)

        # Jaccard distances between the various explanations (stability)
        lime_jaccard_mat = 1 - pdist(np.stack(rows.lime_bin_expl, axis=0),
                                     'jaccard')
        shap_jaccard_mat = 1 - pdist(np.stack(rows.shap_bin_expl, axis=0),
                                     'jaccard')
        self.lime_avg_jaccard_bin, self.lime_std_jaccard_bin = np.mean(
            lime_jaccard_mat), np.std(lime_jaccard_mat)
        self.shap_avg_jaccard_bin, self.shap_std_jaccard_bin = np.mean(
            shap_jaccard_mat), np.std(shap_jaccard_mat)

        # LIME/SHAP explanation comparisons
        lime_shap_jaccard_mat = 1 - cdist(np.stack(rows.lime_bin_expl, axis=0),
                                          np.stack(rows.shap_bin_expl, axis=0),
                                          'jaccard')
        lime_shap_avg_jaccard_bin, lime_shap_std_jaccard_bin = np.mean(
            lime_shap_jaccard_mat), np.std(lime_shap_jaccard_mat)

        # store the metrics for later use
        self.metrics = rows

        def leaf_plot(stability, method):
            fig, ax1 = plt.subplots(figsize=(6, 2.2))
            data = [
                stability.flatten(),
                1 - rows[method + '_local_discr'],
                rows[method + '_fidelity_f1'],
                # rows[method + '_prescriptivity_f1'],
                # rows[method + '_bal_prescriptivity' ],
                1 - 2 * np.abs(rows[method + '_boundary_discr'])
            ]

            # color = 'tab:red'
            ax1.tick_params(axis='both', which='major', labelsize=12)
            ax1.set_xlabel('distribution')
            ax1.set_ylabel('LEAF metrics', color='black', fontsize=15)
            ax1.boxplot(data, vert=False, widths=0.7)
            ax1.tick_params(axis='y', labelcolor='#500000')
            ax1.set_yticks(np.arange(1, len(data) + 1))
            ax1.set_yticklabels([
                'Stability', 'Local Concordance', 'Fidelity', 'Prescriptivity'
            ])
            ax1.set_xlim([-0.05, 1.05])
            ax1.invert_yaxis()

            ax2 = ax1.twinx(
            )  # instantiate a second axes that shares the same x-axis
            ax2.tick_params(axis='both', which='major', labelsize=12)
            ax2.set_ylabel(
                'Values',
                color='#000080')  # we already handled the x-label with ax1
            ax2.boxplot(data, vert=False, widths=0.7)
            # ax2.boxplot([np.mean(d) for d in data], color=color)
            ax2.tick_params(axis='y', labelcolor='#000080')
            ax2.set_yticks(np.arange(1, len(data) + 1))
            ax2.set_yticklabels(
                ["  %.3f ± %.3f  " % (np.mean(d), np.std(d)) for d in data])
            ax2.invert_yaxis()

            fig.tight_layout(
            )  # otherwise the right y-label is slightly clipped
            if figure_dir is not None:
                imgname = figure_dir + method + "_leaf.pdf"
                print('Saving', imgname)
                plt.savefig(imgname, dpi=150, bbox_inches='tight')
            plt.show()

        # Show LIME explanation
        display(HTML("<h2>LIME</h2>"))
        lime_expl.show_in_notebook(show_table=True, show_all=False)
        leaf_plot(lime_jaccard_mat, 'lime')

        # Show SHAP explanation
        display(HTML("<h2>SHAP</h2>"))
        display(shap.force_plot(shap_phi0[label_x0], shap_phi[label_x0], x0))
        leaf_plot(shap_jaccard_mat, 'shap')

        prescription = False
        if prescription:
            print("====================================================")
            lime_x1, lime_sx1 = EL
            shap_x1, shap_sx1 = ES

            print(
                'SHAP accuracy %f balanced_accuracy %f precision %f recall %f'
                % (rows.shap_prescriptivity.mean(),
                   rows.shap_bal_prescriptivity.mean(),
                   rows.shap_precision_x1.mean(), rows.shap_recall_x1.mean()))

            lime_diff = (rows.iloc[-1].lime_g.coef_ != 0) * (lime_x1 - x0)
            shap_diff = (rows.iloc[-1].shap_g.coef_ != 0) * (shap_x1 - x0)

            print(np.array(rows.iloc[-1].lime_g.coef_ != 0))
            print('lime_diff\n', lime_diff)
            print('shap_diff\n', shap_diff)

            lime_output_x1 = cls_proba([lime_x1])[0]
            shap_output_x1 = cls_proba([shap_x1])[0]
            lime_label_x1 = 1 if lime_output_x1[1] >= lime_output_x1[0] else 0
            shap_label_x1 = 1 if shap_output_x1[1] >= shap_output_x1[0] else 0

            print("LIME(x1) prob =", lime_output_x1)
            print("SHAP(x1) prob =", shap_output_x1)

            # df = pd.DataFrame([x0, x0 + shap_diff], index=['x', 'x\'']).round(2)
            # display(df.T.iloc[:math.ceil(F/2),:])
            # display(df.T.iloc[math.ceil(F/2):,:])

            # Show LIME explanation
            lime_expl = LIMEEXPL.explain_instance(
                np.array(shap_x1),
                cls_proba,
                num_features=num_features,
                top_labels=1,
                num_samples=self.explanation_samples)
            lime_expl.show_in_notebook(show_table=True, show_all=False)
            # leaf_plot(lime_jaccard_mat, 'lime')

            # Show SHAP explanation
            shap_phi = SHAPEXPL.shap_values(shap_x1, l1_reg="num_features(10)")
            shap_phi0 = SHAPEXPL.expected_value
            argtop = np.argsort(np.abs(shap_phi[0]))
            for k in range(len(shap_phi)):
                shap_phi[k][argtop[:(F - num_features)]] = 0
            display(
                shap.force_plot(shap_phi0[shap_label_x1],
                                shap_phi[shap_label_x1], shap_x1))
Exemplo n.º 5
0
    def worker(self):
        def cancel(b):
            self.sc.cancelJobGroup(self.job_info.group_id)

        def toggle(widget):
            def f(b):
                for w in widget.children:
                    h = w.layout.height
                    if h is None or h == "16px":
                        w.layout.height = "0px"
                        b.icon = "arrow-circle-down"
                    else:
                        w.layout.height = "16px"
                        b.icon = "arrow-circle-right"

            return f

        style = {"description_width": "initial"}
        bars = {}
        labels = {}
        lastJob = None

        progressbars = VBox([])

        cancel_button = Button(button_style="",
                               tooltip="Cancel Spark Job",
                               icon="window-close")
        cancel_button.add_class("db-button")
        cancel_button.on_click(cancel)

        toggle_button = Button(button_style="",
                               tooltip="Toggle progress bar",
                               icon="arrow-circle-right")
        toggle_button.add_class("db-button")
        toggle_button.on_click(toggle(progressbars))

        indicator = HBox([toggle_button, progressbars])

        while self.running == 1:
            time.sleep(0.2)
            jobs = [(jobid, self.tracker.getJobInfo(jobid)) for jobid in
                    self.tracker.getJobIdsForGroup(self.job_info.group_id)
                    if self.tracker.getJobInfo(jobid).status == "RUNNING"]

            for j, job in jobs:
                if bars.get(j, None) is None:
                    if lastJob is not None:
                        bars[lastJob].value = 100.0
                    bars[j] = FloatProgress(
                        value=0.0,
                        min=0.0,
                        max=100.0,
                        description="Job: %04d Stage: %04d" % (j, 0),
                        bar_style="info",
                        orientation="horizontal",
                        style=style,
                    )
                    bars[j].add_class("db-bar")
                    labels[j] = Label(
                        value="",
                        description="Code:",
                        disabled=False,
                        layout=Layout(width="800px",
                                      height="100%",
                                      margin="0 0 0 5px"),
                    )
                    labels[j].add_class("db-label")

                    progressbar = HBox([bars[j], labels[j]])
                    progressbars.children = progressbars.children + (
                        progressbar, )
                    if not self.progressbar_showing:
                        self.progressbar_showing = True
                        display(indicator)

                lastJob = j
                stageIds = sorted(job.stageIds)
                for s in stageIds:
                    stageInfo = self.tracker.getStageInfo(s)
                    bars[j].description = "Job: %04d Stage: %04d" % (j, s)
                    labels[j].value = "code: '%s' / stages: %s" % (
                        stageInfo.name,
                        str(stageIds)[1:-1],
                    )
                    if stageInfo.numActiveTasks > 0:
                        progress = int(100 * stageInfo.numCompletedTasks /
                                       stageInfo.numTasks)
                        bars[j].value = progress

        if lastJob is not None and self.running == 0:
            bars[lastJob].value = 100.0
Exemplo n.º 6
0
    def getTS(self, startDate, endDate):

        ndays = (endDate - startDate).days + 1
        if self.init:
            ndays += 1
        currentDate = startDate
        delta = timedelta(days=1)

        self.df = pd.DataFrame(columns=[
            'datetime', 'AOD', 'DUST_PM', 'SALT_PM', 'ORG_CARB', 'BLK_CARB',
            'SO4', 'PM2.5'
        ])

        with self.out_cp:
            self.out_cp.clear_output()

            pbar = IntProgress(min=0, max=int(ndays))
            pbar.description = 'Progress:'
            info1 = Label('0%')
            info2 = Label(' ')
            display(
                VBox([
                    HBox([pbar, info1]),
                    HBox([info2], layout=Layout(justify_content='center'))
                ]))

            progVal = 0
            if self.init:
                info2.value = 'Initializing NASA Earth Data Connection..'
                self.initSession()
                self.init = False
                pbar.value += 1
                progVal += 1
                info1.value = '{:.1f}%'.format(
                    (float(progVal) / float(ndays)) * 100.0)

            self.lonlatToIndex(self.plon, self.plat)

            while currentDate <= endDate:
                url = self.getUrlMERRA(currentDate)
                info2.value = 'Accessing data for {}'.format(currentDate)

                dataset = open_url(url, session=self.session)
                aod = np.squeeze(dataset['TOTEXTTAU'][:, self.ilat, self.ilon])
                dust_pm = np.squeeze(
                    dataset['DUSMASS25'][:, self.ilat,
                                         self.ilon]) * 1000000000.0
                salt_pm = np.squeeze(
                    dataset['SSSMASS25'][:, self.ilat,
                                         self.ilon]) * 1000000000.0
                org_carb = np.squeeze(
                    dataset['OCSMASS'][:, self.ilat, self.ilon]) * 1000000000.0
                blk_carb = np.squeeze(
                    dataset['BCSMASS'][:, self.ilat, self.ilon]) * 1000000000.0
                so4 = np.squeeze(dataset['SO4SMASS'][:, self.ilat,
                                                     self.ilon]) * 1000000000.0
                pm25 = (1.375 * so4 + 1.6 * org_carb + blk_carb + dust_pm +
                        salt_pm)
                dt = pd.date_range(currentDate, periods=24, freq='H')
                vardict = {
                    'datetime': dt,
                    'AOD': aod,
                    'DUST_PM': dust_pm,
                    'SALT_PM': salt_pm,
                    'ORG_CARB': org_carb,
                    'BLK_CARB': blk_carb,
                    'SO4': so4,
                    'PM2.5': pm25
                }
                df_add = pd.DataFrame(vardict)
                self.df = pd.concat([self.df, df_add])
                currentDate += delta
                progVal += 1
                info1.value = '{:.1f}%'.format(
                    (float(progVal) / float(ndays)) * 100.0)
                pbar.value += 1
        self.stateChange = False