Python make_dataset 예제들, experiment_data.make_dataset Python 예제들

예제 #1

0

파일 보기

def load_prediction_scores(outcome, feats, split="test"):
    # load data for reference
    if "trop_" in outcome:
        outcome_dir = os.path.join("prediction-output/troponin")
        Xdf, Ydf, encdf = ed.make_dataset(outcome='troponin',
                                          features='simple',
                                          do_split=False)
    else:
        outcome_dir = os.path.join("prediction-output/%s" % outcome)
        Xdf, Ydf, encdf = ed.make_dataset(outcome=outcome,
                                          features='simple',
                                          do_split=False)

    _, _, mdata = misc.split_data(Xdf.values, Ydf.values, encdf.split, encdf)

    # load predictions
    if '.pkl' in feats:
        print("  ... loading mod file %s" % feats)
        mod = base.load_model(os.path.join(outcome_dir, feats))
        zs = mod.fit_res['z%s-enc-%s' % (split, outcome)]
        if not hasattr(zs, 'index'):
            split_idx = ['train', 'val', 'test'].index(split)
            zs = pd.Series(zs, index=mdata[split_idx].index)
    else:
        with open(os.path.join(outcome_dir, "lreg-%s.pkl" % feats), 'rb') as f:
            res = pickle.load(f)[outcome]
        zs = res['z%s' % split]

    # subselect to test split
    Ys = Ydf.loc[zs.index][outcome]
    Xs = Xdf.loc[zs.index]
    encs = encdf.loc[zs.index]
    return zs, Ys, Xs, encs

예제 #2

0

파일 보기

파일: simple_baseline.py 프로젝트: andymiller/ekg-risk-models

def run_logistic(outcome="mace", features="simple"):
    Xdf, Ydf, encdf = \
        ed.make_dataset(outcome=outcome, features=features, do_split=False)

    # remove missing values
    resdict = {}
    for c in Ydf.columns:
        print("  working on outcome %s "%c)

        # split and purge nan
        Yc = Ydf[c]
        idx_nan = pd.isnull(Yc)
        Xdata, Ydata, mdata = misc.split_data(Xdf[~idx_nan].values,
            Yc[~idx_nan].values, encdf.split[~idx_nan], encdf[~idx_nan])

        cs = np.logspace(-2.5, 1, 20)
        res = misc.run_logistic(Xdata, Ydata,
            Xdf.columns.tolist(), penalty="l2", cs=cs)

        # make sure ztrain/ zval/ ztest are linked to an ed_enc_idx
        res['ztrain'] = pd.Series(logit(np.exp(res['lnp_train'])),
                                  index=mdata[0].index, name='ztrain')
        res['zval']   = pd.Series(logit(np.exp(res['lnp_val'])),
                                  index=mdata[1].index, name='zval')
        res['ztest']  = pd.Series(logit(np.exp(res['lnp_test'])),
                                  index=mdata[2].index, name='ztest')
        resdict[c] = res

    return resdict

예제 #3

0

파일 보기

def plot_full_ekg_example():
    # load outcome data and encounter dataframe
    Xdf, Ydf, encdf = ed.make_dataset(outcome='mace',
                                      features='remarks',
                                      do_split=False)
    Xmat, _, tgrid = ed.load_ekg_data(encdf=encdf, constrain_range=False)

    fig, ax = plt.figure(figsize=(12, 3)), plt.gca()
    ax = viz.plot_stacked_ecg(ax, Xmat[300], linewidth=1.5)  # **kwargs)
    ax.set_xlabel("time (seconds)", fontsize=16)
    ax.get_yaxis().set_ticks([])
    fig.tight_layout()
    fig.savefig('prediction-output/example_ekg.png',
                bbox_inches='tight',
                dpi=150)
    plt.close("all")

    # plot segmented beats
    from ekgmodels import preproc
    beatdf, beatmat = preproc.create_beat_dataset_fixed(encdf,
                                                        Xmat,
                                                        tgrid,
                                                        detrend=False)

    def savebeat(bis, name="example_beats.png"):
        fig, axarr = plt.subplots(1, 3, figsize=(6, 3.5))
        for bi, ax in zip(bis, axarr.flatten()):
            blen = beatdf['beat_len'].iloc[bi]
            ax = viz.plot_stacked_beat(ax, beatmat[bi], beatlen=blen)

        axarr[0].set_xlabel("")
        axarr[2].set_xlabel("")
        axarr[1].set_ylabel("")
        axarr[2].set_ylabel("")
        fig.savefig("prediction-output/%s" % name,
                    bbox_inches='tight',
                    dpi=150)
        plt.close("all")

    bis = [0, 1, 2]
    savebeat(bis, name="example_beats.png")

    bis = [300, 301, 302]
    savebeat(bis, name="example_beats-2.png")

예제 #4

0

파일 보기

파일: simple_baseline.py 프로젝트: andymiller/ekg-risk-models

def run_beat_mlp(outcome='mace', use_features=False, features='simple'):
    # load outcome and ekg remarks
    Xdf, Ydf, encdf = ed.make_dataset(outcome=outcome, features='remarks', do_split=False)
    outcome_names = Ydf.columns
    Ydf = pd.concat([Ydf, Xdf[remark_feats]], axis=1)
    Xsimp, _, _ = ed.make_dataset(outcome=outcome, features=features, do_split=False)
    assert np.all(Xsimp.index == Ydf.index)

    # load outcome data and encounter dataframe
    # beat data ---- beatdf now has 'index' as a column (ed_enc_idx) 
    beatdf, beatmat = ed.load_beat_data(encdf=encdf, constrain_range=True)

    # expand outcome and simple features
    Ybeat      = Ydf.loc[ beatdf['index'] ].values
    Xsimp_beat = Xsimp.loc[ beatdf['index'] ].values

    # now split the data
    Xdata, Ydata, mdata = misc.split_data(beatmat, Ybeat, beatdf.split, beatdf)
    Wdata, _, _ = misc.split_data(Xsimp_beat, Ybeat, beatdf.split, beatdf)
    Xtrain, Xval, Xtest = Xdata
    _, n_channels, n_samples = Xtrain.shape
    ntrain, nval, ntest = (len(x) for x in Xdata)

    # isolate feature columns
    if use_features == True:
        print("Adding %s features to EKG in last layer!"%features)
        Xtrain = np.column_stack([Xtrain.reshape(ntrain, -1), Wdata[0]])
        Xval   = np.column_stack([Xval.reshape(nval, -1)    , Wdata[1]])
        Xtest  = np.column_stack([Xtest.reshape(ntest, -1) ,  Wdata[2]])
        # isolate feature columns
        model = mlp.BeatDeepWideMlpClassifier(
                data_dim = n_channels * n_samples,
                total_n_outputs = Ydata[0].shape[1],
                verbose = False,
                hdims   = [500, 500],
                h_dim   = 100,
                dim_wide = Wdata[0].shape[1])
    else:
        Xtrain = Xtrain.reshape((Xtrain.shape[0], -1))
        Xval   = Xval.reshape((Xval.shape[0], -1))
        Xtest  = Xtest.reshape((Xtest.shape[0], -1))
        model  = mlp.BeatMlpClassifier(data_dim = n_channels*n_samples,
                                       n_outputs=Ydata[0].shape[1],
                                       verbose=False,
                                       hdims=[500, 500])

    # Run optimization
    model.fit((Xtrain, Xval, Xtest), Ydata,
              lr_reduce_interval=20,
              epochs=80,
              learning_rate=1e-3,
              class_weights=False)

    # contract z values back down to encounter-level, save classification data
    for oi, outcome in enumerate(outcome_names):
        for si, split in enumerate(['train', 'val', 'test']):
            zs = pd.Series(model.fit_res['z%s'%split].numpy()[:,oi],
                           index = mdata[si]['index'])
            zs_mu = zs.groupby(level=0, sort=False).mean()
            yt    = Ydf.loc[zs_mu.index][outcome]
            model.fit_res['z%s-enc-%s'%(split, outcome)] = zs_mu
            model.fit_res['y%s-enc-%s'%(split, outcome)] = yt
            nan_idx = pd.isnull(yt)
            model.fit_res['%sdf-%s'%(split, outcome)] = \
                misc.classification_stats(yt[~nan_idx].values, zs_mu[~nan_idx].values)

    return model

예제 #5

0

파일 보기

파일: simple_baseline.py 프로젝트: andymiller/ekg-risk-models

def run_ekg_mlp(outcome='mace', use_features=False, features='simple'):
    # load outcome and ekg remarks
    Xdf, Ydf, encdf = ed.make_dataset(outcome=outcome, features='remarks', do_split=False)
    outcome_names = Ydf.columns
    Ydf = pd.concat([Ydf, Xdf[remark_feats]], axis=1)
    Xsimp, _, _ = ed.make_dataset(outcome=outcome, features=features, do_split=False)
    assert np.all(Xsimp.index == Ydf.index)

    # load outcome data and encounter dataframe
    # beat data ---- beatdf now has 'index' as a column (ed_enc_idx) 
    Xmat, _, tgrid = ed.load_ekg_data(encdf=encdf, constrain_range=True)

    # now split the data
    Xdata, Ydata, mdata = misc.split_data(Xmat, Ydf.values, encdf.split, encdf)
    Wdata, _, _ = misc.split_data(Xsimp, Ydf.values, encdf.split, encdf)
    Xtrain, Xval, Xtest = Xdata
    _, n_channels, n_samples = Xtrain.shape
    ntrain, nval, ntest = (len(x) for x in Xdata)

    # now split the data
    Xtrain, Xval, Xtest = (X[:,:,:-1] for X in Xdata) # convent size
    Ytrain, Yval, Ytest = (Yy for Yy in Ydata)

    if use_features:
        # if we put phatdict in here, add to 
        ntrain, nval, ntest = (len(x) for x in Xdata)
        Xtrain = np.column_stack([Xtrain.reshape(ntrain, -1), Wdata[0]])
        Xval   = np.column_stack([Xval.reshape(nval, -1)  ,   Wdata[1]])
        Xtest  = np.column_stack([Xtest.reshape(ntest, -1) ,  Wdata[2]])
        model = resnet.EKGDeepWideResNetClassifier(n_channels=3,
                                           n_samples=1000-1,
                                           total_n_outputs=Ytrain.shape[1],
                                           num_rep_blocks=8,
                                           kernel_size=16,
                                           h_dim = 100,
                                           dim_wide = Wdata[0].shape[1],
                                           verbose=False)
    else:
        model = resnet.EKGResNetClassifier(n_channels=3,
                                           n_samples=1000-1,
                                           n_outputs=Ytrain.shape[1],
                                           num_rep_blocks=8,
                                           kernel_size=16,
                                           verbose=False)
    model.fit((Xtrain, Xval, Xtest), (Ytrain, Yval, Ytest),
              epochs=80,
              learning_rate=1e-3,
              lr_reduce_interval=10,
              class_weights=False)

    # contract z values back down to encounter-level, save classification data
    for oi, outcome in enumerate(outcome_names):
        for si, split in enumerate(['train', 'val', 'test']):
            zs  = model.fit_res['z%s'%split].numpy()[:, oi]
            yt  = Ydata[si][:, oi]
            nan_idx = pd.isnull(yt)
            model.fit_res['%sdf-%s'%(split, outcome)] = \
                misc.classification_stats(yt[~nan_idx], zs[~nan_idx], num_samples=1000)
            model.fit_res['z%s-enc-%s'%(split, outcome)] = \
                pd.Series(zs, index=mdata[oi].index)
            model.fit_res['y%s-enc-%s'%(split, outcome)] = \
                pd.Series(yt, index=mdata[oi].index)

    return model

예제 #6

0

파일 보기

def results_table(outcomes=["future_afib"],
                  features=["simple", "remark"],
                  split="test",
                  subset=None,
                  do_logreg=True,
                  do_net=False):

    # no history subset
    auc_cols = OrderedDict()
    for outcome in outcomes:
        print("\n===== outcome %s ========" % outcome)
        if "trop_" in outcome:
            outcome_dir = os.path.join("prediction-output/troponin")
            Xdf, Ydf, encdf = ed.make_dataset(outcome='troponin',
                                              features='simple',
                                              do_split=False)
        else:
            outcome_dir = os.path.join("prediction-output/%s" % outcome)
            Xdf, Ydf, encdf = ed.make_dataset(outcome=outcome,
                                              features='simple',
                                              do_split=False)
            _, _, mdata = misc.split_data(Xdf.values, Ydf.values, encdf.split,
                                          encdf)
        rows = []
        for feats in features:
            # lreg results
            if do_logreg:
                print(" lreg w/ feats: ", feats)
                with open(os.path.join(outcome_dir, "lreg-%s.pkl" % feats),
                          'rb') as f:
                    res = pickle.load(f)[outcome]

                #best_mod = best_logistic_model(res)
                #tdf = res['%sdf'%split]
                #auc = tdf[ (tdf['model']==best_mod) & (tdf['metric']=='auc') ]['string'].iloc[0]
                zs = res['z%s' % split]
                Ys = Ydf.loc[zs.index][outcome]
                Xs = Xdf.loc[zs.index]
                encs = encdf.loc[zs.index]
                if subset == "no_history":
                    has_past_afib = encdf.loc[zs.index]['has_afib_past']
                    no_idx = (Xs['mi']==0.) & (Xs['diabetes']==0.) & \
                             (Xs['stroke']==0.) & (Xs['hypertense']==0.) & \
                             (has_past_afib == 0.) & \
                             (encs['age'] < 50.)
                    if outcome == "mace":
                        untested_idx = ~pd.isnull(encs['has_mace'])
                        no_idx = no_idx & untested_idx
                    zs = zs[no_idx]
                    Ys = Ys[no_idx]
                baucs = misc.bootstrap_auc(Ys.values,
                                           zs.values,
                                           num_samples=1000)
                auc = "%2.3f [%2.3f, %2.3f]" % (baucs.mean(),
                                                np.percentile(baucs, 2.5),
                                                np.percentile(baucs, 97.5))
                rows.append(auc)
                #print('features: ', feats)
                #print(res['coefdf'][best_mod].sort_values())

            #xg boost results
            else:
                with open(os.path.join(outcome_dir, "xgb-%s.pkl" % feats),
                          'rb') as f:
                    res = pickle.load(f)
                tdf = res['%sdf' % split]
                auc = tdf[tdf['metric'] == 'auc']['string'].iloc[0]
                rows.append(auc)

        if do_net:
            modfiles = [
                'beatnet-raw-ekg.pkl', 'beatnet-simple.pkl',
                'resnet-raw-ekg.pkl', 'resnet-simple.pkl'
            ]
            for modfile in modfiles:
                # load ekg mlp outcome
                print("  ... loading mod file %s" % modfile)
                mod = base.load_model(os.path.join(outcome_dir, modfile))
                mdf = mod.fit_res['%sdf-%s' % (split, outcome)]
                mauc = mdf[mdf['metric'] == 'auc']['string'].iloc[0]

                zs = mod.fit_res['z%s-enc-%s' % (split, outcome)]
                if not hasattr(zs, 'index'):
                    split_idx = ['train', 'val', 'test'].index(split)
                    zs = pd.Series(zs, index=mdata[split_idx].index)
                Ys = Ydf.loc[zs.index][outcome]
                Xs = Xdf.loc[zs.index]
                encs = encdf.loc[zs.index]
                if subset == "no_history":
                    has_past_afib = encdf.loc[zs.index]['has_afib_past']
                    no_idx = (Xs['mi']==0.) & (Xs['diabetes']==0.) & \
                             (Xs['stroke']==0.) & (Xs['hypertense']==0.) & \
                             (has_past_afib == 0.) & \
                             (encs['age'] < 50.)
                    if outcome == "mace":
                        untested_idx = ~pd.isnull(encs['has_mace'])
                        no_idx = no_idx & untested_idx
                    zs = zs[no_idx]
                    Ys = Ys[no_idx]
                print(Ys, zs)
                baucs = misc.bootstrap_auc(Ys.values,
                                           zs.values,
                                           num_samples=1000)
                mauc = "%2.3f [%2.3f, %2.3f]" % (baucs.mean(),
                                                 np.percentile(baucs, 2.5),
                                                 np.percentile(baucs, 97.5))
                rows.append(mauc)

        auc_cols[outcome] = rows

    import copy
    fidx = copy.deepcopy(features)
    if do_net:
        fidx += ['beatnet', 'beatnet+simple', 'resnet', 'resnet+simple']
    aucdf = pd.DataFrame(auc_cols, index=fidx)
    return aucdf

예제 #7

0

파일 보기

def pairwise_compare_aucs(outcome="mace", features=["simple", "remark"]):
    if "trop_" in outcome:
        outcome_dir = os.path.join("prediction-output/troponin")
        Xdf, Ydf, encdf = ed.make_dataset(outcome='troponin',
                                          features='simple',
                                          do_split=False)
    else:
        outcome_dir = os.path.join("prediction-output/%s" % outcome)
        Xdf, Ydf, encdf = ed.make_dataset(outcome=outcome,
                                          features='simple',
                                          do_split=False)
        _, _, mdata = misc.split_data(Xdf.values, Ydf.values, encdf.split,
                                      encdf)

    split = "test"
    subset = "no_history"
    zs_mod, ys_mod = {}, {}
    for feats in features:
        print(" lreg w/ feats: ", feats)
        with open(os.path.join(outcome_dir, "lreg-%s.pkl" % feats), 'rb') as f:
            res = pickle.load(f)[outcome]

        zs = res['z%s' % split]
        Ys = Ydf.loc[zs.index][outcome]
        Xs = Xdf.loc[zs.index]
        encs = encdf.loc[zs.index]
        if subset == "no_history":
            has_past_afib = encdf.loc[zs.index]['has_afib_past']
            no_idx = (Xs['mi']==0.) & (Xs['diabetes']==0.) & \
                     (Xs['stroke']==0.) & (Xs['hypertense']==0.) & \
                     (has_past_afib == 0.) & \
                     (encs['age'] < 50.)
            if outcome == "mace":
                untested_idx = ~pd.isnull(encs['has_mace'])
                no_idx = no_idx & untested_idx
            zs = zs[no_idx]
            Ys = Ys[no_idx]
        zs_mod[feats] = zs
        ys_mod[feats] = Ys

    modfiles = [
        'beatnet-raw-ekg.pkl', 'beatnet-simple.pkl', 'resnet-raw-ekg.pkl',
        'resnet-simple.pkl'
    ]
    modfiles = [
        'beatnet-raw-ekg.pkl', 'resnet-raw-ekg.pkl', 'beatnet-simple.pkl'
    ]
    for modfile in modfiles:
        # load ekg mlp outcome
        print("  ... loading mod file %s" % modfile)
        mod = base.load_model(os.path.join(outcome_dir, modfile))
        print("    ... has %d params" % mod.num_params())
        mdf = mod.fit_res['%sdf-%s' % (split, outcome)]
        #mauc = mdf[ mdf['metric']=='auc' ]['string'].iloc[0]

        zs = mod.fit_res['z%s-enc-%s' % (split, outcome)]
        if not hasattr(zs, 'index'):
            split_idx = ['train', 'val', 'test'].index(split)
            zs = pd.Series(zs, index=mdata[split_idx].index)
        Ys = Ydf.loc[zs.index][outcome]
        Xs = Xdf.loc[zs.index]
        encs = encdf.loc[zs.index]
        if subset == "no_history":
            has_past_afib = encdf.loc[zs.index]['has_afib_past']
            no_idx = (Xs['mi']==0.) & (Xs['diabetes']==0.) & \
                     (Xs['stroke']==0.) & (Xs['hypertense']==0.) & \
                     (has_past_afib == 0.) & \
                     (encs['age'] < 50.)
            if outcome == "mace":
                untested_idx = ~pd.isnull(encs['has_mace'])
                no_idx = no_idx & untested_idx
            zs = zs[no_idx]
            Ys = Ys[no_idx]
        zs_mod[modfile] = zs
        ys_mod[modfile] = Ys

    # compare pairs
    zsekg = zs_mod['beatnet-raw-ekg.pkl']
    zsresnet = zs_mod['resnet-raw-ekg.pkl'].loc[zsekg.index]
    zsbase = zs_mod['simple'].loc[zsekg.index]
    #zsbase = zs_mod[0].loc[zsekg.index]
    ##zsrem  = zs_mod[1].loc[zsekg.index]
    ysbase = Ys.loc[zsekg.index]

    sa, sb, diff = misc.bootstrap_auc_comparison(ysbase.values,
                                                 zsbase.values,
                                                 zsekg.values,
                                                 num_samples=1000)
    print(" simple => beatnet ", np.percentile(diff, [2.5, 97.5]))

    sa, sb, diff = misc.bootstrap_auc_comparison(ysbase.values,
                                                 zsrem.values,
                                                 zsekg.values,
                                                 num_samples=1000)
    print(" rem => beatnet ", np.percentile(diff, [2.5, 97.5]))

    sa, sb, diff = misc.bootstrap_auc_comparison(ysbase.values,
                                                 zsresnet.values,
                                                 zsekg.values,
                                                 num_samples=1000)
    print(" resnet => beatnet ", np.percentile(diff, [2.5, 97.5]))

    sa, sb, diff = misc.bootstrap_auc_comparison(ysbase.values,
                                                 zsrem.values,
                                                 zsresnet.values,
                                                 num_samples=1000)
    print(" rem => resnet ", np.percentile(diff, [2.5, 97.5]))