def load_prediction_scores(outcome, feats, split="test"): # load data for reference if "trop_" in outcome: outcome_dir = os.path.join("prediction-output/troponin") Xdf, Ydf, encdf = ed.make_dataset(outcome='troponin', features='simple', do_split=False) else: outcome_dir = os.path.join("prediction-output/%s" % outcome) Xdf, Ydf, encdf = ed.make_dataset(outcome=outcome, features='simple', do_split=False) _, _, mdata = misc.split_data(Xdf.values, Ydf.values, encdf.split, encdf) # load predictions if '.pkl' in feats: print(" ... loading mod file %s" % feats) mod = base.load_model(os.path.join(outcome_dir, feats)) zs = mod.fit_res['z%s-enc-%s' % (split, outcome)] if not hasattr(zs, 'index'): split_idx = ['train', 'val', 'test'].index(split) zs = pd.Series(zs, index=mdata[split_idx].index) else: with open(os.path.join(outcome_dir, "lreg-%s.pkl" % feats), 'rb') as f: res = pickle.load(f)[outcome] zs = res['z%s' % split] # subselect to test split Ys = Ydf.loc[zs.index][outcome] Xs = Xdf.loc[zs.index] encs = encdf.loc[zs.index] return zs, Ys, Xs, encs
def run_logistic(outcome="mace", features="simple"): Xdf, Ydf, encdf = \ ed.make_dataset(outcome=outcome, features=features, do_split=False) # remove missing values resdict = {} for c in Ydf.columns: print(" working on outcome %s "%c) # split and purge nan Yc = Ydf[c] idx_nan = pd.isnull(Yc) Xdata, Ydata, mdata = misc.split_data(Xdf[~idx_nan].values, Yc[~idx_nan].values, encdf.split[~idx_nan], encdf[~idx_nan]) cs = np.logspace(-2.5, 1, 20) res = misc.run_logistic(Xdata, Ydata, Xdf.columns.tolist(), penalty="l2", cs=cs) # make sure ztrain/ zval/ ztest are linked to an ed_enc_idx res['ztrain'] = pd.Series(logit(np.exp(res['lnp_train'])), index=mdata[0].index, name='ztrain') res['zval'] = pd.Series(logit(np.exp(res['lnp_val'])), index=mdata[1].index, name='zval') res['ztest'] = pd.Series(logit(np.exp(res['lnp_test'])), index=mdata[2].index, name='ztest') resdict[c] = res return resdict
def plot_full_ekg_example(): # load outcome data and encounter dataframe Xdf, Ydf, encdf = ed.make_dataset(outcome='mace', features='remarks', do_split=False) Xmat, _, tgrid = ed.load_ekg_data(encdf=encdf, constrain_range=False) fig, ax = plt.figure(figsize=(12, 3)), plt.gca() ax = viz.plot_stacked_ecg(ax, Xmat[300], linewidth=1.5) # **kwargs) ax.set_xlabel("time (seconds)", fontsize=16) ax.get_yaxis().set_ticks([]) fig.tight_layout() fig.savefig('prediction-output/example_ekg.png', bbox_inches='tight', dpi=150) plt.close("all") # plot segmented beats from ekgmodels import preproc beatdf, beatmat = preproc.create_beat_dataset_fixed(encdf, Xmat, tgrid, detrend=False) def savebeat(bis, name="example_beats.png"): fig, axarr = plt.subplots(1, 3, figsize=(6, 3.5)) for bi, ax in zip(bis, axarr.flatten()): blen = beatdf['beat_len'].iloc[bi] ax = viz.plot_stacked_beat(ax, beatmat[bi], beatlen=blen) axarr[0].set_xlabel("") axarr[2].set_xlabel("") axarr[1].set_ylabel("") axarr[2].set_ylabel("") fig.savefig("prediction-output/%s" % name, bbox_inches='tight', dpi=150) plt.close("all") bis = [0, 1, 2] savebeat(bis, name="example_beats.png") bis = [300, 301, 302] savebeat(bis, name="example_beats-2.png")
def run_beat_mlp(outcome='mace', use_features=False, features='simple'): # load outcome and ekg remarks Xdf, Ydf, encdf = ed.make_dataset(outcome=outcome, features='remarks', do_split=False) outcome_names = Ydf.columns Ydf = pd.concat([Ydf, Xdf[remark_feats]], axis=1) Xsimp, _, _ = ed.make_dataset(outcome=outcome, features=features, do_split=False) assert np.all(Xsimp.index == Ydf.index) # load outcome data and encounter dataframe # beat data ---- beatdf now has 'index' as a column (ed_enc_idx) beatdf, beatmat = ed.load_beat_data(encdf=encdf, constrain_range=True) # expand outcome and simple features Ybeat = Ydf.loc[ beatdf['index'] ].values Xsimp_beat = Xsimp.loc[ beatdf['index'] ].values # now split the data Xdata, Ydata, mdata = misc.split_data(beatmat, Ybeat, beatdf.split, beatdf) Wdata, _, _ = misc.split_data(Xsimp_beat, Ybeat, beatdf.split, beatdf) Xtrain, Xval, Xtest = Xdata _, n_channels, n_samples = Xtrain.shape ntrain, nval, ntest = (len(x) for x in Xdata) # isolate feature columns if use_features == True: print("Adding %s features to EKG in last layer!"%features) Xtrain = np.column_stack([Xtrain.reshape(ntrain, -1), Wdata[0]]) Xval = np.column_stack([Xval.reshape(nval, -1) , Wdata[1]]) Xtest = np.column_stack([Xtest.reshape(ntest, -1) , Wdata[2]]) # isolate feature columns model = mlp.BeatDeepWideMlpClassifier( data_dim = n_channels * n_samples, total_n_outputs = Ydata[0].shape[1], verbose = False, hdims = [500, 500], h_dim = 100, dim_wide = Wdata[0].shape[1]) else: Xtrain = Xtrain.reshape((Xtrain.shape[0], -1)) Xval = Xval.reshape((Xval.shape[0], -1)) Xtest = Xtest.reshape((Xtest.shape[0], -1)) model = mlp.BeatMlpClassifier(data_dim = n_channels*n_samples, n_outputs=Ydata[0].shape[1], verbose=False, hdims=[500, 500]) # Run optimization model.fit((Xtrain, Xval, Xtest), Ydata, lr_reduce_interval=20, epochs=80, learning_rate=1e-3, class_weights=False) # contract z values back down to encounter-level, save classification data for oi, outcome in enumerate(outcome_names): for si, split in enumerate(['train', 'val', 'test']): zs = pd.Series(model.fit_res['z%s'%split].numpy()[:,oi], index = mdata[si]['index']) zs_mu = zs.groupby(level=0, sort=False).mean() yt = Ydf.loc[zs_mu.index][outcome] model.fit_res['z%s-enc-%s'%(split, outcome)] = zs_mu model.fit_res['y%s-enc-%s'%(split, outcome)] = yt nan_idx = pd.isnull(yt) model.fit_res['%sdf-%s'%(split, outcome)] = \ misc.classification_stats(yt[~nan_idx].values, zs_mu[~nan_idx].values) return model
def run_ekg_mlp(outcome='mace', use_features=False, features='simple'): # load outcome and ekg remarks Xdf, Ydf, encdf = ed.make_dataset(outcome=outcome, features='remarks', do_split=False) outcome_names = Ydf.columns Ydf = pd.concat([Ydf, Xdf[remark_feats]], axis=1) Xsimp, _, _ = ed.make_dataset(outcome=outcome, features=features, do_split=False) assert np.all(Xsimp.index == Ydf.index) # load outcome data and encounter dataframe # beat data ---- beatdf now has 'index' as a column (ed_enc_idx) Xmat, _, tgrid = ed.load_ekg_data(encdf=encdf, constrain_range=True) # now split the data Xdata, Ydata, mdata = misc.split_data(Xmat, Ydf.values, encdf.split, encdf) Wdata, _, _ = misc.split_data(Xsimp, Ydf.values, encdf.split, encdf) Xtrain, Xval, Xtest = Xdata _, n_channels, n_samples = Xtrain.shape ntrain, nval, ntest = (len(x) for x in Xdata) # now split the data Xtrain, Xval, Xtest = (X[:,:,:-1] for X in Xdata) # convent size Ytrain, Yval, Ytest = (Yy for Yy in Ydata) if use_features: # if we put phatdict in here, add to ntrain, nval, ntest = (len(x) for x in Xdata) Xtrain = np.column_stack([Xtrain.reshape(ntrain, -1), Wdata[0]]) Xval = np.column_stack([Xval.reshape(nval, -1) , Wdata[1]]) Xtest = np.column_stack([Xtest.reshape(ntest, -1) , Wdata[2]]) model = resnet.EKGDeepWideResNetClassifier(n_channels=3, n_samples=1000-1, total_n_outputs=Ytrain.shape[1], num_rep_blocks=8, kernel_size=16, h_dim = 100, dim_wide = Wdata[0].shape[1], verbose=False) else: model = resnet.EKGResNetClassifier(n_channels=3, n_samples=1000-1, n_outputs=Ytrain.shape[1], num_rep_blocks=8, kernel_size=16, verbose=False) model.fit((Xtrain, Xval, Xtest), (Ytrain, Yval, Ytest), epochs=80, learning_rate=1e-3, lr_reduce_interval=10, class_weights=False) # contract z values back down to encounter-level, save classification data for oi, outcome in enumerate(outcome_names): for si, split in enumerate(['train', 'val', 'test']): zs = model.fit_res['z%s'%split].numpy()[:, oi] yt = Ydata[si][:, oi] nan_idx = pd.isnull(yt) model.fit_res['%sdf-%s'%(split, outcome)] = \ misc.classification_stats(yt[~nan_idx], zs[~nan_idx], num_samples=1000) model.fit_res['z%s-enc-%s'%(split, outcome)] = \ pd.Series(zs, index=mdata[oi].index) model.fit_res['y%s-enc-%s'%(split, outcome)] = \ pd.Series(yt, index=mdata[oi].index) return model
def results_table(outcomes=["future_afib"], features=["simple", "remark"], split="test", subset=None, do_logreg=True, do_net=False): # no history subset auc_cols = OrderedDict() for outcome in outcomes: print("\n===== outcome %s ========" % outcome) if "trop_" in outcome: outcome_dir = os.path.join("prediction-output/troponin") Xdf, Ydf, encdf = ed.make_dataset(outcome='troponin', features='simple', do_split=False) else: outcome_dir = os.path.join("prediction-output/%s" % outcome) Xdf, Ydf, encdf = ed.make_dataset(outcome=outcome, features='simple', do_split=False) _, _, mdata = misc.split_data(Xdf.values, Ydf.values, encdf.split, encdf) rows = [] for feats in features: # lreg results if do_logreg: print(" lreg w/ feats: ", feats) with open(os.path.join(outcome_dir, "lreg-%s.pkl" % feats), 'rb') as f: res = pickle.load(f)[outcome] #best_mod = best_logistic_model(res) #tdf = res['%sdf'%split] #auc = tdf[ (tdf['model']==best_mod) & (tdf['metric']=='auc') ]['string'].iloc[0] zs = res['z%s' % split] Ys = Ydf.loc[zs.index][outcome] Xs = Xdf.loc[zs.index] encs = encdf.loc[zs.index] if subset == "no_history": has_past_afib = encdf.loc[zs.index]['has_afib_past'] no_idx = (Xs['mi']==0.) & (Xs['diabetes']==0.) & \ (Xs['stroke']==0.) & (Xs['hypertense']==0.) & \ (has_past_afib == 0.) & \ (encs['age'] < 50.) if outcome == "mace": untested_idx = ~pd.isnull(encs['has_mace']) no_idx = no_idx & untested_idx zs = zs[no_idx] Ys = Ys[no_idx] baucs = misc.bootstrap_auc(Ys.values, zs.values, num_samples=1000) auc = "%2.3f [%2.3f, %2.3f]" % (baucs.mean(), np.percentile(baucs, 2.5), np.percentile(baucs, 97.5)) rows.append(auc) #print('features: ', feats) #print(res['coefdf'][best_mod].sort_values()) #xg boost results else: with open(os.path.join(outcome_dir, "xgb-%s.pkl" % feats), 'rb') as f: res = pickle.load(f) tdf = res['%sdf' % split] auc = tdf[tdf['metric'] == 'auc']['string'].iloc[0] rows.append(auc) if do_net: modfiles = [ 'beatnet-raw-ekg.pkl', 'beatnet-simple.pkl', 'resnet-raw-ekg.pkl', 'resnet-simple.pkl' ] for modfile in modfiles: # load ekg mlp outcome print(" ... loading mod file %s" % modfile) mod = base.load_model(os.path.join(outcome_dir, modfile)) mdf = mod.fit_res['%sdf-%s' % (split, outcome)] mauc = mdf[mdf['metric'] == 'auc']['string'].iloc[0] zs = mod.fit_res['z%s-enc-%s' % (split, outcome)] if not hasattr(zs, 'index'): split_idx = ['train', 'val', 'test'].index(split) zs = pd.Series(zs, index=mdata[split_idx].index) Ys = Ydf.loc[zs.index][outcome] Xs = Xdf.loc[zs.index] encs = encdf.loc[zs.index] if subset == "no_history": has_past_afib = encdf.loc[zs.index]['has_afib_past'] no_idx = (Xs['mi']==0.) & (Xs['diabetes']==0.) & \ (Xs['stroke']==0.) & (Xs['hypertense']==0.) & \ (has_past_afib == 0.) & \ (encs['age'] < 50.) if outcome == "mace": untested_idx = ~pd.isnull(encs['has_mace']) no_idx = no_idx & untested_idx zs = zs[no_idx] Ys = Ys[no_idx] print(Ys, zs) baucs = misc.bootstrap_auc(Ys.values, zs.values, num_samples=1000) mauc = "%2.3f [%2.3f, %2.3f]" % (baucs.mean(), np.percentile(baucs, 2.5), np.percentile(baucs, 97.5)) rows.append(mauc) auc_cols[outcome] = rows import copy fidx = copy.deepcopy(features) if do_net: fidx += ['beatnet', 'beatnet+simple', 'resnet', 'resnet+simple'] aucdf = pd.DataFrame(auc_cols, index=fidx) return aucdf
def pairwise_compare_aucs(outcome="mace", features=["simple", "remark"]): if "trop_" in outcome: outcome_dir = os.path.join("prediction-output/troponin") Xdf, Ydf, encdf = ed.make_dataset(outcome='troponin', features='simple', do_split=False) else: outcome_dir = os.path.join("prediction-output/%s" % outcome) Xdf, Ydf, encdf = ed.make_dataset(outcome=outcome, features='simple', do_split=False) _, _, mdata = misc.split_data(Xdf.values, Ydf.values, encdf.split, encdf) split = "test" subset = "no_history" zs_mod, ys_mod = {}, {} for feats in features: print(" lreg w/ feats: ", feats) with open(os.path.join(outcome_dir, "lreg-%s.pkl" % feats), 'rb') as f: res = pickle.load(f)[outcome] zs = res['z%s' % split] Ys = Ydf.loc[zs.index][outcome] Xs = Xdf.loc[zs.index] encs = encdf.loc[zs.index] if subset == "no_history": has_past_afib = encdf.loc[zs.index]['has_afib_past'] no_idx = (Xs['mi']==0.) & (Xs['diabetes']==0.) & \ (Xs['stroke']==0.) & (Xs['hypertense']==0.) & \ (has_past_afib == 0.) & \ (encs['age'] < 50.) if outcome == "mace": untested_idx = ~pd.isnull(encs['has_mace']) no_idx = no_idx & untested_idx zs = zs[no_idx] Ys = Ys[no_idx] zs_mod[feats] = zs ys_mod[feats] = Ys modfiles = [ 'beatnet-raw-ekg.pkl', 'beatnet-simple.pkl', 'resnet-raw-ekg.pkl', 'resnet-simple.pkl' ] modfiles = [ 'beatnet-raw-ekg.pkl', 'resnet-raw-ekg.pkl', 'beatnet-simple.pkl' ] for modfile in modfiles: # load ekg mlp outcome print(" ... loading mod file %s" % modfile) mod = base.load_model(os.path.join(outcome_dir, modfile)) print(" ... has %d params" % mod.num_params()) mdf = mod.fit_res['%sdf-%s' % (split, outcome)] #mauc = mdf[ mdf['metric']=='auc' ]['string'].iloc[0] zs = mod.fit_res['z%s-enc-%s' % (split, outcome)] if not hasattr(zs, 'index'): split_idx = ['train', 'val', 'test'].index(split) zs = pd.Series(zs, index=mdata[split_idx].index) Ys = Ydf.loc[zs.index][outcome] Xs = Xdf.loc[zs.index] encs = encdf.loc[zs.index] if subset == "no_history": has_past_afib = encdf.loc[zs.index]['has_afib_past'] no_idx = (Xs['mi']==0.) & (Xs['diabetes']==0.) & \ (Xs['stroke']==0.) & (Xs['hypertense']==0.) & \ (has_past_afib == 0.) & \ (encs['age'] < 50.) if outcome == "mace": untested_idx = ~pd.isnull(encs['has_mace']) no_idx = no_idx & untested_idx zs = zs[no_idx] Ys = Ys[no_idx] zs_mod[modfile] = zs ys_mod[modfile] = Ys # compare pairs zsekg = zs_mod['beatnet-raw-ekg.pkl'] zsresnet = zs_mod['resnet-raw-ekg.pkl'].loc[zsekg.index] zsbase = zs_mod['simple'].loc[zsekg.index] #zsbase = zs_mod[0].loc[zsekg.index] ##zsrem = zs_mod[1].loc[zsekg.index] ysbase = Ys.loc[zsekg.index] sa, sb, diff = misc.bootstrap_auc_comparison(ysbase.values, zsbase.values, zsekg.values, num_samples=1000) print(" simple => beatnet ", np.percentile(diff, [2.5, 97.5])) sa, sb, diff = misc.bootstrap_auc_comparison(ysbase.values, zsrem.values, zsekg.values, num_samples=1000) print(" rem => beatnet ", np.percentile(diff, [2.5, 97.5])) sa, sb, diff = misc.bootstrap_auc_comparison(ysbase.values, zsresnet.values, zsekg.values, num_samples=1000) print(" resnet => beatnet ", np.percentile(diff, [2.5, 97.5])) sa, sb, diff = misc.bootstrap_auc_comparison(ysbase.values, zsrem.values, zsresnet.values, num_samples=1000) print(" rem => resnet ", np.percentile(diff, [2.5, 97.5]))