示例#1
0
def get_counts_and_clasif(up_to_dataset_dict, timepoint):


    X_train, y_train, X_test, y_test, xgb_model, input_data = unpack_input_data(up_to_dataset_dict[timepoint]['input_file'], up_to_dataset_dict[timepoint]['model_file'])
    bin_X_test = X_test.applymap(lambda x: 1 if x > 1 else 0)

    cor_df = creat_count_df(xgb_model, X_test, y_test)


    # train classifier
    tprs, fprs, prs, rcs, thresholds = classifier(X_test, y_test)
    sorted_fprs, sorted_tprs, sorted_recall, sorted_precision, sorted_thresholds = prep_for_auc(fprs, tprs, prs, rcs, thresholds)
    auc, pr_auc = calc_auc(sorted_tprs, sorted_fprs,sorted_precision, sorted_recall)
    pr_chance= np.round(np.sum(y_test)/len(y_test),2)

    return cor_df, sorted_fprs, sorted_tprs, sorted_recall, sorted_precision, auc, pr_auc, pr_chance
示例#2
0
def calc_npv(y_true, y_pred):
    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
    return tn/(tn+fn)



# -----------
# MAIN
# -----------

# %%
###
###   get model predictions
###
X_train, y_train, X_test, y_test, xgb_model, input_data = unpack_input_data(input_file, model_file)
metrics_results, metrics_df, model_params = validate_best_model(xgb_model, X_test, y_test)
y_pred, y_proba = get_preds(xgb_model, X_test)

pred_df = pd.DataFrame({'GRIDS':X_test.index.tolist(), 'y_true':y_test, 'y_pred':y_pred, 'y_proba':y_proba[:,1]})

# %%
###
###    load risk fx
###

long_risk_df = harmonize_risk_fx(risk_file_dict, risk_cols_to_keep_dict)
long_risk_df['RISK_CAT_LABEL'] = long_risk_df.RISK_CATEGORY +", "+ long_risk_df.RISK_LABEL
wide_risk_df = long_risk_df.pivot(index='GRID', columns='RISK_LABEL', values='RISK_CATEGORY')
risk_cols = long_risk_df.RISK_LABEL.unique()

# set up paths
timeseries = ['0_weeks', '13_weeks','28_weeks', '35_weeks', '37_weeks']

roc_dict = dict()
pr_dict = dict()
f1_score=dict()
for timepoint in timeseries: 
    results_dir = os.path.join(ICDCPT_DIR, f'{timepoint}_notwins_timeseries_v1')
    input_file = glob.glob(results_dir+"/input_data*.tsv")[0]
    model_file = glob.glob(results_dir+"/best_xgb_model*.pickle")[0]
    
    
    # load models and input files
    _, _, ehr_X_test, ehr_y_test, ehr_xgb_model, ehr_input_data = unpack_input_data(input_file, model_file)
    ehr_metrics_results, ehr_metrics_df, _ = validate_best_model(ehr_xgb_model, ehr_X_test, ehr_y_test)

    ehr_interp_fpr, ehr_interp_tpr, ehr_auc = get_auroc_coords(ehr_metrics_results)
    temp_roc_dict = {'interp_fpr':ehr_interp_fpr, 'interp_tpr':ehr_interp_tpr, 'auc':ehr_auc}
    roc_dict[timepoint] = temp_roc_dict
    f1_score[timepoint] = {'f1_score': ehr_metrics_results['f1_score'], 'pr_score': ehr_metrics_results['pr_score'], 'rc_score': ehr_metrics_results['rc_score']}
    
    ehr_interp_rc, ehr_interp_pr, ehr_pr_auc, ehr_pos_prop = get_pr_coord(ehr_metrics_results, ehr_y_test)
    temp_pr_dict = {'interp_rc':ehr_interp_rc, 'interp_pr':ehr_interp_pr, 'pr_auc':ehr_pr_auc, 'pos_prop':ehr_pos_prop}
    pr_dict[timepoint] = temp_pr_dict
    break
    
roc_dict    

示例#4
0
###
###    MAIN
###

### define file paths
ehr_input_file=os.path.join(RF_DIR,'input_data_up_to_28_weeks_since_preg_start_icd9_cpt_count-2019-06-19.tsv')
riskfx_input_file=os.path.join(CLIN_RISK_DIR,'input_data_up_to_28_weeks_since_preg_start_risk_fx-2020-03-24.tsv')


### load models and input files
ehr_model_file=os.path.join(RF_DIR,'best_xgb_model_up_to_28_weeks_since_preg_start_icd9_cpt_count-2019-06-19.pickle')
riskfx_model_file=os.path.join(CLIN_RISK_DIR,'best_xgb_model_up_to_28_weeks_since_preg_start_risk_fx-2020-03-24.pickle')


ehr_X_train, ehr_y_train, ehr_X_test, ehr_y_test, ehr_xgb_model, ehr_input_data = unpack_input_data(ehr_input_file, ehr_model_file)
riskfx_X_train, riskfx_y_train, riskfx_X_test, riskfx_y_test, riskfx_xgb_model, riskfx_input_data = unpack_input_data(riskfx_input_file, riskfx_model_file)

# check that grids match!
np.all(ehr_input_data.GRID == riskfx_input_data.GRID)

# %%
###
###    compare predictions
###

ehr_y_pred, ehr_y_proba = get_preds(ehr_xgb_model, ehr_X_test)
riskfx_y_pred, riskfx_y_proba = get_preds(riskfx_xgb_model, riskfx_X_test)

# measure ppv and npv
ehr_ppv = metrics.precision_score(ehr_y_test, ehr_y_pred)
    # force a 1,pos_prop end
    interp_rc = np.hstack((interp_rc, np.array([1])))
    interp_pr = np.hstack((interp_pr, np.array([pos_prop])))

    return interp_rc, interp_pr, pr_auc, pos_prop

# %%
###
### main
###

THIS_MODEL ='28_weeks_icd9'

# load vu data
X_train, y_train, X_test, y_test, xgb_model, input_data = unpack_input_data(vu_dicts[THIS_MODEL]['input_file'], vu_dicts[THIS_MODEL]['model_file'])
vu_metrics_results, _, _ = validate_best_model(xgb_model, X_test, y_test)

# uc
uc_metrics = uc_dicts[THIS_MODEL]
interp_fpr, interp_tpr, auc  = get_auroc_coords(uc_metrics)
interp_rc, interp_pr, pr_auc, pos_prop = get_pr_coord(uc_metrics)



# %%

import matplotlib.font_manager as fm
fpath='/dors/capra_lab/users/abraha1/conda/envs/py36_r_ml/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/ttf/Arial.ttf'
prop = fm.FontProperties(fname=fpath, size=11)
sprop = fm.FontProperties(fname=fpath, size=9)
vg_input_file = os.path.join(
    VG_DIR,
    'input_data_vaginal_delivery_up_to_28_weeks_since_preg_start_icd9_cpt_no_twins_count-2020-06-04.tsv'
)

csec_model_file = os.path.join(
    CSEC_DIR,
    'best_xgb_model_csection_up_to_28_weeks_since_preg_start_icd9_cpt_no_twins_count-2020-06-04.pickle'
)
vg_model_file = os.path.join(
    VG_DIR,
    'best_xgb_model_vaginal_delivery_up_to_28_weeks_since_preg_start_icd9_cpt_no_twins_count-2020-06-04.pickle'
)

# load models and input files
csec_X_train, csec_y_train, csec_X_test, csec_y_test, csec_xgb_model, csec_input_data = unpack_input_data(
    csec_input_file, csec_model_file)
vg_X_train, vg_y_train, vg_X_test, vg_y_test, vg_xgb_model, vg_input_data = unpack_input_data(
    vg_input_file, vg_model_file)

csec_no_ptb_test_grids = csec_X_test.reset_index().loc[csec_y_test == 0,
                                                       'GRID'].values
csec_ptb_test_grids = csec_X_test.reset_index().loc[csec_y_test == 1,
                                                    'GRID'].values
vg_no_ptb_test_grids = vg_X_test.reset_index().loc[vg_y_test == 0,
                                                   'GRID'].values
vg_ptb_test_grids = vg_X_test.reset_index().loc[vg_y_test == 1, 'GRID'].values

# consider working in probability space..
# explainer = shap.TreeExplainer(csec_xgb_model.get_booster(), data=shap.sample(csec_X_train, 100), model_output='probability')
# shap_values = explainer.shap_values(csec_X_train)
示例#7
0
    RF_DIR,
    'input_data_up_to_28_weeks_since_preg_start_icd9_cpt_count-2019-06-19.tsv')
riskfx_input_file = os.path.join(
    CLIN_RISK_DIR,
    'input_data_up_to_28_weeks_since_preg_start_risk_fx-2020-03-24.tsv')

ehr_model_file = os.path.join(
    RF_DIR,
    'best_xgb_model_up_to_28_weeks_since_preg_start_icd9_cpt_count-2019-06-19.pickle'
)
riskfx_model_file = os.path.join(
    CLIN_RISK_DIR,
    'best_xgb_model_up_to_28_weeks_since_preg_start_risk_fx-2020-03-24.pickle')

# load models and input files
_, _, ehr_X_test, ehr_y_test, ehr_xgb_model, ehr_input_data = unpack_input_data(
    ehr_input_file, ehr_model_file)
_, _, riskfac_X_test, riskfac_y_test, riskfac_xgb_model, riskfac_input_data = unpack_input_data(
    riskfx_input_file, riskfx_model_file)

ehr_metrics_results, ehr_metrics_df, _ = validate_best_model(
    ehr_xgb_model, ehr_X_test, ehr_y_test)
riskfac_metrics_results, riskfac_metrics_df, _ = validate_best_model(
    riskfac_xgb_model, riskfac_X_test, riskfac_y_test)

###
###    plot
###
# %%
# fig paramaters
sns.set(style='whitegrid', font_scale=1.5, rc={'figure.figsize': (6, 6)})
sns.set_style({
vg_input_file = os.path.join(
    VG_DIR,
    'input_data_vaginal_delivery_up_to_28_weeks_since_preg_start_icd9_cpt_count-2020-04-12.tsv'
)

cs_model_file = os.path.join(
    CSEC_DIR,
    'best_xgb_model_csection_up_to_28_weeks_since_preg_start_icd9_cpt_count-2020-04-12.pickle'
)
vg_model_file = os.path.join(
    VG_DIR,
    'best_xgb_model_vaginal_delivery_up_to_28_weeks_since_preg_start_icd9_cpt_count-2020-04-12.pickle'
)

# load models and input files
_, _, cs_X_test, cs_y_test, cs_xgb_model, cs_input_data = unpack_input_data(
    cs_input_file, cs_model_file)
_, _, vg_X_test, vg_y_test, vg_xgb_model, vg_input_data = unpack_input_data(
    vg_input_file, vg_model_file)

cs_metrics_results, cs_metrics_df, _ = validate_best_model(
    cs_xgb_model, cs_X_test, cs_y_test)
vg_metrics_results, vg_metrics_df, _ = validate_best_model(
    vg_xgb_model, vg_X_test, vg_y_test)

# %%
###
###    plot
###

# plot - PR
mult = 1
示例#9
0
    long_top_shap_df = pd.melt(top_shap_df, id_vars="GRID", var_name='feat', value_name='feat_shap')

    top_feat_shap_df = pd.merge(long_top_feat_count_df, long_top_shap_df, on=['GRID','feat'], how='inner')

    return top_feat_shap_df

# %%
# -----------
# MAIN
# -----------


# -----------  load and melt data  -----------

# load feature matrix, labels, and xgboost model
X_train, y_train, X_test, y_test, xgb_model, input_df =  unpack_input_data(INPUT_DF_FILE, XGB_MODEL_FILE)
train_df, train_df_w_labels = extract_train_df(input_df)
test_df, test_df_w_labels = extract_test_df(input_df)

# load pickled shap values
train_shap = pickle.load( open( SHAP_TRAIN_PICKLE, 'rb'))
test_shap = pickle.load( open( SHAP_TEST_PICKLE, 'rb'))

# take top 10 shap features
train_top_feats_descrip = filter_shap(train_shap[:,:-1], train_df, top_n=15)
long_shap_feat_df = melt_feat_and_shap(train_shap, train_df_w_labels, train_top_feats_descrip)

#

long_shap_feat_df.to_csv(os.path.join(OUTPUT_DIR, 'long_shap_feat_df.tsv'), sep="\t", index=False)
# train_top_feats_descrip.to_csv(os.path.join(OUTPUT_DIR, 'top15_feat_w_descript.tsv'), sep="\t", index=False)