def load_if_exists(STORED_PICKLE_FILE, filepaths_dict): # load if the data already exists... # if os.path.isfile(STORED_PICKLE_FILE): if False: print("loading pickled file...") metrics_file = open(STORED_PICKLE_FILE, 'rb') up_to_dataset_dict = pickle.load(metrics_file) else: print("creating data...") store_results = {} for label, inner_dict in filepaths_dict.items(): print(label) X_test, y_test, xgb_model = get_test_performance(inner_dict['input_file'], inner_dict['model_file']) metrics_results, metrics_df, model_params = validate_best_model(xgb_model, X_test, y_test) filepaths_dict[label]['metrics_df'] = metrics_df filepaths_dict[label]['metrics_results'] = metrics_results filepaths_dict[label]['y_test'] = y_test filepaths_dict[label]['X_test'] = X_test pickle.dump(filepaths_dict, open(STORED_PICKLE_FILE, 'wb')) print("pickled model.") return filepaths_dict
# load if the data already exists... # UP TO if os.path.isfile(UPTO_STORED_DATA_FILE): print("loading pickled file...") metrics_file = open(UPTO_STORED_DATA_FILE, 'rb') up_to_dataset_dict = pickle.load(metrics_file) else: print("creating data...") store_results = {} for label, inner_dict in up_to_dataset_dict.items(): print(label) X_test, y_test, xgb_model = get_test_performance( inner_dict['input_file'], inner_dict['model_file']) metrics_results, metrics_df, model_params = validate_best_model( xgb_model, X_test, y_test) up_to_dataset_dict[label]['metrics_df'] = metrics_df up_to_dataset_dict[label]['metrics_results'] = metrics_results up_to_dataset_dict[label]['y_test'] = y_test up_to_dataset_dict[label]['X_test'] = X_test mod_names_up_to = dict( zip(up_to_dataset_dict.keys(), [ 'up_to_{}_before_delivery'.format(x) for x in up_to_dataset_dict.keys() ])) # In[ ]: # SET UP FIGURE PARAMTERS ....
### PATHS ### DATE = datetime.now().strftime('%Y-%m-%d') ROOT_DATA_DIR = "/dors/capra_lab/users/abraha1/projects/PTB_phenotyping/results/ptb_predict_machine_learning/2019_09_06_2nd_ptb_icd_cpt/equal_sample_size" OUTPUT_DIR = "/dors/capra_lab/users/abraha1/projects/PTB_phenotyping/scripts/rand_forest_ptb_classification/manuscript/second_delivery" # load input and model fiel up_to_dataset_dict = OrderedDict() for num_weeks in ['0', '90', '365']: input_file = os.path.join(ROOT_DATA_DIR,'eq_up_to_{0}d_before_second_delivery/input_data_eq_samp_size_raw_counts_icd_cpt_up_to_{0}_days_before_second_delivery-2019-09-08.tsv'.format(num_weeks)) model_file = os.path.join(ROOT_DATA_DIR,'eq_up_to_{0}d_before_second_delivery/best_xgb_model_eq_samp_size_raw_counts_icd_cpt_up_to_{0}_days_before_second_delivery-2019-09-08.pickle'.format(num_weeks)) _, _, X_test, y_test, xgb_model, input_data = unpack_input_data(input_file, model_file) metrics_results, _, _ = validate_best_model(xgb_model, X_test, y_test) interp_rc, interp_pr, pr_auc, pos_prop = get_pr_coord(metrics_results, y_test) up_to_dataset_dict['{}_days'.format(num_weeks)] = {'interp_rc': interp_rc, 'interp_pr': interp_pr, 'pr_auc':pr_auc, 'pos_prop':pos_prop} ### ### FUNCTIONS ### def get_auroc_coords(metric_results): # unpack data metrics_results = metric_results fpr = metrics_results['fpr']
# set up paths timeseries = ['0_weeks', '13_weeks','28_weeks', '35_weeks', '37_weeks'] roc_dict = dict() pr_dict = dict() f1_score=dict() for timepoint in timeseries: results_dir = os.path.join(ICDCPT_DIR, f'{timepoint}_notwins_timeseries_v1') input_file = glob.glob(results_dir+"/input_data*.tsv")[0] model_file = glob.glob(results_dir+"/best_xgb_model*.pickle")[0] # load models and input files _, _, ehr_X_test, ehr_y_test, ehr_xgb_model, ehr_input_data = unpack_input_data(input_file, model_file) ehr_metrics_results, ehr_metrics_df, _ = validate_best_model(ehr_xgb_model, ehr_X_test, ehr_y_test) ehr_interp_fpr, ehr_interp_tpr, ehr_auc = get_auroc_coords(ehr_metrics_results) temp_roc_dict = {'interp_fpr':ehr_interp_fpr, 'interp_tpr':ehr_interp_tpr, 'auc':ehr_auc} roc_dict[timepoint] = temp_roc_dict f1_score[timepoint] = {'f1_score': ehr_metrics_results['f1_score'], 'pr_score': ehr_metrics_results['pr_score'], 'rc_score': ehr_metrics_results['rc_score']} ehr_interp_rc, ehr_interp_pr, ehr_pr_auc, ehr_pos_prop = get_pr_coord(ehr_metrics_results, ehr_y_test) temp_pr_dict = {'interp_rc':ehr_interp_rc, 'interp_pr':ehr_interp_pr, 'pr_auc':ehr_pr_auc, 'pos_prop':ehr_pos_prop} pr_dict[timepoint] = temp_pr_dict break roc_dict
ehr_model_file = os.path.join( RF_DIR, 'best_xgb_model_up_to_28_weeks_since_preg_start_icd9_cpt_count-2019-06-19.pickle' ) riskfx_model_file = os.path.join( CLIN_RISK_DIR, 'best_xgb_model_up_to_28_weeks_since_preg_start_risk_fx-2020-03-24.pickle') # load models and input files _, _, ehr_X_test, ehr_y_test, ehr_xgb_model, ehr_input_data = unpack_input_data( ehr_input_file, ehr_model_file) _, _, riskfac_X_test, riskfac_y_test, riskfac_xgb_model, riskfac_input_data = unpack_input_data( riskfx_input_file, riskfx_model_file) ehr_metrics_results, ehr_metrics_df, _ = validate_best_model( ehr_xgb_model, ehr_X_test, ehr_y_test) riskfac_metrics_results, riskfac_metrics_df, _ = validate_best_model( riskfac_xgb_model, riskfac_X_test, riskfac_y_test) ### ### plot ### # %% # fig paramaters sns.set(style='whitegrid', font_scale=1.5, rc={'figure.figsize': (6, 6)}) sns.set_style({ 'axes.grid': True, 'axes.edgecolor': 'k', 'grid.color': '#e1e1e1' }) fsize = 14
cs_model_file = os.path.join( CSEC_DIR, 'best_xgb_model_csection_up_to_28_weeks_since_preg_start_icd9_cpt_count-2020-04-12.pickle' ) vg_model_file = os.path.join( VG_DIR, 'best_xgb_model_vaginal_delivery_up_to_28_weeks_since_preg_start_icd9_cpt_count-2020-04-12.pickle' ) # load models and input files _, _, cs_X_test, cs_y_test, cs_xgb_model, cs_input_data = unpack_input_data( cs_input_file, cs_model_file) _, _, vg_X_test, vg_y_test, vg_xgb_model, vg_input_data = unpack_input_data( vg_input_file, vg_model_file) cs_metrics_results, cs_metrics_df, _ = validate_best_model( cs_xgb_model, cs_X_test, cs_y_test) vg_metrics_results, vg_metrics_df, _ = validate_best_model( vg_xgb_model, vg_X_test, vg_y_test) # %% ### ### plot ### # plot - PR mult = 1 sns.set(style='ticks', context='paper', font_scale=1.0, rc={'figure.figsize': (2.2 * mult, 2.2 * mult)}) sns.set_style({