f_history=os.path.join(args.rnn_models_dir, best_model_prefix+'history.json')) clf_models_dict['rnn'] = rnn models = shallow_models + ['rnn'] # get id's of 3 patients with clinical deterioration and 3 different stay lengths chosen_thresh = 192 chosen_stay_ids_df = y_test_df[(y_test_df[args.outcome_column_name]==1)][id_cols].copy().reset_index(drop=True) chosen_stay_ids_df = chosen_stay_ids_df.drop_duplicates(subset=id_cols).reset_index(drop=True) # for each patient get their vitals, labs, demographics labs_df, labs_data_dict, vitals_df, vitals_data_dict, \ demographics_df, demographics_data_dict, outcomes_df, outcomes_data_dict = get_preprocessed_data(args.preproc_data_dir) vitals = parse_feature_cols(vitals_data_dict) labs = parse_feature_cols(labs_data_dict) chosen_stay_labs_df = pd.merge(labs_df, chosen_stay_ids_df, on=id_cols, how='inner') chosen_stay_vitals_df = pd.merge(vitals_df, chosen_stay_ids_df, on=id_cols, how='inner') chosen_stay_highfreq_df = pd.merge(chosen_stay_labs_df, chosen_stay_vitals_df, on = id_cols + ['hours_since_admission', 'timestamp'], how='outer') highfreq_features_dict = merge_data_dicts([labs_data_dict, vitals_data_dict]) highfreq_features_dict['fields'] = highfreq_features_dict['schema']['fields'] # choose a subject chosen_short_stay_subj_list = ['14343967', '18115638', '18826316', '17245153',
sys.path.append(os.path.join(PROJECT_REPO_DIR, 'src', 'rnn')) from feature_transformation import * from filter_admissions_by_tslice import get_preprocessed_data from merge_features_all_tslices import merge_data_dicts, get_all_features_data import argparse from progressbar import ProgressBar if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--preproc_data_dir', type=str) parser.add_argument('--output_dir', type=str) parser.add_argument('--include_medications', type=str, default='True') args = parser.parse_args() # Get all the labs, vitals, demographics and outcomes labs_df, labs_data_dict, vitals_df, vitals_data_dict, demographics_df, demographics_data_dict, medications_df, medications_data_dict, outcomes_df, outcomes_data_dict = get_preprocessed_data( args.preproc_data_dir) # merge the labs, vitals and demographics to get a single features table if args.include_medications == 'True': print('Getting labs, vitals, medications and demographics...') features_df, features_data_dict = get_all_features_data( labs_df, labs_data_dict, vitals_df, vitals_data_dict, demographics_df, demographics_data_dict, medications_df, medications_data_dict, True) else: print('Getting labs, vitalsand demographics...') features_df, features_data_dict = get_all_features_data( labs_df, labs_data_dict, vitals_df, vitals_data_dict, demographics_df, demographics_data_dict, medications_df, medications_data_dict, False)
# import the y dict to get the id cols y_test_dict = load_data_dict_json(y_test_dict_file) id_cols = parse_id_cols(y_test_dict) tslice_folders = os.path.join(args.tslice_folder, 'TSLICE=') collapsed_tslice_folders = os.path.join(args.collapsed_tslice_folder, 'TSLICE=') outcome_col = args.outcome_column_name tslices_list = args.evaluation_tslices.split(' ') y_test_ids_df = y_test_df[id_cols].drop_duplicates( subset=id_cols).reset_index(drop=True) # get demographics csv and data_dict # for each patient get their vitals, labs, demographics _, _, _, _, demographics_df, demographics_data_dict, _, _ = get_preprocessed_data( args.preproc_data_dir) prctile_vals = [5, 50, 95] random_seed_list = args.random_seed_list.split(' ') perf_df = pd.DataFrame() for p, tslice in enumerate(tslices_list): tslice_folder = tslice_folders + tslice collapsed_tslice_folder = collapsed_tslice_folders + tslice # get test set collapsed labs and vitals collapsed_vitals_df = pd.read_csv( os.path.join(collapsed_tslice_folder, 'CollapsedVitalsPerSequence.csv')) collapsed_labs_df = pd.read_csv( os.path.join(collapsed_tslice_folder, 'CollapsedLabsPerSequence.csv')) mews_df = pd.read_csv(
x_test_dict_file = os.path.join(args.clf_train_test_split_dir, 'x_dict.json') x_test_dict = load_data_dict_json(x_test_dict_file) feature_cols_with_mask_features = parse_feature_cols(x_test_dict) # import the y dict to get the id cols y_test_dict = load_data_dict_json(y_test_dict_file) id_cols = parse_id_cols(y_test_dict) tslice_folders = os.path.join(args.tslice_folder, 'TSLICE=') outcome_col_name = args.outcome_column_name tslices_list = args.evaluation_tslices.split(' ') y_test_ids_df = y_test_df[id_cols].drop_duplicates(subset=id_cols).reset_index(drop=True) # get demographics csv and data_dict # for each patient get their vitals, labs, demographics _,labs_data_dict,_,vitals_data_dict, _, demographics_data_dict, _,medications_data_dict, _, _ = get_preprocessed_data(args.preproc_data_dir) time_col = parse_time_col(vitals_data_dict) # prctile_vals = [5, 50, 95] # random_seed_list = args.random_seed_list.split(' ') # perf_df = pd.DataFrame() # clf_models_dir = os.path.join(args.clf_models_dir, 'current_best_model') clf_models_dir=args.clf_models_dir # predict on each tslice prctile_vals = [5, 50, 95] random_seed_list = args.random_seed_list.split(' ') perf_df = pd.DataFrame() # get training set normalization estimates