def tune_image_hyperparameters( data: ImageDataset, param_distributions: dict ) -> Tuple[List[float], List[dict], int, dict]: """ Performs randomized hyperparameter search for the current hyperparameter specification. Evaluates the best model using the test set. """ hyperparameters = Hyperparameters(param_distributions) print(f"Number of combinations: {len(hyperparameters.combinations)}") configurations = hyperparameters.sample_combinations(RANDOM_SAMPLES) configuration_count = len(configurations) print(f"Sampled combinations: {configuration_count}") results = [] start_time = time.monotonic() for (index, configuration) in enumerate(configurations): tuning_io_utils.print_configuration(configuration, index, configuration_count, start_time) result = evaluate_hyperparameters(data, configuration) results.append(result) # Figure out the index of the configuration that produced the best score. scores = [result["val_accuracy"] for result in results] best_index = np.argmax(scores) # Retrain the best configuration using all the training data and measure # accuracy on the test data. best_history = train_and_evaluate(data, configurations[best_index]) return (scores, configurations, best_index, best_history)
def main(): pp = Hyperparameters() print('Load data...') data = np.load(pp.data_pp_dir + 'data_arrays_' + pp.gender + '.npz') df_index_code = feather.read_dataframe(pp.data_pp_dir + 'df_index_code_' + pp.gender + '.feather') df_code_cols = feather.read_dataframe(pp.data_pp_dir + 'df_code_cols_' + pp.gender + '.feather') cols_list = load_obj(pp.data_pp_dir + 'cols_list.pkl') df = pd.DataFrame(data['x'], columns=cols_list) df['TIME'] = data['time'] df['EVENT'] = data['event'] df = pd.concat([df, df_code_cols], axis=1) idx_trn = (data['fold'][:, 0] != 99) df_trn = df[idx_trn] idx_val = (data['fold'][:, 0] == 99) df_val = df[idx_val] print('Begin study...') study = optuna.create_study(sampler=optuna.samplers.TPESampler(), direction='maximize') study.optimize(lambda trial: objective(trial, df_trn, df_val), n_trials=100) print('Save...') save_obj(study, pp.log_dir + 'cel_study_' + pp.gender + '.pkl')
def main(): # Load data print('Load data...') hp = Hyperparameters() data = np.load('../' + hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz') print('Use all data for model fitting...') x = data['x'] time = data['time'] event = data['event'] cols_list = load_obj('../' + hp.data_pp_dir + 'cols_list.pkl') df = pd.DataFrame(x, columns=cols_list) df['TIME'] = time df['EVENT'] = event ################################################################### print('Add additional columns...') df_index_code = feather.read_dataframe('../' + hp.results_dir + 'hr_addcodes_' + hp.gender + '.feather') df_index_code = pd.concat([df_index_code[df_index_code['TYPE']==1].head(10), df_index_code[df_index_code['TYPE']==0].head(10)], sort=False) for index, row in df_index_code.iterrows(): print(row['DESCRIPTION']) df[row['DESCRIPTION']] = (data['codes'] == row['INDEX_CODE']).max(axis=1) cols_list = cols_list + [row['DESCRIPTION']] ################################################################### print('Fitting...') cph = CoxPHFitter() cph.fit(df, duration_col='TIME', event_col='EVENT', show_progress=True, step_size=0.5) cph.print_summary() print('done')
def main(): hp = Hyperparameters() df = feather.read_dataframe(hp.data_dir + 'ALL_PHARMS_2008_2012_v3-1.feather') df['chem_id'] = df['chem_id'].astype(int) df['dispmonth_index'] = df['dispmonth_index'].astype(int) df.drop_duplicates(inplace=True) print('Remove future data...') df = df[df['dispmonth_index'] < 60] print('Split males and females...') males = feather.read_dataframe( hp.data_pp_dir + 'Py_VARIANZ_2012_v3-1_pp_males.feather')['VSIMPLE_INDEX_MASTER'] females = feather.read_dataframe( hp.data_pp_dir + 'Py_VARIANZ_2012_v3-1_pp_females.feather')['VSIMPLE_INDEX_MASTER'] df_males = df.merge(males, how='inner', on='VSIMPLE_INDEX_MASTER') df_females = df.merge(females, how='inner', on='VSIMPLE_INDEX_MASTER') print('Remove codes associated with less than min_count persons...') df_males = df_males[df_males.groupby('chem_id')['VSIMPLE_INDEX_MASTER']. transform('nunique') >= hp.min_count] df_females = df_females[ df_females.groupby('chem_id')['VSIMPLE_INDEX_MASTER'].transform( 'nunique') >= hp.min_count] print('Code prevalence and most frequent diag type...') info_ph_males = df_males.groupby(['chem_id'])['VSIMPLE_INDEX_MASTER'] info_ph_males = info_ph_males.agg( lambda x: x.nunique()).to_frame().reset_index() info_ph_males.rename(columns={'VSIMPLE_INDEX_MASTER': 'PREVALENCE'}, inplace=True) info_ph_females = df_females.groupby(['chem_id'])['VSIMPLE_INDEX_MASTER'] info_ph_females = info_ph_females.agg( lambda x: x.nunique()).to_frame().reset_index() info_ph_females.rename(columns={'VSIMPLE_INDEX_MASTER': 'PREVALENCE'}, inplace=True) print('Save...') info_ph_males.to_feather(hp.data_pp_dir + 'info_ph_males.feather') info_ph_females.to_feather(hp.data_pp_dir + 'info_ph_females.feather') df_males.sort_values( by=['VSIMPLE_INDEX_MASTER', 'dispmonth_index', 'chem_id'], ascending=True, inplace=True) df_males.reset_index(drop=True, inplace=True) df_males.to_feather(hp.data_pp_dir + 'PH_pp_males.feather') df_females.sort_values( by=['VSIMPLE_INDEX_MASTER', 'dispmonth_index', 'chem_id'], ascending=True, inplace=True) df_females.reset_index(drop=True, inplace=True) df_females.to_feather(hp.data_pp_dir + 'PH_pp_females.feather')
def main(): # Load data print('Load data...') hp = Hyperparameters() df_index_code = feather.read_dataframe(hp.data_pp_dir + 'df_index_code_' + hp.gender + '.feather') num_embeddings = df_index_code.shape[0] means = np.load(hp.data_pp_dir + 'means_' + hp.gender + '.npz') print('Add standard columns...') if hp.redundant_predictors: cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl') else: cols_list = hp.reduced_col_list num_cols = len(cols_list) ####################################################################################################### print('Compute HRs...') # Trained models if hp.redundant_predictors: tmp = listdir(hp.log_dir + 'all/') models = ['all/' + i for i in tmp if '.pt' in i] else: tmp = listdir(hp.log_dir + 'all_no_redundancies/') models = ['all_no_redundancies/' + i for i in tmp if '.pt' in i] log_hr_matrix = np.zeros((len(range(30, 75)), len(models))) # Neural Net num_input = num_cols+1 if hp.nonprop_hazards else num_cols net = NetRNNFinal(num_input, num_embeddings+1, hp).to(hp.device) #+1 for zero padding net.eval() for i in range(len(models)): print('HRs for model {}'.format(i)) # Restore variables from disk net.load_state_dict(torch.load(hp.log_dir + models[i], map_location=hp.device)) # Compute risk for all ages for j in tqdm(range(30, 75)): with torch.no_grad(): x_b = torch.zeros((1, num_cols), device=hp.device) codes_b = torch.zeros((1, 1), device=hp.device) month_b = torch.zeros((1, 1), device=hp.device) diagt_b = torch.zeros((1, 1), device=hp.device) x_b[0, cols_list.index('nhi_age')] = j - means['mean_age'] log_hr = net(x_b, codes_b, month_b, diagt_b).detach().cpu().numpy().squeeze() # Store log_hr_matrix[j-30, i] = log_hr # Compute HRs mean_hr = (log_hr_matrix.mean(axis=1)) df = pd.DataFrame({'age': range(30, 75), 'HR': mean_hr}) df['diff_hr'] = np.exp(df['HR'].diff()) print(df.describe())
def classifier_hyperparameters(): to_give_neurons = input("Do you want to set neurons in FC layer(y/n) ") while to_give_neurons != "y" and to_give_neurons != "n": print("") print("Invalid answer") to_give_neurons = input("Do you want to set neurons in FC layer(y/n) ") neurons = 64 if to_give_neurons == "y": print("") neurons = int(input("Give number of neurons: ")) while neurons <= 0: print("") print("Invalid answer") neurons = int(input("Give number of neurons: ")) print("") to_give_dropout = input("Do you want to add a Dropout layer in Flatten layer(y/n) ") dropouts = [] for i in range(2): while to_give_dropout != "y" and to_give_dropout != "n": print("") print("Invalid answer") if i == 0: to_give_dropout = input("Do you want to add a Dropout layer in Flatten layer(y/n) ") else: to_give_dropout = input("Do you want to add a Dropout layer in FC layer(y/n) ") if to_give_dropout == "n": dropouts.append(0.0) else: dropout_rate = float(input("Give dropout rate of Dropout's layer: ")) while (dropout_rate <= 0.0 or dropout_rate >= 1.0): print("") print("Invalid answer(should be between 0.0 and 1.0") dropout_rate = float(input("Give dropout rate of Dropout's layer: ")) dropouts.append(dropout_rate) if i == 0: print("") to_give_dropout = input("Do you want to add a Dropout layer in FC layer(y/n) ") print("") epochs = input_fns.input_epochs() print("") batch_size = input_fns.input_batch_size() print("") return Hyperparameters(0, 0, 0, dropouts, 0, 0, epochs, batch_size, neurons)
def distillation(model_fname, dataset, n_classes, train_loader, test_loader, n_epochs): listed_dir, f = os.path.split(model_fname) logging.info(f"Distilling model in {f}") model_to_distil, hparams = load_model_and_hyperparameters( f, listed_dir, n_classes) distillation_hparams = Hyperparameters( hparams.learning_rate, hparams.weight_decay, hparams.momentum, hparams.loss_function, hparams.gradient_method, hparams.model_name, hparams.scheduler) optimizer, scheduler = get_optimizer_and_scheduler(distillation_hparams, model_to_distil, n_epochs) regul_function = f[-findnth_right(f[::-1], "_lugeR", 0 ):-findnth_left(f[::-1], "_", 0)] regul_coefficient = f[-findnth_right(f[::-1], "_", 0):-4] fname_origin = RegularizationHyperparameters( hparams.learning_rate, hparams.weight_decay, hparams.momentum, hparams.loss_function, hparams.gradient_method, hparams.model_name, hparams.scheduler, regul_coefficient, regul_function).build_name() model_origin, hparams_origin = load_model_and_hyperparameters( fname_origin + ".run", f"./{dataset}/models/models_regularized", n_classes) tensorboard_logdir = distillation_hparams.get_tensorboard_name() writer = SummaryWriter(os.path.join(CN.TBOARD, tensorboard_logdir)) results = train_model_distillation_hinton(model_to_distil, model_origin, CN.DEVICE, hparams.loss_function, n_epochs, train_loader, test_loader, scheduler, optimizer, writer) fname = "Distilled_" + f[:-4] model_dir = f"./{dataset}/models/models_distilled/" results_dir = f"./{dataset}/results/" fname_model = fname + ".run" fname_results = fname + ".csv" logging.info("Saving model distilled" + fname) distilled_run = ModelRun(model_to_distil.state_dict(), distillation_hparams) torch.save(distilled_run, model_dir + fname_model) results.to_csv(results_dir + fname_results)
def main(): hp = Hyperparameters() for gender in ['females', 'males']: print(gender) data = np.load(hp.data_pp_dir + 'data_arrays_' + gender + '.npz') df = feather.read_dataframe(hp.data_pp_dir + 'df_index_person_' + gender + '.feather') df_geo = feather.read_dataframe(hp.data_dir + 'Py_VARIANZ_2012_v3-1_GEO.feather')[[ 'VSIMPLE_INDEX_MASTER', 'MB2020_code' ]] df_mb_sa2 = read_ods(hp.data_dir + 'MB_SA2.ods', 1).rename(columns={ 'MB2020_V1_': 'MB2020_code' }).astype(int) df_geo = df_geo.merge(df_mb_sa2, how='left', on='MB2020_code').drop(['MB2020_code'], axis=1) df = df.merge(df_geo, how='left', on='VSIMPLE_INDEX_MASTER') # load predicted risk df['RISK_PERC'] = feather.read_dataframe(hp.results_dir + 'df_cml_' + gender + '.feather')['RISK_PERC'] # median risk print('Median risk: {:.3} IQR: [{:.3}, {:.3}]'.format( np.percentile(df['RISK_PERC'].values, 50), np.percentile(df['RISK_PERC'].values, 25), np.percentile(df['RISK_PERC'].values, 75))) # set SA2s with less than 5 people to NaN df.loc[df.groupby('SA22020_V1')['VSIMPLE_INDEX_MASTER']. transform('nunique') < 5, 'RISK_PERC'] = np.nan # get median risk by SA2 df = df.groupby('SA22020_V1')['RISK_PERC'].median().reset_index() # save df.to_csv(hp.results_dir + 'df_sa2_' + gender + '.csv') if gender == 'females': df_females = df else: df_males = df df = df_females.merge(df_males, on='SA22020_V1', how='inner').dropna() corr_coeff, lcl, ucl = corr(df['RISK_PERC_x'].values, df['RISK_PERC_y'].values) print('Pearsons correlation: {:.3} [{:.3}, {:.3}]'.format( corr_coeff, lcl, ucl))
def _get_ff_hyperparameters() -> Hyperparameters: """Returns hyperparameters used to tune the feed-forward network. """ # First pass: hyperparameter_values = Hyperparameters({ 'learning_rate': [0.1, 0.01, 0.001], 'batch_size': [32, 64, 128], 'optimizer': ['adam', 'sgd'] }) # Best: # optimizer: sgd, batch size: 64, learning rate: 0.1 # Second pass: hyperparameter_values = Hyperparameters({ 'learning_rate': [0.05, 0.1, 0.2], 'batch_size': [16, 32, 64], 'optimizer': ['sgd'] }) # Best: # optimizer: sgd, batch size: 16, learning rate: 0.1 return hyperparameter_values
def _get_cnn_hyperparameters() -> Hyperparameters: """Returns hyperparameters used to tune the network. """ # Spectrograms # First pass: # hyperparameter_values = Hyperparameters({ # 'learning_rate': [0.1, 0.01, 0.001], # 'batch_size': [32, 64, 128], # 'optimizer': ['adam', 'sgd'] # }) # Results: # optimizer: adam, batch size: 64, learning rate: 0.001 # Adam with learning rate 0.001 seems to work best, regardless of batch size. # Second pass: # hyperparameter_values = Hyperparameters({ # 'learning_rate': [0.001], # 'batch_size': [8, 16, 32, 64, 256], # 'optimizer': ['adam'] # }) # Best: # optimizer: adam, batch size: 64, learning rate: 0.001 # Scaleograms # First pass: # hyperparameter_values = Hyperparameters({ # 'learning_rate': [0.1, 0.01, 0.001], # 'batch_size': [32, 64, 128], # 'optimizer': ['adam', 'sgd'] # }) # Results: # optimizer: adam, batch size: 32, learning rate: 0.001 # Adam with learning rate 0.001 seems to work best, regardless of batch size. # Second pass: hyperparameter_values = Hyperparameters({ 'learning_rate': [0.001], 'batch_size': [8, 16, 32, 256], 'optimizer': ['adam'] }) # Best: # optimizer: adam, batch size: 32, learning rate: 0.001 return hyperparameter_values
def main(): pp = Hyperparameters() print('Load data...') data = np.load(pp.data_pp_dir + 'data_arrays_' + pp.gender + '.npz') df_index_code = feather.read_dataframe(pp.data_pp_dir + 'df_index_code_' + pp.gender + '.feather') print('Begin study...') #study = optuna.create_study(sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.SuccessiveHalvingPruner()) study = optuna.create_study(sampler=optuna.samplers.GridSampler( {'summarize': ['output_attention']}), pruner=optuna.pruners.NopPruner()) study.optimize(lambda trial: objective(trial, data, df_index_code), n_trials=1) print('Save...') save_obj(study, pp.log_dir + 'study_' + pp.gender + '.pkl')
def main(): pp = Hyperparameters() print('Load data...') data = np.load(pp.data_pp_dir + 'data_arrays_' + pp.gender + '.npz') df_index_code = feather.read_dataframe(pp.data_pp_dir + 'df_index_code_' + pp.gender + '.feather') codes = data['codes'] code_cols = np.zeros((codes.shape[0], df_index_code.shape[0]), dtype=bool) print('Codes to columns...') for i in tqdm(range(df_index_code.shape[0])): code_cols[:, i] = np.bitwise_or.reduce(codes == (i + 1), 1) df_code_cols = pd.DataFrame( code_cols, columns=[str(i + 1) for i in range(df_index_code.shape[0])]) print('Save...') df_code_cols.to_feather(pp.data_pp_dir + 'df_code_cols_' + pp.gender + '.feather')
def main(): create_directory("saved_runs") device = torch.device(args.device) hy = Hyperparameters() writer = SummaryWriter() if args.tensor_log else None sentences = get_sentences(args.train_file) test_sentences = get_sentences(args.test_file) vocab, reverse_vocab = create_vocab() print("Loaded vocab of size {}".format(len(vocab))) test_perplexities = [] for epoch in range(hy.num_epochs): model, train_perplexity = train(sentences, vocab, reverse_vocab, hy, writer, device) test_perplexity = evaluate(model, test_sentences, vocab, reverse_vocab, hy, writer, device) test_perplexities.append(test_perplexity) print("=" * 80) print("Final Test Perplexity = {:.2f}".format(min(test_perplexities))) print("=" * 80)
def main(): # Load data print('Load data...') hp = Hyperparameters() data = np.load(hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz') means = np.load(hp.data_pp_dir + 'means_' + hp.gender + '.npz') x = data['x'] time = data['time'] event = data['event'] cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl') # restore original age and en_nzdep_q before centering x[:, cols_list.index('nhi_age')] += means['mean_age'] x[:, cols_list.index('en_nzdep_q')] += means['mean_nzdep'] df_cox = pd.DataFrame(x, columns=cols_list) df_cox['TIME'] = time df_cox['EVENT'] = event df_cml = pd.DataFrame(x, columns=cols_list) df_cml['TIME'] = time df_cml['EVENT'] = event # load predicted risk lph_matrix_cox = np.zeros((df_cox.shape[0], hp.num_folds)) lph_matrix_cml = np.zeros((df_cml.shape[0], hp.num_folds)) for fold in range(hp.num_folds): for swap in range(2): print('Fold: {} Swap: {}'.format(fold, swap)) idx = (data['fold'][:, fold] == swap) lph_matrix_cox[idx, fold] = feather.read_dataframe(hp.results_dir + 'df_cox_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '.feather')['LPH'] lph_matrix_cml[idx, fold] = feather.read_dataframe(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '.feather')['LPH'] df_cox['LPH'] = lph_matrix_cox.mean(axis=1) df_cml['LPH'] = lph_matrix_cml.mean(axis=1) # remove validation data idx = (data['fold'][:, fold] != 99) df_cox = df_cox[idx].reset_index(drop=True) df_cml = df_cml[idx].reset_index(drop=True) es_cox = EvalSurv(df_cox.copy()) es_cml = EvalSurv(df_cml.copy()) df_cox['RISK_PERC'] = es_cox.get_risk_perc(1826) df_cml['RISK_PERC'] = es_cml.get_risk_perc(1826) ################################################################################################ print('Plot all...') fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 7)) ax_plt = ax[0] calibration_plot(df_cox, df_cml, ax_plt) ax_plt.title.set_text( 'Calibration: Men') if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women') ax_plt = ax[1] discrimination_plot(df_cox, df_cml, ax_plt) ax_plt.title.set_text('Discrimination: Men' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women') plt.tight_layout() plt.subplots_adjust(wspace=0.3, hspace=0.3) fig.savefig(hp.plots_dir + hp.gender + '_all.png') plt.close() ################################################################################################ print('Plot by age...') fig, ax = plt.subplots(nrows=3, ncols=2, figsize=(16, 21)) #30-44 condition = (df_cox['nhi_age'] >= 30) & (df_cox['nhi_age'] < 45) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax[0][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men 30-44 years' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women 30-44 years') ax_plt = ax[0][1] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men 30-44 years' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women 30-44 years') #45-59 condition = (df_cox['nhi_age'] >= 45) & (df_cox['nhi_age'] < 60) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax[1][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men 45-59 years' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women 45-59 years') ax_plt = ax[1][1] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men 45-59 years' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women 45-59 years') #60-74 condition = (df_cox['nhi_age'] >= 60) & (df_cox['nhi_age'] < 75) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax[2][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men 60-74 years' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women 60-74 years') ax_plt = ax[2][1] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men 60-74 years' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women 60-74 years') plt.tight_layout() plt.subplots_adjust(wspace=0.3, hspace=0.3) fig.savefig(hp.plots_dir + hp.gender + '_age.png') plt.close() ################################################################################################ print('Plot by ethnicity...') fig_cal, ax_cal = plt.subplots(nrows=3, ncols=2, figsize=(16, 21)) fig_dis, ax_dis = plt.subplots(nrows=3, ncols=2, figsize=(16, 21)) #Maori condition = df_cox['en_prtsd_eth_2'].astype(bool) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[0][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Maori Men' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Maori Women') ax_plt = ax_dis[0][0] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Maori Men' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Maori Women') #Pacific condition = df_cox['en_prtsd_eth_3'].astype(bool) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[0][1] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Pacific Men' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Pacific Women') ax_plt = ax_dis[0][1] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Pacific Men' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Pacific Women') #Indian condition = df_cox['en_prtsd_eth_43'].astype(bool) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[1][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Indian Men' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Indian Women') ax_plt = ax_dis[1][0] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Indian Men' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Indian Women') #Other condition = df_cox['en_prtsd_eth_9'].astype(bool) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[1][1] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men of Other Ethnicity' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women of Other Ethnicity') ax_plt = ax_dis[1][1] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men of Other Ethnicity' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women of Other Ethnicity') #NZ European condition = (~df_cox['en_prtsd_eth_2'].astype(bool)) & ( ~df_cox['en_prtsd_eth_3'].astype(bool)) & ( ~df_cox['en_prtsd_eth_43'].astype(bool)) & ( ~df_cox['en_prtsd_eth_9'].astype(bool)) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[2][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: European Men' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: European Women') ax_plt = ax_dis[2][0] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: European Men' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: European Women') ax_cal[2, 1].axis('off') ax_dis[2, 1].axis('off') fig_cal.tight_layout() fig_cal.subplots_adjust(wspace=0.3, hspace=0.3) fig_cal.savefig(hp.plots_dir + hp.gender + '_ethnicity_calibration.png') fig_dis.tight_layout() fig_dis.subplots_adjust(wspace=0.3, hspace=0.3) fig_dis.savefig(hp.plots_dir + hp.gender + '_ethnicity_discrimination.png') plt.close() ################################################################################################ print('Plot by deprivation...') fig_cal, ax_cal = plt.subplots(nrows=3, ncols=2, figsize=(16, 21)) fig_dis, ax_dis = plt.subplots(nrows=3, ncols=2, figsize=(16, 21)) #1 condition = (df_cox['en_nzdep_q'].round().astype(int) == 1) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[0][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men Deprivation Q1' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women Deprivation Q1') ax_plt = ax_dis[0][0] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men Deprivation Q1' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women Deprivation Q1') #2 condition = (df_cox['en_nzdep_q'].round().astype(int) == 2) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[0][1] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men Deprivation Q2' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women Deprivation Q2') ax_plt = ax_dis[0][1] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men Deprivation Q2' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women Deprivation Q2') #3 condition = (df_cox['en_nzdep_q'].round().astype(int) == 3) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[1][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men Deprivation Q3' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women Deprivation Q3') ax_plt = ax_dis[1][0] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men Deprivation Q3' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women Deprivation Q3') #4 condition = (df_cox['en_nzdep_q'].round().astype(int) == 4) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[1][1] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men Deprivation Q4' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women Deprivation Q4') ax_plt = ax_dis[1][1] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men Deprivation Q4' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women Deprivation Q4') #5 condition = (df_cox['en_nzdep_q'].round().astype(int) == 5) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[2][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men Deprivation Q5' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women Deprivation Q5') ax_plt = ax_dis[2][0] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men Deprivation Q5' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women Deprivation Q5') ax_cal[2, 1].axis('off') ax_dis[2, 1].axis('off') fig_cal.tight_layout() fig_cal.subplots_adjust(wspace=0.3, hspace=0.3) fig_cal.savefig(hp.plots_dir + hp.gender + '_deprivation_calibration.png') fig_dis.tight_layout() fig_dis.subplots_adjust(wspace=0.3, hspace=0.3) fig_dis.savefig(hp.plots_dir + hp.gender + '_deprivation_discrimination.png') plt.close() ################################################################################################ print('Plot by medication...') fig_cal, ax_cal = plt.subplots(nrows=3, ncols=2, figsize=(16, 21)) fig_dis, ax_dis = plt.subplots(nrows=3, ncols=2, figsize=(16, 21)) #BPL condition = df_cox['ph_bp_lowering_prior_6mths'].astype(bool) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[0][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men with BPL Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women with BPL Meds') ax_plt = ax_dis[0][0] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men with BPL Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women with BPL Meds') #No BPL condition = ~df_cox['ph_bp_lowering_prior_6mths'].astype(bool) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[0][1] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men without BPL Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women without BPL Meds') ax_plt = ax_dis[0][1] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men without BPL Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women without BPL Meds') #LL condition = df_cox['ph_lipid_lowering_prior_6mths'].astype(bool) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[1][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men with LL Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women with LL Meds') ax_plt = ax_dis[1][0] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men with LL Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women with LL Meds') #No LL condition = ~df_cox['ph_lipid_lowering_prior_6mths'].astype(bool) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[1][1] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men without LL Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women without LL Meds') ax_plt = ax_dis[1][1] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men without LL Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women without LL Meds') #APL/AC condition = df_cox['ph_antiplat_anticoag_prior_6mths'].astype(bool) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[2][0] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men with APL/AC Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women with APL/AC Meds') ax_plt = ax_dis[2][0] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men with APL/AC Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women with APL/AC Meds') #No APL/AC condition = ~df_cox['ph_antiplat_anticoag_prior_6mths'].astype(bool) print('Num people: ', sum(condition)) df_cox_red = df_cox.loc[condition].copy() df_cml_red = df_cml.loc[condition].copy() ax_plt = ax_cal[2][1] calibration_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Calibration: Men without APL/AC Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Calibration: Women without APL/AC Meds') ax_plt = ax_dis[2][1] discrimination_plot(df_cox_red, df_cml_red, ax_plt) ax_plt.title.set_text('Discrimination: Men without APL/AC Meds' ) if hp.gender == 'males' else ax_plt.title.set_text( 'Discrimination: Women without APL/AC Meds') fig_cal.tight_layout() fig_cal.subplots_adjust(wspace=0.3, hspace=0.3) fig_cal.savefig(hp.plots_dir + hp.gender + '_medication_calibration.png') fig_dis.tight_layout() fig_dis.subplots_adjust(wspace=0.3, hspace=0.3) fig_dis.savefig(hp.plots_dir + hp.gender + '_medication_discrimination.png') plt.close()
def main(): # Load data print('Load data...') hp = Hyperparameters() data = np.load(hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz') time = data['time'] event = data['event'] df = pd.DataFrame({'TIME': data['time'], 'EVENT': data['event']}) #baseline survival CML # df_cml = df.copy() # lph_matrix = np.zeros((df_cml.shape[0], hp.num_folds)) # for fold in range(hp.num_folds): # for swap in range(2): # print('Fold: {} Swap: {}'.format(fold, swap)) # idx = (data['fold'][:, fold] == swap) # if hp.redundant_predictors: # lph_matrix[idx, fold] = feather.read_dataframe(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '.feather')['LPH'] # else: # lph_matrix[idx, fold] = feather.read_dataframe(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '_no_redundancies.feather')['LPH'] # df_cml['LPH'] = lph_matrix.mean(axis=1) # idx = (data['fold'][:, fold] != 99) #exclude validation fold # df_cml = df_cml[idx].reset_index(drop=True) # es_cml = EvalSurv(df_cml.copy()) # print('Base survival CML: {:.13}'.format(es_cml.get_base_surv(1826))) # return # evaluation vectors d_index_vec_cox = np.zeros((hp.num_folds, 2)) r2_vec_cox = np.zeros((hp.num_folds, 2)) concordance_vec_cox = np.zeros((hp.num_folds, 2)) ibs_vec_cox = np.zeros((hp.num_folds, 2)) auc_vec_cox = np.zeros((hp.num_folds, 2)) d_index_vec_cml = np.zeros((hp.num_folds, 2)) r2_vec_cml = np.zeros((hp.num_folds, 2)) concordance_vec_cml = np.zeros((hp.num_folds, 2)) ibs_vec_cml = np.zeros((hp.num_folds, 2)) auc_vec_cml = np.zeros((hp.num_folds, 2)) print('Evaluate on each fold...') for fold in range(hp.num_folds): for swap in range(2): print('Fold: {} Swap: {}'.format(fold, swap)) idx = (data['fold'][:, fold] == swap) df_fold = df[idx].reset_index(drop=True) df_cox = df_fold.copy() df_cml = df_fold.copy() # load log partial hazards df_cox['LPH'] = feather.read_dataframe(hp.results_dir + 'df_cox_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '.feather')['LPH'] if hp.redundant_predictors: df_cml['LPH'] = feather.read_dataframe(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '.feather')['LPH'] else: df_cml['LPH'] = feather.read_dataframe(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '_no_redundancies.feather')['LPH'] ################################################################################################ es_cox = EvalSurv(df_cox.copy()) es_cml = EvalSurv(df_cml.copy()) r2_vec_cox[fold, swap] = es_cox.R_squared_D() d_index_vec_cox[fold, swap], _ = es_cox.D_index() concordance_vec_cox[fold, swap] = es_cox.concordance_index() ibs_vec_cox[fold, swap] = es_cox.integrated_brier_score() auc_vec_cox[fold, swap] = es_cox.auc(1826) r2_vec_cml[fold, swap] = es_cml.R_squared_D() d_index_vec_cml[fold, swap], _ = es_cml.D_index() concordance_vec_cml[fold, swap] = es_cml.concordance_index() ibs_vec_cml[fold, swap] = es_cml.integrated_brier_score() auc_vec_cml[fold, swap] = es_cml.auc(1826) print('Save...') if hp.redundant_predictors: np.savez(hp.results_dir + 'eval_vecs_' + hp.gender + '.npz', r2_vec_cox=r2_vec_cox, d_index_vec_cox=d_index_vec_cox, concordance_vec_cox=concordance_vec_cox, ibs_vec_cox=ibs_vec_cox, auc_vec_cox=auc_vec_cox, r2_vec_cml=r2_vec_cml, d_index_vec_cml=d_index_vec_cml, concordance_vec_cml=concordance_vec_cml, ibs_vec_cml=ibs_vec_cml, auc_vec_cml=auc_vec_cml) else: np.savez(hp.results_dir + 'eval_vecs_' + hp.gender + '_no_redundancies.npz', r2_vec_cox=r2_vec_cox, d_index_vec_cox=d_index_vec_cox, concordance_vec_cox=concordance_vec_cox, ibs_vec_cox=ibs_vec_cox, auc_vec_cox=auc_vec_cox, r2_vec_cml=r2_vec_cml, d_index_vec_cml=d_index_vec_cml, concordance_vec_cml=concordance_vec_cml, ibs_vec_cml=ibs_vec_cml, auc_vec_cml=auc_vec_cml)
def main(): # Load data print('Load data...') hp = Hyperparameters() df_index_code = feather.read_dataframe(hp.data_pp_dir + 'df_index_code_' + hp.gender + '.feather') print('Create list of codes...') pharm_lookup = feather.read_dataframe(hp.data_dir + 'CURRENT_VIEW_PHARMS_LOOKUP.feather') icd10_lookup = feather.read_dataframe(hp.data_dir + 'CURRENT_ICD10_ALL_LOOKUP.feather') pharm_lookup = pharm_lookup[['CHEMICAL_ID', 'CHEMICAL_NAME']] pharm_lookup.rename(columns={ 'CHEMICAL_ID': 'CODE', 'CHEMICAL_NAME': 'DESCRIPTION' }, inplace=True) pharm_lookup['CODE'] = pharm_lookup['CODE'].fillna(0).astype(int).astype( str) pharm_lookup.drop_duplicates(subset='CODE', inplace=True) pharm_lookup['TYPE'] = 0 icd10_lookup = icd10_lookup[['code', 'code_description']] icd10_lookup.rename(columns={ 'code': 'CODE', 'code_description': 'DESCRIPTION' }, inplace=True) icd10_lookup['CODE'] = icd10_lookup['CODE'].astype(str) icd10_lookup.drop_duplicates(subset='CODE', inplace=True) icd10_lookup['TYPE'] = 1 print('Get prevalences and most frequent he code type...') pharm_lookup['DIAG_TYPE'] = 0 info_ph = feather.read_dataframe(hp.data_pp_dir + 'info_ph_' + hp.gender + '.feather') info_ph.rename(columns={'chem_id': 'CODE'}, inplace=True) info_ph['CODE'] = info_ph['CODE'].astype(str) pharm_lookup = pharm_lookup.merge(info_ph, how='left', on='CODE') info_he = feather.read_dataframe(hp.data_pp_dir + 'info_he_' + hp.gender + '.feather') info_he.rename(columns={'CLIN_CD_10': 'CODE'}, inplace=True) icd10_lookup = icd10_lookup.merge(info_he, how='left', on='CODE') print('Merge with lookup table...') lookup = pd.concat([pharm_lookup, icd10_lookup], ignore_index=True, sort=False) df_index_code['CODE'] = df_index_code['CODE'].astype(str) df_index_code = df_index_code.merge(lookup, how='left', on=['CODE', 'TYPE']) num_embeddings = df_index_code.shape[0] print('Add standard columns...') if hp.redundant_predictors: cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl') else: cols_list = hp.reduced_col_list num_cols = len(cols_list) df_cols = pd.DataFrame({'TYPE': 2, 'DESCRIPTION': cols_list}) df_index_code = pd.concat([df_cols, df_index_code], sort=False) ####################################################################################################### print('Compute HRs...') # Trained models if hp.redundant_predictors: tmp = listdir(hp.log_dir + 'all/') models = ['all/' + i for i in tmp if '.pt' in i] else: tmp = listdir(hp.log_dir + 'all_no_redundancies/') models = ['all_no_redundancies/' + i for i in tmp if '.pt' in i] log_hr_columns = np.zeros((num_cols, len(models))) log_hr_embeddings = np.zeros((num_embeddings, len(models))) # Neural Net num_input = num_cols + 1 if hp.nonprop_hazards else num_cols net = NetRNNFinal(num_input, num_embeddings + 1, hp).to(hp.device) #+1 for zero padding net.eval() for i in range(len(models)): print('HRs for model {}'.format(i)) # Restore variables from disk net.load_state_dict( torch.load(hp.log_dir + models[i], map_location=hp.device)) with torch.no_grad(): x_b = torch.zeros((1, num_cols), device=hp.device) codes_b = torch.zeros((1, 1), device=hp.device) month_b = torch.zeros((1, 1), device=hp.device) diagt_b = torch.zeros((1, 1), device=hp.device) risk_baseline = net(x_b, codes_b, month_b, diagt_b).detach().cpu().numpy().squeeze() # Compute risk for standard columns for j in tqdm(range(num_cols)): with torch.no_grad(): x_b = torch.zeros((1, num_cols), device=hp.device) codes_b = torch.zeros((1, 1), device=hp.device) month_b = torch.zeros((1, 1), device=hp.device) diagt_b = torch.zeros((1, 1), device=hp.device) x_b[0, j] = 1 risk_mod = net( x_b, codes_b, month_b, diagt_b).detach().cpu().numpy().squeeze() - risk_baseline # Store log_hr_columns[j, i] = risk_mod # Compute risk for embeddings for j in tqdm(range(num_embeddings)): with torch.no_grad(): x_b = torch.zeros((1, num_cols), device=hp.device) codes_b = torch.zeros((1, 1), device=hp.device) month_b = torch.zeros((1, 1), device=hp.device) diagt_b = torch.zeros((1, 1), device=hp.device) codes_b[0] = (j + 1) diagt_b[0] = df_index_code['DIAG_TYPE'].values[j] risk_mod = net( x_b, codes_b, month_b, diagt_b).detach().cpu().numpy().squeeze() - risk_baseline # Store log_hr_embeddings[j, i] = risk_mod # Compute HRs log_hr_matrix = np.concatenate((log_hr_columns, log_hr_embeddings)) mean_hr = np.exp(log_hr_matrix.mean(axis=1)) lCI, uCI = np.exp( sms.DescrStatsW(log_hr_matrix.transpose()).tconfint_mean()) df_index_code['HR'] = mean_hr df_index_code['lCI'] = lCI df_index_code['uCI'] = uCI # Save df_index_code.sort_values(by=['TYPE', 'HR'], ascending=False, inplace=True) if hp.redundant_predictors: df_index_code.to_csv(hp.results_dir + 'hr_addcodes_' + hp.gender + '.csv', index=False) df_index_code.reset_index(drop=True).to_feather(hp.results_dir + 'hr_addcodes_' + hp.gender + '.feather') else: df_index_code.to_csv(hp.results_dir + 'hr_addcodes_' + hp.gender + '_no_redundancies.csv', index=False) df_index_code.reset_index( drop=True).to_feather(hp.results_dir + 'hr_addcodes_' + hp.gender + '_no_redundancies.feather')
def main(): hp = Hyperparameters() df = feather.read_dataframe(hp.data_dir + 'HX_ADM_2008_2012_v3-1.feather') df.rename(columns={'eventmonth_index': 'dispmonth_index'}, inplace=True) df['dispmonth_index'] = df['dispmonth_index'].astype(int) df.drop_duplicates(inplace=True) print('Remove future data...') df = df[df['dispmonth_index'] < 60] print('Replace DIAG_TYP with numerical values...') df.rename(columns={'DIAG_TYP': 'DIAG_TYPE'}, inplace=True) df['DIAG_TYPE'] = df['DIAG_TYPE'].replace({'A': 1, 'B': 2, 'E': 3, 'O': 4}) print('Split males and females...') males = feather.read_dataframe( hp.data_pp_dir + 'Py_VARIANZ_2012_v3-1_pp_males.feather')['VSIMPLE_INDEX_MASTER'] females = feather.read_dataframe( hp.data_pp_dir + 'Py_VARIANZ_2012_v3-1_pp_females.feather')['VSIMPLE_INDEX_MASTER'] df_males = df.merge(males, how='inner', on='VSIMPLE_INDEX_MASTER') df_females = df.merge(females, how='inner', on='VSIMPLE_INDEX_MASTER') print('Remove codes associated with less than min_count persons...') df_males = df_males[df_males.groupby('CLIN_CD_10')['VSIMPLE_INDEX_MASTER']. transform('nunique') >= hp.min_count] df_females = df_females[ df_females.groupby('CLIN_CD_10')['VSIMPLE_INDEX_MASTER'].transform( 'nunique') >= hp.min_count] print('Code prevalence and most frequent diag type...') info_he_males = df_males.groupby(['CLIN_CD_10' ])[['VSIMPLE_INDEX_MASTER', 'DIAG_TYPE']] info_he_males = info_he_males.agg({ 'VSIMPLE_INDEX_MASTER': lambda x: x.nunique(), 'DIAG_TYPE': lambda x: pd.Series.mode(x)[0] }).reset_index() info_he_males.rename(columns={'VSIMPLE_INDEX_MASTER': 'PREVALENCE'}, inplace=True) info_he_females = df_females.groupby( ['CLIN_CD_10'])[['VSIMPLE_INDEX_MASTER', 'DIAG_TYPE']] info_he_females = info_he_females.agg({ 'VSIMPLE_INDEX_MASTER': lambda x: x.nunique(), 'DIAG_TYPE': lambda x: pd.Series.mode(x)[0] }).reset_index() info_he_females.rename(columns={'VSIMPLE_INDEX_MASTER': 'PREVALENCE'}, inplace=True) print('Save...') info_he_males.to_feather(hp.data_pp_dir + 'info_he_males.feather') info_he_females.to_feather(hp.data_pp_dir + 'info_he_females.feather') df_males.sort_values( by=['VSIMPLE_INDEX_MASTER', 'dispmonth_index', 'CLIN_CD_10'], ascending=True, inplace=True) df_males.reset_index(drop=True, inplace=True) df_males.to_feather(hp.data_pp_dir + 'HE_pp_males.feather') df_females.sort_values( by=['VSIMPLE_INDEX_MASTER', 'dispmonth_index', 'CLIN_CD_10'], ascending=True, inplace=True) df_females.reset_index(drop=True, inplace=True) df_females.to_feather(hp.data_pp_dir + 'HE_pp_females.feather')
def main(): hp = Hyperparameters() print('Load data...') data = np.load(hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz') df_index_code = feather.read_dataframe(hp.data_pp_dir + 'df_index_code_' + hp.gender + '.feather') print('Test on each fold...') for fold in range(hp.num_folds): for swap in range(2): print('Fold: {} Swap: {}'.format(fold, swap)) idx = (data['fold'][:, fold] == swap) x = data['x'][idx] codes = data['codes'][idx] month = data['month'][idx] diagt = data['diagt'][idx] if not hp.redundant_predictors: cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl') x = x[:, [cols_list.index(i) for i in hp.reduced_col_list]] ####################################################################################################### print('Create data loaders and tensors...') dataset = utils.TensorDataset(torch.from_numpy(x), torch.from_numpy(codes), torch.from_numpy(month), torch.from_numpy(diagt)) # Create batch queues loader = utils.DataLoader(dataset, batch_size = hp.batch_size, shuffle = False, drop_last = False) # Neural Net net = NetRNNFinal(x.shape[1], df_index_code.shape[0]+1, hp).to(hp.device) #+1 for zero padding net.eval() # Trained models if hp.redundant_predictors: tmp = listdir(hp.log_dir + 'fold_' + str(fold) + '_' + str(1-swap) + '/') models = ['fold_' + str(fold) + '_' + str(1-swap) + '/' + i for i in tmp if '.pt' in i] else: tmp = listdir(hp.log_dir + 'fold_' + str(fold) + '_' + str(1-swap) + '_no_redundancies/') models = ['fold_' + str(fold) + '_' + str(1-swap) + '_no_redundancies/' + i for i in tmp if '.pt' in i] lph_matrix = np.zeros((x.shape[0], len(models))) for i in range(len(models)): print('Model {}'.format(models[i])) # Restore variables from disk net.load_state_dict(torch.load(hp.log_dir + models[i], map_location=hp.device)) # Prediction log_partial_hazard = np.array([]) print('Computing partial hazard for test data...') with torch.no_grad(): for _, (x, codes, month, diagt) in enumerate(tqdm(loader)): x, codes, month, diagt = x.to(hp.device), codes.to(hp.device), month.to(hp.device), diagt.to(hp.device) log_partial_hazard = np.append(log_partial_hazard, net(x, codes, month, diagt).detach().cpu().numpy()) lph_matrix[:, i] = log_partial_hazard print('Create dataframe...') df_cml = pd.DataFrame(lph_matrix, columns=models) df_cml['LPH'] = lph_matrix.mean(axis=1) print('Saving log proportional hazards for fold...') if hp.redundant_predictors: df_cml.to_feather(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '.feather') else: df_cml.to_feather(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '_no_redundancies.feather')
def main(): # Load data print('Load data...') hp = Hyperparameters() data = np.load(hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz') print('Use all data for model fitting...') x = data['x'] time = data['time'] event = data['event'] cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl') df = pd.DataFrame(x, columns=cols_list) df['TIME'] = time df['EVENT'] = event ################################################################### print('Fitting all data...') cph = CoxPHFitter() cph.fit(df, duration_col='TIME', event_col='EVENT', show_progress=True, step_size=0.5) cph.print_summary() print('Saving...') df_summary = cph.summary df_summary['PREDICTOR'] = cols_list df_summary.to_csv(hp.results_dir + 'hr_' + hp.gender + '.csv', index=False) ################################################################### print('Test on each fold (train on swapped)...') for fold in range(hp.num_folds): for swap in range(2): print('Fold: {} Swap: {}'.format(fold, swap)) idx = (data['fold'][:, fold] == (1 - swap)) x = data['x'][idx] time = data['time'][idx] event = data['event'][idx] df = pd.DataFrame(x, columns=cols_list) df['TIME'] = time df['EVENT'] = event print('Fitting all data...') cph = CoxPHFitter() cph.fit(df, duration_col='TIME', event_col='EVENT', show_progress=True, step_size=0.5) print('done') idx = (data['fold'][:, fold] == swap) x = data['x'][idx] df_cox = pd.DataFrame( {'LPH': np.dot(x - cph._norm_mean.values, cph.params_)}) print('Saving log proportional hazards for fold...') df_cox.to_feather(hp.results_dir + 'df_cox_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '.feather')
#!/usr/bin/env python import logging, pickle from hyperparameters import Hyperparameters if __name__ == "__main__": hyperparameters = Hyperparameters("language-model.cfg") import os.path, os # Setting up a log file. This is handy to follow progress during # the program's execution without resorting to printing to stdout. logfile = os.path.join(hyperparameters.run_dir, hyperparameters.logfile) verboselogfile = os.path.join(hyperparameters.run_dir, hyperparameters.verboselogfile) logging.basicConfig(filename=logfile, filemode="w", level=logging.DEBUG) print("Logging to %s, and creating link %s" % (logfile, verboselogfile)) try: logging.info("Trying to read training state from %s..." % hyperparameters.run_dir) filename = os.path.join(hyperparameters.run_dir, "trainstate.pkl") with open(filename, 'rb') as f: saved_state = pickle.load(f) corpus_state, dictionary_state, hyperparameters = saved_state[0:2] from lexicon import Corpus, Dictionary corpus = Corpus(*corpus_state) dictionary = Dictionary(*dictionary_state) from state import TrainingState
#!/usr/bin/env python import logging, pickle from hyperparameters import Hyperparameters if __name__ == "__main__": hyperparameters = Hyperparameters("language-model.cfg") import os.path, os # Setting up a log file. This is handy to follow progress during # the program's execution without resorting to printing to stdout. logfile = os.path.join(hyperparameters.run_dir, hyperparameters.logfile) verboselogfile = os.path.join(hyperparameters.run_dir, hyperparameters.verboselogfile) logging.basicConfig(filename=logfile, filemode="w", level=logging.DEBUG) print("Logging to %s, and creating link %s" % (logfile, verboselogfile)) try: logging.info("Trying to read training state from %s..." % hyperparameters.run_dir) filename = os.path.join(hyperparameters.run_dir, "trainstate.pkl") with open(filename, 'rb') as f: saved_state = pickle.load(f) corpus_state, dictionary_state, hyperparameters = saved_state[0:2] from lexicon import Corpus, Dictionary corpus = Corpus(*corpus_state) dictionary = Dictionary(*dictionary_state) from state import TrainingState trainstate = TrainingState(corpus, dictionary, hyperparameters)
point = np.array([x, y], dtype=np.float32) sample = sampleLatentSpace(cfg, nn, point) plt.imshow(sample, cmap='Greys') plt.title("figure {}; Latent Space Vector ({},{})".format( i, round(x, 3), round(y, 3))) plt.savefig(cfg.savePDFLocation + '\{}'.format(i)) plt.clf() print("Results saved in {}".format(cfg.savePDFLocation)) if __name__ == "__main__": cfg = Config() cfg.getArgs() hyp = Hyperparameters() data, balanceTracker = getDataArray(cfg.dataPath, cfg) trainingData = shuffleData(data) net = NeralNet(hyp) trainer = Trainer(cfg, trainingData) trainer.train(net, hyp) if trainer.trainingLoss[-1] > 20300: print( ">>Network Stuck in Local Minimum, Please Re-run to get Proper Results" ) generatePDFs(net, cfg, hyp, trainer)
def main(): hp = Hyperparameters() # Load data #df = feather.read_dataframe(hp.data_dir + 'Py_VARIANZ_2012_v3-1.feather') df = pd.read_feather(hp.data_dir + 'Py_VARIANZ_2012_v3-1.feather') # Exclude df = df[~df['ph_loopdiuretics_prior_5yrs_3evts'].astype(bool)] df = df[~df['ph_antianginals_prior_5yrs_3evts'].astype(bool)] df.dropna(subset=['end_fu_date'], inplace=True) # Adjust data types df['nhi_age'] = df['nhi_age'].astype(int) df['gender_code'] = df['gender_code'].astype(bool) df['en_prtsd_eth'] = df['en_prtsd_eth'].astype(int) df['en_nzdep_q'] = df['en_nzdep_q'].astype(int) df['hx_vdr_diabetes'] = df['hx_vdr_diabetes'].astype(bool) df['hx_af'] = df['hx_af'].astype(bool) df['ph_bp_lowering_prior_6mths'] = df['ph_bp_lowering_prior_6mths'].astype( bool) df['ph_lipid_lowering_prior_6mths'] = df[ 'ph_lipid_lowering_prior_6mths'].astype(bool) df['ph_anticoagulants_prior_6mths'] = df[ 'ph_anticoagulants_prior_6mths'].astype(bool) df['ph_antiplatelets_prior_6mths'] = df[ 'ph_antiplatelets_prior_6mths'].astype(bool) df['out_broad_cvd_adm_date'] = pd.to_datetime(df['out_broad_cvd_adm_date'], format='%Y-%m-%d', errors='coerce') df['end_fu_date'] = pd.to_datetime(df['end_fu_date'], format='%Y-%m-%d', errors='coerce') # Map Other Asian, Chinese, MELAA to 'other' df['en_prtsd_eth'].replace({4: 9, 42: 9, 5: 9}, inplace=True) # Create antiplatelet/anticoagulant column df['ph_antiplat_anticoag_prior_6mths'] = df[ 'ph_antiplatelets_prior_6mths'] | df['ph_anticoagulants_prior_6mths'] # Time to event and binary event column df['EVENT_DATE'] = df[['out_broad_cvd_adm_date', 'end_fu_date']].min(axis=1) beginning = pd.to_datetime({'year': [2012], 'month': [12], 'day': [31]})[0] df['TIME'] = (df['EVENT_DATE'] - beginning).dt.days.astype(int) df['EVENT'] = df['out_broad_cvd'] | df['imp_fatal_cvd'] # Descriptive statistics num_participants = len(df.index) print('Total participants: {}'.format(num_participants)) num_males = len(df.loc[df['gender_code']].index) num_females = len(df.loc[~df['gender_code']].index) print('Men: {} ({:.1f}%)'.format(num_males, 100 * num_males / num_participants)) print('Women: {} ({:.1f}%)'.format(num_females, 100 * num_females / num_participants)) mean_age_males, std_age_males = df.loc[ df['gender_code'], 'nhi_age'].mean(), df.loc[df['gender_code'], 'nhi_age'].std() mean_age_females, std_age_females = df.loc[ ~df['gender_code'], 'nhi_age'].mean(), df.loc[~df['gender_code'], 'nhi_age'].std() print('Age Men: {:.1f} ({:.1f})'.format(mean_age_males, std_age_males)) print('Age Women: {:.1f} ({:.1f})'.format(mean_age_females, std_age_females)) num_nze_males = (df.loc[df['gender_code'], 'en_prtsd_eth'] == 1).sum() num_nze_females = (df.loc[~df['gender_code'], 'en_prtsd_eth'] == 1).sum() print('NZE Men: {} ({:.1f}%)'.format(num_nze_males, 100 * num_nze_males / num_males)) print('NZE Women: {} ({:.1f}%)'.format(num_nze_females, 100 * num_nze_females / num_females)) num_maori_males = (df.loc[df['gender_code'], 'en_prtsd_eth'] == 2).sum() num_maori_females = (df.loc[~df['gender_code'], 'en_prtsd_eth'] == 2).sum() print('Maori Men: {} ({:.1f}%)'.format(num_maori_males, 100 * num_maori_males / num_males)) print('Maori Women: {} ({:.1f}%)'.format( num_maori_females, 100 * num_maori_females / num_females)) num_pacific_males = (df.loc[df['gender_code'], 'en_prtsd_eth'] == 3).sum() num_pacific_females = (df.loc[~df['gender_code'], 'en_prtsd_eth'] == 3).sum() print('Pacific Men: {} ({:.1f}%)'.format( num_pacific_males, 100 * num_pacific_males / num_males)) print('Pacific Women: {} ({:.1f}%)'.format( num_pacific_females, 100 * num_pacific_females / num_females)) num_indian_males = (df.loc[df['gender_code'], 'en_prtsd_eth'] == 43).sum() num_indian_females = (df.loc[~df['gender_code'], 'en_prtsd_eth'] == 43).sum() print('Indian Men: {} ({:.1f}%)'.format(num_indian_males, 100 * num_indian_males / num_males)) print('Indian Women: {} ({:.1f}%)'.format( num_indian_females, 100 * num_indian_females / num_females)) num_other_males = (df.loc[df['gender_code'], 'en_prtsd_eth'] == 9).sum() num_other_females = (df.loc[~df['gender_code'], 'en_prtsd_eth'] == 9).sum() print('Other Men: {} ({:.1f}%)'.format(num_other_males, 100 * num_other_males / num_males)) print('Other Women: {} ({:.1f}%)'.format( num_other_females, 100 * num_other_females / num_females)) num_dp1_males = (df.loc[df['gender_code'], 'en_nzdep_q'] == 1).sum() num_dp1_females = (df.loc[~df['gender_code'], 'en_nzdep_q'] == 1).sum() print('dp1 Men: {} ({:.1f}%)'.format(num_dp1_males, 100 * num_dp1_males / num_males)) print('dp1 Women: {} ({:.1f}%)'.format(num_dp1_females, 100 * num_dp1_females / num_females)) num_dp2_males = (df.loc[df['gender_code'], 'en_nzdep_q'] == 2).sum() num_dp2_females = (df.loc[~df['gender_code'], 'en_nzdep_q'] == 2).sum() print('dp2 Men: {} ({:.1f}%)'.format(num_dp2_males, 100 * num_dp2_males / num_males)) print('dp2 Women: {} ({:.1f}%)'.format(num_dp2_females, 100 * num_dp2_females / num_females)) num_dp3_males = (df.loc[df['gender_code'], 'en_nzdep_q'] == 3).sum() num_dp3_females = (df.loc[~df['gender_code'], 'en_nzdep_q'] == 3).sum() print('dp3 Men: {} ({:.1f}%)'.format(num_dp3_males, 100 * num_dp3_males / num_males)) print('dp3 Women: {} ({:.1f}%)'.format(num_dp3_females, 100 * num_dp3_females / num_females)) num_dp4_males = (df.loc[df['gender_code'], 'en_nzdep_q'] == 4).sum() num_dp4_females = (df.loc[~df['gender_code'], 'en_nzdep_q'] == 4).sum() print('dp4 Men: {} ({:.1f}%)'.format(num_dp4_males, 100 * num_dp4_males / num_males)) print('dp4 Women: {} ({:.1f}%)'.format(num_dp4_females, 100 * num_dp4_females / num_females)) num_dp5_males = (df.loc[df['gender_code'], 'en_nzdep_q'] == 5).sum() num_dp5_females = (df.loc[~df['gender_code'], 'en_nzdep_q'] == 5).sum() print('dp5 Men: {} ({:.1f}%)'.format(num_dp5_males, 100 * num_dp5_males / num_males)) print('dp5 Women: {} ({:.1f}%)'.format(num_dp5_females, 100 * num_dp5_females / num_females)) num_diabetes_males = df.loc[df['gender_code'], 'hx_vdr_diabetes'].sum() num_diabetes_females = df.loc[~df['gender_code'], 'hx_vdr_diabetes'].sum() print('Diabetes Men: {} ({:.1f}%)'.format( num_diabetes_males, 100 * num_diabetes_males / num_males)) print('Diabetes Women: {} ({:.1f}%)'.format( num_diabetes_females, 100 * num_diabetes_females / num_females)) num_AF_males = df.loc[df['gender_code'], 'hx_af'].sum() num_AF_females = df.loc[~df['gender_code'], 'hx_af'].sum() print('AF Men: {} ({:.1f}%)'.format(num_AF_males, 100 * num_AF_males / num_males)) print('AF Women: {} ({:.1f}%)'.format(num_AF_females, 100 * num_AF_females / num_females)) num_BP_males = df.loc[df['gender_code'], 'ph_bp_lowering_prior_6mths'].sum() num_BP_females = df.loc[~df['gender_code'], 'ph_bp_lowering_prior_6mths'].sum() print('BP Men: {} ({:.1f}%)'.format(num_BP_males, 100 * num_BP_males / num_males)) print('BP Women: {} ({:.1f}%)'.format(num_BP_females, 100 * num_BP_females / num_females)) num_LL_males = df.loc[df['gender_code'], 'ph_lipid_lowering_prior_6mths'].sum() num_LL_females = df.loc[~df['gender_code'], 'ph_lipid_lowering_prior_6mths'].sum() print('LL Men: {} ({:.1f}%)'.format(num_LL_males, 100 * num_LL_males / num_males)) print('LL Women: {} ({:.1f}%)'.format(num_LL_females, 100 * num_LL_females / num_females)) num_APAC_males = df.loc[df['gender_code'], 'ph_antiplat_anticoag_prior_6mths'].sum() num_APAC_females = df.loc[~df['gender_code'], 'ph_antiplat_anticoag_prior_6mths'].sum() print('APAC Men: {} ({:.1f}%)'.format(num_APAC_males, 100 * num_APAC_males / num_males)) print('APAC Women: {} ({:.1f}%)'.format( num_APAC_females, 100 * num_APAC_females / num_females)) follow_up_males, follow_up_males_mean = df.loc[ df['gender_code'], 'TIME'].sum() / 365, df.loc[df['gender_code'], 'TIME'].mean() / 365 follow_up_females, follow_up_females_mean = df.loc[ ~df['gender_code'], 'TIME'].sum() / 365, df.loc[~df['gender_code'], 'TIME'].mean() / 365 print('Follow up Men: {:.0f} ({:.1f})'.format(follow_up_males, follow_up_males_mean)) print('Follow up Women: {:.0f} ({:.1f})'.format(follow_up_females, follow_up_females_mean)) num_CVD_death_males = df.loc[df['gender_code'], 'imp_fatal_cvd'].sum() num_CVD_death_females = df.loc[~df['gender_code'], 'imp_fatal_cvd'].sum() print('CVD death Men: {} ({:.1f}%)'.format( num_CVD_death_males, 100 * num_CVD_death_males / num_males)) print('CVD death Women: {} ({:.1f}%)'.format( num_CVD_death_females, 100 * num_CVD_death_females / num_females)) num_CVD_event_males = df.loc[df['gender_code'], 'EVENT'].sum() num_CVD_event_females = df.loc[~df['gender_code'], 'EVENT'].sum() print('CVD event Men: {} ({:.1f}%)'.format( num_CVD_event_males, 100 * num_CVD_event_males / num_males)) print('CVD event Women: {} ({:.1f}%)'.format( num_CVD_event_females, 100 * num_CVD_event_females / num_females)) tmp_males = df.loc[df['gender_code'] & df['EVENT'], 'TIME'] / 365 time_to_CVD_males, time_to_CVD_males_Q1, time_to_CVD_males_Q3 = tmp_males.median( ), tmp_males.quantile(0.25), tmp_males.quantile(0.75) tmp_females = df.loc[~df['gender_code'] & df['EVENT'], 'TIME'] / 365 time_to_CVD_females, time_to_CVD_females_Q1, time_to_CVD_females_Q3 = tmp_females.median( ), tmp_females.quantile(0.25), tmp_females.quantile(0.75) print('Time to CVD Men: {:.1f} ({:.1f}, {:.1f})'.format( time_to_CVD_males, time_to_CVD_males_Q1, time_to_CVD_males_Q3)) print('Time to CVD Women: {:.1f} ({:.1f}, {:.1f})'.format( time_to_CVD_females, time_to_CVD_females_Q1, time_to_CVD_females_Q3)) num_censored_5y_males = (1 - df.loc[df['gender_code'] & (df['TIME'] == 1826), 'EVENT']).sum() num_censored_5y_females = (1 - df.loc[~df['gender_code'] & (df['TIME'] == 1826), 'EVENT']).sum() print('Censored at 5 years Men: {} ({:.1f}%)'.format( num_censored_5y_males, 100 * num_censored_5y_males / num_males)) print('Censored at 5 years Women: {} ({:.1f}%)'.format( num_censored_5y_females, 100 * num_censored_5y_females / num_females)) # Center age and deprivation index, separately for males and females mean_age_males = df.loc[df['gender_code'], 'nhi_age'].mean() mean_age_females = df.loc[~df['gender_code'], 'nhi_age'].mean() df.loc[df['gender_code'], 'nhi_age'] = df.loc[df['gender_code'], 'nhi_age'] - mean_age_males df.loc[~df['gender_code'], 'nhi_age'] = df.loc[~df['gender_code'], 'nhi_age'] - mean_age_females mean_nzdep_males = 3 mean_nzdep_females = 3 df.loc[df['gender_code'], 'en_nzdep_q'] = df.loc[df['gender_code'], 'en_nzdep_q'] - mean_nzdep_males df.loc[~df['gender_code'], 'en_nzdep_q'] = df.loc[~df['gender_code'], 'en_nzdep_q'] - mean_nzdep_females # Create interaction columns df['age_X_bp'] = df['nhi_age'] * df['ph_bp_lowering_prior_6mths'] df['age_X_diabetes'] = df['nhi_age'] * df['hx_vdr_diabetes'] df['age_X_af'] = df['nhi_age'] * df['hx_af'] df['bp_X_diabetes'] = df['ph_bp_lowering_prior_6mths'] & df[ 'hx_vdr_diabetes'] df['antiplat_anticoag_X_diabetes'] = df[ 'ph_antiplat_anticoag_prior_6mths'] & df['hx_vdr_diabetes'] df['bp_X_lipid'] = df['ph_bp_lowering_prior_6mths'] & df[ 'ph_lipid_lowering_prior_6mths'] # Keep all VARIANZ risk equations columns keep_cols = [ 'VSIMPLE_INDEX_MASTER', 'nhi_age', 'gender_code', 'en_prtsd_eth', 'en_nzdep_q', 'hx_vdr_diabetes', 'hx_af', 'ph_bp_lowering_prior_6mths', 'ph_lipid_lowering_prior_6mths', 'ph_antiplat_anticoag_prior_6mths', 'age_X_bp', 'age_X_diabetes', 'age_X_af', 'bp_X_diabetes', 'antiplat_anticoag_X_diabetes', 'bp_X_lipid', 'TIME', 'EVENT' ] df = df[keep_cols] # Save df_males = df[df['gender_code']] df_males.reset_index(drop=True, inplace=True) df_males.to_feather(hp.data_pp_dir + 'Py_VARIANZ_2012_v3-1_pp_males.feather') np.savez(hp.data_pp_dir + 'means_males.npz', mean_age=mean_age_males, mean_nzdep=mean_nzdep_males) df_females = df[~df['gender_code']] df_females.reset_index(drop=True, inplace=True) df_females.to_feather(hp.data_pp_dir + 'Py_VARIANZ_2012_v3-1_pp_females.feather') np.savez(hp.data_pp_dir + 'means_females.npz', mean_age=mean_age_females, mean_nzdep=mean_nzdep_females)
def objective(trial, data, df_index_code): hp = Hyperparameters(trial) #hp = Hyperparameters() print(trial.params) idx_trn = (data['fold'] != 99) x_trn = data['x'][idx_trn] time_trn = data['time'][idx_trn] event_trn = data['event'][idx_trn] codes_trn = data['codes'][idx_trn] month_trn = data['month'][idx_trn] diagt_trn = data['diagt'][idx_trn] idx_val = (data['fold'] == 99) x_val = data['x'][idx_val] time_val = data['time'][idx_val] event_val = data['event'][idx_val] codes_val = data['codes'][idx_val] month_val = data['month'][idx_val] diagt_val = data['diagt'][idx_val] # could move this outside objective function for efficiency sort_idx_trn, case_idx_trn, max_idx_control_trn = sort_and_case_indices( x_trn, time_trn, event_trn) sort_idx_val, case_idx_val, max_idx_control_val = sort_and_case_indices( x_val, time_val, event_val) x_trn, time_trn, event_trn = x_trn[sort_idx_trn], time_trn[ sort_idx_trn], event_trn[sort_idx_trn] codes_trn, month_trn, diagt_trn = codes_trn[sort_idx_trn], month_trn[ sort_idx_trn], diagt_trn[sort_idx_trn] x_val, time_val, event_val = x_val[sort_idx_val], time_val[ sort_idx_val], event_val[sort_idx_val] codes_val, month_val, diagt_val = codes_val[sort_idx_val], month_val[ sort_idx_val], diagt_val[sort_idx_val] ####################################################################################################### print('Create data loaders and tensors...') case_trn = utils.TensorDataset(torch.from_numpy(x_trn[case_idx_trn]), torch.from_numpy(time_trn[case_idx_trn]), torch.from_numpy(max_idx_control_trn), torch.from_numpy(codes_trn[case_idx_trn]), torch.from_numpy(month_trn[case_idx_trn]), torch.from_numpy(diagt_trn[case_idx_trn])) case_val = utils.TensorDataset(torch.from_numpy(x_val[case_idx_val]), torch.from_numpy(time_val[case_idx_val]), torch.from_numpy(max_idx_control_val), torch.from_numpy(codes_val[case_idx_val]), torch.from_numpy(month_val[case_idx_val]), torch.from_numpy(diagt_val[case_idx_val])) x_trn, x_val = torch.from_numpy(x_trn), torch.from_numpy(x_val) time_trn, time_val = torch.from_numpy(time_trn), torch.from_numpy(time_val) event_trn, event_val = torch.from_numpy(event_trn), torch.from_numpy( event_val) codes_trn, codes_val = torch.from_numpy(codes_trn), torch.from_numpy( codes_val) month_trn, month_val = torch.from_numpy(month_trn), torch.from_numpy( month_val) diagt_trn, diagt_val = torch.from_numpy(diagt_trn), torch.from_numpy( diagt_val) # Create batch queues trn_loader = utils.DataLoader(case_trn, batch_size=hp.batch_size, shuffle=True, drop_last=True) val_loader = utils.DataLoader(case_val, batch_size=hp.batch_size, shuffle=False, drop_last=False) print('Train...') # Neural Net hp.model_name = str(trial.number) + '_' + hp.model_name num_input = x_trn.shape[1] + 1 if hp.nonprop_hazards else x_trn.shape[1] net = NetRNNFinal(num_input, df_index_code.shape[0] + 1, hp).to(hp.device) #+1 for zero padding criterion = CoxPHLoss().to(hp.device) optimizer = optim.Adam(net.parameters(), lr=hp.learning_rate) best, num_bad_epochs = 100., 0 for epoch in range(1000): trn(trn_loader, x_trn, codes_trn, month_trn, diagt_trn, net, criterion, optimizer, hp) loss_val = val(val_loader, x_val, codes_val, month_val, diagt_val, net, criterion, epoch, hp) # early stopping if loss_val < best: print( '############### Saving good model ###############################' ) torch.save(net.state_dict(), hp.log_dir + hp.model_name) best = loss_val num_bad_epochs = 0 else: num_bad_epochs += 1 if num_bad_epochs == hp.patience: break # pruning trial.report(best, epoch) if trial.should_prune(): raise optuna.TrialPruned() print('Done') return best
def _restore_hp(self): self.hyperparameters = Hyperparameters.from_csv(self.path)
help="Name of experiment", default="") parser.add_argument("--batch_size", type=int, help="batch size", default=1) parser.add_argument("--epochs", type=int, help="number of training epochs", default=10) parser.add_argument("--embedding_size", type=int, help="embedding size", default=500) parser.add_argument("--hidden_size", type=int, help="RNN size", default=512) parser.add_argument("--lr", type=float, help="Learning rate", default=1e-3) parser.add_argument("--bidirectional", type=bool, help="Bidirectional RNN", default=False) parser.add_argument("--num_rnn_layers", type=int, help="# RNN Layers", default=2) args = parser.parse_args() writer = SummaryWriter(args.experiment_name) hyperparameters = Hyperparameters(args) main()
def main(): hp = Hyperparameters() print('Load data...') data = np.load(hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz') df_index_code = feather.read_dataframe(hp.data_pp_dir + 'df_index_code_' + hp.gender + '.feather') print('Train on each fold...') for fold in range(hp.num_folds): for swap in range(2): print('Fold: {} Swap: {}'.format(fold, swap)) idx = (data['fold'][:, fold] == swap) x = data['x'][idx] time = data['time'][idx] event = data['event'][idx] codes = data['codes'][idx] month = data['month'][idx] diagt = data['diagt'][idx] if not hp.redundant_predictors: cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl') x = x[:, [cols_list.index(i) for i in hp.reduced_col_list]] sort_idx, case_idx, max_idx_control = sort_and_case_indices( x, time, event) x, time, event = x[sort_idx], time[sort_idx], event[sort_idx] codes, month, diagt = codes[sort_idx], month[sort_idx], diagt[ sort_idx] print('Create data loaders and tensors...') case = utils.TensorDataset(torch.from_numpy(x[case_idx]), torch.from_numpy(time[case_idx]), torch.from_numpy(max_idx_control), torch.from_numpy(codes[case_idx]), torch.from_numpy(month[case_idx]), torch.from_numpy(diagt[case_idx])) x = torch.from_numpy(x) time = torch.from_numpy(time) event = torch.from_numpy(event) codes = torch.from_numpy(codes) month = torch.from_numpy(month) diagt = torch.from_numpy(diagt) for trial in range(hp.num_trials): print('Trial: {}'.format(trial)) # Create batch queues trn_loader = utils.DataLoader(case, batch_size=hp.batch_size, shuffle=True, drop_last=True) print('Train...') # Neural Net hp.model_name = str(trial) + '_' + datetime.now().strftime( '%Y%m%d_%H%M%S_%f') + '.pt' num_input = x.shape[1] + 1 if hp.nonprop_hazards else x.shape[1] net = NetRNNFinal(num_input, df_index_code.shape[0] + 1, hp).to(hp.device) #+1 for zero padding criterion = CoxPHLoss().to(hp.device) optimizer = optim.Adam(net.parameters(), lr=hp.learning_rate) for epoch in range(hp.max_epochs): trn(trn_loader, x, codes, month, diagt, net, criterion, optimizer, hp) if hp.redundant_predictors: torch.save( net.state_dict(), hp.log_dir + 'fold_' + str(fold) + '_' + str(swap) + '/' + hp.model_name) else: torch.save( net.state_dict(), hp.log_dir + 'fold_' + str(fold) + '_' + str(swap) + '_no_redundancies/' + hp.model_name) print('Done')
def main(): # Load data print('Load data...') hp = Hyperparameters() if hp.redundant_predictors: data = np.load(hp.results_dir + 'eval_vecs_' + hp.gender + '.npz') else: data = np.load(hp.results_dir + 'eval_vecs_' + hp.gender + '_no_redundancies.npz') # evaluation arrays r2_vec_cox = data['r2_vec_cox'] d_index_vec_cox = data['d_index_vec_cox'] concordance_vec_cox = data['concordance_vec_cox'] ibs_vec_cox = data['ibs_vec_cox'] auc_vec_cox = data['auc_vec_cox'] r2_vec_cml = data['r2_vec_cml'] d_index_vec_cml = data['d_index_vec_cml'] concordance_vec_cml = data['concordance_vec_cml'] ibs_vec_cml = data['ibs_vec_cml'] auc_vec_cml = data['auc_vec_cml'] r2_p = robust_cv_test(r2_vec_cox, r2_vec_cml) print('R-squared(D) p-value: {:.3}'.format(r2_p)) d_index_p = robust_cv_test(d_index_vec_cox, d_index_vec_cml) print('D-index p-value: {:.3}'.format(d_index_p)) concordance_p = robust_cv_test(concordance_vec_cox, concordance_vec_cml) print('Concordance p-value: {:.3}'.format(concordance_p)) ibs_p = robust_cv_test(ibs_vec_cox, ibs_vec_cml) print('IBS p-value: {:.3}'.format(ibs_p)) auc_p = robust_cv_test(auc_vec_cox, auc_vec_cml) print('AUC p-value: {:.3}'.format(auc_p)) r2_vec_cox = np.reshape(r2_vec_cox, -1) d_index_vec_cox = np.reshape(d_index_vec_cox, -1) concordance_vec_cox = np.reshape(concordance_vec_cox, -1) ibs_vec_cox = np.reshape(ibs_vec_cox, -1) auc_vec_cox = np.reshape(auc_vec_cox, -1) r2_vec_cml = np.reshape(r2_vec_cml, -1) d_index_vec_cml = np.reshape(d_index_vec_cml, -1) concordance_vec_cml = np.reshape(concordance_vec_cml, -1) ibs_vec_cml = np.reshape(ibs_vec_cml, -1) auc_vec_cml = np.reshape(auc_vec_cml, -1) r2_mean, (r2_lci, r2_uci) = r2_vec_cox.mean(), sms.DescrStatsW( r2_vec_cox).tconfint_mean() print('R-squared(D) Cox (95% CI): {:.3} ({:.3}, {:.3})'.format( r2_mean, r2_lci, r2_uci)) d_index_mean, (d_index_lci, d_index_uci) = d_index_vec_cox.mean(), sms.DescrStatsW( d_index_vec_cox).tconfint_mean() print('D-index Cox (95% CI): {:.3} ({:.3}, {:.3})'.format( d_index_mean, d_index_lci, d_index_uci)) concordance_mean, (concordance_lci, concordance_uci) = concordance_vec_cox.mean( ), sms.DescrStatsW(concordance_vec_cox).tconfint_mean() print('Concordance Cox (95% CI): {:.3} ({:.3}, {:.3})'.format( concordance_mean, concordance_lci, concordance_uci)) ibs_mean, (ibs_lci, ibs_uci) = ibs_vec_cox.mean(), sms.DescrStatsW( ibs_vec_cox).tconfint_mean() print('IBS Cox (95% CI): {:.3} ({:.3}, {:.3})'.format( ibs_mean, ibs_lci, ibs_uci)) auc_mean, (auc_lci, auc_uci) = auc_vec_cox.mean(), sms.DescrStatsW( auc_vec_cox).tconfint_mean() print('AUC Cox (95% CI): {:.3} ({:.3}, {:.3})'.format( auc_mean, auc_lci, auc_uci)) r2_mean, (r2_lci, r2_uci) = r2_vec_cml.mean(), sms.DescrStatsW( r2_vec_cml).tconfint_mean() print('R-squared(D) CML (95% CI): {:.3} ({:.3}, {:.3})'.format( r2_mean, r2_lci, r2_uci)) d_index_mean, (d_index_lci, d_index_uci) = d_index_vec_cml.mean(), sms.DescrStatsW( d_index_vec_cml).tconfint_mean() print('D-index CML (95% CI): {:.3} ({:.3}, {:.3})'.format( d_index_mean, d_index_lci, d_index_uci)) concordance_mean, (concordance_lci, concordance_uci) = concordance_vec_cml.mean( ), sms.DescrStatsW(concordance_vec_cml).tconfint_mean() print('Concordance CML (95% CI): {:.3} ({:.3}, {:.3})'.format( concordance_mean, concordance_lci, concordance_uci)) ibs_mean, (ibs_lci, ibs_uci) = ibs_vec_cml.mean(), sms.DescrStatsW( ibs_vec_cml).tconfint_mean() print('IBS CML (95% CI): {:.3} ({:.3}, {:.3})'.format( ibs_mean, ibs_lci, ibs_uci)) auc_mean, (auc_lci, auc_uci) = auc_vec_cml.mean(), sms.DescrStatsW( auc_vec_cml).tconfint_mean() print('AUC CML (95% CI): {:.3} ({:.3}, {:.3})'.format( auc_mean, auc_lci, auc_uci))
def main(): hp = Hyperparameters() np.random.seed(hp.np_seed) for gender in ['males', 'females']: print('Processing ' + gender + '...') print('Loading VARIANZ data...') df = feather.read_dataframe(hp.data_pp_dir + 'Py_VARIANZ_2012_v3-1_pp_' + gender + '.feather') print('Loading medications...') ph = feather.read_dataframe(hp.data_pp_dir + 'PH_pp_' + gender + '.feather') ph.rename(columns={'chem_id': 'CODE', 'dispmonth_index': 'MONTH'}, inplace=True) ph['TYPE'] = 0 print('Loading hospital events...') he = feather.read_dataframe(hp.data_pp_dir + 'HE_pp_' + gender + '.feather') he.rename(columns={'CLIN_CD_10': 'CODE', 'dispmonth_index': 'MONTH'}, inplace=True) he['TYPE'] = 1 print('-----------------------------------------') # numerical index for each person df.reset_index(drop=True, inplace=True) df_index_person = df['VSIMPLE_INDEX_MASTER'].reset_index().rename(columns={'index': 'INDEX_PERSON'}) # convert categorical ethnicity into indicator variables print('Create dummy variables...') df = pd.get_dummies(df, prefix='en_prtsd_eth', columns=['en_prtsd_eth'], drop_first=True) print('-----------------------------------------') print('Concatenating codes...') ac = pd.concat([ph, he], ignore_index=True, sort=False) ac['DIAG_TYPE'] = ac['DIAG_TYPE'].fillna(0).astype(int) # medications and hospital events print('Get max number of codes per person...') ac['COUNT'] = ac.groupby(['VSIMPLE_INDEX_MASTER']).cumcount() max_count = ac['COUNT'].max()+1 print('max_count {}'.format(max_count)) # code index (add 1 to reserve 0 for padding) df_index_code = ac[['CODE', 'TYPE']].drop_duplicates().reset_index(drop=True) df_index_code['CODE'] = df_index_code['CODE'].astype(str) df_index_code['INDEX_CODE'] = df_index_code.index + 1 # codes, times, diag_type arrays codes = np.zeros((len(df_index_person), max_count), dtype=np.int16) # uint16 not supported by torch month = np.zeros((len(df_index_person), max_count), dtype=np.uint8) diagt = np.zeros((len(df_index_person), max_count), dtype=np.uint8) print('Merging index_person...') ac = ac.merge(df_index_person, how='inner', on='VSIMPLE_INDEX_MASTER') print('Merging index_code...') ac['CODE'] = ac['CODE'].astype(str) ac = ac.merge(df_index_code, how='inner', on=['CODE', 'TYPE']) print('Updating arrays...') codes[ac['INDEX_PERSON'].values, ac['COUNT'].values] = ac['INDEX_CODE'].values month[ac['INDEX_PERSON'].values, ac['COUNT'].values] = ac['MONTH'].values diagt[ac['INDEX_PERSON'].values, ac['COUNT'].values] = ac['DIAG_TYPE'].values print('-----------------------------------------') # data folds, stratified by event, for 5x2 cv print('Exclude validation data...') # done this way for historical reasons df_trn, df_tst = train_test_split(df, test_size=0.1, train_size=0.8, shuffle=True, stratify=df['EVENT']) df_tmp = pd.concat([df_trn, df_tst]) print('Split data into folds...') for i in range(hp.num_folds): df_trn, df_tst = train_test_split(df_tmp, test_size=0.5, train_size=0.5, shuffle=True, stratify=df_tmp['EVENT']) df['FOLD_' + str(i)] = 99 df.loc[df_trn.index, 'FOLD_' + str(i)] = 0 df.loc[df_tst.index, 'FOLD_' + str(i)] = 1 # Other arrays fold_cols = ['FOLD_' + str(i) for i in range(hp.num_folds)] time = df['TIME'].values event = df['EVENT'].values.astype(int) fold = df[fold_cols].values df.drop(['TIME', 'EVENT', 'VSIMPLE_INDEX_MASTER', 'gender_code'] + fold_cols, axis=1, inplace=True) x = df.values.astype('float32') print('-----------------------------------------') print('Save...') np.savez(hp.data_pp_dir + 'data_arrays_' + gender + '.npz', x=x, time=time, event=event, codes=codes, month=month, diagt=diagt, fold=fold) df_index_person.to_feather(hp.data_pp_dir + 'df_index_person_' + gender + '.feather') df_index_code.to_feather(hp.data_pp_dir + 'df_index_code_' + gender + '.feather') save_obj(list(df.columns), hp.data_pp_dir + 'cols_list.pkl')
def __init__(self, steps, gym, network_descriptions, curriculum_designer, hyperparameters=Hyperparameters()): ray.init(log_to_driver=hyperparameters.log_to_driver) # min num remotes should be orders of magnitude smaller than min_num_runs_generated self.num_remotes = 32 self.min_num_runs_generated = 100 self.gam = 0.999 self.lam = 0.97 self.finish_runs_time = 1. self.hyperparameters = hyperparameters #number of iterations of first gathering samples, then optimizing on them to run here self.steps = steps self.gym = gym self.network_descriptions = network_descriptions self.curriculum_designer = curriculum_designer self.actor_weights = [] #get actor and critic weights a_w, c_w = ray.get(get_initial_weights.remote(self.network_descriptions)) self.actor_weights.append(a_w) self.critic_weights = c_w #track with training iteration this is in self.iteration = 0 #build logging object for this! self.logger = [dict()]