예제 #1
0
def tune_image_hyperparameters(
        data: ImageDataset, param_distributions: dict
) -> Tuple[List[float], List[dict], int, dict]:
    """
    Performs randomized hyperparameter search for the current hyperparameter
    specification. Evaluates the best model using the test set.
    """

    hyperparameters = Hyperparameters(param_distributions)
    print(f"Number of combinations: {len(hyperparameters.combinations)}")
    configurations = hyperparameters.sample_combinations(RANDOM_SAMPLES)
    configuration_count = len(configurations)
    print(f"Sampled combinations: {configuration_count}")

    results = []
    start_time = time.monotonic()
    for (index, configuration) in enumerate(configurations):
        tuning_io_utils.print_configuration(configuration, index,
                                            configuration_count, start_time)

        result = evaluate_hyperparameters(data, configuration)
        results.append(result)

    # Figure out the index of the configuration that produced the best score.
    scores = [result["val_accuracy"] for result in results]
    best_index = np.argmax(scores)

    # Retrain the best configuration using all the training data and measure
    # accuracy on the test data.
    best_history = train_and_evaluate(data, configurations[best_index])

    return (scores, configurations, best_index, best_history)
def main():
    pp = Hyperparameters()

    print('Load data...')
    data = np.load(pp.data_pp_dir + 'data_arrays_' + pp.gender + '.npz')
    df_index_code = feather.read_dataframe(pp.data_pp_dir + 'df_index_code_' +
                                           pp.gender + '.feather')
    df_code_cols = feather.read_dataframe(pp.data_pp_dir + 'df_code_cols_' +
                                          pp.gender + '.feather')
    cols_list = load_obj(pp.data_pp_dir + 'cols_list.pkl')

    df = pd.DataFrame(data['x'], columns=cols_list)
    df['TIME'] = data['time']
    df['EVENT'] = data['event']
    df = pd.concat([df, df_code_cols], axis=1)

    idx_trn = (data['fold'][:, 0] != 99)
    df_trn = df[idx_trn]
    idx_val = (data['fold'][:, 0] == 99)
    df_val = df[idx_val]

    print('Begin study...')
    study = optuna.create_study(sampler=optuna.samplers.TPESampler(),
                                direction='maximize')
    study.optimize(lambda trial: objective(trial, df_trn, df_val),
                   n_trials=100)

    print('Save...')
    save_obj(study, pp.log_dir + 'cel_study_' + pp.gender + '.pkl')
예제 #3
0
def main():
    # Load data
    print('Load data...')
    hp = Hyperparameters()
    data = np.load('../' + hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz')
    
    print('Use all data for model fitting...')
    x = data['x']
    time = data['time']
    event = data['event']
    
    cols_list = load_obj('../' + hp.data_pp_dir + 'cols_list.pkl')
    
    df = pd.DataFrame(x, columns=cols_list)
    df['TIME'] = time
    df['EVENT'] = event

    ###################################################################
    
    print('Add additional columns...')
    df_index_code = feather.read_dataframe('../' + hp.results_dir + 'hr_addcodes_' + hp.gender + '.feather')
    df_index_code = pd.concat([df_index_code[df_index_code['TYPE']==1].head(10), df_index_code[df_index_code['TYPE']==0].head(10)], sort=False)
    
    for index, row in df_index_code.iterrows():
        print(row['DESCRIPTION'])
        df[row['DESCRIPTION']] = (data['codes'] == row['INDEX_CODE']).max(axis=1)
        cols_list = cols_list + [row['DESCRIPTION']]
    
    ###################################################################
    
    print('Fitting...')
    cph = CoxPHFitter()
    cph.fit(df, duration_col='TIME', event_col='EVENT', show_progress=True, step_size=0.5)
    cph.print_summary()
    print('done')
예제 #4
0
def main():
    hp = Hyperparameters()

    df = feather.read_dataframe(hp.data_dir +
                                'ALL_PHARMS_2008_2012_v3-1.feather')
    df['chem_id'] = df['chem_id'].astype(int)
    df['dispmonth_index'] = df['dispmonth_index'].astype(int)

    df.drop_duplicates(inplace=True)

    print('Remove future data...')
    df = df[df['dispmonth_index'] < 60]

    print('Split males and females...')
    males = feather.read_dataframe(
        hp.data_pp_dir +
        'Py_VARIANZ_2012_v3-1_pp_males.feather')['VSIMPLE_INDEX_MASTER']
    females = feather.read_dataframe(
        hp.data_pp_dir +
        'Py_VARIANZ_2012_v3-1_pp_females.feather')['VSIMPLE_INDEX_MASTER']
    df_males = df.merge(males, how='inner', on='VSIMPLE_INDEX_MASTER')
    df_females = df.merge(females, how='inner', on='VSIMPLE_INDEX_MASTER')

    print('Remove codes associated with less than min_count persons...')
    df_males = df_males[df_males.groupby('chem_id')['VSIMPLE_INDEX_MASTER'].
                        transform('nunique') >= hp.min_count]
    df_females = df_females[
        df_females.groupby('chem_id')['VSIMPLE_INDEX_MASTER'].transform(
            'nunique') >= hp.min_count]

    print('Code prevalence and most frequent diag type...')
    info_ph_males = df_males.groupby(['chem_id'])['VSIMPLE_INDEX_MASTER']
    info_ph_males = info_ph_males.agg(
        lambda x: x.nunique()).to_frame().reset_index()
    info_ph_males.rename(columns={'VSIMPLE_INDEX_MASTER': 'PREVALENCE'},
                         inplace=True)
    info_ph_females = df_females.groupby(['chem_id'])['VSIMPLE_INDEX_MASTER']
    info_ph_females = info_ph_females.agg(
        lambda x: x.nunique()).to_frame().reset_index()
    info_ph_females.rename(columns={'VSIMPLE_INDEX_MASTER': 'PREVALENCE'},
                           inplace=True)

    print('Save...')
    info_ph_males.to_feather(hp.data_pp_dir + 'info_ph_males.feather')
    info_ph_females.to_feather(hp.data_pp_dir + 'info_ph_females.feather')

    df_males.sort_values(
        by=['VSIMPLE_INDEX_MASTER', 'dispmonth_index', 'chem_id'],
        ascending=True,
        inplace=True)
    df_males.reset_index(drop=True, inplace=True)
    df_males.to_feather(hp.data_pp_dir + 'PH_pp_males.feather')

    df_females.sort_values(
        by=['VSIMPLE_INDEX_MASTER', 'dispmonth_index', 'chem_id'],
        ascending=True,
        inplace=True)
    df_females.reset_index(drop=True, inplace=True)
    df_females.to_feather(hp.data_pp_dir + 'PH_pp_females.feather')
예제 #5
0
def main():
    # Load data
    print('Load data...')
    hp = Hyperparameters()
    df_index_code = feather.read_dataframe(hp.data_pp_dir + 'df_index_code_' + hp.gender + '.feather')
    num_embeddings = df_index_code.shape[0]
    means = np.load(hp.data_pp_dir + 'means_' + hp.gender + '.npz')

    print('Add standard columns...')
    if hp.redundant_predictors:
        cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl')
    else:
        cols_list = hp.reduced_col_list
    num_cols = len(cols_list)

    #######################################################################################################
        
    print('Compute HRs...')

    # Trained models
    if hp.redundant_predictors:
        tmp = listdir(hp.log_dir + 'all/')
        models = ['all/' + i for i in tmp if '.pt' in i]    
    else:
        tmp = listdir(hp.log_dir + 'all_no_redundancies/')
        models = ['all_no_redundancies/' + i for i in tmp if '.pt' in i]    

    log_hr_matrix = np.zeros((len(range(30, 75)), len(models)))

    # Neural Net
    num_input = num_cols+1 if hp.nonprop_hazards else num_cols
    net = NetRNNFinal(num_input, num_embeddings+1, hp).to(hp.device) #+1 for zero padding
    net.eval()

    for i in range(len(models)):
        print('HRs for model {}'.format(i))
        
        # Restore variables from disk
        net.load_state_dict(torch.load(hp.log_dir + models[i], map_location=hp.device))

        # Compute risk for all ages
        for j in tqdm(range(30, 75)):
            with torch.no_grad():
                x_b = torch.zeros((1, num_cols), device=hp.device)
                codes_b = torch.zeros((1, 1), device=hp.device)
                month_b = torch.zeros((1, 1), device=hp.device)
                diagt_b = torch.zeros((1, 1), device=hp.device)
                x_b[0, cols_list.index('nhi_age')] = j - means['mean_age']
                log_hr = net(x_b, codes_b, month_b, diagt_b).detach().cpu().numpy().squeeze()
            
            # Store
            log_hr_matrix[j-30, i] = log_hr
    
    # Compute HRs
    mean_hr = (log_hr_matrix.mean(axis=1))
    df = pd.DataFrame({'age': range(30, 75), 'HR': mean_hr})
    df['diff_hr'] = np.exp(df['HR'].diff())
    print(df.describe())
예제 #6
0
def classifier_hyperparameters():
    to_give_neurons = input("Do you want to set neurons in FC layer(y/n) ")

    while to_give_neurons != "y" and to_give_neurons != "n":
        print("")
        print("Invalid answer")
        to_give_neurons = input("Do you want to set neurons in FC layer(y/n) ")


    neurons = 64
    if to_give_neurons == "y":
        print("")
        neurons = int(input("Give number of neurons: "))
        while neurons <= 0:
            print("")
            print("Invalid answer")
            neurons = int(input("Give number of neurons: "))

    print("")
    to_give_dropout = input("Do you want to add a Dropout layer in Flatten layer(y/n) ")

    dropouts = []

    for i in range(2):
        while to_give_dropout != "y" and to_give_dropout != "n":
            print("")
            print("Invalid answer")
            if i == 0:
                to_give_dropout = input("Do you want to add a Dropout layer in Flatten layer(y/n) ")
            else:
                to_give_dropout = input("Do you want to add a Dropout layer in FC layer(y/n) ")

        if to_give_dropout == "n":
            dropouts.append(0.0)
        else:
            dropout_rate = float(input("Give dropout rate of Dropout's layer: "))

            while (dropout_rate <= 0.0 or dropout_rate >= 1.0):
                print("")
                print("Invalid answer(should be between 0.0 and 1.0")
                dropout_rate = float(input("Give dropout rate of Dropout's layer: "))
            dropouts.append(dropout_rate)

        if i == 0:
            print("")
            to_give_dropout = input("Do you want to add a Dropout layer in FC layer(y/n) ")


    print("")
    epochs = input_fns.input_epochs()
    print("")
    batch_size = input_fns.input_batch_size()
    print("")

    return Hyperparameters(0, 0, 0, dropouts,
                        0, 0, epochs, batch_size, neurons)
예제 #7
0
def distillation(model_fname, dataset, n_classes, train_loader, test_loader,
                 n_epochs):
    listed_dir, f = os.path.split(model_fname)
    logging.info(f"Distilling model in {f}")
    model_to_distil, hparams = load_model_and_hyperparameters(
        f, listed_dir, n_classes)
    distillation_hparams = Hyperparameters(
        hparams.learning_rate, hparams.weight_decay, hparams.momentum,
        hparams.loss_function, hparams.gradient_method, hparams.model_name,
        hparams.scheduler)

    optimizer, scheduler = get_optimizer_and_scheduler(distillation_hparams,
                                                       model_to_distil,
                                                       n_epochs)
    regul_function = f[-findnth_right(f[::-1], "_lugeR", 0
                                      ):-findnth_left(f[::-1], "_", 0)]
    regul_coefficient = f[-findnth_right(f[::-1], "_", 0):-4]
    fname_origin = RegularizationHyperparameters(
        hparams.learning_rate, hparams.weight_decay, hparams.momentum,
        hparams.loss_function, hparams.gradient_method, hparams.model_name,
        hparams.scheduler, regul_coefficient, regul_function).build_name()
    model_origin, hparams_origin = load_model_and_hyperparameters(
        fname_origin + ".run", f"./{dataset}/models/models_regularized",
        n_classes)
    tensorboard_logdir = distillation_hparams.get_tensorboard_name()
    writer = SummaryWriter(os.path.join(CN.TBOARD, tensorboard_logdir))
    results = train_model_distillation_hinton(model_to_distil, model_origin,
                                              CN.DEVICE, hparams.loss_function,
                                              n_epochs, train_loader,
                                              test_loader, scheduler,
                                              optimizer, writer)
    fname = "Distilled_" + f[:-4]
    model_dir = f"./{dataset}/models/models_distilled/"
    results_dir = f"./{dataset}/results/"
    fname_model = fname + ".run"
    fname_results = fname + ".csv"
    logging.info("Saving model distilled" + fname)
    distilled_run = ModelRun(model_to_distil.state_dict(),
                             distillation_hparams)
    torch.save(distilled_run, model_dir + fname_model)
    results.to_csv(results_dir + fname_results)
예제 #8
0
def main():
    hp = Hyperparameters()

    for gender in ['females', 'males']:
        print(gender)
        data = np.load(hp.data_pp_dir + 'data_arrays_' + gender + '.npz')
        df = feather.read_dataframe(hp.data_pp_dir + 'df_index_person_' +
                                    gender + '.feather')
        df_geo = feather.read_dataframe(hp.data_dir +
                                        'Py_VARIANZ_2012_v3-1_GEO.feather')[[
                                            'VSIMPLE_INDEX_MASTER',
                                            'MB2020_code'
                                        ]]
        df_mb_sa2 = read_ods(hp.data_dir + 'MB_SA2.ods',
                             1).rename(columns={
                                 'MB2020_V1_': 'MB2020_code'
                             }).astype(int)
        df_geo = df_geo.merge(df_mb_sa2, how='left',
                              on='MB2020_code').drop(['MB2020_code'], axis=1)
        df = df.merge(df_geo, how='left', on='VSIMPLE_INDEX_MASTER')

        # load predicted risk
        df['RISK_PERC'] = feather.read_dataframe(hp.results_dir + 'df_cml_' +
                                                 gender +
                                                 '.feather')['RISK_PERC']

        # median risk
        print('Median risk: {:.3} IQR: [{:.3}, {:.3}]'.format(
            np.percentile(df['RISK_PERC'].values, 50),
            np.percentile(df['RISK_PERC'].values, 25),
            np.percentile(df['RISK_PERC'].values, 75)))

        # set SA2s with less than 5 people to NaN
        df.loc[df.groupby('SA22020_V1')['VSIMPLE_INDEX_MASTER'].
               transform('nunique') < 5, 'RISK_PERC'] = np.nan

        # get median risk by SA2
        df = df.groupby('SA22020_V1')['RISK_PERC'].median().reset_index()

        # save
        df.to_csv(hp.results_dir + 'df_sa2_' + gender + '.csv')
        if gender == 'females':
            df_females = df
        else:
            df_males = df

    df = df_females.merge(df_males, on='SA22020_V1', how='inner').dropna()
    corr_coeff, lcl, ucl = corr(df['RISK_PERC_x'].values,
                                df['RISK_PERC_y'].values)
    print('Pearsons correlation: {:.3} [{:.3}, {:.3}]'.format(
        corr_coeff, lcl, ucl))
def _get_ff_hyperparameters() -> Hyperparameters:
    """Returns hyperparameters used to tune the feed-forward network.
    """
    # First pass:
    hyperparameter_values = Hyperparameters({
        'learning_rate': [0.1, 0.01, 0.001],
        'batch_size': [32, 64, 128],
        'optimizer': ['adam', 'sgd']
    })
    # Best:
    # optimizer: sgd, batch size: 64, learning rate: 0.1

    # Second pass:
    hyperparameter_values = Hyperparameters({
        'learning_rate': [0.05, 0.1, 0.2],
        'batch_size': [16, 32, 64],
        'optimizer': ['sgd']
    })

    # Best:
    # optimizer: sgd, batch size: 16, learning rate: 0.1

    return hyperparameter_values
def _get_cnn_hyperparameters() -> Hyperparameters:
    """Returns hyperparameters used to tune the network.
    """
    # Spectrograms

    # First pass:
    # hyperparameter_values = Hyperparameters({
    #     'learning_rate': [0.1, 0.01, 0.001],
    #     'batch_size': [32, 64, 128],
    #     'optimizer': ['adam', 'sgd']
    #     })
    # Results:
    # optimizer: adam, batch size: 64, learning rate: 0.001
    # Adam with learning rate 0.001 seems to work best, regardless of batch size.

    # Second pass:
    # hyperparameter_values = Hyperparameters({
    #     'learning_rate': [0.001],
    #     'batch_size': [8, 16, 32, 64, 256],
    #     'optimizer': ['adam']
    #     })

    # Best:
    # optimizer: adam, batch size: 64, learning rate: 0.001

    # Scaleograms

    # First pass:
    # hyperparameter_values = Hyperparameters({
    #     'learning_rate': [0.1, 0.01, 0.001],
    #     'batch_size': [32, 64, 128],
    #     'optimizer': ['adam', 'sgd']
    #     })
    # Results:
    # optimizer: adam, batch size: 32, learning rate: 0.001
    # Adam with learning rate 0.001 seems to work best, regardless of batch size.

    # Second pass:
    hyperparameter_values = Hyperparameters({
        'learning_rate': [0.001],
        'batch_size': [8, 16, 32, 256],
        'optimizer': ['adam']
    })

    # Best:
    # optimizer: adam, batch size: 32, learning rate: 0.001

    return hyperparameter_values
def main():
    pp = Hyperparameters()

    print('Load data...')
    data = np.load(pp.data_pp_dir + 'data_arrays_' + pp.gender + '.npz')
    df_index_code = feather.read_dataframe(pp.data_pp_dir + 'df_index_code_' +
                                           pp.gender + '.feather')

    print('Begin study...')
    #study = optuna.create_study(sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.SuccessiveHalvingPruner())
    study = optuna.create_study(sampler=optuna.samplers.GridSampler(
        {'summarize': ['output_attention']}),
                                pruner=optuna.pruners.NopPruner())
    study.optimize(lambda trial: objective(trial, data, df_index_code),
                   n_trials=1)

    print('Save...')
    save_obj(study, pp.log_dir + 'study_' + pp.gender + '.pkl')
예제 #12
0
def main():
    pp = Hyperparameters()

    print('Load data...')
    data = np.load(pp.data_pp_dir + 'data_arrays_' + pp.gender + '.npz')
    df_index_code = feather.read_dataframe(pp.data_pp_dir + 'df_index_code_' +
                                           pp.gender + '.feather')

    codes = data['codes']
    code_cols = np.zeros((codes.shape[0], df_index_code.shape[0]), dtype=bool)

    print('Codes to columns...')
    for i in tqdm(range(df_index_code.shape[0])):
        code_cols[:, i] = np.bitwise_or.reduce(codes == (i + 1), 1)
    df_code_cols = pd.DataFrame(
        code_cols, columns=[str(i + 1) for i in range(df_index_code.shape[0])])

    print('Save...')
    df_code_cols.to_feather(pp.data_pp_dir + 'df_code_cols_' + pp.gender +
                            '.feather')
예제 #13
0
def main():
    create_directory("saved_runs")

    device = torch.device(args.device)
    hy = Hyperparameters()
    writer = SummaryWriter() if args.tensor_log else None
    sentences = get_sentences(args.train_file)
    test_sentences = get_sentences(args.test_file)

    vocab, reverse_vocab = create_vocab()
    print("Loaded vocab of size {}".format(len(vocab)))

    test_perplexities = []

    for epoch in range(hy.num_epochs):
        model, train_perplexity = train(sentences, vocab, reverse_vocab, hy,
                                        writer, device)
        test_perplexity = evaluate(model, test_sentences, vocab, reverse_vocab,
                                   hy, writer, device)
        test_perplexities.append(test_perplexity)

    print("=" * 80)
    print("Final Test Perplexity = {:.2f}".format(min(test_perplexities)))
    print("=" * 80)
예제 #14
0
def main():
    # Load data
    print('Load data...')
    hp = Hyperparameters()
    data = np.load(hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz')
    means = np.load(hp.data_pp_dir + 'means_' + hp.gender + '.npz')
    x = data['x']
    time = data['time']
    event = data['event']
    cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl')

    # restore original age and en_nzdep_q before centering
    x[:, cols_list.index('nhi_age')] += means['mean_age']
    x[:, cols_list.index('en_nzdep_q')] += means['mean_nzdep']

    df_cox = pd.DataFrame(x, columns=cols_list)
    df_cox['TIME'] = time
    df_cox['EVENT'] = event

    df_cml = pd.DataFrame(x, columns=cols_list)
    df_cml['TIME'] = time
    df_cml['EVENT'] = event

    # load predicted risk
    lph_matrix_cox = np.zeros((df_cox.shape[0], hp.num_folds))
    lph_matrix_cml = np.zeros((df_cml.shape[0], hp.num_folds))
    for fold in range(hp.num_folds):
        for swap in range(2):
            print('Fold: {} Swap: {}'.format(fold, swap))
            idx = (data['fold'][:, fold] == swap)
            lph_matrix_cox[idx,
                           fold] = feather.read_dataframe(hp.results_dir +
                                                          'df_cox_' +
                                                          hp.gender +
                                                          '_fold_' +
                                                          str(fold) + '_' +
                                                          str(swap) +
                                                          '.feather')['LPH']
            lph_matrix_cml[idx,
                           fold] = feather.read_dataframe(hp.results_dir +
                                                          'df_cml_' +
                                                          hp.gender +
                                                          '_fold_' +
                                                          str(fold) + '_' +
                                                          str(swap) +
                                                          '.feather')['LPH']
    df_cox['LPH'] = lph_matrix_cox.mean(axis=1)
    df_cml['LPH'] = lph_matrix_cml.mean(axis=1)

    # remove validation data
    idx = (data['fold'][:, fold] != 99)
    df_cox = df_cox[idx].reset_index(drop=True)
    df_cml = df_cml[idx].reset_index(drop=True)
    es_cox = EvalSurv(df_cox.copy())
    es_cml = EvalSurv(df_cml.copy())

    df_cox['RISK_PERC'] = es_cox.get_risk_perc(1826)
    df_cml['RISK_PERC'] = es_cml.get_risk_perc(1826)

    ################################################################################################

    print('Plot all...')
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 7))

    ax_plt = ax[0]
    calibration_plot(df_cox, df_cml, ax_plt)
    ax_plt.title.set_text(
        'Calibration: Men') if hp.gender == 'males' else ax_plt.title.set_text(
            'Calibration: Women')

    ax_plt = ax[1]
    discrimination_plot(df_cox, df_cml, ax_plt)
    ax_plt.title.set_text('Discrimination: Men'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women')

    plt.tight_layout()
    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    fig.savefig(hp.plots_dir + hp.gender + '_all.png')
    plt.close()

    ################################################################################################

    print('Plot by age...')
    fig, ax = plt.subplots(nrows=3, ncols=2, figsize=(16, 21))

    #30-44
    condition = (df_cox['nhi_age'] >= 30) & (df_cox['nhi_age'] < 45)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax[0][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men 30-44 years'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women 30-44 years')

    ax_plt = ax[0][1]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men 30-44 years'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women 30-44 years')

    #45-59
    condition = (df_cox['nhi_age'] >= 45) & (df_cox['nhi_age'] < 60)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax[1][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men 45-59 years'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women 45-59 years')

    ax_plt = ax[1][1]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men 45-59 years'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women 45-59 years')

    #60-74
    condition = (df_cox['nhi_age'] >= 60) & (df_cox['nhi_age'] < 75)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax[2][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men 60-74 years'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women 60-74 years')

    ax_plt = ax[2][1]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men 60-74 years'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women 60-74 years')

    plt.tight_layout()
    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    fig.savefig(hp.plots_dir + hp.gender + '_age.png')
    plt.close()

    ################################################################################################

    print('Plot by ethnicity...')
    fig_cal, ax_cal = plt.subplots(nrows=3, ncols=2, figsize=(16, 21))
    fig_dis, ax_dis = plt.subplots(nrows=3, ncols=2, figsize=(16, 21))

    #Maori
    condition = df_cox['en_prtsd_eth_2'].astype(bool)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[0][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Maori Men'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Maori Women')

    ax_plt = ax_dis[0][0]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Maori Men'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Maori Women')

    #Pacific
    condition = df_cox['en_prtsd_eth_3'].astype(bool)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[0][1]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Pacific Men'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Pacific Women')

    ax_plt = ax_dis[0][1]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Pacific Men'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Pacific Women')

    #Indian
    condition = df_cox['en_prtsd_eth_43'].astype(bool)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[1][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Indian Men'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Indian Women')

    ax_plt = ax_dis[1][0]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Indian Men'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Indian Women')

    #Other
    condition = df_cox['en_prtsd_eth_9'].astype(bool)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[1][1]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men of Other Ethnicity'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women of Other Ethnicity')

    ax_plt = ax_dis[1][1]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men of Other Ethnicity'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women of Other Ethnicity')

    #NZ European
    condition = (~df_cox['en_prtsd_eth_2'].astype(bool)) & (
        ~df_cox['en_prtsd_eth_3'].astype(bool)) & (
            ~df_cox['en_prtsd_eth_43'].astype(bool)) & (
                ~df_cox['en_prtsd_eth_9'].astype(bool))
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[2][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: European Men'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: European Women')

    ax_plt = ax_dis[2][0]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: European Men'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: European Women')

    ax_cal[2, 1].axis('off')
    ax_dis[2, 1].axis('off')
    fig_cal.tight_layout()
    fig_cal.subplots_adjust(wspace=0.3, hspace=0.3)
    fig_cal.savefig(hp.plots_dir + hp.gender + '_ethnicity_calibration.png')
    fig_dis.tight_layout()
    fig_dis.subplots_adjust(wspace=0.3, hspace=0.3)
    fig_dis.savefig(hp.plots_dir + hp.gender + '_ethnicity_discrimination.png')
    plt.close()

    ################################################################################################

    print('Plot by deprivation...')
    fig_cal, ax_cal = plt.subplots(nrows=3, ncols=2, figsize=(16, 21))
    fig_dis, ax_dis = plt.subplots(nrows=3, ncols=2, figsize=(16, 21))

    #1
    condition = (df_cox['en_nzdep_q'].round().astype(int) == 1)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[0][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men Deprivation Q1'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women Deprivation Q1')

    ax_plt = ax_dis[0][0]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men Deprivation Q1'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women Deprivation Q1')

    #2
    condition = (df_cox['en_nzdep_q'].round().astype(int) == 2)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[0][1]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men Deprivation Q2'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women Deprivation Q2')

    ax_plt = ax_dis[0][1]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men Deprivation Q2'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women Deprivation Q2')

    #3
    condition = (df_cox['en_nzdep_q'].round().astype(int) == 3)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[1][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men Deprivation Q3'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women Deprivation Q3')

    ax_plt = ax_dis[1][0]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men Deprivation Q3'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women Deprivation Q3')

    #4
    condition = (df_cox['en_nzdep_q'].round().astype(int) == 4)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[1][1]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men Deprivation Q4'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women Deprivation Q4')

    ax_plt = ax_dis[1][1]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men Deprivation Q4'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women Deprivation Q4')

    #5
    condition = (df_cox['en_nzdep_q'].round().astype(int) == 5)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[2][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men Deprivation Q5'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women Deprivation Q5')

    ax_plt = ax_dis[2][0]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men Deprivation Q5'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women Deprivation Q5')

    ax_cal[2, 1].axis('off')
    ax_dis[2, 1].axis('off')
    fig_cal.tight_layout()
    fig_cal.subplots_adjust(wspace=0.3, hspace=0.3)
    fig_cal.savefig(hp.plots_dir + hp.gender + '_deprivation_calibration.png')
    fig_dis.tight_layout()
    fig_dis.subplots_adjust(wspace=0.3, hspace=0.3)
    fig_dis.savefig(hp.plots_dir + hp.gender +
                    '_deprivation_discrimination.png')
    plt.close()

    ################################################################################################

    print('Plot by medication...')
    fig_cal, ax_cal = plt.subplots(nrows=3, ncols=2, figsize=(16, 21))
    fig_dis, ax_dis = plt.subplots(nrows=3, ncols=2, figsize=(16, 21))

    #BPL
    condition = df_cox['ph_bp_lowering_prior_6mths'].astype(bool)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[0][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men with BPL Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women with BPL Meds')

    ax_plt = ax_dis[0][0]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men with BPL Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women with BPL Meds')

    #No BPL
    condition = ~df_cox['ph_bp_lowering_prior_6mths'].astype(bool)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[0][1]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men without BPL Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women without BPL Meds')

    ax_plt = ax_dis[0][1]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men without BPL Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women without BPL Meds')

    #LL
    condition = df_cox['ph_lipid_lowering_prior_6mths'].astype(bool)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[1][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men with LL Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women with LL Meds')

    ax_plt = ax_dis[1][0]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men with LL Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women with LL Meds')

    #No LL
    condition = ~df_cox['ph_lipid_lowering_prior_6mths'].astype(bool)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[1][1]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men without LL Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women without LL Meds')

    ax_plt = ax_dis[1][1]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men without LL Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women without LL Meds')

    #APL/AC
    condition = df_cox['ph_antiplat_anticoag_prior_6mths'].astype(bool)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[2][0]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men with APL/AC Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women with APL/AC Meds')

    ax_plt = ax_dis[2][0]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men with APL/AC Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women with APL/AC Meds')

    #No APL/AC
    condition = ~df_cox['ph_antiplat_anticoag_prior_6mths'].astype(bool)
    print('Num people: ', sum(condition))
    df_cox_red = df_cox.loc[condition].copy()
    df_cml_red = df_cml.loc[condition].copy()

    ax_plt = ax_cal[2][1]
    calibration_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Calibration: Men without APL/AC Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Calibration: Women without APL/AC Meds')

    ax_plt = ax_dis[2][1]
    discrimination_plot(df_cox_red, df_cml_red, ax_plt)
    ax_plt.title.set_text('Discrimination: Men without APL/AC Meds'
                          ) if hp.gender == 'males' else ax_plt.title.set_text(
                              'Discrimination: Women without APL/AC Meds')

    fig_cal.tight_layout()
    fig_cal.subplots_adjust(wspace=0.3, hspace=0.3)
    fig_cal.savefig(hp.plots_dir + hp.gender + '_medication_calibration.png')
    fig_dis.tight_layout()
    fig_dis.subplots_adjust(wspace=0.3, hspace=0.3)
    fig_dis.savefig(hp.plots_dir + hp.gender +
                    '_medication_discrimination.png')
    plt.close()
예제 #15
0
def main():
    # Load data
    print('Load data...')
    hp = Hyperparameters()
    data = np.load(hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz')
    time = data['time']
    event = data['event']
    
    df = pd.DataFrame({'TIME': data['time'], 'EVENT': data['event']})

    #baseline survival CML
    # df_cml = df.copy()
    # lph_matrix = np.zeros((df_cml.shape[0], hp.num_folds))
    # for fold in range(hp.num_folds):
        # for swap in range(2):
            # print('Fold: {} Swap: {}'.format(fold, swap))
            # idx = (data['fold'][:, fold] == swap)
            # if hp.redundant_predictors:
                # lph_matrix[idx, fold] = feather.read_dataframe(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '.feather')['LPH']
            # else:
                # lph_matrix[idx, fold] = feather.read_dataframe(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '_no_redundancies.feather')['LPH']
    # df_cml['LPH'] = lph_matrix.mean(axis=1)
    # idx = (data['fold'][:, fold] != 99) #exclude validation fold
    # df_cml = df_cml[idx].reset_index(drop=True)
    # es_cml = EvalSurv(df_cml.copy())
    # print('Base survival CML: {:.13}'.format(es_cml.get_base_surv(1826)))
    # return    

    # evaluation vectors
    d_index_vec_cox = np.zeros((hp.num_folds, 2))
    r2_vec_cox = np.zeros((hp.num_folds, 2))
    concordance_vec_cox = np.zeros((hp.num_folds, 2))
    ibs_vec_cox = np.zeros((hp.num_folds, 2))
    auc_vec_cox = np.zeros((hp.num_folds, 2))

    d_index_vec_cml = np.zeros((hp.num_folds, 2))
    r2_vec_cml = np.zeros((hp.num_folds, 2))
    concordance_vec_cml = np.zeros((hp.num_folds, 2))
    ibs_vec_cml = np.zeros((hp.num_folds, 2))
    auc_vec_cml = np.zeros((hp.num_folds, 2))
    
    print('Evaluate on each fold...')
    for fold in range(hp.num_folds):
        for swap in range(2):
            print('Fold: {} Swap: {}'.format(fold, swap))
            
            idx = (data['fold'][:, fold] == swap)
            df_fold = df[idx].reset_index(drop=True)
    
            df_cox = df_fold.copy()
            df_cml = df_fold.copy()

            # load log partial hazards
            df_cox['LPH'] = feather.read_dataframe(hp.results_dir + 'df_cox_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '.feather')['LPH']
            if hp.redundant_predictors:
                df_cml['LPH'] = feather.read_dataframe(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '.feather')['LPH']
            else:
                df_cml['LPH'] = feather.read_dataframe(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '_no_redundancies.feather')['LPH']
    
            ################################################################################################
                                        
            es_cox = EvalSurv(df_cox.copy())
            es_cml = EvalSurv(df_cml.copy())

            r2_vec_cox[fold, swap] = es_cox.R_squared_D()
            d_index_vec_cox[fold, swap], _ = es_cox.D_index()
            concordance_vec_cox[fold, swap] = es_cox.concordance_index()
            ibs_vec_cox[fold, swap] = es_cox.integrated_brier_score()
            auc_vec_cox[fold, swap] = es_cox.auc(1826)
            
            r2_vec_cml[fold, swap] = es_cml.R_squared_D()
            d_index_vec_cml[fold, swap], _ = es_cml.D_index()
            concordance_vec_cml[fold, swap] = es_cml.concordance_index()
            ibs_vec_cml[fold, swap] = es_cml.integrated_brier_score()
            auc_vec_cml[fold, swap] = es_cml.auc(1826)

    print('Save...')
    if hp.redundant_predictors:
        np.savez(hp.results_dir + 'eval_vecs_' + hp.gender + '.npz', 
                 r2_vec_cox=r2_vec_cox, d_index_vec_cox=d_index_vec_cox, concordance_vec_cox=concordance_vec_cox, ibs_vec_cox=ibs_vec_cox, auc_vec_cox=auc_vec_cox, 
                 r2_vec_cml=r2_vec_cml, d_index_vec_cml=d_index_vec_cml, concordance_vec_cml=concordance_vec_cml, ibs_vec_cml=ibs_vec_cml, auc_vec_cml=auc_vec_cml)
    else:
        np.savez(hp.results_dir + 'eval_vecs_' + hp.gender + '_no_redundancies.npz', 
                 r2_vec_cox=r2_vec_cox, d_index_vec_cox=d_index_vec_cox, concordance_vec_cox=concordance_vec_cox, ibs_vec_cox=ibs_vec_cox, auc_vec_cox=auc_vec_cox, 
                 r2_vec_cml=r2_vec_cml, d_index_vec_cml=d_index_vec_cml, concordance_vec_cml=concordance_vec_cml, ibs_vec_cml=ibs_vec_cml, auc_vec_cml=auc_vec_cml)
예제 #16
0
def main():
    # Load data
    print('Load data...')
    hp = Hyperparameters()
    df_index_code = feather.read_dataframe(hp.data_pp_dir + 'df_index_code_' +
                                           hp.gender + '.feather')

    print('Create list of codes...')
    pharm_lookup = feather.read_dataframe(hp.data_dir +
                                          'CURRENT_VIEW_PHARMS_LOOKUP.feather')
    icd10_lookup = feather.read_dataframe(hp.data_dir +
                                          'CURRENT_ICD10_ALL_LOOKUP.feather')

    pharm_lookup = pharm_lookup[['CHEMICAL_ID', 'CHEMICAL_NAME']]
    pharm_lookup.rename(columns={
        'CHEMICAL_ID': 'CODE',
        'CHEMICAL_NAME': 'DESCRIPTION'
    },
                        inplace=True)
    pharm_lookup['CODE'] = pharm_lookup['CODE'].fillna(0).astype(int).astype(
        str)
    pharm_lookup.drop_duplicates(subset='CODE', inplace=True)
    pharm_lookup['TYPE'] = 0

    icd10_lookup = icd10_lookup[['code', 'code_description']]
    icd10_lookup.rename(columns={
        'code': 'CODE',
        'code_description': 'DESCRIPTION'
    },
                        inplace=True)
    icd10_lookup['CODE'] = icd10_lookup['CODE'].astype(str)
    icd10_lookup.drop_duplicates(subset='CODE', inplace=True)
    icd10_lookup['TYPE'] = 1

    print('Get prevalences and most frequent he code type...')
    pharm_lookup['DIAG_TYPE'] = 0
    info_ph = feather.read_dataframe(hp.data_pp_dir + 'info_ph_' + hp.gender +
                                     '.feather')
    info_ph.rename(columns={'chem_id': 'CODE'}, inplace=True)
    info_ph['CODE'] = info_ph['CODE'].astype(str)
    pharm_lookup = pharm_lookup.merge(info_ph, how='left', on='CODE')

    info_he = feather.read_dataframe(hp.data_pp_dir + 'info_he_' + hp.gender +
                                     '.feather')
    info_he.rename(columns={'CLIN_CD_10': 'CODE'}, inplace=True)
    icd10_lookup = icd10_lookup.merge(info_he, how='left', on='CODE')

    print('Merge with lookup table...')
    lookup = pd.concat([pharm_lookup, icd10_lookup],
                       ignore_index=True,
                       sort=False)
    df_index_code['CODE'] = df_index_code['CODE'].astype(str)
    df_index_code = df_index_code.merge(lookup,
                                        how='left',
                                        on=['CODE', 'TYPE'])
    num_embeddings = df_index_code.shape[0]

    print('Add standard columns...')
    if hp.redundant_predictors:
        cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl')
    else:
        cols_list = hp.reduced_col_list
    num_cols = len(cols_list)
    df_cols = pd.DataFrame({'TYPE': 2, 'DESCRIPTION': cols_list})
    df_index_code = pd.concat([df_cols, df_index_code], sort=False)

    #######################################################################################################

    print('Compute HRs...')

    # Trained models
    if hp.redundant_predictors:
        tmp = listdir(hp.log_dir + 'all/')
        models = ['all/' + i for i in tmp if '.pt' in i]
    else:
        tmp = listdir(hp.log_dir + 'all_no_redundancies/')
        models = ['all_no_redundancies/' + i for i in tmp if '.pt' in i]

    log_hr_columns = np.zeros((num_cols, len(models)))
    log_hr_embeddings = np.zeros((num_embeddings, len(models)))

    # Neural Net
    num_input = num_cols + 1 if hp.nonprop_hazards else num_cols
    net = NetRNNFinal(num_input, num_embeddings + 1,
                      hp).to(hp.device)  #+1 for zero padding
    net.eval()

    for i in range(len(models)):
        print('HRs for model {}'.format(i))

        # Restore variables from disk
        net.load_state_dict(
            torch.load(hp.log_dir + models[i], map_location=hp.device))

        with torch.no_grad():
            x_b = torch.zeros((1, num_cols), device=hp.device)
            codes_b = torch.zeros((1, 1), device=hp.device)
            month_b = torch.zeros((1, 1), device=hp.device)
            diagt_b = torch.zeros((1, 1), device=hp.device)
            risk_baseline = net(x_b, codes_b, month_b,
                                diagt_b).detach().cpu().numpy().squeeze()

        # Compute risk for standard columns
        for j in tqdm(range(num_cols)):
            with torch.no_grad():
                x_b = torch.zeros((1, num_cols), device=hp.device)
                codes_b = torch.zeros((1, 1), device=hp.device)
                month_b = torch.zeros((1, 1), device=hp.device)
                diagt_b = torch.zeros((1, 1), device=hp.device)
                x_b[0, j] = 1
                risk_mod = net(
                    x_b, codes_b, month_b,
                    diagt_b).detach().cpu().numpy().squeeze() - risk_baseline

            # Store
            log_hr_columns[j, i] = risk_mod

        # Compute risk for embeddings
        for j in tqdm(range(num_embeddings)):
            with torch.no_grad():
                x_b = torch.zeros((1, num_cols), device=hp.device)
                codes_b = torch.zeros((1, 1), device=hp.device)
                month_b = torch.zeros((1, 1), device=hp.device)
                diagt_b = torch.zeros((1, 1), device=hp.device)
                codes_b[0] = (j + 1)
                diagt_b[0] = df_index_code['DIAG_TYPE'].values[j]
                risk_mod = net(
                    x_b, codes_b, month_b,
                    diagt_b).detach().cpu().numpy().squeeze() - risk_baseline

            # Store
            log_hr_embeddings[j, i] = risk_mod

    # Compute HRs
    log_hr_matrix = np.concatenate((log_hr_columns, log_hr_embeddings))
    mean_hr = np.exp(log_hr_matrix.mean(axis=1))
    lCI, uCI = np.exp(
        sms.DescrStatsW(log_hr_matrix.transpose()).tconfint_mean())
    df_index_code['HR'] = mean_hr
    df_index_code['lCI'] = lCI
    df_index_code['uCI'] = uCI

    # Save
    df_index_code.sort_values(by=['TYPE', 'HR'], ascending=False, inplace=True)
    if hp.redundant_predictors:
        df_index_code.to_csv(hp.results_dir + 'hr_addcodes_' + hp.gender +
                             '.csv',
                             index=False)
        df_index_code.reset_index(drop=True).to_feather(hp.results_dir +
                                                        'hr_addcodes_' +
                                                        hp.gender + '.feather')
    else:
        df_index_code.to_csv(hp.results_dir + 'hr_addcodes_' + hp.gender +
                             '_no_redundancies.csv',
                             index=False)
        df_index_code.reset_index(
            drop=True).to_feather(hp.results_dir + 'hr_addcodes_' + hp.gender +
                                  '_no_redundancies.feather')
예제 #17
0
def main():
    hp = Hyperparameters()

    df = feather.read_dataframe(hp.data_dir + 'HX_ADM_2008_2012_v3-1.feather')
    df.rename(columns={'eventmonth_index': 'dispmonth_index'}, inplace=True)
    df['dispmonth_index'] = df['dispmonth_index'].astype(int)

    df.drop_duplicates(inplace=True)

    print('Remove future data...')
    df = df[df['dispmonth_index'] < 60]

    print('Replace DIAG_TYP with numerical values...')
    df.rename(columns={'DIAG_TYP': 'DIAG_TYPE'}, inplace=True)
    df['DIAG_TYPE'] = df['DIAG_TYPE'].replace({'A': 1, 'B': 2, 'E': 3, 'O': 4})

    print('Split males and females...')
    males = feather.read_dataframe(
        hp.data_pp_dir +
        'Py_VARIANZ_2012_v3-1_pp_males.feather')['VSIMPLE_INDEX_MASTER']
    females = feather.read_dataframe(
        hp.data_pp_dir +
        'Py_VARIANZ_2012_v3-1_pp_females.feather')['VSIMPLE_INDEX_MASTER']
    df_males = df.merge(males, how='inner', on='VSIMPLE_INDEX_MASTER')
    df_females = df.merge(females, how='inner', on='VSIMPLE_INDEX_MASTER')

    print('Remove codes associated with less than min_count persons...')
    df_males = df_males[df_males.groupby('CLIN_CD_10')['VSIMPLE_INDEX_MASTER'].
                        transform('nunique') >= hp.min_count]
    df_females = df_females[
        df_females.groupby('CLIN_CD_10')['VSIMPLE_INDEX_MASTER'].transform(
            'nunique') >= hp.min_count]

    print('Code prevalence and most frequent diag type...')
    info_he_males = df_males.groupby(['CLIN_CD_10'
                                      ])[['VSIMPLE_INDEX_MASTER', 'DIAG_TYPE']]
    info_he_males = info_he_males.agg({
        'VSIMPLE_INDEX_MASTER':
        lambda x: x.nunique(),
        'DIAG_TYPE':
        lambda x: pd.Series.mode(x)[0]
    }).reset_index()
    info_he_males.rename(columns={'VSIMPLE_INDEX_MASTER': 'PREVALENCE'},
                         inplace=True)
    info_he_females = df_females.groupby(
        ['CLIN_CD_10'])[['VSIMPLE_INDEX_MASTER', 'DIAG_TYPE']]
    info_he_females = info_he_females.agg({
        'VSIMPLE_INDEX_MASTER':
        lambda x: x.nunique(),
        'DIAG_TYPE':
        lambda x: pd.Series.mode(x)[0]
    }).reset_index()
    info_he_females.rename(columns={'VSIMPLE_INDEX_MASTER': 'PREVALENCE'},
                           inplace=True)

    print('Save...')
    info_he_males.to_feather(hp.data_pp_dir + 'info_he_males.feather')
    info_he_females.to_feather(hp.data_pp_dir + 'info_he_females.feather')

    df_males.sort_values(
        by=['VSIMPLE_INDEX_MASTER', 'dispmonth_index', 'CLIN_CD_10'],
        ascending=True,
        inplace=True)
    df_males.reset_index(drop=True, inplace=True)
    df_males.to_feather(hp.data_pp_dir + 'HE_pp_males.feather')

    df_females.sort_values(
        by=['VSIMPLE_INDEX_MASTER', 'dispmonth_index', 'CLIN_CD_10'],
        ascending=True,
        inplace=True)
    df_females.reset_index(drop=True, inplace=True)
    df_females.to_feather(hp.data_pp_dir + 'HE_pp_females.feather')
예제 #18
0
def main():
    hp = Hyperparameters()
    
    print('Load data...')
    data = np.load(hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz')
    df_index_code = feather.read_dataframe(hp.data_pp_dir + 'df_index_code_' + hp.gender + '.feather')
    
    print('Test on each fold...')
    for fold in range(hp.num_folds):
        for swap in range(2):
            print('Fold: {} Swap: {}'.format(fold, swap))
            
            idx = (data['fold'][:, fold] == swap)
            x = data['x'][idx]
            codes = data['codes'][idx]
            month = data['month'][idx]
            diagt = data['diagt'][idx]

            if not hp.redundant_predictors:
                cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl')
                x = x[:, [cols_list.index(i) for i in hp.reduced_col_list]]            

            ####################################################################################################### 

            print('Create data loaders and tensors...')
            dataset = utils.TensorDataset(torch.from_numpy(x), torch.from_numpy(codes), torch.from_numpy(month), torch.from_numpy(diagt))

            # Create batch queues
            loader = utils.DataLoader(dataset, batch_size = hp.batch_size, shuffle = False, drop_last = False)

            # Neural Net
            net = NetRNNFinal(x.shape[1], df_index_code.shape[0]+1, hp).to(hp.device) #+1 for zero padding
            net.eval()

            # Trained models
            if hp.redundant_predictors:
                tmp = listdir(hp.log_dir + 'fold_' + str(fold) + '_' + str(1-swap) + '/')
                models = ['fold_' + str(fold) + '_' + str(1-swap) + '/' + i for i in tmp if '.pt' in i]
            else:
                tmp = listdir(hp.log_dir + 'fold_' + str(fold) + '_' + str(1-swap) + '_no_redundancies/')
                models = ['fold_' + str(fold) + '_' + str(1-swap) + '_no_redundancies/' + i for i in tmp if '.pt' in i]            
            lph_matrix = np.zeros((x.shape[0], len(models)))

            for i in range(len(models)):
                print('Model {}'.format(models[i]))
                # Restore variables from disk
                net.load_state_dict(torch.load(hp.log_dir + models[i], map_location=hp.device))
        
                # Prediction
                log_partial_hazard = np.array([])
                print('Computing partial hazard for test data...')
                with torch.no_grad():
                    for _, (x, codes, month, diagt) in enumerate(tqdm(loader)):
                        x, codes, month, diagt = x.to(hp.device), codes.to(hp.device), month.to(hp.device), diagt.to(hp.device)
                        log_partial_hazard = np.append(log_partial_hazard, net(x, codes, month, diagt).detach().cpu().numpy())
                lph_matrix[:, i] = log_partial_hazard

            print('Create dataframe...')
            df_cml = pd.DataFrame(lph_matrix, columns=models)
            df_cml['LPH'] = lph_matrix.mean(axis=1)
            
            print('Saving log proportional hazards for fold...')
            if hp.redundant_predictors:
                df_cml.to_feather(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '.feather')
            else:
                df_cml.to_feather(hp.results_dir + 'df_cml_' + hp.gender + '_fold_' + str(fold) + '_' + str(swap) + '_no_redundancies.feather')
예제 #19
0
def main():
    # Load data
    print('Load data...')
    hp = Hyperparameters()
    data = np.load(hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz')

    print('Use all data for model fitting...')
    x = data['x']
    time = data['time']
    event = data['event']

    cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl')

    df = pd.DataFrame(x, columns=cols_list)
    df['TIME'] = time
    df['EVENT'] = event

    ###################################################################

    print('Fitting all data...')
    cph = CoxPHFitter()
    cph.fit(df,
            duration_col='TIME',
            event_col='EVENT',
            show_progress=True,
            step_size=0.5)
    cph.print_summary()

    print('Saving...')
    df_summary = cph.summary
    df_summary['PREDICTOR'] = cols_list
    df_summary.to_csv(hp.results_dir + 'hr_' + hp.gender + '.csv', index=False)

    ###################################################################

    print('Test on each fold (train on swapped)...')
    for fold in range(hp.num_folds):
        for swap in range(2):
            print('Fold: {} Swap: {}'.format(fold, swap))

            idx = (data['fold'][:, fold] == (1 - swap))
            x = data['x'][idx]
            time = data['time'][idx]
            event = data['event'][idx]

            df = pd.DataFrame(x, columns=cols_list)
            df['TIME'] = time
            df['EVENT'] = event

            print('Fitting all data...')
            cph = CoxPHFitter()
            cph.fit(df,
                    duration_col='TIME',
                    event_col='EVENT',
                    show_progress=True,
                    step_size=0.5)
            print('done')

            idx = (data['fold'][:, fold] == swap)
            x = data['x'][idx]
            df_cox = pd.DataFrame(
                {'LPH': np.dot(x - cph._norm_mean.values, cph.params_)})

            print('Saving log proportional hazards for fold...')
            df_cox.to_feather(hp.results_dir + 'df_cox_' + hp.gender +
                              '_fold_' + str(fold) + '_' + str(swap) +
                              '.feather')
예제 #20
0
#!/usr/bin/env python

import logging, pickle
from hyperparameters import Hyperparameters

if __name__ == "__main__":
    hyperparameters = Hyperparameters("language-model.cfg")

    import os.path, os
    # Setting up a log file. This is handy to follow progress during
    # the program's execution without resorting to printing to stdout.
    logfile = os.path.join(hyperparameters.run_dir, hyperparameters.logfile)
    verboselogfile = os.path.join(hyperparameters.run_dir,
                                  hyperparameters.verboselogfile)
    logging.basicConfig(filename=logfile, filemode="w", level=logging.DEBUG)
    print("Logging to %s, and creating link %s" % (logfile, verboselogfile))

    try:
        logging.info("Trying to read training state from %s..." %
                     hyperparameters.run_dir)
        filename = os.path.join(hyperparameters.run_dir, "trainstate.pkl")
        with open(filename, 'rb') as f:
            saved_state = pickle.load(f)

        corpus_state, dictionary_state, hyperparameters = saved_state[0:2]

        from lexicon import Corpus, Dictionary
        corpus = Corpus(*corpus_state)
        dictionary = Dictionary(*dictionary_state)

        from state import TrainingState
예제 #21
0
파일: train.py 프로젝트: sinopeus/thrax
#!/usr/bin/env python

import logging, pickle
from hyperparameters import Hyperparameters


if __name__ == "__main__":
    hyperparameters = Hyperparameters("language-model.cfg")

    import os.path, os
    # Setting up a log file. This is handy to follow progress during
    # the program's execution without resorting to printing to stdout.
    logfile = os.path.join(hyperparameters.run_dir, hyperparameters.logfile)
    verboselogfile = os.path.join(hyperparameters.run_dir, hyperparameters.verboselogfile)
    logging.basicConfig(filename=logfile, filemode="w", level=logging.DEBUG)
    print("Logging to %s, and creating link %s" % (logfile, verboselogfile))

    try:
        logging.info("Trying to read training state from %s..." % hyperparameters.run_dir)
        filename = os.path.join(hyperparameters.run_dir, "trainstate.pkl")
        with open(filename, 'rb') as f:
            saved_state = pickle.load(f)

        corpus_state, dictionary_state, hyperparameters = saved_state[0:2]

        from lexicon import Corpus, Dictionary
        corpus = Corpus(*corpus_state)
        dictionary = Dictionary(*dictionary_state)

        from state import TrainingState
        trainstate = TrainingState(corpus, dictionary, hyperparameters)
예제 #22
0
        point = np.array([x, y], dtype=np.float32)
        sample = sampleLatentSpace(cfg, nn, point)
        plt.imshow(sample, cmap='Greys')
        plt.title("figure {}; Latent Space Vector ({},{})".format(
            i, round(x, 3), round(y, 3)))
        plt.savefig(cfg.savePDFLocation + '\{}'.format(i))
        plt.clf()

    print("Results saved in {}".format(cfg.savePDFLocation))


if __name__ == "__main__":
    cfg = Config()
    cfg.getArgs()

    hyp = Hyperparameters()

    data, balanceTracker = getDataArray(cfg.dataPath, cfg)
    trainingData = shuffleData(data)

    net = NeralNet(hyp)
    trainer = Trainer(cfg, trainingData)

    trainer.train(net, hyp)
    if trainer.trainingLoss[-1] > 20300:
        print(
            ">>Network Stuck in Local Minimum, Please Re-run to get Proper Results"
        )

    generatePDFs(net, cfg, hyp, trainer)
예제 #23
0
def main():
    hp = Hyperparameters()

    # Load data
    #df = feather.read_dataframe(hp.data_dir + 'Py_VARIANZ_2012_v3-1.feather')
    df = pd.read_feather(hp.data_dir + 'Py_VARIANZ_2012_v3-1.feather')

    # Exclude
    df = df[~df['ph_loopdiuretics_prior_5yrs_3evts'].astype(bool)]
    df = df[~df['ph_antianginals_prior_5yrs_3evts'].astype(bool)]
    df.dropna(subset=['end_fu_date'], inplace=True)

    # Adjust data types
    df['nhi_age'] = df['nhi_age'].astype(int)
    df['gender_code'] = df['gender_code'].astype(bool)
    df['en_prtsd_eth'] = df['en_prtsd_eth'].astype(int)
    df['en_nzdep_q'] = df['en_nzdep_q'].astype(int)
    df['hx_vdr_diabetes'] = df['hx_vdr_diabetes'].astype(bool)
    df['hx_af'] = df['hx_af'].astype(bool)
    df['ph_bp_lowering_prior_6mths'] = df['ph_bp_lowering_prior_6mths'].astype(
        bool)
    df['ph_lipid_lowering_prior_6mths'] = df[
        'ph_lipid_lowering_prior_6mths'].astype(bool)
    df['ph_anticoagulants_prior_6mths'] = df[
        'ph_anticoagulants_prior_6mths'].astype(bool)
    df['ph_antiplatelets_prior_6mths'] = df[
        'ph_antiplatelets_prior_6mths'].astype(bool)
    df['out_broad_cvd_adm_date'] = pd.to_datetime(df['out_broad_cvd_adm_date'],
                                                  format='%Y-%m-%d',
                                                  errors='coerce')
    df['end_fu_date'] = pd.to_datetime(df['end_fu_date'],
                                       format='%Y-%m-%d',
                                       errors='coerce')

    # Map Other Asian, Chinese, MELAA to 'other'
    df['en_prtsd_eth'].replace({4: 9, 42: 9, 5: 9}, inplace=True)

    # Create antiplatelet/anticoagulant column
    df['ph_antiplat_anticoag_prior_6mths'] = df[
        'ph_antiplatelets_prior_6mths'] | df['ph_anticoagulants_prior_6mths']

    # Time to event and binary event column
    df['EVENT_DATE'] = df[['out_broad_cvd_adm_date',
                           'end_fu_date']].min(axis=1)
    beginning = pd.to_datetime({'year': [2012], 'month': [12], 'day': [31]})[0]
    df['TIME'] = (df['EVENT_DATE'] - beginning).dt.days.astype(int)
    df['EVENT'] = df['out_broad_cvd'] | df['imp_fatal_cvd']

    # Descriptive statistics
    num_participants = len(df.index)
    print('Total participants: {}'.format(num_participants))
    num_males = len(df.loc[df['gender_code']].index)
    num_females = len(df.loc[~df['gender_code']].index)
    print('Men: {} ({:.1f}%)'.format(num_males,
                                     100 * num_males / num_participants))
    print('Women: {} ({:.1f}%)'.format(num_females,
                                       100 * num_females / num_participants))
    mean_age_males, std_age_males = df.loc[
        df['gender_code'], 'nhi_age'].mean(), df.loc[df['gender_code'],
                                                     'nhi_age'].std()
    mean_age_females, std_age_females = df.loc[
        ~df['gender_code'], 'nhi_age'].mean(), df.loc[~df['gender_code'],
                                                      'nhi_age'].std()
    print('Age Men: {:.1f} ({:.1f})'.format(mean_age_males, std_age_males))
    print('Age Women: {:.1f} ({:.1f})'.format(mean_age_females,
                                              std_age_females))
    num_nze_males = (df.loc[df['gender_code'], 'en_prtsd_eth'] == 1).sum()
    num_nze_females = (df.loc[~df['gender_code'], 'en_prtsd_eth'] == 1).sum()
    print('NZE Men: {} ({:.1f}%)'.format(num_nze_males,
                                         100 * num_nze_males / num_males))
    print('NZE Women: {} ({:.1f}%)'.format(num_nze_females, 100 *
                                           num_nze_females / num_females))
    num_maori_males = (df.loc[df['gender_code'], 'en_prtsd_eth'] == 2).sum()
    num_maori_females = (df.loc[~df['gender_code'], 'en_prtsd_eth'] == 2).sum()
    print('Maori Men: {} ({:.1f}%)'.format(num_maori_males,
                                           100 * num_maori_males / num_males))
    print('Maori Women: {} ({:.1f}%)'.format(
        num_maori_females, 100 * num_maori_females / num_females))
    num_pacific_males = (df.loc[df['gender_code'], 'en_prtsd_eth'] == 3).sum()
    num_pacific_females = (df.loc[~df['gender_code'],
                                  'en_prtsd_eth'] == 3).sum()
    print('Pacific Men: {} ({:.1f}%)'.format(
        num_pacific_males, 100 * num_pacific_males / num_males))
    print('Pacific Women: {} ({:.1f}%)'.format(
        num_pacific_females, 100 * num_pacific_females / num_females))
    num_indian_males = (df.loc[df['gender_code'], 'en_prtsd_eth'] == 43).sum()
    num_indian_females = (df.loc[~df['gender_code'],
                                 'en_prtsd_eth'] == 43).sum()
    print('Indian Men: {} ({:.1f}%)'.format(num_indian_males, 100 *
                                            num_indian_males / num_males))
    print('Indian Women: {} ({:.1f}%)'.format(
        num_indian_females, 100 * num_indian_females / num_females))
    num_other_males = (df.loc[df['gender_code'], 'en_prtsd_eth'] == 9).sum()
    num_other_females = (df.loc[~df['gender_code'], 'en_prtsd_eth'] == 9).sum()
    print('Other Men: {} ({:.1f}%)'.format(num_other_males,
                                           100 * num_other_males / num_males))
    print('Other Women: {} ({:.1f}%)'.format(
        num_other_females, 100 * num_other_females / num_females))
    num_dp1_males = (df.loc[df['gender_code'], 'en_nzdep_q'] == 1).sum()
    num_dp1_females = (df.loc[~df['gender_code'], 'en_nzdep_q'] == 1).sum()
    print('dp1 Men: {} ({:.1f}%)'.format(num_dp1_males,
                                         100 * num_dp1_males / num_males))
    print('dp1 Women: {} ({:.1f}%)'.format(num_dp1_females, 100 *
                                           num_dp1_females / num_females))
    num_dp2_males = (df.loc[df['gender_code'], 'en_nzdep_q'] == 2).sum()
    num_dp2_females = (df.loc[~df['gender_code'], 'en_nzdep_q'] == 2).sum()
    print('dp2 Men: {} ({:.1f}%)'.format(num_dp2_males,
                                         100 * num_dp2_males / num_males))
    print('dp2 Women: {} ({:.1f}%)'.format(num_dp2_females, 100 *
                                           num_dp2_females / num_females))
    num_dp3_males = (df.loc[df['gender_code'], 'en_nzdep_q'] == 3).sum()
    num_dp3_females = (df.loc[~df['gender_code'], 'en_nzdep_q'] == 3).sum()
    print('dp3 Men: {} ({:.1f}%)'.format(num_dp3_males,
                                         100 * num_dp3_males / num_males))
    print('dp3 Women: {} ({:.1f}%)'.format(num_dp3_females, 100 *
                                           num_dp3_females / num_females))
    num_dp4_males = (df.loc[df['gender_code'], 'en_nzdep_q'] == 4).sum()
    num_dp4_females = (df.loc[~df['gender_code'], 'en_nzdep_q'] == 4).sum()
    print('dp4 Men: {} ({:.1f}%)'.format(num_dp4_males,
                                         100 * num_dp4_males / num_males))
    print('dp4 Women: {} ({:.1f}%)'.format(num_dp4_females, 100 *
                                           num_dp4_females / num_females))
    num_dp5_males = (df.loc[df['gender_code'], 'en_nzdep_q'] == 5).sum()
    num_dp5_females = (df.loc[~df['gender_code'], 'en_nzdep_q'] == 5).sum()
    print('dp5 Men: {} ({:.1f}%)'.format(num_dp5_males,
                                         100 * num_dp5_males / num_males))
    print('dp5 Women: {} ({:.1f}%)'.format(num_dp5_females, 100 *
                                           num_dp5_females / num_females))
    num_diabetes_males = df.loc[df['gender_code'], 'hx_vdr_diabetes'].sum()
    num_diabetes_females = df.loc[~df['gender_code'], 'hx_vdr_diabetes'].sum()
    print('Diabetes Men: {} ({:.1f}%)'.format(
        num_diabetes_males, 100 * num_diabetes_males / num_males))
    print('Diabetes Women: {} ({:.1f}%)'.format(
        num_diabetes_females, 100 * num_diabetes_females / num_females))
    num_AF_males = df.loc[df['gender_code'], 'hx_af'].sum()
    num_AF_females = df.loc[~df['gender_code'], 'hx_af'].sum()
    print('AF Men: {} ({:.1f}%)'.format(num_AF_males,
                                        100 * num_AF_males / num_males))
    print('AF Women: {} ({:.1f}%)'.format(num_AF_females,
                                          100 * num_AF_females / num_females))
    num_BP_males = df.loc[df['gender_code'],
                          'ph_bp_lowering_prior_6mths'].sum()
    num_BP_females = df.loc[~df['gender_code'],
                            'ph_bp_lowering_prior_6mths'].sum()
    print('BP Men: {} ({:.1f}%)'.format(num_BP_males,
                                        100 * num_BP_males / num_males))
    print('BP Women: {} ({:.1f}%)'.format(num_BP_females,
                                          100 * num_BP_females / num_females))
    num_LL_males = df.loc[df['gender_code'],
                          'ph_lipid_lowering_prior_6mths'].sum()
    num_LL_females = df.loc[~df['gender_code'],
                            'ph_lipid_lowering_prior_6mths'].sum()
    print('LL Men: {} ({:.1f}%)'.format(num_LL_males,
                                        100 * num_LL_males / num_males))
    print('LL Women: {} ({:.1f}%)'.format(num_LL_females,
                                          100 * num_LL_females / num_females))
    num_APAC_males = df.loc[df['gender_code'],
                            'ph_antiplat_anticoag_prior_6mths'].sum()
    num_APAC_females = df.loc[~df['gender_code'],
                              'ph_antiplat_anticoag_prior_6mths'].sum()
    print('APAC Men: {} ({:.1f}%)'.format(num_APAC_males,
                                          100 * num_APAC_males / num_males))
    print('APAC Women: {} ({:.1f}%)'.format(
        num_APAC_females, 100 * num_APAC_females / num_females))
    follow_up_males, follow_up_males_mean = df.loc[
        df['gender_code'], 'TIME'].sum() / 365, df.loc[df['gender_code'],
                                                       'TIME'].mean() / 365
    follow_up_females, follow_up_females_mean = df.loc[
        ~df['gender_code'], 'TIME'].sum() / 365, df.loc[~df['gender_code'],
                                                        'TIME'].mean() / 365
    print('Follow up Men: {:.0f} ({:.1f})'.format(follow_up_males,
                                                  follow_up_males_mean))
    print('Follow up Women: {:.0f} ({:.1f})'.format(follow_up_females,
                                                    follow_up_females_mean))
    num_CVD_death_males = df.loc[df['gender_code'], 'imp_fatal_cvd'].sum()
    num_CVD_death_females = df.loc[~df['gender_code'], 'imp_fatal_cvd'].sum()
    print('CVD death Men: {} ({:.1f}%)'.format(
        num_CVD_death_males, 100 * num_CVD_death_males / num_males))
    print('CVD death Women: {} ({:.1f}%)'.format(
        num_CVD_death_females, 100 * num_CVD_death_females / num_females))
    num_CVD_event_males = df.loc[df['gender_code'], 'EVENT'].sum()
    num_CVD_event_females = df.loc[~df['gender_code'], 'EVENT'].sum()
    print('CVD event Men: {} ({:.1f}%)'.format(
        num_CVD_event_males, 100 * num_CVD_event_males / num_males))
    print('CVD event Women: {} ({:.1f}%)'.format(
        num_CVD_event_females, 100 * num_CVD_event_females / num_females))
    tmp_males = df.loc[df['gender_code'] & df['EVENT'], 'TIME'] / 365
    time_to_CVD_males, time_to_CVD_males_Q1, time_to_CVD_males_Q3 = tmp_males.median(
    ), tmp_males.quantile(0.25), tmp_males.quantile(0.75)
    tmp_females = df.loc[~df['gender_code'] & df['EVENT'], 'TIME'] / 365
    time_to_CVD_females, time_to_CVD_females_Q1, time_to_CVD_females_Q3 = tmp_females.median(
    ), tmp_females.quantile(0.25), tmp_females.quantile(0.75)
    print('Time to CVD Men: {:.1f} ({:.1f}, {:.1f})'.format(
        time_to_CVD_males, time_to_CVD_males_Q1, time_to_CVD_males_Q3))
    print('Time to CVD Women: {:.1f} ({:.1f}, {:.1f})'.format(
        time_to_CVD_females, time_to_CVD_females_Q1, time_to_CVD_females_Q3))
    num_censored_5y_males = (1 - df.loc[df['gender_code'] &
                                        (df['TIME'] == 1826), 'EVENT']).sum()
    num_censored_5y_females = (1 -
                               df.loc[~df['gender_code'] &
                                      (df['TIME'] == 1826), 'EVENT']).sum()
    print('Censored at 5 years Men: {} ({:.1f}%)'.format(
        num_censored_5y_males, 100 * num_censored_5y_males / num_males))
    print('Censored at 5 years Women: {} ({:.1f}%)'.format(
        num_censored_5y_females, 100 * num_censored_5y_females / num_females))

    # Center age and deprivation index, separately for males and females
    mean_age_males = df.loc[df['gender_code'], 'nhi_age'].mean()
    mean_age_females = df.loc[~df['gender_code'], 'nhi_age'].mean()
    df.loc[df['gender_code'],
           'nhi_age'] = df.loc[df['gender_code'], 'nhi_age'] - mean_age_males
    df.loc[~df['gender_code'],
           'nhi_age'] = df.loc[~df['gender_code'],
                               'nhi_age'] - mean_age_females

    mean_nzdep_males = 3
    mean_nzdep_females = 3
    df.loc[df['gender_code'],
           'en_nzdep_q'] = df.loc[df['gender_code'],
                                  'en_nzdep_q'] - mean_nzdep_males
    df.loc[~df['gender_code'],
           'en_nzdep_q'] = df.loc[~df['gender_code'],
                                  'en_nzdep_q'] - mean_nzdep_females

    # Create interaction columns
    df['age_X_bp'] = df['nhi_age'] * df['ph_bp_lowering_prior_6mths']
    df['age_X_diabetes'] = df['nhi_age'] * df['hx_vdr_diabetes']
    df['age_X_af'] = df['nhi_age'] * df['hx_af']
    df['bp_X_diabetes'] = df['ph_bp_lowering_prior_6mths'] & df[
        'hx_vdr_diabetes']
    df['antiplat_anticoag_X_diabetes'] = df[
        'ph_antiplat_anticoag_prior_6mths'] & df['hx_vdr_diabetes']
    df['bp_X_lipid'] = df['ph_bp_lowering_prior_6mths'] & df[
        'ph_lipid_lowering_prior_6mths']

    # Keep all VARIANZ risk equations columns
    keep_cols = [
        'VSIMPLE_INDEX_MASTER', 'nhi_age', 'gender_code', 'en_prtsd_eth',
        'en_nzdep_q', 'hx_vdr_diabetes', 'hx_af', 'ph_bp_lowering_prior_6mths',
        'ph_lipid_lowering_prior_6mths', 'ph_antiplat_anticoag_prior_6mths',
        'age_X_bp', 'age_X_diabetes', 'age_X_af', 'bp_X_diabetes',
        'antiplat_anticoag_X_diabetes', 'bp_X_lipid', 'TIME', 'EVENT'
    ]
    df = df[keep_cols]

    # Save
    df_males = df[df['gender_code']]
    df_males.reset_index(drop=True, inplace=True)
    df_males.to_feather(hp.data_pp_dir +
                        'Py_VARIANZ_2012_v3-1_pp_males.feather')
    np.savez(hp.data_pp_dir + 'means_males.npz',
             mean_age=mean_age_males,
             mean_nzdep=mean_nzdep_males)

    df_females = df[~df['gender_code']]
    df_females.reset_index(drop=True, inplace=True)
    df_females.to_feather(hp.data_pp_dir +
                          'Py_VARIANZ_2012_v3-1_pp_females.feather')
    np.savez(hp.data_pp_dir + 'means_females.npz',
             mean_age=mean_age_females,
             mean_nzdep=mean_nzdep_females)
def objective(trial, data, df_index_code):
    hp = Hyperparameters(trial)
    #hp = Hyperparameters()
    print(trial.params)

    idx_trn = (data['fold'] != 99)
    x_trn = data['x'][idx_trn]
    time_trn = data['time'][idx_trn]
    event_trn = data['event'][idx_trn]
    codes_trn = data['codes'][idx_trn]
    month_trn = data['month'][idx_trn]
    diagt_trn = data['diagt'][idx_trn]

    idx_val = (data['fold'] == 99)
    x_val = data['x'][idx_val]
    time_val = data['time'][idx_val]
    event_val = data['event'][idx_val]
    codes_val = data['codes'][idx_val]
    month_val = data['month'][idx_val]
    diagt_val = data['diagt'][idx_val]

    # could move this outside objective function for efficiency
    sort_idx_trn, case_idx_trn, max_idx_control_trn = sort_and_case_indices(
        x_trn, time_trn, event_trn)
    sort_idx_val, case_idx_val, max_idx_control_val = sort_and_case_indices(
        x_val, time_val, event_val)

    x_trn, time_trn, event_trn = x_trn[sort_idx_trn], time_trn[
        sort_idx_trn], event_trn[sort_idx_trn]
    codes_trn, month_trn, diagt_trn = codes_trn[sort_idx_trn], month_trn[
        sort_idx_trn], diagt_trn[sort_idx_trn]

    x_val, time_val, event_val = x_val[sort_idx_val], time_val[
        sort_idx_val], event_val[sort_idx_val]
    codes_val, month_val, diagt_val = codes_val[sort_idx_val], month_val[
        sort_idx_val], diagt_val[sort_idx_val]

    #######################################################################################################

    print('Create data loaders and tensors...')
    case_trn = utils.TensorDataset(torch.from_numpy(x_trn[case_idx_trn]),
                                   torch.from_numpy(time_trn[case_idx_trn]),
                                   torch.from_numpy(max_idx_control_trn),
                                   torch.from_numpy(codes_trn[case_idx_trn]),
                                   torch.from_numpy(month_trn[case_idx_trn]),
                                   torch.from_numpy(diagt_trn[case_idx_trn]))
    case_val = utils.TensorDataset(torch.from_numpy(x_val[case_idx_val]),
                                   torch.from_numpy(time_val[case_idx_val]),
                                   torch.from_numpy(max_idx_control_val),
                                   torch.from_numpy(codes_val[case_idx_val]),
                                   torch.from_numpy(month_val[case_idx_val]),
                                   torch.from_numpy(diagt_val[case_idx_val]))

    x_trn, x_val = torch.from_numpy(x_trn), torch.from_numpy(x_val)
    time_trn, time_val = torch.from_numpy(time_trn), torch.from_numpy(time_val)
    event_trn, event_val = torch.from_numpy(event_trn), torch.from_numpy(
        event_val)
    codes_trn, codes_val = torch.from_numpy(codes_trn), torch.from_numpy(
        codes_val)
    month_trn, month_val = torch.from_numpy(month_trn), torch.from_numpy(
        month_val)
    diagt_trn, diagt_val = torch.from_numpy(diagt_trn), torch.from_numpy(
        diagt_val)

    # Create batch queues
    trn_loader = utils.DataLoader(case_trn,
                                  batch_size=hp.batch_size,
                                  shuffle=True,
                                  drop_last=True)
    val_loader = utils.DataLoader(case_val,
                                  batch_size=hp.batch_size,
                                  shuffle=False,
                                  drop_last=False)

    print('Train...')
    # Neural Net
    hp.model_name = str(trial.number) + '_' + hp.model_name
    num_input = x_trn.shape[1] + 1 if hp.nonprop_hazards else x_trn.shape[1]
    net = NetRNNFinal(num_input, df_index_code.shape[0] + 1,
                      hp).to(hp.device)  #+1 for zero padding
    criterion = CoxPHLoss().to(hp.device)
    optimizer = optim.Adam(net.parameters(), lr=hp.learning_rate)

    best, num_bad_epochs = 100., 0
    for epoch in range(1000):
        trn(trn_loader, x_trn, codes_trn, month_trn, diagt_trn, net, criterion,
            optimizer, hp)
        loss_val = val(val_loader, x_val, codes_val, month_val, diagt_val, net,
                       criterion, epoch, hp)
        # early stopping
        if loss_val < best:
            print(
                '############### Saving good model ###############################'
            )
            torch.save(net.state_dict(), hp.log_dir + hp.model_name)
            best = loss_val
            num_bad_epochs = 0
        else:
            num_bad_epochs += 1
            if num_bad_epochs == hp.patience:
                break
        # pruning
        trial.report(best, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    print('Done')
    return best
예제 #25
0
 def _restore_hp(self):
     self.hyperparameters = Hyperparameters.from_csv(self.path)
예제 #26
0
파일: main.py 프로젝트: abagaria/seq2seq
                        help="Name of experiment",
                        default="")
    parser.add_argument("--batch_size", type=int, help="batch size", default=1)
    parser.add_argument("--epochs",
                        type=int,
                        help="number of training epochs",
                        default=10)
    parser.add_argument("--embedding_size",
                        type=int,
                        help="embedding size",
                        default=500)
    parser.add_argument("--hidden_size",
                        type=int,
                        help="RNN size",
                        default=512)
    parser.add_argument("--lr", type=float, help="Learning rate", default=1e-3)
    parser.add_argument("--bidirectional",
                        type=bool,
                        help="Bidirectional RNN",
                        default=False)
    parser.add_argument("--num_rnn_layers",
                        type=int,
                        help="# RNN Layers",
                        default=2)
    args = parser.parse_args()

    writer = SummaryWriter(args.experiment_name)
    hyperparameters = Hyperparameters(args)

    main()
예제 #27
0
def main():
    hp = Hyperparameters()

    print('Load data...')
    data = np.load(hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz')
    df_index_code = feather.read_dataframe(hp.data_pp_dir + 'df_index_code_' +
                                           hp.gender + '.feather')

    print('Train on each fold...')
    for fold in range(hp.num_folds):
        for swap in range(2):
            print('Fold: {} Swap: {}'.format(fold, swap))

            idx = (data['fold'][:, fold] == swap)
            x = data['x'][idx]
            time = data['time'][idx]
            event = data['event'][idx]
            codes = data['codes'][idx]
            month = data['month'][idx]
            diagt = data['diagt'][idx]

            if not hp.redundant_predictors:
                cols_list = load_obj(hp.data_pp_dir + 'cols_list.pkl')
                x = x[:, [cols_list.index(i) for i in hp.reduced_col_list]]

            sort_idx, case_idx, max_idx_control = sort_and_case_indices(
                x, time, event)
            x, time, event = x[sort_idx], time[sort_idx], event[sort_idx]
            codes, month, diagt = codes[sort_idx], month[sort_idx], diagt[
                sort_idx]

            print('Create data loaders and tensors...')
            case = utils.TensorDataset(torch.from_numpy(x[case_idx]),
                                       torch.from_numpy(time[case_idx]),
                                       torch.from_numpy(max_idx_control),
                                       torch.from_numpy(codes[case_idx]),
                                       torch.from_numpy(month[case_idx]),
                                       torch.from_numpy(diagt[case_idx]))

            x = torch.from_numpy(x)
            time = torch.from_numpy(time)
            event = torch.from_numpy(event)
            codes = torch.from_numpy(codes)
            month = torch.from_numpy(month)
            diagt = torch.from_numpy(diagt)

            for trial in range(hp.num_trials):
                print('Trial: {}'.format(trial))

                # Create batch queues
                trn_loader = utils.DataLoader(case,
                                              batch_size=hp.batch_size,
                                              shuffle=True,
                                              drop_last=True)

                print('Train...')
                # Neural Net
                hp.model_name = str(trial) + '_' + datetime.now().strftime(
                    '%Y%m%d_%H%M%S_%f') + '.pt'
                num_input = x.shape[1] + 1 if hp.nonprop_hazards else x.shape[1]
                net = NetRNNFinal(num_input, df_index_code.shape[0] + 1,
                                  hp).to(hp.device)  #+1 for zero padding
                criterion = CoxPHLoss().to(hp.device)
                optimizer = optim.Adam(net.parameters(), lr=hp.learning_rate)

                for epoch in range(hp.max_epochs):
                    trn(trn_loader, x, codes, month, diagt, net, criterion,
                        optimizer, hp)
                if hp.redundant_predictors:
                    torch.save(
                        net.state_dict(), hp.log_dir + 'fold_' + str(fold) +
                        '_' + str(swap) + '/' + hp.model_name)
                else:
                    torch.save(
                        net.state_dict(), hp.log_dir + 'fold_' + str(fold) +
                        '_' + str(swap) + '_no_redundancies/' + hp.model_name)
                print('Done')
예제 #28
0
def main():
    # Load data
    print('Load data...')
    hp = Hyperparameters()
    if hp.redundant_predictors:
        data = np.load(hp.results_dir + 'eval_vecs_' + hp.gender + '.npz')
    else:
        data = np.load(hp.results_dir + 'eval_vecs_' + hp.gender +
                       '_no_redundancies.npz')

    # evaluation arrays
    r2_vec_cox = data['r2_vec_cox']
    d_index_vec_cox = data['d_index_vec_cox']
    concordance_vec_cox = data['concordance_vec_cox']
    ibs_vec_cox = data['ibs_vec_cox']
    auc_vec_cox = data['auc_vec_cox']

    r2_vec_cml = data['r2_vec_cml']
    d_index_vec_cml = data['d_index_vec_cml']
    concordance_vec_cml = data['concordance_vec_cml']
    ibs_vec_cml = data['ibs_vec_cml']
    auc_vec_cml = data['auc_vec_cml']

    r2_p = robust_cv_test(r2_vec_cox, r2_vec_cml)
    print('R-squared(D) p-value: {:.3}'.format(r2_p))
    d_index_p = robust_cv_test(d_index_vec_cox, d_index_vec_cml)
    print('D-index p-value: {:.3}'.format(d_index_p))
    concordance_p = robust_cv_test(concordance_vec_cox, concordance_vec_cml)
    print('Concordance p-value: {:.3}'.format(concordance_p))
    ibs_p = robust_cv_test(ibs_vec_cox, ibs_vec_cml)
    print('IBS p-value: {:.3}'.format(ibs_p))
    auc_p = robust_cv_test(auc_vec_cox, auc_vec_cml)
    print('AUC p-value: {:.3}'.format(auc_p))

    r2_vec_cox = np.reshape(r2_vec_cox, -1)
    d_index_vec_cox = np.reshape(d_index_vec_cox, -1)
    concordance_vec_cox = np.reshape(concordance_vec_cox, -1)
    ibs_vec_cox = np.reshape(ibs_vec_cox, -1)
    auc_vec_cox = np.reshape(auc_vec_cox, -1)
    r2_vec_cml = np.reshape(r2_vec_cml, -1)
    d_index_vec_cml = np.reshape(d_index_vec_cml, -1)
    concordance_vec_cml = np.reshape(concordance_vec_cml, -1)
    ibs_vec_cml = np.reshape(ibs_vec_cml, -1)
    auc_vec_cml = np.reshape(auc_vec_cml, -1)

    r2_mean, (r2_lci, r2_uci) = r2_vec_cox.mean(), sms.DescrStatsW(
        r2_vec_cox).tconfint_mean()
    print('R-squared(D) Cox (95% CI): {:.3} ({:.3}, {:.3})'.format(
        r2_mean, r2_lci, r2_uci))
    d_index_mean, (d_index_lci,
                   d_index_uci) = d_index_vec_cox.mean(), sms.DescrStatsW(
                       d_index_vec_cox).tconfint_mean()
    print('D-index Cox (95% CI): {:.3} ({:.3}, {:.3})'.format(
        d_index_mean, d_index_lci, d_index_uci))
    concordance_mean, (concordance_lci,
                       concordance_uci) = concordance_vec_cox.mean(
                       ), sms.DescrStatsW(concordance_vec_cox).tconfint_mean()
    print('Concordance Cox (95% CI): {:.3} ({:.3}, {:.3})'.format(
        concordance_mean, concordance_lci, concordance_uci))
    ibs_mean, (ibs_lci, ibs_uci) = ibs_vec_cox.mean(), sms.DescrStatsW(
        ibs_vec_cox).tconfint_mean()
    print('IBS Cox (95% CI): {:.3} ({:.3}, {:.3})'.format(
        ibs_mean, ibs_lci, ibs_uci))
    auc_mean, (auc_lci, auc_uci) = auc_vec_cox.mean(), sms.DescrStatsW(
        auc_vec_cox).tconfint_mean()
    print('AUC Cox (95% CI): {:.3} ({:.3}, {:.3})'.format(
        auc_mean, auc_lci, auc_uci))

    r2_mean, (r2_lci, r2_uci) = r2_vec_cml.mean(), sms.DescrStatsW(
        r2_vec_cml).tconfint_mean()
    print('R-squared(D) CML (95% CI): {:.3} ({:.3}, {:.3})'.format(
        r2_mean, r2_lci, r2_uci))
    d_index_mean, (d_index_lci,
                   d_index_uci) = d_index_vec_cml.mean(), sms.DescrStatsW(
                       d_index_vec_cml).tconfint_mean()
    print('D-index CML (95% CI): {:.3} ({:.3}, {:.3})'.format(
        d_index_mean, d_index_lci, d_index_uci))
    concordance_mean, (concordance_lci,
                       concordance_uci) = concordance_vec_cml.mean(
                       ), sms.DescrStatsW(concordance_vec_cml).tconfint_mean()
    print('Concordance CML (95% CI): {:.3} ({:.3}, {:.3})'.format(
        concordance_mean, concordance_lci, concordance_uci))
    ibs_mean, (ibs_lci, ibs_uci) = ibs_vec_cml.mean(), sms.DescrStatsW(
        ibs_vec_cml).tconfint_mean()
    print('IBS CML (95% CI): {:.3} ({:.3}, {:.3})'.format(
        ibs_mean, ibs_lci, ibs_uci))
    auc_mean, (auc_lci, auc_uci) = auc_vec_cml.mean(), sms.DescrStatsW(
        auc_vec_cml).tconfint_mean()
    print('AUC CML (95% CI): {:.3} ({:.3}, {:.3})'.format(
        auc_mean, auc_lci, auc_uci))
def main():
    hp = Hyperparameters()
    np.random.seed(hp.np_seed)

    for gender in ['males', 'females']:
        print('Processing ' + gender + '...')

        print('Loading VARIANZ data...')
        df = feather.read_dataframe(hp.data_pp_dir + 'Py_VARIANZ_2012_v3-1_pp_' + gender + '.feather')

        print('Loading medications...')
        ph = feather.read_dataframe(hp.data_pp_dir + 'PH_pp_' + gender + '.feather')
        ph.rename(columns={'chem_id': 'CODE', 'dispmonth_index': 'MONTH'}, inplace=True)
        ph['TYPE'] = 0

        print('Loading hospital events...')
        he = feather.read_dataframe(hp.data_pp_dir + 'HE_pp_' + gender + '.feather')
        he.rename(columns={'CLIN_CD_10': 'CODE', 'dispmonth_index': 'MONTH'}, inplace=True)
        he['TYPE'] = 1
        
        print('-----------------------------------------')
        # numerical index for each person
        df.reset_index(drop=True, inplace=True)
        df_index_person = df['VSIMPLE_INDEX_MASTER'].reset_index().rename(columns={'index': 'INDEX_PERSON'})

        # convert categorical ethnicity into indicator variables
        print('Create dummy variables...')
        df = pd.get_dummies(df, prefix='en_prtsd_eth', columns=['en_prtsd_eth'], drop_first=True)
        
        print('-----------------------------------------')
        print('Concatenating codes...')
        ac = pd.concat([ph, he], ignore_index=True, sort=False)
        ac['DIAG_TYPE'] = ac['DIAG_TYPE'].fillna(0).astype(int)

        # medications and hospital events
        print('Get max number of codes per person...')
        ac['COUNT'] = ac.groupby(['VSIMPLE_INDEX_MASTER']).cumcount()
        max_count = ac['COUNT'].max()+1
        print('max_count {}'.format(max_count))

        # code index (add 1 to reserve 0 for padding)
        df_index_code = ac[['CODE', 'TYPE']].drop_duplicates().reset_index(drop=True)
        df_index_code['CODE'] = df_index_code['CODE'].astype(str)
        df_index_code['INDEX_CODE'] = df_index_code.index + 1
            
        # codes, times, diag_type arrays
        codes = np.zeros((len(df_index_person), max_count), dtype=np.int16) # uint16 not supported by torch
        month = np.zeros((len(df_index_person), max_count), dtype=np.uint8)
        diagt = np.zeros((len(df_index_person), max_count), dtype=np.uint8)

        print('Merging index_person...')
        ac = ac.merge(df_index_person, how='inner', on='VSIMPLE_INDEX_MASTER')
        print('Merging index_code...')
        ac['CODE'] = ac['CODE'].astype(str)
        ac = ac.merge(df_index_code,   how='inner', on=['CODE', 'TYPE'])
        print('Updating arrays...')
        codes[ac['INDEX_PERSON'].values, ac['COUNT'].values] = ac['INDEX_CODE'].values
        month[ac['INDEX_PERSON'].values, ac['COUNT'].values] = ac['MONTH'].values
        diagt[ac['INDEX_PERSON'].values, ac['COUNT'].values] = ac['DIAG_TYPE'].values
        
        print('-----------------------------------------')
        # data folds, stratified by event, for 5x2 cv
        print('Exclude validation data...') # done this way for historical reasons
        df_trn, df_tst = train_test_split(df, test_size=0.1, train_size=0.8, shuffle=True, stratify=df['EVENT'])
        df_tmp = pd.concat([df_trn, df_tst])

        print('Split data into folds...')
        for i in range(hp.num_folds):
            df_trn, df_tst = train_test_split(df_tmp, test_size=0.5, train_size=0.5, shuffle=True, stratify=df_tmp['EVENT'])
            df['FOLD_' + str(i)] = 99
            df.loc[df_trn.index, 'FOLD_' + str(i)] = 0
            df.loc[df_tst.index, 'FOLD_' + str(i)] = 1

        # Other arrays
        fold_cols = ['FOLD_' + str(i) for i in range(hp.num_folds)]
        time = df['TIME'].values
        event = df['EVENT'].values.astype(int)
        fold = df[fold_cols].values
        df.drop(['TIME', 'EVENT', 'VSIMPLE_INDEX_MASTER', 'gender_code'] + fold_cols, axis=1, inplace=True)
        x = df.values.astype('float32')

        print('-----------------------------------------')
        print('Save...')
        np.savez(hp.data_pp_dir + 'data_arrays_' + gender + '.npz', x=x, time=time, event=event, codes=codes, month=month, diagt=diagt, fold=fold)
        df_index_person.to_feather(hp.data_pp_dir + 'df_index_person_' + gender + '.feather')
        df_index_code.to_feather(hp.data_pp_dir + 'df_index_code_' + gender + '.feather')
        save_obj(list(df.columns), hp.data_pp_dir + 'cols_list.pkl')
    def __init__(self, steps, gym, network_descriptions, curriculum_designer, hyperparameters=Hyperparameters()):
        ray.init(log_to_driver=hyperparameters.log_to_driver)
        # min num remotes should be orders of magnitude smaller than min_num_runs_generated
        self.num_remotes = 32
        self.min_num_runs_generated  = 100
        self.gam = 0.999
        self.lam = 0.97
        self.finish_runs_time = 1.
        self.hyperparameters = hyperparameters

        #number of iterations of first gathering samples, then optimizing on them to run here
        self.steps = steps
        self.gym = gym
        self.network_descriptions = network_descriptions
        self.curriculum_designer = curriculum_designer
        self.actor_weights = []
        #get actor and critic weights
        a_w, c_w = ray.get(get_initial_weights.remote(self.network_descriptions))
        self.actor_weights.append(a_w)
        self.critic_weights = c_w
        #track with training iteration this is in
        self.iteration = 0
        #build logging object for this!
        self.logger = [dict()]