Пример #1
0
def stat_report():

    stat_sums = stat_summary()

    stats = pd.DataFrame(
                         stat_sums,
                         index=[                           
                                'TPR_1', 'TPR_2', 'TPR_3', 'TPR_4', 'TPR_5',
                                'TNR_1', 'TNR_2', 'TNR_3', 'TNR_4', 'TNR_5',
                                'PPV_1', 'PPV_2', 'PPV_3', 'PPV_4', 'PPV_5',
                                'NPV_1', 'NPV_2', 'NPV_3', 'NPV_4', 'NPV_5',
                                'F1_1',  'F1_2',  'F1_3',  'F1_4',  'F1_5',
                                'ACC_1', 'ACC_2', 'ACC_3', 'ACC_4', 'ACC_5'
                                ],                            
                         columns=[
                                 'mean',
                                 '95% CI -',
                                 '95% CI +'
                                 ]
                         )

    filename = str(DNN_model) + '_' + \
               str(iteration) + '_' + \
               str(epochs) + '_' + \
               str(strftime("%d-%b-%Y-%H-%M-%S", gmtime())) + \
               '.csv'

    stats.to_csv(os.path.join(result_dir, filename))

    return stats
Пример #2
0
def stats_to_csv():
    exp = {
        'uni': ['l09', 'l15', 'l2', 'l5'],
        'bin_old': ['l14', 'l15', 'l2', 'l5'],
        'bin': ['l15', 'l2', 'l5']
    }

    for m in exp.keys():
        for l in exp[m]:
            data = scalar_parse(m, l)
            stats = scalar_stats(data)
            stats.to_csv('stats_' + m + '_' + l + '.csv')
    return
Пример #3
0
def plot_week_data_with_stats(sample_md,
                              metric,
                              time_column,
                              hue=None,
                              alphas=alphas,
                              reference_time=1,
                              output_figure_filepath=None,
                              output_table_filepath=None):
    fig = plot_week_data(sample_md, metric, time_column, label_axes=True)
    stats = tabulate_week_to_reference_week_paired_stats(
        sample_md, metric, reference_time, time_column)
    ymax = fig.axes[0].get_ylim()[1]
    stats.sort_index()
    for i, w in enumerate(stats.index):
        t, q = stats['test-statistic'][w], stats['q-value'][w]
        sig_text = get_sig_text(q, alphas)
        fig.axes[0].text(i, 1.02 * ymax, sig_text, ha='center', va='center')
    if output_table_filepath is not None:
        stats.to_csv(output_table_filepath)
    if output_figure_filepath is not None:
        fig.savefig(output_figure_filepath, dpi=(300))
    else:
        return fig
def write_stats(stats, filepath_prefix, overall):
    stats.to_csv(filepath_prefix + '.csv', index_label='Copy #', header=['TPR', 'FNR', 'PPV', 'FDR', 'F1'])
    csv.writer(open(filepath_prefix + '.csv', 'a')).writerow(['Overall', overall, 1 - overall, '', '', ''])
Пример #5
0
)
snps = np.load(
    'SNPs_0.1.npy'
)  #If x were given as just snps then they would be fixed effects right
os.chdir(
    "/srv/uom-data1-q.unimelb.edu.au/6300-afournier/home/student.unimelb.edu.au/andhikap/Clim_GWAS/New_Microclim"
)
env = pd.read_csv('Microclimate_minmaxDaily_threshold_0_0.csv', sep=',')
logical = env.columns.str.startswith(('PAR', 'TT', 'PTT', 'daylength'))
env = env.iloc[:, ~logical]  #get columns that don't start with PAR
environment = np.array(env)
assert K.shape[0] == K.shape[1], 'K MATRIX IS NOT SYMMETRICAL'
assert K.shape[0] == env.shape[
    0], 'NO. OF INDIVIDUALS DOES NOT MATCH K MATRIX DIMENSIONS'
os.chdir(
    "/srv/uom-data1-q.unimelb.edu.au/6300-afournier/home/student.unimelb.edu.au/andhikap/Clim_GWAS/Clim_GWAS_2"
)
y = np.load("DTB_NEW.npy")

iscan_model = iscan(snps, y, K=K, M=env)

#𝐲 ~ 𝓝(𝙼𝜶, 4777.187⋅𝙺 + 394.918⋅𝙸) --prop explained by K: 0.9236446282509733

h1_effsizes = iscan_model.effsizes[
    'h1']  #test every combination between snp and covariate (dim= 86760 * 406)
h1_effsizes.to_csv('h1_effsizes_minmaxEMMAXIBS.csv')
h2_effsizes = iscan_model.effsizes['h2']
h2_effsizes.to_csv('h2_effsizes_minmaxEMMAXIBS.csv')
stats = iscan_model.stats
stats.to_csv('stats_minmaxEMMAXIBS.csv')
Пример #6
0
def fit_deconfounder(
        data_dir,
        save_dir,
        factor_model,
        learning_rate,
        max_steps,
        latent_dim,
        layer_dim,
        batch_size,
        num_samples,  # number of samples from variational distribution
        holdout_portion,
        print_steps,
        tolerance,
        num_confounder_samples,  #number of samples of substitute confounder from the posterior
        CV,
        outcome_type):

    param_save_dir = os.path.join(
        save_dir,
        "{}_lr{}_maxsteps{}_latentdim{}_layerdim{}_batchsize{}_numsamples{}_holdoutp{}_tolerance{}_numconfsamples{}_CV{}_outType{}/"
        .format(factor_model, learning_rate, max_steps, latent_dim, layer_dim,
                batch_size, num_samples, holdout_portion, tolerance,
                num_confounder_samples, CV, outcome_type))

    if os.path.exists(param_save_dir):
        print("Deleting old log directory at {}".format(param_save_dir))
        shutil.rmtree(param_save_dir)
    if not os.path.exists(param_save_dir):
        os.makedirs(param_save_dir)

    kwargs = ({
        'num_workers': 1,
        'pin_memory': True
    } if torch.cuda.is_available() else {})

    df = pd.read_csv(os.path.join(data_dir, "drug_exposure_sparse_matrix.csv"),
                     index_col=0)
    data = df.values
    dataset = CustomDataset(data, holdout_portion)
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              **kwargs)
    iterator = data_loader.__iter__()

    num_datapoints, data_dim = dataset.counts.shape

    summary_writer = SummaryWriter(save_dir)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if factor_model == 'PPCA':
        stddv_datapoints = 0.5
        model = PPCA(device, num_datapoints, data_dim, latent_dim,
                     stddv_datapoints, num_samples, print_steps,
                     summary_writer).to(device)
    if factor_model == 'PMF':
        model = PMF(device, num_datapoints, data_dim, latent_dim, num_samples,
                    print_steps, summary_writer).to(device)
    if factor_model == 'DEF':
        model = DEF(device, num_datapoints, data_dim, layer_dim, num_samples,
                    print_steps, summary_writer).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    start_time = time.time()
    prev_loss = 1e8
    tol = 0  #tolerance counter
    for step in range(max_steps):
        try:
            datapoints_indices, x_train, holdout_mask = iterator.next()
        except StopIteration:
            iterator = data_loader.__iter__()
            datapoints_indices, x_train, holdout_mask = iterator.next()

        datapoints_indices = datapoints_indices.to(device)
        x_train = x_train.to(device)
        optimizer.zero_grad()
        elbo = model(datapoints_indices, x_train, holdout_mask, step)
        loss = -elbo
        loss.backward()
        optimizer.step()

        if step == 0 or step % print_steps == print_steps - 1:
            duration = (time.time() - start_time) / (step + 1)
            print("Step: {:>3d} ELBO: {:.3f} ({:.3f} sec)".format(
                step + 1, -loss, duration))
            summary_writer.add_scalar("loss", loss, step)

            if loss < prev_loss:
                tol = 0
                prev_loss = loss
            else:
                tol += 1
                prev_loss = loss

            if step == max_steps - 1 or tol >= tolerance:
                model.predictive_check(dataset.vad, dataset.holdout_mask,
                                       dataset.holdout_subjects, 100, 100)

                if factor_model == "PPCA":
                    np.savetxt(os.path.join(param_save_dir, "qw_loc"),
                               model.qw_distribution.location.cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qw_scale"),
                               model.qw_distribution.scale().cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qz_loc"),
                               model.qz_distribution.location.cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qz_scale"),
                               model.qz_distribution.scale().cpu().detach())

                if factor_model == "PMF":
                    np.savetxt(os.path.join(param_save_dir, "qv_loc"),
                               model.qv_distribution.location.cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qv_scale"),
                               model.qv_distribution.scale().cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qu_loc"),
                               model.qu_distribution.location.cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qu_scale"),
                               model.qu_distribution.scale().cpu().detach())

                if factor_model == "DEF":
                    np.savetxt(os.path.join(param_save_dir, "qw1_loc"),
                               model.qw1_distribution.location.cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qw1_scale"),
                               model.qw1_distribution.scale().cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qw0_loc"),
                               model.qw0_distribution.location.cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qw0_scale"),
                               model.qw0_distribution.scale().cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qz2_loc"),
                               model.qz2_distribution.location.cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qz2_scale"),
                               model.qz2_distribution.scale().cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qz1_loc"),
                               model.qz1_distribution.location.cpu().detach())
                    np.savetxt(os.path.join(param_save_dir, "qz1_scale"),
                               model.qz1_distribution.scale().cpu().detach())

                if tol >= tolerance:
                    print(
                        "Loss goes up for {} consecutive prints. Stop training."
                        .format(tol))
                    break

                if step == max_steps - 1:
                    print("Maximum step reached. Stop training.")

    # fit outcome model
    covariates_df = pd.read_csv(os.path.join(data_dir,
                                             "pre_treatment_lab.csv"))
    covariates = covariates_df['value_as_number'].values
    outcome_df = pd.read_csv(os.path.join(data_dir, "post_treatment_lab.csv"))
    y = outcome_df['value_as_number'].values - covariates

    # Unadjusted model
    coefficients = np.zeros((1, data_dim))
    outcome_model = fit_outcome_model(dataset.counts,
                                      y,
                                      data_dim,
                                      outcome_type,
                                      CV=CV,
                                      verbose=True)
    if outcome_type == 'linear':
        coefficients[0, :] = outcome_model.coef_[:data_dim]
    if outcome_type == 'binary':
        coefficients[0, :] = outcome_model.coef_[0][:data_dim]

    coefficients = pd.DataFrame(coefficients, columns=df.columns)
    coefficients.to_csv(os.path.join(param_save_dir, "coefficients.csv"))

    # Deconfounder (adjusting for substitute confounder)
    treatment_effects = np.zeros((num_confounder_samples, data_dim))
    for sample in range(num_confounder_samples):
        if factor_model == "PPCA":
            substitute_confounder = np.transpose(
                np.squeeze(model.qz_distribution.sample(1).detach().numpy()))
        if factor_model == "PMF":
            substitute_confounder = np.transpose(
                np.squeeze(model.qu_distribution.sample(1).detach().numpy()))
        if factor_model == "DEF":
            substitute_confounder = np.squeeze(
                model.qz1_distribution.sample(1).detach().numpy())
        X = np.column_stack([dataset.counts, substitute_confounder])
        outcome_model = fit_outcome_model(X,
                                          y,
                                          data_dim,
                                          outcome_type,
                                          CV=CV,
                                          verbose=True)
        if outcome_type == 'linear':
            treatment_effects[sample, :] = outcome_model.coef_[:data_dim]
        if outcome_type == 'binary':
            treatment_effects[sample, :] = outcome_model.coef_[0][:data_dim]
    treatment_effects = pd.DataFrame(treatment_effects, columns=df.columns)
    treatment_effects.to_csv(
        os.path.join(param_save_dir, "treatment_effects.csv"))

    stats = pd.DataFrame({
        "drug_name":
        treatment_effects.columns.values,
        "mean":
        treatment_effects.mean(axis=0).values,
        "stderr":
        treatment_effects.sem(axis=0).values,
        "ci95_lower":
        treatment_effects.mean(axis=0).values -
        1.96 * treatment_effects.sem(axis=0).values,
        "ci95_upper":
        treatment_effects.mean(axis=0).values +
        1.96 * treatment_effects.sem(axis=0).values,
    })
    stats.to_csv(os.path.join(param_save_dir, "treatment_effects_stats.csv"),
                 index=False)


# For testing in python
# outputFolder = "C:/Users/lz2629/git/zhangly811/MvDeconfounder/res"
# dataFolder = "C:/Users/lz2629/git/zhangly811/MvDeconfounder/dat"
# factorModel = 'DEF'
# fit_deconfounder(data_dir=dataFolder,
#                  save_dir=outputFolder,
#                  factor_model=factorModel,
#                  learning_rate=0.0001,
#                  max_steps=100000,
#                  latent_dim=1,
#                  layer_dim=[30, 5],
#                  batch_size=1024,
#                  num_samples=1, # number of samples from variational distribution
#                  holdout_portion=0.5,
#                  print_steps=50,
#                  tolerance=3,
#                  num_confounder_samples=30, # number of samples of substitute confounder from the posterior
#                  CV=5,
#                  outcome_type='linear')

# def debugfunc(data_dir,
#              save_dir,
#              factor_model,
#              learning_rate,
#              max_steps,
#              latent_dim,
#              layer_dim,
#              batch_size,
#              num_samples, # number of samples from variational distribution
#              holdout_portion,
#              print_steps,
#              tolerance,
#              num_confounder_samples, #number of samples of substitute confounder from the posterior
#              CV,
#              outcome_type):
#     param_save_dir = os.path.join(save_dir, "{}_lr{}_maxsteps{}_latentdim{}_layerdim{}_batchsize{}_numsamples{}_holdoutp{}_tolerance{}_numconfsamples{}_CV{}_outType{}/".format(
#         factor_model, learning_rate, max_steps, latent_dim, layer_dim, batch_size, num_samples, holdout_portion, tolerance, num_confounder_samples, CV, outcome_type
#     ))
#     df = pd.read_csv(os.path.join(data_dir, "drug_exposure_sparse_matrix.csv"), index_col=0)
#
#     covariates_df = pd.read_csv(os.path.join(data_dir, "pre_treatment_lab.csv"))
#     data_dim=df.shape[1]
#     covariates = covariates_df['value_as_number'].values
#     y_df = pd.read_csv(os.path.join(data_dir, "post_treatment_lab.csv"))
#     y = y_df['value_as_number'].values - covariates
#
#     treatment_effects = np.zeros((num_confounder_samples, data_dim))
#     for sample in range(num_confounder_samples):
#         X = df.values
#         outcome_model = fit_outcome_model(X, y, data_dim, outcome_type, CV=CV, verbose=True)
#         if outcome_type == 'linear':
#             treatment_effects[sample, :] = outcome_model.coef_[:data_dim]
#         if outcome_type == 'binary':
#             treatment_effects[sample, :] = outcome_model.coef_[0][:data_dim]
#     treatment_effects = pd.DataFrame(treatment_effects, columns=df.columns)
#     treatment_effects.to_csv(os.path.join(param_save_dir, "treatment_effects.csv"))
#
#     stats = pd.DataFrame({
#         "drug_name": treatment_effects.columns.values,
#         "mean": treatment_effects.mean(axis=0).values,
#         "stderr":treatment_effects.sem(axis=0).values,
#         "ci95_lower": treatment_effects.mean(axis=0).values - 1.96*treatment_effects.sem(axis=0).values,
#         "ci95_upper": treatment_effects.mean(axis=0).values + 1.96*treatment_effects.sem(axis=0).values,
#     })
#     stats.to_csv(os.path.join(param_save_dir, "treatment_effects_stats_test.csv"), index=False)
#
# debugfunc(data_dir=dataFolder,
#           save_dir=outputFolder,
#           factor_model=factorModel,
#           learning_rate=0.0001,
#           max_steps=100000,
#           latent_dim=1,
#           layer_dim=[30, 5],
#           batch_size=1024,
#           num_samples=1, # number of samples from variational distribution
#           holdout_portion=0.5,
#           print_steps=50,
#           tolerance=3,
#           num_confounder_samples=30, # number of samples of substitute confounder from the posterior
#           CV=5,
#           outcome_type='linear')
Пример #7
0
def plot_ecdf_single(iteration=0, sample_size=1000, replace=False):
    df = scalar_df_parse(
        "csv/analisiScenario/20ms/Nonmonitoring-lognormal.csv")
    df1 = scalar_df_parse(
        "csv/analisiScenario/35ms/Nonmonitoring-lognormal.csv")
    df2 = scalar_df_parse(
        "csv/analisiScenario/50ms/Nonmonitoring-lognormal.csv")
    df3 = scalar_df_parse(
        "csv/analisiScenario/10ms/Nonmonitoring-lognormal.csv")

    sample = df[df.name == "queueLength"]
    x = np.sort(sample['value'].dropna())
    n = x.size
    y = np.arange(1, n + 1) / n

    plt.scatter(x=x, y=y, label="20ms")
    plt.legend(loc='best')
    plt.grid(True)
    plt.xlabel('x')
    plt.ylabel('F(x)')
    plt.title("ECDF for 20ms")
    plt.savefig(
        './analysis/analisiScenario/queueLength/non-monitoring-lognormal/20ms.png'
    )
    plt.show()

    sample1 = df1[df1.name == "queueLength"]

    x = np.sort(sample1['value'].dropna())
    n = x.size
    y = np.arange(1, n + 1) / n

    plt.scatter(x=x, y=y, label="35ms")
    plt.legend(loc='best')
    plt.grid(True)
    plt.xlabel('x')
    plt.ylabel('F(x)')
    plt.title("ECDF for 35ms")
    plt.savefig(
        './analysis/analisiScenario/queueLength/non-monitoring-lognormal/35ms.png'
    )
    plt.show()

    sample2 = df2[df2.name == "queueLength"]

    x = np.sort(sample2['value'].dropna())
    n = x.size
    y = np.arange(1, n + 1) / n

    plt.scatter(x=x, y=y, label="50ms")
    plt.legend(loc='best')
    plt.grid(True)
    plt.xlabel('x')
    plt.ylabel('F(x)')
    plt.title("ECDF for 50ms")
    plt.savefig(
        './analysis/analisiScenario/queueLength/non-monitoring-lognormal/50ms.png'
    )
    plt.show()

    sample3 = df3[df3.name == "queueLength"]

    x = np.sort(sample3['value'].dropna())
    n = x.size
    y = np.arange(1, n + 1) / n

    plt.scatter(x=x, y=y, label="10ms")
    plt.legend(loc='best')
    plt.grid(True)
    plt.xlabel('x')
    plt.ylabel('F(x)')
    plt.title("ECDF for 10ms")
    plt.savefig(
        './analysis/analisiScenario/queueLength/non-monitoring-lognormal/10ms.png'
    )
    plt.show()

    stats = data_analysis(df, "queueLength")
    stats.to_csv(
        './analysis/analisiScenario/queueLength/non-monitoring-lognormal/stats20ms.csv',
        index=False)
    stats = data_analysis(df1, "queueLength")
    stats.to_csv(
        './analysis/analisiScenario/queueLength/non-monitoring-lognormal/stats35ms.csv',
        index=False)
    stats = data_analysis(df2, "queueLength")
    stats.to_csv(
        './analysis/analisiScenario/queueLength/non-monitoring-lognormal/stats50ms.csv',
        index=False)
    stats = data_analysis(df3, "queueLength")
    stats.to_csv(
        './analysis/analisiScenario/queueLength/non-monitoring-lognormal/stats10ms.csv',
        index=False)
    stData['weekday'] = stData['weekday'].apply(recalculate_weekday)
    return stData


def get_stats(rawData: pd.DataFrame, group='ALL'):
    stData = build_stats_df(rawData)
    Explorer = ExploratoryAnalysis(stData)
    stats = Explorer.all_column_stats(group)
    corr = Explorer.correlation_measures(group)
    return [stats, corr]


dataFile = '/home/gauss/arm/importante/work/ai/projects/revolico/clean_data/ads_dump1.csv'

df = pd.read_csv(dataFile)  # [0:1000]
df['classification'] = df['classification'].apply(get_first_classification)

# stats, corr = get_stats(df)  # Explorer.all_column_stats()

groups = df.groupby('classification')
groups = [(group[0], build_stats_df(group[1])) for group in groups]
statsDF = build_stats_df(df)
groups.append(("ALL", statsDF))
exp = ExploratoryAnalysis(groups)
# exp.set_groups(groups)


stats, corr = exp.groups_column_stats()
stats.to_csv('ExploratoryStats.csv')
corr.to_csv('Correlation.csv')