def stat_report(): stat_sums = stat_summary() stats = pd.DataFrame( stat_sums, index=[ 'TPR_1', 'TPR_2', 'TPR_3', 'TPR_4', 'TPR_5', 'TNR_1', 'TNR_2', 'TNR_3', 'TNR_4', 'TNR_5', 'PPV_1', 'PPV_2', 'PPV_3', 'PPV_4', 'PPV_5', 'NPV_1', 'NPV_2', 'NPV_3', 'NPV_4', 'NPV_5', 'F1_1', 'F1_2', 'F1_3', 'F1_4', 'F1_5', 'ACC_1', 'ACC_2', 'ACC_3', 'ACC_4', 'ACC_5' ], columns=[ 'mean', '95% CI -', '95% CI +' ] ) filename = str(DNN_model) + '_' + \ str(iteration) + '_' + \ str(epochs) + '_' + \ str(strftime("%d-%b-%Y-%H-%M-%S", gmtime())) + \ '.csv' stats.to_csv(os.path.join(result_dir, filename)) return stats
def stats_to_csv(): exp = { 'uni': ['l09', 'l15', 'l2', 'l5'], 'bin_old': ['l14', 'l15', 'l2', 'l5'], 'bin': ['l15', 'l2', 'l5'] } for m in exp.keys(): for l in exp[m]: data = scalar_parse(m, l) stats = scalar_stats(data) stats.to_csv('stats_' + m + '_' + l + '.csv') return
def plot_week_data_with_stats(sample_md, metric, time_column, hue=None, alphas=alphas, reference_time=1, output_figure_filepath=None, output_table_filepath=None): fig = plot_week_data(sample_md, metric, time_column, label_axes=True) stats = tabulate_week_to_reference_week_paired_stats( sample_md, metric, reference_time, time_column) ymax = fig.axes[0].get_ylim()[1] stats.sort_index() for i, w in enumerate(stats.index): t, q = stats['test-statistic'][w], stats['q-value'][w] sig_text = get_sig_text(q, alphas) fig.axes[0].text(i, 1.02 * ymax, sig_text, ha='center', va='center') if output_table_filepath is not None: stats.to_csv(output_table_filepath) if output_figure_filepath is not None: fig.savefig(output_figure_filepath, dpi=(300)) else: return fig
def write_stats(stats, filepath_prefix, overall): stats.to_csv(filepath_prefix + '.csv', index_label='Copy #', header=['TPR', 'FNR', 'PPV', 'FDR', 'F1']) csv.writer(open(filepath_prefix + '.csv', 'a')).writerow(['Overall', overall, 1 - overall, '', '', ''])
) snps = np.load( 'SNPs_0.1.npy' ) #If x were given as just snps then they would be fixed effects right os.chdir( "/srv/uom-data1-q.unimelb.edu.au/6300-afournier/home/student.unimelb.edu.au/andhikap/Clim_GWAS/New_Microclim" ) env = pd.read_csv('Microclimate_minmaxDaily_threshold_0_0.csv', sep=',') logical = env.columns.str.startswith(('PAR', 'TT', 'PTT', 'daylength')) env = env.iloc[:, ~logical] #get columns that don't start with PAR environment = np.array(env) assert K.shape[0] == K.shape[1], 'K MATRIX IS NOT SYMMETRICAL' assert K.shape[0] == env.shape[ 0], 'NO. OF INDIVIDUALS DOES NOT MATCH K MATRIX DIMENSIONS' os.chdir( "/srv/uom-data1-q.unimelb.edu.au/6300-afournier/home/student.unimelb.edu.au/andhikap/Clim_GWAS/Clim_GWAS_2" ) y = np.load("DTB_NEW.npy") iscan_model = iscan(snps, y, K=K, M=env) #𝐲 ~ 𝓝(𝙼𝜶, 4777.187⋅𝙺 + 394.918⋅𝙸) --prop explained by K: 0.9236446282509733 h1_effsizes = iscan_model.effsizes[ 'h1'] #test every combination between snp and covariate (dim= 86760 * 406) h1_effsizes.to_csv('h1_effsizes_minmaxEMMAXIBS.csv') h2_effsizes = iscan_model.effsizes['h2'] h2_effsizes.to_csv('h2_effsizes_minmaxEMMAXIBS.csv') stats = iscan_model.stats stats.to_csv('stats_minmaxEMMAXIBS.csv')
def fit_deconfounder( data_dir, save_dir, factor_model, learning_rate, max_steps, latent_dim, layer_dim, batch_size, num_samples, # number of samples from variational distribution holdout_portion, print_steps, tolerance, num_confounder_samples, #number of samples of substitute confounder from the posterior CV, outcome_type): param_save_dir = os.path.join( save_dir, "{}_lr{}_maxsteps{}_latentdim{}_layerdim{}_batchsize{}_numsamples{}_holdoutp{}_tolerance{}_numconfsamples{}_CV{}_outType{}/" .format(factor_model, learning_rate, max_steps, latent_dim, layer_dim, batch_size, num_samples, holdout_portion, tolerance, num_confounder_samples, CV, outcome_type)) if os.path.exists(param_save_dir): print("Deleting old log directory at {}".format(param_save_dir)) shutil.rmtree(param_save_dir) if not os.path.exists(param_save_dir): os.makedirs(param_save_dir) kwargs = ({ 'num_workers': 1, 'pin_memory': True } if torch.cuda.is_available() else {}) df = pd.read_csv(os.path.join(data_dir, "drug_exposure_sparse_matrix.csv"), index_col=0) data = df.values dataset = CustomDataset(data, holdout_portion) data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, **kwargs) iterator = data_loader.__iter__() num_datapoints, data_dim = dataset.counts.shape summary_writer = SummaryWriter(save_dir) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if factor_model == 'PPCA': stddv_datapoints = 0.5 model = PPCA(device, num_datapoints, data_dim, latent_dim, stddv_datapoints, num_samples, print_steps, summary_writer).to(device) if factor_model == 'PMF': model = PMF(device, num_datapoints, data_dim, latent_dim, num_samples, print_steps, summary_writer).to(device) if factor_model == 'DEF': model = DEF(device, num_datapoints, data_dim, layer_dim, num_samples, print_steps, summary_writer).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) start_time = time.time() prev_loss = 1e8 tol = 0 #tolerance counter for step in range(max_steps): try: datapoints_indices, x_train, holdout_mask = iterator.next() except StopIteration: iterator = data_loader.__iter__() datapoints_indices, x_train, holdout_mask = iterator.next() datapoints_indices = datapoints_indices.to(device) x_train = x_train.to(device) optimizer.zero_grad() elbo = model(datapoints_indices, x_train, holdout_mask, step) loss = -elbo loss.backward() optimizer.step() if step == 0 or step % print_steps == print_steps - 1: duration = (time.time() - start_time) / (step + 1) print("Step: {:>3d} ELBO: {:.3f} ({:.3f} sec)".format( step + 1, -loss, duration)) summary_writer.add_scalar("loss", loss, step) if loss < prev_loss: tol = 0 prev_loss = loss else: tol += 1 prev_loss = loss if step == max_steps - 1 or tol >= tolerance: model.predictive_check(dataset.vad, dataset.holdout_mask, dataset.holdout_subjects, 100, 100) if factor_model == "PPCA": np.savetxt(os.path.join(param_save_dir, "qw_loc"), model.qw_distribution.location.cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qw_scale"), model.qw_distribution.scale().cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qz_loc"), model.qz_distribution.location.cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qz_scale"), model.qz_distribution.scale().cpu().detach()) if factor_model == "PMF": np.savetxt(os.path.join(param_save_dir, "qv_loc"), model.qv_distribution.location.cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qv_scale"), model.qv_distribution.scale().cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qu_loc"), model.qu_distribution.location.cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qu_scale"), model.qu_distribution.scale().cpu().detach()) if factor_model == "DEF": np.savetxt(os.path.join(param_save_dir, "qw1_loc"), model.qw1_distribution.location.cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qw1_scale"), model.qw1_distribution.scale().cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qw0_loc"), model.qw0_distribution.location.cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qw0_scale"), model.qw0_distribution.scale().cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qz2_loc"), model.qz2_distribution.location.cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qz2_scale"), model.qz2_distribution.scale().cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qz1_loc"), model.qz1_distribution.location.cpu().detach()) np.savetxt(os.path.join(param_save_dir, "qz1_scale"), model.qz1_distribution.scale().cpu().detach()) if tol >= tolerance: print( "Loss goes up for {} consecutive prints. Stop training." .format(tol)) break if step == max_steps - 1: print("Maximum step reached. Stop training.") # fit outcome model covariates_df = pd.read_csv(os.path.join(data_dir, "pre_treatment_lab.csv")) covariates = covariates_df['value_as_number'].values outcome_df = pd.read_csv(os.path.join(data_dir, "post_treatment_lab.csv")) y = outcome_df['value_as_number'].values - covariates # Unadjusted model coefficients = np.zeros((1, data_dim)) outcome_model = fit_outcome_model(dataset.counts, y, data_dim, outcome_type, CV=CV, verbose=True) if outcome_type == 'linear': coefficients[0, :] = outcome_model.coef_[:data_dim] if outcome_type == 'binary': coefficients[0, :] = outcome_model.coef_[0][:data_dim] coefficients = pd.DataFrame(coefficients, columns=df.columns) coefficients.to_csv(os.path.join(param_save_dir, "coefficients.csv")) # Deconfounder (adjusting for substitute confounder) treatment_effects = np.zeros((num_confounder_samples, data_dim)) for sample in range(num_confounder_samples): if factor_model == "PPCA": substitute_confounder = np.transpose( np.squeeze(model.qz_distribution.sample(1).detach().numpy())) if factor_model == "PMF": substitute_confounder = np.transpose( np.squeeze(model.qu_distribution.sample(1).detach().numpy())) if factor_model == "DEF": substitute_confounder = np.squeeze( model.qz1_distribution.sample(1).detach().numpy()) X = np.column_stack([dataset.counts, substitute_confounder]) outcome_model = fit_outcome_model(X, y, data_dim, outcome_type, CV=CV, verbose=True) if outcome_type == 'linear': treatment_effects[sample, :] = outcome_model.coef_[:data_dim] if outcome_type == 'binary': treatment_effects[sample, :] = outcome_model.coef_[0][:data_dim] treatment_effects = pd.DataFrame(treatment_effects, columns=df.columns) treatment_effects.to_csv( os.path.join(param_save_dir, "treatment_effects.csv")) stats = pd.DataFrame({ "drug_name": treatment_effects.columns.values, "mean": treatment_effects.mean(axis=0).values, "stderr": treatment_effects.sem(axis=0).values, "ci95_lower": treatment_effects.mean(axis=0).values - 1.96 * treatment_effects.sem(axis=0).values, "ci95_upper": treatment_effects.mean(axis=0).values + 1.96 * treatment_effects.sem(axis=0).values, }) stats.to_csv(os.path.join(param_save_dir, "treatment_effects_stats.csv"), index=False) # For testing in python # outputFolder = "C:/Users/lz2629/git/zhangly811/MvDeconfounder/res" # dataFolder = "C:/Users/lz2629/git/zhangly811/MvDeconfounder/dat" # factorModel = 'DEF' # fit_deconfounder(data_dir=dataFolder, # save_dir=outputFolder, # factor_model=factorModel, # learning_rate=0.0001, # max_steps=100000, # latent_dim=1, # layer_dim=[30, 5], # batch_size=1024, # num_samples=1, # number of samples from variational distribution # holdout_portion=0.5, # print_steps=50, # tolerance=3, # num_confounder_samples=30, # number of samples of substitute confounder from the posterior # CV=5, # outcome_type='linear') # def debugfunc(data_dir, # save_dir, # factor_model, # learning_rate, # max_steps, # latent_dim, # layer_dim, # batch_size, # num_samples, # number of samples from variational distribution # holdout_portion, # print_steps, # tolerance, # num_confounder_samples, #number of samples of substitute confounder from the posterior # CV, # outcome_type): # param_save_dir = os.path.join(save_dir, "{}_lr{}_maxsteps{}_latentdim{}_layerdim{}_batchsize{}_numsamples{}_holdoutp{}_tolerance{}_numconfsamples{}_CV{}_outType{}/".format( # factor_model, learning_rate, max_steps, latent_dim, layer_dim, batch_size, num_samples, holdout_portion, tolerance, num_confounder_samples, CV, outcome_type # )) # df = pd.read_csv(os.path.join(data_dir, "drug_exposure_sparse_matrix.csv"), index_col=0) # # covariates_df = pd.read_csv(os.path.join(data_dir, "pre_treatment_lab.csv")) # data_dim=df.shape[1] # covariates = covariates_df['value_as_number'].values # y_df = pd.read_csv(os.path.join(data_dir, "post_treatment_lab.csv")) # y = y_df['value_as_number'].values - covariates # # treatment_effects = np.zeros((num_confounder_samples, data_dim)) # for sample in range(num_confounder_samples): # X = df.values # outcome_model = fit_outcome_model(X, y, data_dim, outcome_type, CV=CV, verbose=True) # if outcome_type == 'linear': # treatment_effects[sample, :] = outcome_model.coef_[:data_dim] # if outcome_type == 'binary': # treatment_effects[sample, :] = outcome_model.coef_[0][:data_dim] # treatment_effects = pd.DataFrame(treatment_effects, columns=df.columns) # treatment_effects.to_csv(os.path.join(param_save_dir, "treatment_effects.csv")) # # stats = pd.DataFrame({ # "drug_name": treatment_effects.columns.values, # "mean": treatment_effects.mean(axis=0).values, # "stderr":treatment_effects.sem(axis=0).values, # "ci95_lower": treatment_effects.mean(axis=0).values - 1.96*treatment_effects.sem(axis=0).values, # "ci95_upper": treatment_effects.mean(axis=0).values + 1.96*treatment_effects.sem(axis=0).values, # }) # stats.to_csv(os.path.join(param_save_dir, "treatment_effects_stats_test.csv"), index=False) # # debugfunc(data_dir=dataFolder, # save_dir=outputFolder, # factor_model=factorModel, # learning_rate=0.0001, # max_steps=100000, # latent_dim=1, # layer_dim=[30, 5], # batch_size=1024, # num_samples=1, # number of samples from variational distribution # holdout_portion=0.5, # print_steps=50, # tolerance=3, # num_confounder_samples=30, # number of samples of substitute confounder from the posterior # CV=5, # outcome_type='linear')
def plot_ecdf_single(iteration=0, sample_size=1000, replace=False): df = scalar_df_parse( "csv/analisiScenario/20ms/Nonmonitoring-lognormal.csv") df1 = scalar_df_parse( "csv/analisiScenario/35ms/Nonmonitoring-lognormal.csv") df2 = scalar_df_parse( "csv/analisiScenario/50ms/Nonmonitoring-lognormal.csv") df3 = scalar_df_parse( "csv/analisiScenario/10ms/Nonmonitoring-lognormal.csv") sample = df[df.name == "queueLength"] x = np.sort(sample['value'].dropna()) n = x.size y = np.arange(1, n + 1) / n plt.scatter(x=x, y=y, label="20ms") plt.legend(loc='best') plt.grid(True) plt.xlabel('x') plt.ylabel('F(x)') plt.title("ECDF for 20ms") plt.savefig( './analysis/analisiScenario/queueLength/non-monitoring-lognormal/20ms.png' ) plt.show() sample1 = df1[df1.name == "queueLength"] x = np.sort(sample1['value'].dropna()) n = x.size y = np.arange(1, n + 1) / n plt.scatter(x=x, y=y, label="35ms") plt.legend(loc='best') plt.grid(True) plt.xlabel('x') plt.ylabel('F(x)') plt.title("ECDF for 35ms") plt.savefig( './analysis/analisiScenario/queueLength/non-monitoring-lognormal/35ms.png' ) plt.show() sample2 = df2[df2.name == "queueLength"] x = np.sort(sample2['value'].dropna()) n = x.size y = np.arange(1, n + 1) / n plt.scatter(x=x, y=y, label="50ms") plt.legend(loc='best') plt.grid(True) plt.xlabel('x') plt.ylabel('F(x)') plt.title("ECDF for 50ms") plt.savefig( './analysis/analisiScenario/queueLength/non-monitoring-lognormal/50ms.png' ) plt.show() sample3 = df3[df3.name == "queueLength"] x = np.sort(sample3['value'].dropna()) n = x.size y = np.arange(1, n + 1) / n plt.scatter(x=x, y=y, label="10ms") plt.legend(loc='best') plt.grid(True) plt.xlabel('x') plt.ylabel('F(x)') plt.title("ECDF for 10ms") plt.savefig( './analysis/analisiScenario/queueLength/non-monitoring-lognormal/10ms.png' ) plt.show() stats = data_analysis(df, "queueLength") stats.to_csv( './analysis/analisiScenario/queueLength/non-monitoring-lognormal/stats20ms.csv', index=False) stats = data_analysis(df1, "queueLength") stats.to_csv( './analysis/analisiScenario/queueLength/non-monitoring-lognormal/stats35ms.csv', index=False) stats = data_analysis(df2, "queueLength") stats.to_csv( './analysis/analisiScenario/queueLength/non-monitoring-lognormal/stats50ms.csv', index=False) stats = data_analysis(df3, "queueLength") stats.to_csv( './analysis/analisiScenario/queueLength/non-monitoring-lognormal/stats10ms.csv', index=False)
stData['weekday'] = stData['weekday'].apply(recalculate_weekday) return stData def get_stats(rawData: pd.DataFrame, group='ALL'): stData = build_stats_df(rawData) Explorer = ExploratoryAnalysis(stData) stats = Explorer.all_column_stats(group) corr = Explorer.correlation_measures(group) return [stats, corr] dataFile = '/home/gauss/arm/importante/work/ai/projects/revolico/clean_data/ads_dump1.csv' df = pd.read_csv(dataFile) # [0:1000] df['classification'] = df['classification'].apply(get_first_classification) # stats, corr = get_stats(df) # Explorer.all_column_stats() groups = df.groupby('classification') groups = [(group[0], build_stats_df(group[1])) for group in groups] statsDF = build_stats_df(df) groups.append(("ALL", statsDF)) exp = ExploratoryAnalysis(groups) # exp.set_groups(groups) stats, corr = exp.groups_column_stats() stats.to_csv('ExploratoryStats.csv') corr.to_csv('Correlation.csv')