def task(args): (d, (gdsc_data_name, gdsc_data_type), drugids) = args logging.info("repr_dim = %s", d) logging.info("gdsc_data_name = %s", gdsc_data_name) import diffpri as dp import csv import pandas # Import data logging.info("Loading gene expressions...") geneexpr = pandas.read_hdf("data/%s.h5" % (gdsc_data_name), gdsc_data_type) x_full = geneexpr.as_matrix() logging.info(" * size = %s x %s" % x_full.shape) (n, d_full) = x_full.shape logging.info("Loading drug sensitivity data...") drugres = pandas.read_hdf("data/GDSC_drugres.h5", 'drug_responses') y = drugres.as_matrix() logging.info(" * size = %s x %s" % y.shape) assert x_full.shape[0] == y.shape[0] n_pv = n - n_npv - n_test pv_max = n_pv logging.info("Running the tests...") for drugid in drugids: logging.info("drugid = %d" % drugid) sd = np.nanstd(y[:, drugid], ddof=1) S = np.zeros((len(seeds), len(eps), len(model_seeds)), dtype=np.float64) R = np.zeros((len(seeds), len(eps), len(model_seeds)), dtype=np.float64) for j, e in enumerate(eps): logging.info(" epsilon = %s", e) repr_eps = e / 2 pred_eps = e / 2 if np.isinf(e): w_x = np.inf w_y = np.inf else: w_x = np.asscalar( np.loadtxt("drugsens_params/clipping/wx_n%d_d%d_e%s.txt" % (n_pv, d, pred_eps))) w_y = np.asscalar( np.loadtxt("drugsens_params/clipping/wy_n%d_d%d_e%s.txt" % (n_pv, d, pred_eps))) for model_seed in model_seeds: logging.info(" model seed = %d" % model_seed) np.random.seed(model_seed) logging.info(" selecting features...") selected_features = select_features( x_full, y[:, drugid], [d], sparsity, repr_eps, lasso_max_iter=lasso_max_iter, fit_intercept=True) x = x_full[:, selected_features[d]] d = x.shape[1] for seed in seeds: logging.info(" seed = %d" % seed) #logging.info(" preprocessing...") # Process data nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData( x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid, seed) if np.isinf(e): private = False if clipping_only: private = False #logging.info(" fitting and evaluating...") # Fit model if mcmc: pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, B_x, B_y, pred_eps, x_test, private) else: pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x, B_y, pred_eps, x_test, private) # Evaluate S[seed, j, model_seed] = dp.precision(pred, y_test) R[seed, j, model_seed] = dp.pc(pred, y_test, sd) # Save results for model_seed in model_seeds: dim_red = '%s-kifer_%d-%d' % (gdsc_data_name, d, model_seed) resname = "%s-pv%dnpv%dtst%d%s%s-%d" % ( dim_red, n_pv, n_npv, n_test, ("-cliponly" if clipping_only else ""), ("-mcmc" if mcmc else "-fixed"), drugid, ) filename = "drugsens_res/corr-%s.npy" % (resname) np.save(filename, S[:, :, model_seed]) logging.info("saved %s" % filename) filename = "drugsens_res/wpc-%s.npy" % (resname) np.save(filename, R[:, :, model_seed]) logging.info("saved %s" % filename)
d = 10 # Process data nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData( x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid, seed) private = False # Fit model pred_mcmc_d10 = pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, B_x, B_y, e, x_test, private) pred_fixd_d10 = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x, B_y, e, x_test, private) # Evaluate & save base_mcmc_d10_corr = dp.precision(pred_mcmc_d10, y_test) tmp = np.array([base_mcmc_d10_corr]) np.savetxt(datapath + 'base-corr-mcmc-d10' + '-' + str(drugid) + '-' + str(seed) + '.csv', tmp, delimiter=',') base_mcmc_d10_wpc = dp.pc(pred_mcmc_d10, y_test, sd) tmp = np.array([base_mcmc_d10_wpc]) np.savetxt(datapath + 'base-wpc-mcmc-d10' + '-' + str(drugid) + '-' + str(seed) + '.csv', tmp, delimiter=',') base_fixd_d10_corr = dp.precision(pred_fixd_d10, y_test) tmp = np.array([base_fixd_d10_corr])
n_cv = 50 for seed in range(n_cv): S = np.zeros((len(pv_size),1),dtype=np.float64) R = np.zeros((len(pv_size),1),dtype=np.float64) for i in range(len(pv_size)): n_pv = pv_size[i] w_x = WX[i,w_ind] w_y = WY[i,w_ind] # Process data nxx_pv,nxx_npv,nxy_pv,nxy_npv,nyy_pv,nyy_npv,x_test,y_test,B_x,B_y,n_train,private = dp.processData(x,y,d,n_test,n_pv,n_npv,pv_max,w_x,w_y,drugid,seed) # Fit model if mcmc: pred = dp.predictMCMC(n_train,nxx_pv,nxx_npv,nxy_pv,nxy_npv,nyy_pv,nyy_npv,B_x,B_y,e,x_test,private) else: pred = dp.predictL(nxx_pv,nxx_npv,nxy_pv,nxy_npv,B_x,B_y,e,x_test,private) # Evaluate S[i,0] = dp.precision(pred,y_test) R[i,0] = dp.pc(pred,y_test,sd) # Save results into file csvpath = datapath+'cliptest-drugsens-corr-'+stre+'-'+str(drugid)+'-'+str(seed)+'.csv' np.savetxt(csvpath,S,delimiter=',') csvpath = datapath+'cliptest-drugsens-wpc-'+stre+'-'+str(drugid)+'-'+str(seed)+'.csv' np.savetxt(csvpath,R,delimiter=',')
# Process data nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData( x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid, seed) # Fit model if mcmc: pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, B_x, B_y, e, x_test, private) else: pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x, B_y, e, x_test, private) # Evaluate R[i, j] = dp.precision(pred, y_test) # Save results into file csvpath = datapath + 'cliptest-drugsens-A-' + str(drugid) + '-' + str( seed) + '.csv' np.savetxt(csvpath, R, delimiter=',') # B) Non-private data size if test == 1: # Fetch clipping threshold omegas for test B (in A-W*[:,1]) f = open(datapath + 'A-WX.csv', 'rt') reader = csv.reader(f, delimiter=',') WX = np.array(list(reader)).astype(float) f.close() f = open(datapath + 'A-WY.csv', 'rt')
# Process data nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData( x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid, seed) # Fit model if mcmc: pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, B_x, B_y, e, x_test, private) else: pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x, B_y, e, x_test, private) # Evaluate S[i] = dp.precision(pred, y_test) # Save results into file csvpath = datapath + 'synth-rplr-' + t + '-' + str(seed) + '.csv' np.savetxt(csvpath, S, delimiter=',') if param == 1: # Cross-validation for seed in range(n_cv): S = np.zeros(len(pv_size), dtype=np.float64) for i in range(len(pv_size)): n_pv = pv_size[i] e = 2.0
def task(args): (dim_red, ) = args logging.info("dim_red = %s", dim_red) import diffpri as dp import csv import pandas # Import data logging.info("Loading representation...") filename = "%s.csv" % (dim_red) f = open("data_repr/" + filename, 'rt') reader = csv.reader(f, delimiter=',') x = np.array(list(reader)).astype(float) f.close() logging.info(" * size = %s x %s" % x.shape) (n, d) = x.shape logging.info("Loading drug sensitivity data...") drugres = pandas.read_hdf("data/GDSC_drugres.h5", 'drug_responses') y = drugres.as_matrix() logging.info(" * size = %s x %s" % y.shape) assert x.shape[0] == y.shape[0] n_pv = n - n_npv - n_test pv_max = n_pv if ica: logging.info("Running FastICA...") from sklearn.decomposition import FastICA x = FastICA(max_iter=2000).fit_transform(x) logging.info("Running the tests...") for drugid in drugids: logging.info("drugid = %d" % drugid) #S = np.zeros((len(seeds),len(pv_size),len(eps)),dtype=np.float64) #R = np.zeros((len(seeds),len(pv_size),len(eps)),dtype=np.float64) S = np.zeros((len(seeds), len(eps)), dtype=np.float64) R = np.zeros((len(seeds), len(eps)), dtype=np.float64) for seed in seeds: logging.info("seed = %d" % seed) sd = np.nanstd(y[:, drugid], ddof=1) for j, e in enumerate(eps): if np.isinf(e): w_x = np.inf w_y = np.inf else: w_x = np.asscalar( np.loadtxt( "drugsens_params/clipping/wx_n%d_d%d_e%s.txt" % (n_pv, d, e))) w_y = np.asscalar( np.loadtxt( "drugsens_params/clipping/wy_n%d_d%d_e%s.txt" % (n_pv, d, e))) # Process data nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData( x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid, seed) if np.isinf(e): private = False if clipping_only: private = False # Fit model if mcmc: pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, B_x, B_y, e, x_test, private) else: pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x, B_y, e, x_test, private) # Evaluate S[seed, j] = dp.precision(pred, y_test) R[seed, j] = dp.pc(pred, y_test, sd) # Save results resname = "%s-pv%dnpv%dtst%d%s%s%s-%d" % ( dim_red, n_pv, n_npv, n_test, ("-ica" if ica else ""), ("-cliponly" if clipping_only else ""), ("-mcmc" if mcmc else "-fixed"), drugid, ) filename = "drugsens_res/corr-%s.npy" % (resname) np.save(filename, S) logging.info("saved %s" % filename) filename = "drugsens_res/wpc-%s.npy" % (resname) np.save(filename, R) logging.info("saved %s" % filename)
for i in range(len(pv_size)): n_pv = pv_size[i] d = pars['dim'] for j in range(len(eps)): pars['epsilon'] = eps[j] pars['delta'] = delta_list[j] w_x = WX[i,j] w_y = WY[i,j] # check amount of data, use maximum amount if too few samples if n_data < n_pv+n_test: #n_npv+n_test: print('Not enough non-missing data! Continuing with maximum amount of private data: ' + str(n_data-n_test)) n_pv = n_data-n_test # Process data suff_stats_all,sigma_all,added_noise_dict,x_test,y_test,B_x,B_y,n_train = dp.processData(x,y,d,n_test,n_pv,pv_max,w_x,w_y,drugid,seed, pars) # calculate predictions for m in suff_stats_all: pred = dp.predictL(suff_stats_all[m][0],suff_stats_all[m][1],x_test) res_all[m][i,j] = dp.precision(pred,y_test) with open('res/cliptest-drugsens-'+str(drugid)+'-'+str(seed)+'.pickle', 'wb') as f: pickle.dump(res_all, f, pickle.HIGHEST_PROTOCOL) print('Done.')
# Check if split is sensible t1 = minp <= p1 and p1 <= maxp t2 = minp <= p2 and p2 <= maxp t3 = minp <= p3 and p3 <= maxp if not all([t1,t2,t3]): continue # Clipping omega and thresholds w_x,w_y = dp.omega(n,d,eps,True,ln=ln,p1=p1,p2=p2,p3=p3) c1 = sx * w_x c2 = sy * w_y # Clip data xc,yc = dp.clip(x,y,c1,c2) # Perturbed suff.stats. xx = dp.nxx(xc) + W*(c1**2.0)/p1 xy = dp.nxy(xc,yc) + L*c1*c2/p2 yy = dp.nyy(yc) + V*(c2**2.0)/p3 # Prediction pred = dp.doADVI(n,xx,xy,yy,x) # Precision err[k,m] = dp.precision(pred,y) # Save result csvpath = datapath+'budgetsplit-result-'+str(i)+'-'+str(j)+'-'+str(k)+'.csv' np.savetxt(csvpath,err,delimiter=',')