def task(args):
    (d, (gdsc_data_name, gdsc_data_type), drugids) = args
    logging.info("repr_dim = %s", d)
    logging.info("gdsc_data_name = %s", gdsc_data_name)

    import diffpri as dp
    import csv
    import pandas

    # Import data
    logging.info("Loading gene expressions...")
    geneexpr = pandas.read_hdf("data/%s.h5" % (gdsc_data_name), gdsc_data_type)
    x_full = geneexpr.as_matrix()
    logging.info(" * size = %s x %s" % x_full.shape)
    (n, d_full) = x_full.shape

    logging.info("Loading drug sensitivity data...")
    drugres = pandas.read_hdf("data/GDSC_drugres.h5", 'drug_responses')
    y = drugres.as_matrix()
    logging.info(" * size = %s x %s" % y.shape)

    assert x_full.shape[0] == y.shape[0]

    n_pv = n - n_npv - n_test
    pv_max = n_pv

    logging.info("Running the tests...")

    for drugid in drugids:
        logging.info("drugid = %d" % drugid)

        sd = np.nanstd(y[:, drugid], ddof=1)

        S = np.zeros((len(seeds), len(eps), len(model_seeds)),
                     dtype=np.float64)
        R = np.zeros((len(seeds), len(eps), len(model_seeds)),
                     dtype=np.float64)

        for j, e in enumerate(eps):
            logging.info(" epsilon = %s", e)

            repr_eps = e / 2
            pred_eps = e / 2

            if np.isinf(e):
                w_x = np.inf
                w_y = np.inf
            else:
                w_x = np.asscalar(
                    np.loadtxt("drugsens_params/clipping/wx_n%d_d%d_e%s.txt" %
                               (n_pv, d, pred_eps)))
                w_y = np.asscalar(
                    np.loadtxt("drugsens_params/clipping/wy_n%d_d%d_e%s.txt" %
                               (n_pv, d, pred_eps)))

            for model_seed in model_seeds:
                logging.info("  model seed = %d" % model_seed)
                np.random.seed(model_seed)

                logging.info("   selecting features...")
                selected_features = select_features(
                    x_full,
                    y[:, drugid], [d],
                    sparsity,
                    repr_eps,
                    lasso_max_iter=lasso_max_iter,
                    fit_intercept=True)

                x = x_full[:, selected_features[d]]
                d = x.shape[1]

                for seed in seeds:
                    logging.info("   seed = %d" % seed)

                    #logging.info("    preprocessing...")

                    # Process data
                    nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData(
                        x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid,
                        seed)

                    if np.isinf(e):
                        private = False

                    if clipping_only:
                        private = False

                    #logging.info("    fitting and evaluating...")

                    # Fit model
                    if mcmc:
                        pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv,
                                              nxy_npv, nyy_pv, nyy_npv, B_x,
                                              B_y, pred_eps, x_test, private)
                    else:
                        pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv,
                                           B_x, B_y, pred_eps, x_test, private)

                    # Evaluate
                    S[seed, j, model_seed] = dp.precision(pred, y_test)
                    R[seed, j, model_seed] = dp.pc(pred, y_test, sd)

        # Save results
        for model_seed in model_seeds:
            dim_red = '%s-kifer_%d-%d' % (gdsc_data_name, d, model_seed)
            resname = "%s-pv%dnpv%dtst%d%s%s-%d" % (
                dim_red,
                n_pv,
                n_npv,
                n_test,
                ("-cliponly" if clipping_only else ""),
                ("-mcmc" if mcmc else "-fixed"),
                drugid,
            )
            filename = "drugsens_res/corr-%s.npy" % (resname)
            np.save(filename, S[:, :, model_seed])
            logging.info("saved %s" % filename)
            filename = "drugsens_res/wpc-%s.npy" % (resname)
            np.save(filename, R[:, :, model_seed])
            logging.info("saved %s" % filename)
Пример #2
0
    d = 10

    # Process data
    nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData(
        x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid, seed)
    private = False

    # Fit model
    pred_mcmc_d10 = pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv,
                                          nxy_npv, nyy_pv, nyy_npv, B_x, B_y,
                                          e, x_test, private)
    pred_fixd_d10 = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x, B_y, e,
                                x_test, private)

    # Evaluate & save
    base_mcmc_d10_corr = dp.precision(pred_mcmc_d10, y_test)
    tmp = np.array([base_mcmc_d10_corr])
    np.savetxt(datapath + 'base-corr-mcmc-d10' + '-' + str(drugid) + '-' +
               str(seed) + '.csv',
               tmp,
               delimiter=',')

    base_mcmc_d10_wpc = dp.pc(pred_mcmc_d10, y_test, sd)
    tmp = np.array([base_mcmc_d10_wpc])
    np.savetxt(datapath + 'base-wpc-mcmc-d10' + '-' + str(drugid) + '-' +
               str(seed) + '.csv',
               tmp,
               delimiter=',')

    base_fixd_d10_corr = dp.precision(pred_fixd_d10, y_test)
    tmp = np.array([base_fixd_d10_corr])
Пример #3
0
n_cv = 50
for seed in range(n_cv):

	S = np.zeros((len(pv_size),1),dtype=np.float64)
	R = np.zeros((len(pv_size),1),dtype=np.float64)

	for i in range(len(pv_size)):

		n_pv = pv_size[i]
		w_x = WX[i,w_ind]
		w_y = WY[i,w_ind]

		# Process data
		nxx_pv,nxx_npv,nxy_pv,nxy_npv,nyy_pv,nyy_npv,x_test,y_test,B_x,B_y,n_train,private = dp.processData(x,y,d,n_test,n_pv,n_npv,pv_max,w_x,w_y,drugid,seed)

		# Fit model
		if mcmc:
			pred = dp.predictMCMC(n_train,nxx_pv,nxx_npv,nxy_pv,nxy_npv,nyy_pv,nyy_npv,B_x,B_y,e,x_test,private)
		else:
			pred = dp.predictL(nxx_pv,nxx_npv,nxy_pv,nxy_npv,B_x,B_y,e,x_test,private)

		# Evaluate
		S[i,0] = dp.precision(pred,y_test)
		R[i,0] = dp.pc(pred,y_test,sd)	

	# Save results into file
	csvpath = datapath+'cliptest-drugsens-corr-'+stre+'-'+str(drugid)+'-'+str(seed)+'.csv'
	np.savetxt(csvpath,S,delimiter=',')
	csvpath = datapath+'cliptest-drugsens-wpc-'+stre+'-'+str(drugid)+'-'+str(seed)+'.csv'
	np.savetxt(csvpath,R,delimiter=',')
Пример #4
0
                # Process data
                nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData(
                    x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid,
                    seed)

                # Fit model
                if mcmc:
                    pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv,
                                          nxy_npv, nyy_pv, nyy_npv, B_x, B_y,
                                          e, x_test, private)
                else:
                    pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x,
                                       B_y, e, x_test, private)

                # Evaluate
                R[i, j] = dp.precision(pred, y_test)

        # Save results into file
        csvpath = datapath + 'cliptest-drugsens-A-' + str(drugid) + '-' + str(
            seed) + '.csv'
        np.savetxt(csvpath, R, delimiter=',')

    # B) Non-private data size
    if test == 1:

        # Fetch clipping threshold omegas for test B (in A-W*[:,1])
        f = open(datapath + 'A-WX.csv', 'rt')
        reader = csv.reader(f, delimiter=',')
        WX = np.array(list(reader)).astype(float)
        f.close()
        f = open(datapath + 'A-WY.csv', 'rt')
Пример #5
0
            # Process data
            nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData(
                x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid, seed)

            # Fit model
            if mcmc:
                pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv,
                                      nxy_npv, nyy_pv, nyy_npv, B_x, B_y, e,
                                      x_test, private)
            else:
                pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x, B_y,
                                   e, x_test, private)

            # Evaluate
            S[i] = dp.precision(pred, y_test)

        # Save results into file
        csvpath = datapath + 'synth-rplr-' + t + '-' + str(seed) + '.csv'
        np.savetxt(csvpath, S, delimiter=',')

if param == 1:
    # Cross-validation
    for seed in range(n_cv):

        S = np.zeros(len(pv_size), dtype=np.float64)

        for i in range(len(pv_size)):

            n_pv = pv_size[i]
            e = 2.0
def task(args):
    (dim_red, ) = args
    logging.info("dim_red = %s", dim_red)

    import diffpri as dp
    import csv
    import pandas

    # Import data
    logging.info("Loading representation...")
    filename = "%s.csv" % (dim_red)
    f = open("data_repr/" + filename, 'rt')
    reader = csv.reader(f, delimiter=',')
    x = np.array(list(reader)).astype(float)
    f.close()
    logging.info(" * size = %s x %s" % x.shape)
    (n, d) = x.shape

    logging.info("Loading drug sensitivity data...")
    drugres = pandas.read_hdf("data/GDSC_drugres.h5", 'drug_responses')
    y = drugres.as_matrix()
    logging.info(" * size = %s x %s" % y.shape)

    assert x.shape[0] == y.shape[0]

    n_pv = n - n_npv - n_test
    pv_max = n_pv

    if ica:
        logging.info("Running FastICA...")
        from sklearn.decomposition import FastICA
        x = FastICA(max_iter=2000).fit_transform(x)

    logging.info("Running the tests...")

    for drugid in drugids:
        logging.info("drugid = %d" % drugid)
        #S = np.zeros((len(seeds),len(pv_size),len(eps)),dtype=np.float64)
        #R = np.zeros((len(seeds),len(pv_size),len(eps)),dtype=np.float64)
        S = np.zeros((len(seeds), len(eps)), dtype=np.float64)
        R = np.zeros((len(seeds), len(eps)), dtype=np.float64)
        for seed in seeds:
            logging.info("seed = %d" % seed)
            sd = np.nanstd(y[:, drugid], ddof=1)

            for j, e in enumerate(eps):
                if np.isinf(e):
                    w_x = np.inf
                    w_y = np.inf
                else:
                    w_x = np.asscalar(
                        np.loadtxt(
                            "drugsens_params/clipping/wx_n%d_d%d_e%s.txt" %
                            (n_pv, d, e)))
                    w_y = np.asscalar(
                        np.loadtxt(
                            "drugsens_params/clipping/wy_n%d_d%d_e%s.txt" %
                            (n_pv, d, e)))

                # Process data
                nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData(
                    x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid,
                    seed)

                if np.isinf(e):
                    private = False

                if clipping_only:
                    private = False

                # Fit model
                if mcmc:
                    pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv,
                                          nxy_npv, nyy_pv, nyy_npv, B_x, B_y,
                                          e, x_test, private)
                else:
                    pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x,
                                       B_y, e, x_test, private)

                # Evaluate
                S[seed, j] = dp.precision(pred, y_test)
                R[seed, j] = dp.pc(pred, y_test, sd)

        # Save results
        resname = "%s-pv%dnpv%dtst%d%s%s%s-%d" % (
            dim_red,
            n_pv,
            n_npv,
            n_test,
            ("-ica" if ica else ""),
            ("-cliponly" if clipping_only else ""),
            ("-mcmc" if mcmc else "-fixed"),
            drugid,
        )
        filename = "drugsens_res/corr-%s.npy" % (resname)
        np.save(filename, S)
        logging.info("saved %s" % filename)
        filename = "drugsens_res/wpc-%s.npy" % (resname)
        np.save(filename, R)
        logging.info("saved %s" % filename)
Пример #7
0
for i in range(len(pv_size)):
  
  n_pv = pv_size[i]
  d = pars['dim']
  for j in range(len(eps)):
    pars['epsilon'] = eps[j]
    pars['delta'] = delta_list[j]
    
    w_x = WX[i,j]
    w_y = WY[i,j]
    
    # check amount of data, use maximum amount if too few samples
    if n_data < n_pv+n_test: #n_npv+n_test:
      print('Not enough non-missing data! Continuing with maximum amount of private data: ' + str(n_data-n_test))
      n_pv = n_data-n_test
    
    # Process data
    suff_stats_all,sigma_all,added_noise_dict,x_test,y_test,B_x,B_y,n_train = dp.processData(x,y,d,n_test,n_pv,pv_max,w_x,w_y,drugid,seed, pars)
    
    # calculate predictions
    for m in suff_stats_all:
      pred = dp.predictL(suff_stats_all[m][0],suff_stats_all[m][1],x_test)
      res_all[m][i,j] = dp.precision(pred,y_test)
      

with open('res/cliptest-drugsens-'+str(drugid)+'-'+str(seed)+'.pickle', 'wb') as f:
  pickle.dump(res_all, f, pickle.HIGHEST_PROTOCOL)
  
print('Done.')
Пример #8
0
	# Check if split is sensible
	t1 = minp <= p1 and p1 <= maxp
	t2 = minp <= p2 and p2 <= maxp
	t3 = minp <= p3 and p3 <= maxp

	if not all([t1,t2,t3]):
		continue

	# Clipping omega and thresholds
	w_x,w_y = dp.omega(n,d,eps,True,ln=ln,p1=p1,p2=p2,p3=p3)
	c1 = sx * w_x
	c2 = sy * w_y

	# Clip data
	xc,yc = dp.clip(x,y,c1,c2)

	# Perturbed suff.stats.
	xx = dp.nxx(xc) + W*(c1**2.0)/p1
	xy = dp.nxy(xc,yc) + L*c1*c2/p2
	yy = dp.nyy(yc) + V*(c2**2.0)/p3

	# Prediction
	pred = dp.doADVI(n,xx,xy,yy,x)

	# Precision
	err[k,m] = dp.precision(pred,y)

# Save result
csvpath = datapath+'budgetsplit-result-'+str(i)+'-'+str(j)+'-'+str(k)+'.csv'
np.savetxt(csvpath,err,delimiter=',')