def task(args): import pandas #data_set, = args logging.info("dataset = %s", data_set) # read the data sets logging.info("Reading data...") data = pandas.read_hdf("data/%s.h5" % (data_set), data_type) logging.info(" * gene expression shape: %d x %d" % data.shape) x = data.as_matrix() if normalize_data: # these shouldn't affect the results x -= np.mean(x) x /= np.std(x) x -= np.mean(x, axis=0) logging.info("Running PCA...") pca = sk_PCA() pca.fit(x) logging.info("Writing results...") res_dir = 'res/pca-explained-variance' res_filename = "%s/%s.txt" % (res_dir, data_set) ensure_dir_exists(res_dir) np.savetxt(res_filename, pca.explained_variance_ratio_)
def run_batch(args, params): ensure_dir_exists("run_parameters") params = [get_params(p, domain) for p in params] np.save("run_parameters/params.npy", params) assert len(params) == gpyopt_batch_size args.wait = True batch.run_tasks(args) # get results #return np.random.randn(gpyopt_batch_size, 1) res = np.zeros((gpyopt_batch_size, 1)) for param_id in range(gpyopt_batch_size): tot_res = 0 for val_cancertypes in val_cancer_type_splits: data_name = '-'.join(val_cancertypes).replace(' ', '_') for seed in seeds: #full_model_id = "%s-%d-%s-s%d%s" % (data_name, repr_dim, param_id, seed, id_suffix) full_model_id = "%s-%s-%d%s" % (param_id, data_name, seed, id_suffix) filename = "param_opt/opt_result-%s.txt" % (full_model_id) try: tot_res += np.loadtxt(filename) import os os.remove(filename) except: tot_res += gpyopt_fail_res logging.info('Warning, could not load "%s"' % filename) res[param_id] = tot_res / (len(val_cancer_type_splits) * len(seeds)) return res
def run_learning_and_mapping(args, fixed_params, task_param, seeds, slurm_args=None): logging.info('Running final tests with...') import copy task_params = [copy.copy(task_param) for s in seeds] param_ids = range(len(task_params)) for param, seed, param_id in zip(task_params, seeds, param_ids): param.param_id = param_id param.seed = seed ensure_dir_exists("run_parameters") common_params = SimpleNamespace( **fixed_params.__dict__, priv_cancertype_pairs=[], pub_cancertypes=sum(cancer_type_pairs, []), task_type='learn_and_map', ) args.wait = True batch2.run_tasks(args, common_params, task_params, slurm_args=slurm_args, params_file=("run_parameters/batch-%s.pkl" % (fixed_params.test_name)))
def cleanup_files(self): sys.stderr.flush() sys.stdout.flush() deltree(self.params.network_shared_path) deltree(self.params.top_directory) ensure_dir_exists(self.params.top_directory) ensure_dir_exists(self.params.network_shared_path)
def task(args): (data_type, repr_dim), seed, (algName, _, makeAlg) = args logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed, algName) # read the data sets logging.info("Reading data...") y_train, x_train, y_test, x_test = dataReader.main("%s_%d" % (data_type, seed)) data_dim = x_train.shape[1] logging.info(" * training set: %d x %d" % x_train.shape) logging.info(" * testing set: %d x %d" % x_test.shape) # init rng np.random.seed(seed) logging.info("Running and evaluating the algorithm...") logging.info(" * using representation with dimension = %d", repr_dim) # init the algorithm alg = makeAlg(data_dim, repr_dim) # create output dir if does not exist ensure_dir_exists('res') # define the progress saving function progress_filename = 'res/progress-encdec-mse-%s-%d-%s.txt' % (data_type, seed, algName) progress_file = open(progress_filename, 'w', encoding='utf-8') def save_progress(): x_test_pred = alg.decode(alg.encode(x_test)) rel_mse = relative_mean_squared_error(x_test, x_test_pred) progress_file.write("%g\n" % rel_mse) # fit to the training data alg.learn(x_train, log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName)), callbacks=[save_progress]) # TODO: remove? x_test = x_train # test with the testing data x_test_pred = alg.decode(alg.encode(x_test)) ensure_dir_exists('pred') pred_filename = 'pred/final-encdec-%s-%d-%s' % (data_type, seed, algName) if save_pred: np.save(pred_filename, x_test_pred) #from sklearn import metrics #mse = metrics.mean_squared_error(x_test, x_test_pred, # multioutput='uniform_average') #explained_var = metrics.explained_variance_score(x_test, x_test_pred, # multioutput='uniform_average') mse = mean_squared_error(x_test, x_test_pred) rel_mse = relative_mean_squared_error(x_test, x_test_pred) logging.info("Result: rel_mse = %g", rel_mse) logging.info("Writing results to a file...") res_filename = 'res/final-encdec-mse-%s-%d-%s.txt' % (data_type, seed, algName) with open(res_filename, 'w', encoding='utf-8') as f: f.write("data = %-16s seed = %-4d alg = %-10s " % (data_type, seed, algName)) f.write("mse = %.6f " % mse) f.write("rel_mse = %.6f " % rel_mse) f.write("\n")
def task(args): repr_dim, (alg_id, seed, load_model) = args logging.info("representation size = %d, algorithm = %s, seed = %d", repr_dim, alg_id, seed) # read the PADS gene expression data logging.info("Reading gene expression data...") import pandas data = pandas.read_hdf("data/%s.h5" % (data_set), data_type) x = data.as_matrix() logging.info(" * data shape: %d x %d" % x.shape) #logging.info("Filter and normalize...") ## load gene names that appear also in TCGA data #tcga_gene_names = np.array(getHDF5data("data/%s_genes.h5" % (aux_data_set), # True, False)[0], dtype=str) #in_tcga = np.array([(gene_name in tcga_gene_names) for gene_name in gene_names]) #assert(np.sum(in_tcga) == len(tcga_gene_names)) ## use only those genes #x = x[:,in_tcga] # normalize the input to _total_ unit variance and zero mean if normalize_data: x -= np.mean(x) x /= np.std(x) x -= np.mean(x, axis=0) logging.info(" * data shape after preprocessing: %d x %d" % x.shape) # init rng np.random.seed(seed) # load the model logging.info("Loading the model...") alg = load_model(repr_dim) # get the representation logging.info("Computing the representation or size %d..." % (repr_dim)) x_repr = alg.encode(x) # test to predict the data itself x_pred = alg.decode(x_repr) rel_mse = relative_mean_squared_error(x, x_pred) logging.info(" * reconstruct the data: rel_mse = %g", rel_mse) ensure_dir_exists("res") with open("res/private-encdec-rel_mse-%d-%s-%s-s%d%s.txt" % (repr_dim, aux_data_set, alg_id, seed, id_suffix), 'w', encoding='utf-8') as f: f.write("%.6f\n" % rel_mse) # save the representation logging.info("Saving the representation...") ensure_dir_exists("data_repr") np.savetxt("data_repr/repr-%d-%s-%s-s%d%s.csv" % (repr_dim, aux_data_set, alg_id, seed, id_suffix), x_repr, delimiter=',')
def run_optimization_batch(args, fixed_params, task_params, seeds, slurm_args=None): ensure_dir_exists("run_parameters") param_ids = range(len(task_params)) for param, seed, param_id in zip(task_params, seeds, param_ids): param.param_id = param_id param.seed = seed assert len(task_params) == gpyopt_batch_size nonpriv_cancertype_pairs = [ ctp for ctp in cancer_type_pairs if ctp != fixed_params.priv_cancertype_pair ] assert len(nonpriv_cancertype_pairs) == len(cancer_type_pairs) - 1 res = np.zeros((len(task_params), fixed_params.param_opt_folds)) for fold in range(fixed_params.param_opt_folds): val_cancertype_pairs = [ ctp for (i, ctp) in enumerate(nonpriv_cancertype_pairs) if i % fixed_params.param_opt_folds == fold ] learn_cancertype_pairs = [ ctp for (i, ctp) in enumerate(nonpriv_cancertype_pairs) if i % fixed_params.param_opt_folds != fold ] assert (len(val_cancertype_pairs) + len(learn_cancertype_pairs) == len(nonpriv_cancertype_pairs)) common_params = SimpleNamespace( **fixed_params.__dict__, priv_cancertype_pairs=val_cancertype_pairs, pub_cancertypes=sum(learn_cancertype_pairs, []), task_type='paramopt', ) args.wait = True batch2.run_tasks(args, common_params, task_params, slurm_args=slurm_args, params_file=("run_parameters/batch-%s.pkl" % (fixed_params.test_name))) # get results for param_id in param_ids: full_model_id = "%s-%s" % (fixed_params.test_name, param_id) filename = "param_opt/opt_result-%s.txt" % (full_model_id) try: res[param_id, fold] = np.loadtxt(filename) import os os.remove(filename) except: res[param_id, fold] = gpyopt_fail_res + np.random.randn( ) * gpyopt_fail_res_std logging.info('Warning, could not load "%s"' % filename) return np.mean(res, axis=1, keepdims=True)
def rclone_upload(local_path: str, rc_remote_path: str, rc_bwlimit: str = None, rc_logfile: str = None, rc_dry_run: bool = False) -> None: """Use rclone to move something. https://rclone.org/docs/ """ logging.debug('rclone_upload() args={0!r}'.format(locals())) # SUPER DEBUG logging.info( 'Using rclone to move local_path={l!r} to rc_remote_path={r!r}'.format( l=local_path, r=rc_remote_path)) # command - prepare args # https://rclone.org/commands/rclone_move/ cmd = [ 'rclone', 'move', local_path, rc_remote_path, ] if rc_bwlimit: # Throttle/ratelimit cmd.append('--bwlimit') cmd.append('{0}'.format(rc_bwlimit)) if rc_logfile: # Verbose debugging info. https://rclone.org/docs/#log-level-level cmd.append('--log-file={0}'.format(rc_logfile)) cmd.append('--log-level') cmd.append('DEBUG') if rc_dry_run: # https://rclone.org/docs/#n-dry-run cmd.append('--dry-run') logging.debug('cmd={0!r}'.format(cmd)) # command - execute common.ensure_dir_exists(dir_path=os.path.join( 'debug')) # Protect against missing dir for stdout/stderr temp files. stdout_path = os.path.join('debug', 'ia2rc.rclone_upload.stdout.txt') stderr_path = os.path.join('debug', 'ia2rc.rclone_upload.stderr.txt') with open( stdout_path, 'w' ) as f_stdout: # File objects required to capture stdout and stderr. with open(stderr_path, 'w') as f_stderr: cmd_res = subprocess.run( args=cmd, encoding='utf8', stdout=f_stdout, stderr=f_stderr, ) # command - capture and tolerate result logging.debug( 'cmd={0!r}'.format(cmd)) # Extra-detailed logging for dev only logging.debug('cmd_res={0!r}'.format(cmd_res)) logging.debug('cmd_res.returncode={0!r}'.format(cmd_res.returncode)) assert ( cmd_res.returncode == 0 ) # Nonzero means problems occured. TODO: Check return code better -2020-07-21. logging.info( 'Finished rclone move local_path={l!r} to rc_remote_path={r!r}'.format( l=local_path, r=rc_remote_path)) return
def task(args): seed, (algName, _, makeAlg) = args data_type = "vae_test" logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed, algName) np.random.seed(seed) x = np.random.normal(0.0, 1.0, (1000, 2)) x = np.dot(x, np.array([[5.0, 3.0], [0.3, -0.5]])) data_dim = x.shape[1] logging.info(" * training set: %d x %d" % x.shape) logging.info(" * testing set: %d x %d" % x.shape) # init rng logging.info("Running and evaluating the algorithm...") logging.info(" * using representation with dimension = %d", repr_dim) # init the algorithm alg = makeAlg(data_dim, repr_dim) # create output dir if does not exist #ensure_dir_exists('res') # define the progress saving function #progress_filename = 'res/progress-encdec-mse-%s-%d-%s.txt' % (data_type, seed, algName) #progress_file = open(progress_filename, 'w', encoding='utf-8') #def save_progress(): # x_test_pred = alg.decode(alg.encode(x_test)) # rel_mse = relative_mean_squared_error(x_test, x_test_pred) # progress_file.write("%g\n" % rel_mse) # fit to the training data alg.learn(x, log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName))) x_test = x # test with the testing data x_test_pred = alg.decode(alg.encode(x_test)) #x_test_pred = alg.decode_generate(alg.encode(x_test)) #x_test_pred = alg.decode_generate(alg.encode_generate(x_test)) #x_test_pred = alg.decode(alg.encode_generate(x_test)) ensure_dir_exists('pred') data_filename = 'data/generated/%s-%d' % (data_type, seed) pred_filename = 'pred/final-encdec-%s-%d-%s' % (data_type, seed, algName) if save_pred: np.save(data_filename, x_test) np.save(pred_filename, x_test_pred) #from sklearn import metrics #mse = metrics.mean_squared_error(x_test, x_test_pred, # multioutput='uniform_average') #explained_var = metrics.explained_variance_score(x_test, x_test_pred, # multioutput='uniform_average') mse = mean_squared_error(x_test, x_test_pred) rel_mse = relative_mean_squared_error(x_test, x_test_pred) logging.info("Result: rel_mse = %g", rel_mse)
def create_top_dirs(prm): is_multi_host = (prm.host_set != []) sharepath = prm.network_shared_path if os.path.exists(sharepath): shutil.rmtree(sharepath) if is_multi_host: # so all remote clients see that directory was recreated time.sleep(2.1) common.ensure_dir_exists(sharepath) if is_multi_host: # workaround to force cross-host synchronization os.listdir(sharepath) time.sleep(1.1) # lets NFS mount option actimeo=1 take effect
def task(args): import diffpri as dp n, d, e = args logging.info("n = %d, d = %d, e = %s", n, d, e) if n == 0 or np.isinf(e): # no pv data -> no clipping wx = 0.0 wy = 0.0 else: wx, wy = dp.omega(n,d,e,mcmc) ensure_dir_exists("drugsens_params/clipping") with open("drugsens_params/clipping/wx_n%d_d%d_e%s.txt" % (n, d, e), 'w') as f: f.write("%s" % wx) with open("drugsens_params/clipping/wy_n%d_d%d_e%s.txt" % (n, d, e), 'w') as f: f.write("%s" % wy)
def run_optimization(args, domain, constraints, batch_size, max_iter, max_duration=None, deadline=None): logging.info('Starting parameter optimization...') import GPyOpt ensure_dir_exists("param_opt") if max_duration is not None: new_dl = datetime.datetime.now() + max_duration if deadline is None or new_dl < deadline: deadline = new_dl initial_design_type = 'random' initial_design_numdata = batch_size logging.info('Selecting initial parameters...') space = GPyOpt.core.task.space.Design_space(domain, constraints) params = GPyOpt.experiment_design.initial_design(initial_design_type, space, initial_design_numdata) logging.info('Running...') results = run_batch(args, params) all_params = params all_results = results for i in range(max_iter): print(np.hstack((all_params, all_results)), flush=True) logging.info('Selecting a new set of parameters...') bo = GPyOpt.methods.BayesianOptimization(f=None, domain = domain, X = all_params, Y = -all_results, acquisition_type = 'EI', normalize_Y = True, evaluator_type = 'local_penalization', batch_size = batch_size, acquisition_jitter = 0, maximize = False) params = bo.suggest_next_locations() logging.info('Running...') results = run_batch(args, params) all_params = np.vstack((all_params, params)) all_results = np.vstack((all_results, results)) np.save("param_opt/opt_params%s.npy" % id_suffix, all_params) np.save("param_opt/opt_results%s.npy" % id_suffix, all_results) if datetime.datetime.now() >= deadline: logging.info('Gpyopt iteration %d: Time based stopping' % (i)) break return all_params[np.argmax(np.results)]
def run_test(args, fixed_params, task_param, seeds, slurm_args=None): logging.info('Running final tests with...') import copy task_params = [copy.copy(task_param) for s in seeds] param_ids = range(len(task_params)) for param, seed, param_id in zip(task_params, seeds, param_ids): param.param_id = param_id param.seed = seed ensure_dir_exists("run_parameters") nonpriv_cancertype_pairs = [ ctp for ctp in cancer_type_pairs if ctp != fixed_params.priv_cancertype_pair ] assert len(nonpriv_cancertype_pairs) == len(cancer_type_pairs) - 1 common_params = SimpleNamespace( **fixed_params.__dict__, priv_cancertype_pairs=[fixed_params.priv_cancertype_pair], pub_cancertypes=sum(nonpriv_cancertype_pairs, []), task_type='test', ) args.wait = True batch2.run_tasks(args, common_params, task_params, slurm_args=slurm_args, params_file=("run_parameters/batch-%s.pkl" % (fixed_params.test_name))) # res = np.zeros((len(task_params), 1)) for param_id in param_ids: full_model_id = "%s-%s" % (fixed_params.test_name, param_id) filename = "param_opt/opt_result-%s.txt" % (full_model_id) try: res[param_id] = np.loadtxt(filename) import os os.remove(filename) except: res[param_id] = np.nan #gpyopt_fail_res logging.info('Warning, could not load "%s"' % filename) filename = "res/test_results-%s.txt" % (fixed_params.test_name) logging.info("Writing final results to '%s'" % filename) np.savetxt(filename, res)
def run_optimization_batch(args, fixed_params, task_params, seeds, slurm_args=None): ensure_dir_exists("run_parameters") param_ids = range(len(task_params)) for param, seed, param_id in zip(task_params, seeds, param_ids): param.param_id = param_id #param.priv_cancertypes = val_cancertypes #param.skip_cancertypes = priv_cancertypes param.seed = seed #np.save("run_parameters/params-%s.npy" % (test_name), params) assert len(task_params) == gpyopt_batch_size common_params = SimpleNamespace( **fixed_params.__dict__, pred_cancertypes=fixed_params.val_cancertypes, skip_cancertypes=fixed_params.priv_cancertypes, task_type='paramopt', ) args.wait = True batch2.run_tasks(args, common_params, task_params, slurm_args=slurm_args, params_file=("run_parameters/batch-%s.pkl" % (fixed_params.test_name))) # get results #return np.random.randn(gpyopt_batch_size, 1) res = np.zeros((len(task_params), 1)) for param_id in param_ids: full_model_id = "%s-%s" % (fixed_params.test_name, param_id) filename = "param_opt/opt_result-%s.txt" % (full_model_id) try: res[param_id] = np.loadtxt(filename) import os os.remove(filename) except: res[param_id] = gpyopt_fail_res + np.random.randn( ) * gpyopt_fail_res_std logging.info('Warning, could not load "%s"' % filename) return res
def run_test(args, fixed_params, task_param, seeds, slurm_args=None): logging.info('Running tests...') task_params = [[ SimpleNamespace( pred_cancertypes=priv_cancertypes, seed=seed, ) for seed in seeds ] for priv_cancertypes in cancer_type_pairs] task_params = sum(task_params, []) param_ids = range(len(task_params)) for param, param_id in zip(task_params, param_ids): param.param_id = param_id ensure_dir_exists("run_parameters") #np.save("run_parameters/params-%s.npy" % (test_name), params) common_params = SimpleNamespace(**fixed_params.__dict__, ) args.wait = True batch2.run_tasks(args, common_params, task_params, slurm_args=slurm_args, params_file=("run_parameters/batch-%s.pkl" % (fixed_params.test_name))) # res = np.zeros((len(task_params), 1)) for param_id in param_ids: full_model_id = "%s-%s" % (fixed_params.test_name, param_id) filename = "param_opt/opt_result-%s.txt" % (full_model_id) try: res[param_id] = np.loadtxt(filename) import os os.remove(filename) except: res[param_id] = np.nan logging.info('Warning, could not load "%s"' % filename) res = np.reshape(res, (len(cancer_type_pairs), len(seeds))) filename = "res/test_results-%s.txt" % (fixed_params.test_name) logging.info("Writing final results to '%s'" % filename) np.savetxt(filename, res)
def task(args): data_type, seed, (algName, _, makeAlg) = args logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed, algName) # read the data sets logging.info("Reading data...") y_train, x_train, y_test, x_test = dataReader.main("%s_%d" % (data_type, seed)) data_dim = x_train.shape[1] logging.info(" * training set: %d x %d" % x_train.shape) logging.info(" * testing set: %d x %d" % x_test.shape) # init rng np.random.seed(seed) x_test = x_train logging.info("Running and evaluating the algorithm...") # init the algorithm alg = makeAlg(data_dim, repr_dim) # create output dir if does not exist ensure_dir_exists('res') from sklearn.decomposition import PCA as sk_PCA pca = sk_PCA(n_components=repr_dim) pca.fit(x_train) y_train = pca.transform(x_train) y_test = pca.transform(x_test) # define the progress saving function progress_filename = 'res/progress-enc-mse-%s-%d-%s.txt' % (data_type, seed, algName) progress_file = open(progress_filename, 'w', encoding='utf-8') def save_progress(): y_test_pred = alg.encode(x_test) rel_mse = relative_mean_squared_error(y_test, y_test_pred) progress_file.write("%g\n" % rel_mse) # fit alg.learn(x_train, y_train, log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName)), callbacks=[save_progress])
def run_optimization(args, domain, constraints, batch_size, max_iter): logging.info('Starting parameter optimization...') import GPyOpt ensure_dir_exists("param_opt") initial_design_type = 'random' initial_design_numdata = batch_size logging.info('Selecting initial parameters...') space = GPyOpt.core.task.space.Design_space(domain, constraints) params = GPyOpt.experiment_design.initial_design(initial_design_type, space, initial_design_numdata) logging.info('Running...') results = run_batch(args, params) all_params = params all_results = results for i in range(max_iter): print(all_params, flush=True) print(all_results, flush=True) logging.info('Selecting a new set of parameters...') bo = GPyOpt.methods.BayesianOptimization( f=None, domain=domain, X=all_params, Y=all_results, acquisition_type='EI', normalize_Y=True, evaluator_type='local_penalization', batch_size=batch_size, acquisition_jitter=0) params = bo.suggest_next_locations() logging.info('Running...') results = run_batch(args, params) all_params = np.vstack((all_params, params)) all_results = np.vstack((all_results, results)) np.save("param_opt/opt_params%s.npy" % id_suffix, all_params) np.save("param_opt/opt_results%s.npy" % id_suffix, all_results)
def task(args): repr_dim, (alg_id, load_model) = args logging.info("representation size = %d, algorithm = %s", repr_dim, alg_id) # read the GDSC gene expression data logging.info("Reading gene expression data...") import pandas data = pandas.read_hdf("data/%s.h5" % (data_set), 'redistributed_gene_expressions') x = data.as_matrix() logging.info(" * data shape: %d x %d" % x.shape) # normalize the input to _total_ unit variance and zero mean if normalize_data: x -= np.mean(x) x /= np.std(x) # init rng np.random.seed(0) # load the model logging.info("Loading the model...") alg = load_model(repr_dim) # get the representation logging.info("Computing the representation or size %d..." % (repr_dim)) x_repr = alg.encode(x) # variance of each representation component #repr_vars = np.var(x_repr, axis=0) repr_avg = np.mean(x_repr, axis=0) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt fig, axes = plt.subplots(nrows=repr_dim, ncols=1, figsize=(16, 10), sharex=True, sharey=True) logging.info("Computing and plotting projections...") x_repr_onedim = np.empty(x_repr.shape) for i in range(repr_dim): logging.info(" * component %d/%d" % (i + 1, repr_dim)) x_repr_onedim[:, :] = repr_avg x_repr_onedim[:, i] = x_repr[:, i] repr_proj = alg.decode(x_repr_onedim) proj_std = np.std(repr_proj, axis=0) #plt.subplot(repr_dim, 1, i+1) axes[i].bar(np.arange(x.shape[1]), proj_std, color='b', edgecolor='none') #axes[i].bar(np.arange(50), proj_std[0:50], color='b', edgecolor='none') plt.ylabel("projection std") plt.xlabel("gene") #plt.title("repr component %d" % i) ensure_dir_exists("figs/tcga_repr_projections") figname = "figs/tcga_repr_projections/d%d_%s.png" % (repr_dim, alg_id) plt.savefig(figname, format='png', dpi=200) plt.close(fig)
def run_alg(x_pub, y_pub, x_priv, y_priv, params, full_model_id): ################################## # representation learning ################################# x = x_pub y = y_pub # separate validation set if needed val_x = None #val_y = None if validation_split: logging.info("Splitting into training and validation sets") from sklearn.model_selection import train_test_split train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=validation_split, random_state=0) x, y = train_x, train_y logging.info(" * training set shape: %d x %d" % x.shape) logging.info(" * validation set shape: %d x %d" % val_x.shape) data_dim = x.shape[1] logging.info(" * data shape after preprocessing: %d x %d" % x.shape) repr_dim = int(round(params.repr_dim)) logging.info("Learning the representation on public data...") logging.info(" * learning a representation of size %d", repr_dim) start_time = time.time() # init the algorithm #alg = make_alg(data_dim, repr_dim, num_classes) #alg = make_alg(data_dim, repr_dim) from models.vae_pytorch import VAE alg = VAE().init( input_dim = data_dim, latent_dim = repr_dim, #enc_dims = [], enc_dims = [int(10 ** params.hidden_layer_size_mul_log10)*repr_dim] * int(params.n_hidden_layers), dec_dims = 'same', enc_activations = 'relu', dec_activations = 'relu', prediction_mean_activation = 'sigmoid', prediction_var = 'gs', prediction_log_var_min = math.log(0.01**2), normalize_input_type = 'quantiles', normalize_input_quantile = 0.05, normalize_input_axis = 'global', normalize_input_target = (0, 1), normalize_input_clip = True, optimizer = 'Adam', optimizer_params = {'lr': 10.0 ** params.learning_rate_log10}, n_epochs = n_epochs, early_stopping = True, reduce_lr_on_plateau = False, batch_size = batch_size) # create output dir if does not exist #ensure_dir_exists('res') # define the progress saving function ensure_dir_exists('param_opt/progress') progress_filename = 'param_opt/progress/encdec-mse-%s.txt' % (full_model_id) progress_file = open(progress_filename, 'w', encoding='utf-8') #aux_progress_filename = 'param_opt/progress/aux-ce-%s.txt' % (full_model_id) #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8') if val_x is not None: val_progress_filename = 'param_opt/progress/encdec-validation-mse-%s.txt' % (full_model_id) val_progress_file = open(val_progress_filename, 'w', encoding='utf-8') #aux_val_progress_filename = 'param_opt/progress/aux-validation-ce-%s.txt' % (full_model_id) #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8') def save_progress(): x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) progress_file.write("%g\n" % rel_mse) #aux_pred = alg.predict_secondary(x) #aux_rel_ce = relative_cross_entropy(y, aux_pred) #aux_progress_file.write("%g\n" % aux_rel_ce) if val_x is not None: val_x_pred = alg.decode(alg.encode(val_x)) rel_mse = relative_mean_squared_error(val_x, val_x_pred) val_progress_file.write("%g\n" % rel_mse) #val_aux_pred = alg.predict_secondary(val_x) #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred) #aux_val_progress_file.write("%g\n" % aux_rel_ce) # fit to the training data ensure_dir_exists("param_opt/log/") alg.learn(x, validation_data=val_x, log_file_prefix=("param_opt/log/%s" % (full_model_id)), per_epoch_callback_funs=[save_progress], deadline=None, max_duration=repr_max_duration) # test reconstruction error x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) val_x_pred = alg.decode(alg.encode(val_x)) val_rel_mse = relative_mean_squared_error(val_x, val_x_pred) logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse, val_rel_mse) elapsed = time.time() - start_time logging.info(" * running time = %s", pretty_duration(elapsed)) ################################## # representation mapping ################################# x = x_priv y = y_priv # get the representation logging.info("Making the representation of private data...") x_repr = alg.encode(x) # test to predict the data itself x_pred = alg.decode(x_repr) rel_mse = relative_mean_squared_error(x, x_pred) logging.info(" * reconstruct the data: rel_mse = %g", rel_mse) ################################## # prediction ################################# x = x_repr # private or non-private logistic regression private = True # test prediction with cross validation logging.info("Prediction with %d-fold cross validation...", pred_cv_folds) from sklearn.model_selection import StratifiedKFold cv = StratifiedKFold(n_splits=pred_cv_folds, shuffle=True, random_state=0) avg_test_acc = 0 for fold, (train, test) in enumerate(cv.split(x, y)): logging.info("Fold %d...", fold) x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test] # init rng #np.random.seed(seed0) logging.info("Bounding the data to 1-sphere...") if scale_fun == "norm_max": logging.info(" * scale by max norm") scale_factor = np.amax(np.linalg.norm(x_train, axis=1)) elif scale_fun == "dims_max": logging.info(" * scale each dimension by max absolute value") scale_factor = np.amax(np.abs(x_train), axis=0) elif scale_fun == "norm_avg": logging.info(" * scale by average norm") scale_factor = np.mean(np.linalg.norm(x_train, axis=1)) elif scale_fun == "dims_std": logging.info(" * scale each dimension by standard deviation") scale_factor = np.std(x_train, axis=0) elif scale_fun == "none": scale_factor = 1.0 else: assert False x_train /= scale_factor * scale_const x_test /= scale_factor * scale_const if clip == "norm": logging.info(" * clip norms to max 1") x_train /= np.maximum(np.linalg.norm(x_train, axis=1, keepdims=True) * (1 + bounding_slack), 1) x_test /= np.maximum(np.linalg.norm(x_test, axis=1, keepdims=True) * (1 + bounding_slack),1) elif clip == "dims": assert False, "not implemented" elif clip == "none": logging.info(" * no clipping -> no bounding") assert private == False #or np.isinf(epsilon) else: assert False # fit logging.info("Fitting a model...") if private: logging.info(" * DP logistic regression: epsilon=%g, alpha=%g", epsilon, regularizer_strength) from models.logistic_regression import DPLogisticRegression model = DPLogisticRegression().init(repr_dim, classes=np.unique(y), alpha=regularizer_strength, epsilon=epsilon) else: logging.info(" * logistic regression: alpha=%g", regularizer_strength) from sklearn.linear_model import LogisticRegression model = LogisticRegression(C=1/regularizer_strength) model.fit(x_train, y_train) #print(model.predict(x_test)) # compute mean accuracy on test set logging.info("Testing the model...") #acc = model.score(x_test, y_test) from sklearn.metrics import accuracy_score train_acc = accuracy_score(y_train, model.predict(x_train)) test_acc = accuracy_score(y_test, model.predict(x_test)) logging.info(" * train accuracy = %.6f", train_acc) logging.info(" * test accuracy = %.6f", test_acc) avg_test_acc += test_acc avg_test_acc /= pred_cv_folds logging.info("Average test accuracy = %.6f", avg_test_acc) return avg_test_acc
def run_test(args, params): logging.info('Running final tests with...') ensure_dir_exists("run_parameters") np.save("run_parameters/test_params.npy", params) args.wait = True batch.run_tasks(args)
def learn_repr(x, y, params, full_model_id): # separate validation set if needed val_x = None #val_y = None if params.repr_learn_validation_split: logging.info("Splitting into training and validation sets") from sklearn.model_selection import train_test_split train_x, val_x, train_y, val_y = train_test_split( x, y, test_size=params.repr_learn_validation_split, random_state=0) x, y = train_x, train_y logging.info(" * training set shape: %d x %d" % x.shape) logging.info(" * validation set shape: %d x %d" % val_x.shape) data_dim = x.shape[1] logging.info(" * data shape after preprocessing: %d x %d" % x.shape) repr_dim = int(round(params.repr_dim)) logging.info("Learning the representation on public data...") logging.info(" * learning a representation of size %d", repr_dim) start_time = time.time() (_, _, _, make_alg, _) = select_repr_alg(params.repr_alg) # init the algorithm #alg = make_alg(data_dim, repr_dim, num_classes) #alg = make_alg(data_dim, repr_dim) alg = make_alg(data_dim, repr_dim, params) # create output dir if does not exist #ensure_dir_exists('res') # define the progress saving function ensure_dir_exists('param_opt/progress') progress_filename = 'param_opt/progress/encdec-mse-%s.txt' % ( full_model_id) progress_file = open(progress_filename, 'w', encoding='utf-8') #aux_progress_filename = 'param_opt/progress/aux-ce-%s.txt' % (full_model_id) #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8') if val_x is not None: val_progress_filename = 'param_opt/progress/encdec-validation-mse-%s.txt' % ( full_model_id) val_progress_file = open(val_progress_filename, 'w', encoding='utf-8') #aux_val_progress_filename = 'param_opt/progress/aux-validation-ce-%s.txt' % (full_model_id) #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8') def save_progress(): x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) progress_file.write("%g\n" % rel_mse) #aux_pred = alg.predict_secondary(x) #aux_rel_ce = relative_cross_entropy(y, aux_pred) #aux_progress_file.write("%g\n" % aux_rel_ce) if val_x is not None: val_x_pred = alg.decode(alg.encode(val_x)) rel_mse = relative_mean_squared_error(val_x, val_x_pred) val_progress_file.write("%g\n" % rel_mse) #val_aux_pred = alg.predict_secondary(val_x) #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred) #aux_val_progress_file.write("%g\n" % aux_rel_ce) # fit to the training data ensure_dir_exists("param_opt/log/") alg.learn(x, validation_data=val_x, log_file_prefix=("param_opt/log/%s" % (full_model_id)), per_epoch_callback_funs=[save_progress], deadline=None, max_duration=params.repr_learn_max_duration) # test reconstruction error x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) if val_x is not None: val_x_pred = alg.decode(alg.encode(val_x)) val_rel_mse = relative_mean_squared_error(val_x, val_x_pred) else: val_rel_mse = np.nan logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse, val_rel_mse) elapsed = time.time() - start_time logging.info(" * running time = %s", pretty_duration(elapsed)) return alg
def task(common_params, task_params): # add logging file log_file_name = "log/opttest-task-%s-%s-s%d.log" % ( common_params.test_name, common_params.task_type, task_params.seed) log_file_handler = logging.FileHandler(log_file_name, mode='w') log_file_handler.setFormatter(log_file_formatter) logging.getLogger().addHandler(log_file_handler) logging.info("test_name = %s", common_params.test_name) logging.info("params_id = %s", task_params.param_id) logging.info("Running with params: %s" % task_params) params = SimpleNamespace(**common_params.__dict__, **task_params.__dict__) (gene_expr, cancer_type) = load_data() # split logging.info("Splitting...") logging.info(" * private cancertype pairs: %s" % params.priv_cancertype_pairs) logging.info(" * public cancertypes: %s" % params.pub_cancertypes) priv_cancertypes = sum(params.priv_cancertype_pairs, []) priv = cancer_type.isin(priv_cancertypes) pub = cancer_type.isin(params.pub_cancertypes) logging.info(" * %d private samples, %d public samples (of %d total)" % (sum(priv), sum(pub), priv.size)) from common import categorical_to_binary x_pub = gene_expr[pub].as_matrix() y_pub = cancer_type[pub].cat.codes.as_matrix() seed = int(params.seed) # init rng np.random.seed(seed) import torch torch.manual_seed(seed) if torch.cuda.is_available() and torch.cuda.device_count() > 0: torch.cuda.manual_seed(seed) full_model_id = "%s-%s" % (common_params.test_name, task_params.param_id) logging.info("Representation learning...") repr_alg = learn_repr(x_pub, y_pub, params, full_model_id) x_pub_repr = map_repr(x_pub, repr_alg, params, full_model_id) if params.task_type == 'paramopt': acc = np.zeros(len(params.priv_cancertype_pairs)) for p, priv_cancertype_pair in enumerate(params.priv_cancertype_pairs): logging.info("Prediction with private cancertypes %s..." % priv_cancertype_pair) priv = cancer_type.isin(priv_cancertype_pair) x_priv = gene_expr[priv].as_matrix() y_priv = cancer_type[priv].cat.codes.as_matrix() x_priv_repr = map_repr(x_priv, repr_alg, params, full_model_id) acc[p] = predict(x_priv_repr, y_priv, x_pub_repr, params, full_model_id) avg_acc = np.mean(acc) logging.info("Total average prediction accuracy: %.6f" % avg_acc) logging.info("Writing results to disk...") filename = "param_opt/opt_result-%s.txt" % (full_model_id) logging.info(" * filename: %s", filename) with open(filename, 'w', encoding='utf-8') as f: f.write("%.6f\n" % avg_acc) elif params.task_type == 'learn_and_map': gdsc_gene_expr = load_gdsc_data() x_gdsc = gdsc_gene_expr.as_matrix() x_gdsc_repr = map_repr(x_gdsc, repr_alg, params, full_model_id) logging.info("Saving the representation...") ensure_dir_exists("data_repr") np.savetxt("data_repr/%s-%s.csv" % (gdsc_data_set, full_model_id), x_gdsc_repr, delimiter=',') else: assert False, "invalid task type"
def run_optimization(args, fixed_params, domain, constraints, batch_size, max_iter, max_duration=None, deadline=None, slurm_args=None): logging.info('Starting parameter optimization...') import GPyOpt ensure_dir_exists("param_opt") if max_duration is not None: new_dl = datetime.datetime.now() + max_duration if deadline is None or new_dl < deadline: deadline = new_dl # initial parameters and values if fixed_params.param_opt_continue: logging.info('Loading earlier params and results...') all_params = np.load("param_opt/opt_params-%s.npy" % (fixed_params.test_name)) all_results = np.load("param_opt/opt_results-%s.npy" % (fixed_params.test_name)) opt_seeds = range(len(all_params)) else: logging.info('Selecting initial parameters...') initial_design_type = 'random' initial_design_numdata = batch_size space = GPyOpt.core.task.space.Design_space(domain, constraints) opt_params = GPyOpt.experiment_design.initial_design( initial_design_type, space, initial_design_numdata) logging.info('Running...') opt_seeds = range(len(opt_params)) task_params = [get_params(p, domain) for p in opt_params] results = run_optimization_batch(args, fixed_params, task_params, opt_seeds, slurm_args) all_params = opt_params all_results = results for i in range(max_iter): #print(np.hstack((all_params, all_results)), flush=True) logging.info('Best result this far: %g', np.amax(all_results)) logging.info('Selecting a new set of parameters...') bo = GPyOpt.methods.BayesianOptimization( f=None, domain=domain, X=all_params, Y=-all_results, acquisition_type='EI', normalize_Y=True, evaluator_type='local_penalization', batch_size=batch_size, acquisition_jitter=0, maximize=False) opt_params = bo.suggest_next_locations() next_seed = max(opt_seeds) + 1 opt_seeds = range(next_seed, next_seed + len(opt_params)) logging.info('Running...') task_params = [get_params(p, domain) for p in opt_params] results = run_optimization_batch(args, fixed_params, task_params, opt_seeds, slurm_args) all_params = np.vstack((all_params, opt_params)) all_results = np.vstack((all_results, results)) np.save("param_opt/opt_params-%s.npy" % (fixed_params.test_name), all_params) np.save("param_opt/opt_results-%s.npy" % (fixed_params.test_name), all_results) if datetime.datetime.now() >= deadline: logging.info('Gpyopt iteration %d: Time based stopping' % (i)) break all_params = [get_params(p, domain) for p in all_params] all_results = list(all_results) filename = "param_opt/paramopt-%s.txt" % (fixed_params.test_name) logging.info("Writing params and result to '%s'" % filename) with open(filename, 'wb') as f: pickle.dump(all_params, f) pickle.dump(all_results, f) best_params_id = np.argmax(all_results) best_params = all_params[best_params_id] best_result = all_results[best_params_id] logging.info('Final best result: %g', best_result) logging.info(' * obtained with: %s', best_params) filename = "res/paramopt_best_result-%s.txt" % (fixed_params.test_name) logging.info("Writing best result to '%s'" % filename) np.savetxt(filename, best_result) filename = "param_opt/paramopt_best_params-%s.txt" % ( fixed_params.test_name) logging.info("Writing best params to '%s'" % filename) with open(filename, 'wb') as f: pickle.dump(best_params, f) return best_params
plt.imshow(x_test[sample,:].reshape((28,28)), cmap='gray') if s == 0: plt.title("original") s = 0 seed = seeds[s] for a, alg_id in enumerate(algorithms): print(" alg = %s ..." % alg_id) #pred_filename = 'pred/final-encdec-%s-%d-%s.npy' % (data_type, seed, alg_id) #pred_rand_filename = 'pred/final-encdec-rand-%s-%d-%s.npy' % (data_type, seed, alg_id) pred_filename = 'pred/final-encdec-%s-r%d-s%d-%s.npy' % (data_type, repr_dim, seed, alg_id) pred_rand_filename = 'pred/final-encdec-rand-%s-r%d-s%d-%s.npy' % (data_type, repr_dim, seed, alg_id) x_test_pred = np.load(pred_filename) x_test_pred_rand = np.load(pred_rand_filename) for s, sample in enumerate(samples): plt.subplot(tiled[1], tiled[0], s * tiled[0] + 2*a + 2) plt.axis('off') plt.imshow(x_test_pred[sample,:].clip(0,1).reshape((28,28)), cmap='gray') if s == 0: plt.title(alg_id) plt.subplot(tiled[1], tiled[0], s * tiled[0] + 2*a + 3) plt.axis('off') plt.imshow(x_test_pred_rand[sample,:].clip(0,1).reshape((28,28)), cmap='gray') ensure_dir_exists("figs/predictions") #figname = "figs/predictions/%s" % (data_type) figname = "figs/predictions/%s-r%d" % (data_type, repr_dim) plt.savefig(figname) plt.close()
def task(args): import pandas param_id, priv_cancertypes, seed = args logging.info("priv classes = %s, params_id = %s, seed = %d", priv_cancertypes, param_id, seed) #repr_dim, (alg_id, _, make_alg), seed = args #logging.info("algorithm = %s, seed = %d", alg_id, seed) # read the data sets alg_id = param_id logging.info("Loading parameters...") params = np.load("run_parameters/params.npy") params = params[param_id] logging.info("Reading data...") gene_expr = pandas.read_hdf("data/%s.h5" % (data_set), data_type) logging.info(" * gene expression shape: %d x %d" % gene_expr.shape) logging.info("Filtering out genes with low expressions...") low_expr = (np.median(gene_expr, axis=0) < 0.0) gene_expr = gene_expr.iloc[:, ~low_expr] logging.info(" * %d of %d remaining (%d removed)" % (sum(~low_expr), low_expr.size, sum(low_expr))) logging.info("Loading cancer types...") cancer_type = pandas.read_hdf("data/%s.h5" % (target_set), target_type) assert np.array_equal(gene_expr.index, cancer_type.index) # split logging.info("Splitting...") priv = cancer_type.isin(priv_cancertypes) logging.info(" * %d private samples, %d public samples (of %d total)" % (sum(priv), sum(~priv), priv.size)) from common import categorical_to_binary x_pub = gene_expr[~priv].as_matrix() y_pub = cancer_type[~priv].cat.codes.as_matrix() x_priv = gene_expr[priv].as_matrix() y_priv = cancer_type[priv].cat.codes.as_matrix() #y = categorical_to_binary(aux_target.values) #num_classes = y.shape[1] data_name = '-'.join(priv_cancertypes).replace(' ', '_') # A hack to have a different seed if the algorithm is run multiple times # with the same parameters. Destroys reproducibility... import time seed0 = int(time.time() * 100) % (2**32) # init rng np.random.seed(seed0) import torch torch.manual_seed(seed0) if torch.cuda.is_available() and torch.cuda.device_count() > 0: torch.cuda.manual_seed(seed0) ################################## # representation learning ################################# x = x_pub y = y_pub # separate validation set if needed val_x = None #val_y = None if validation_split: logging.info("Splitting into training and validation sets") from sklearn.model_selection import train_test_split train_x, val_x, train_y, val_y = train_test_split( x, y, test_size=validation_split, random_state=0) x, y = train_x, train_y #m = x.shape[0] #perm = np.random.permutation(m) #x = x[perm,:] #y = y[perm,:] #split_point = int(validation_split * m) #(val_x, x) = (x[:split_point,:], x[split_point:,:]) #(val_y, y) = (y[:split_point,:], y[split_point:,:]) logging.info(" * training set shape: %d x %d" % x.shape) logging.info(" * validation set shape: %d x %d" % val_x.shape) data_dim = x.shape[1] logging.info(" * data shape after preprocessing: %d x %d" % x.shape) logging.info("Learning the representaiton on public data...") logging.info(" * learning a representation of size %d", repr_dim) start_time = time.time() # init the algorithm #alg = make_alg(data_dim, repr_dim, num_classes) #alg = make_alg(data_dim, repr_dim) from models.vae_pytorch import VAE alg = VAE().init( input_dim=data_dim, latent_dim=repr_dim, #enc_dims = [], enc_dims=[int(10**params.hidden_layer_size_mul_log10) * repr_dim] * int(params.n_hidden_layers), dec_dims='same', enc_activations='relu', dec_activations='relu', prediction_mean_activation='sigmoid', prediction_var='gs', prediction_log_var_min=math.log(0.01**2), normalize_input_type='quantiles', normalize_input_quantile=0.05, normalize_input_axis='global', normalize_input_target=(0, 1), normalize_input_clip=True, optimizer='Adam', optimizer_params={'lr': 10.0**params.learning_rate_log10}, n_epochs=n_epochs, early_stopping=True, reduce_lr_on_plateau=False, batch_size=batch_size) # create output dir if does not exist ensure_dir_exists('res') full_model_id = "%s-%d-%s-s%d%s" % (data_name, repr_dim, alg_id, seed, id_suffix) # define the progress saving function progress_filename = 'res/progress-encdec-mse-%s.txt' % (full_model_id) progress_file = open(progress_filename, 'w', encoding='utf-8') #aux_progress_filename = 'res/progress-aux-ce-%s.txt' % (full_model_id) #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8') if val_x is not None: val_progress_filename = 'res/progress-encdec-validation-mse-%s.txt' % ( full_model_id) val_progress_file = open(val_progress_filename, 'w', encoding='utf-8') #aux_val_progress_filename = 'res/progress-aux-validation-ce-%s.txt' % (full_model_id) #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8') def save_progress(): x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) progress_file.write("%g\n" % rel_mse) #aux_pred = alg.predict_secondary(x) #aux_rel_ce = relative_cross_entropy(y, aux_pred) #aux_progress_file.write("%g\n" % aux_rel_ce) if val_x is not None: val_x_pred = alg.decode(alg.encode(val_x)) rel_mse = relative_mean_squared_error(val_x, val_x_pred) val_progress_file.write("%g\n" % rel_mse) #val_aux_pred = alg.predict_secondary(val_x) #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred) #aux_val_progress_file.write("%g\n" % aux_rel_ce) # fit to the training data alg.learn(x, validation_data=val_x, log_file_prefix=("log/%s" % (full_model_id)), per_epoch_callback_funs=[save_progress], deadline=deadline, max_duration=max_duration) # test reconstruction error x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) val_x_pred = alg.decode(alg.encode(val_x)) val_rel_mse = relative_mean_squared_error(val_x, val_x_pred) logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse, val_rel_mse) elapsed = time.time() - start_time logging.info(" * running time = %s", pretty_duration(elapsed)) # save model #logging.info("Saving the learned model...") #ensure_dir_exists('repr_models') #alg.save("repr_models/%s" % (full_model_id)) ################################## # representation mapping ################################# x = x_priv y = y_priv # get the representation logging.info("Making the representation of private data...") x_repr = alg.encode(x) # test to predict the data itself x_pred = alg.decode(x_repr) rel_mse = relative_mean_squared_error(x, x_pred) logging.info(" * reconstruct the data: rel_mse = %g", rel_mse) ensure_dir_exists("res") with open("res/private-encdec-rel_mse-%d-%s-%s-s%d%s.txt" % (repr_dim, data_name, alg_id, seed, id_suffix), 'w', encoding='utf-8') as f: f.write("%.6f\n" % rel_mse) # save the representation #logging.info("Saving the representation...") #ensure_dir_exists("data_repr") #np.savetxt("data_repr/repr-%s-%d-%s-s%d%s.csv" % # (data_name, repr_dim, alg_id, seed, id_suffix), # x_repr, delimiter=',') ################################## # prediction ################################# x = x_repr # split train and test sets logging.info("Splitting to train and test sets...") from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=pred_test_size, random_state=0) logging.info(" * train samples: %d" % x_train.shape[0]) logging.info(" * test samples: %d" % x_test.shape[0]) # init rng np.random.seed(seed0) #print(np.amax(np.linalg.norm(x_train, axis=1))) #print(np.mean(np.linalg.norm(x_train, axis=1))) logging.info("Bounding the data to 1-sphere...") if scale_fun == "norm_max": logging.info(" * scale by max norm") scale_factor = np.amax(np.linalg.norm(x_train, axis=1)) elif scale_fun == "dims_max": logging.info(" * scale each dimension by max absolute value") scale_factor = np.amax(np.abs(x_train), axis=0) elif scale_fun == "norm_avg": logging.info(" * scale by average norm") scale_factor = np.mean(np.linalg.norm(x_train, axis=1)) elif scale_fun == "dims_std": logging.info(" * scale each dimension by standard deviation") scale_factor = np.std(x_train, axis=0) elif scale_fun == "none": scale_factor = 1.0 else: assert False x_train /= scale_factor * scale_const x_test /= scale_factor * scale_const #print(np.amax(np.linalg.norm(x_train, axis=1, keepdims=True))) if clip == "norm": logging.info(" * clip norms to max 1") x_train /= np.maximum( np.linalg.norm(x_train, axis=1, keepdims=True) * (1 + bounding_slack), 1) x_test /= np.maximum( np.linalg.norm(x_test, axis=1, keepdims=True) * (1 + bounding_slack), 1) elif clip == "dims": assert False, "not implemented" elif clip == "none": logging.info(" * no clipping -> no bounding") assert private == False #or np.isinf(epsilon) else: assert False #for private in [False, True]: for private in [True]: # fit logging.info("Fitting a model...") if private: logging.info(" * DP logistic regression: epsilon=%g, alpha=%g", epsilon, regularizer_strength) from models.logistic_regression import DPLogisticRegression model = DPLogisticRegression().init(repr_dim, classes=np.unique(y), alpha=regularizer_strength, epsilon=epsilon) else: logging.info(" * logistic regression: alpha=%g", regularizer_strength) from sklearn.linear_model import LogisticRegression model = LogisticRegression(C=1 / regularizer_strength) model.fit(x_train, y_train) #print(model.predict(x_test)) # compute mean accuracy on test set logging.info("Testing the model...") #acc = model.score(x_test, y_test) from sklearn.metrics import accuracy_score train_acc = accuracy_score(y_train, model.predict(x_train)) test_acc = accuracy_score(y_test, model.predict(x_test)) logging.info(" * train accuracy = %.6f", train_acc) logging.info(" * test accuracy = %.6f", test_acc) logging.info("Writing results to disk...") ensure_dir_exists("res") filename = ( "res/cancertype-pred-accuracy-%d-%s-%s-s%d-%s-%d-%s%s.txt" % (repr_dim, data_name, alg_id, seed, scale_fun, scale_const, clip, ("-e%g" % (epsilon) if private else "-nonpriv"))) logging.info(" * filename: %s", filename) with open(filename, 'w', encoding='utf-8') as f: f.write("%.6f\n" % test_acc) filename = "param_opt/opt_result%s-%s.txt" % (id_suffix, full_model_id) with open(filename, 'w', encoding='utf-8') as f: f.write("%.6f\n" % test_acc)
plt.gca().set_xticklabels([" " for a in x0]) plt.gca().tick_params(axis='x', which='both',length=0) #plt.gca().set_xticks([]) plt.gca().set_xlim(-0.5, 0.5) plt.gca().set_ylim(0.08, 0.38) plt.legend() plt.gca().set_ylabel("prediction accuracy") plt.gca().set_xlabel(" ") if n_files_not_found > 0: print("Warning: '%s' and %d other files not found." % (last_not_found, n_files_not_found-1)) #plt.show() ensure_dir_exists(figpath) figname = "%s%s%s%s" % (figname, ("-ica" if ica else ""), ("-cliponly" if clipping_only else ""), ("-mcmc" if mcmc else "-fixed"), ) plt.tight_layout() #plt.savefig(figname, format='png', dpi=300, bbox_inches='tight') plt.savefig(figname + ".png", format='png', dpi=300) plt.savefig(figname + ".pdf", format='pdf', dpi=300)
def task(args): seed, (algName, _, makeAlg) = args data_type = "mnist" logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed, algName) # init rng np.random.seed(seed) # load mnist from keras.datasets import mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() image_shape = x_train.shape[1:] x_train = x_train.astype('float32') / 255. x_test = x_test.astype('float32') / 255. x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:]))) x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:]))) #x_train = x_train[0:1000,:] #x_test = x_test[0:100,:] x = x_train data_dim = x.shape[1] logging.info(" * training set: %d x %d" % x.shape) logging.info(" * testing set: %d x %d" % x_test.shape) logging.info("Running and evaluating the algorithm...") logging.info(" * using representation with dimension = %d", repr_dim) # init the algorithm alg = makeAlg(data_dim, repr_dim) # create output dir if does not exist #ensure_dir_exists('res') # define the progress saving function #progress_filename = 'res/progress-encdec-mse-%s-%d-%s.txt' % (data_type, seed, algName) #progress_file = open(progress_filename, 'w', encoding='utf-8') #def save_progress(): # x_test_pred = alg.decode(alg.encode(x_test)) # rel_mse = relative_mean_squared_error(x_test, x_test_pred) # progress_file.write("%g\n" % rel_mse) # fit to the training data alg.learn(x, validation_data=x_test, log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName)), verbose='print_epochs') #verbose='progress_bars') # test with the testing data x_test_pred = alg.decode(alg.encode(x_test)) x_test_pred_rand = alg.decode_generate(alg.encode(x_test)) ensure_dir_exists('pred') data_filename = 'data/generated/%s' % (data_type) pred_filename = 'pred/final-encdec-%s-r%d-s%d-%s' % (data_type, repr_dim, seed, algName) pred_rand_filename = 'pred/final-encdec-rand-%s-r%d-s%d-%s' % ( data_type, repr_dim, seed, algName) if save_pred: np.save(data_filename, x_test) np.save(pred_filename, x_test_pred) np.save(pred_rand_filename, x_test_pred_rand) #from sklearn import metrics #mse = metrics.mean_squared_error(x_test, x_test_pred, # multioutput='uniform_average') #explained_var = metrics.explained_variance_score(x_test, x_test_pred, # multioutput='uniform_average') mse = mean_squared_error(x_test, x_test_pred) rel_mse = relative_mean_squared_error(x_test, x_test_pred) logging.info("Result: rel_mse = %g", rel_mse)
def task(args): repr_dim, alg_id, seed = args logging.info("representation size = %d, algorithm = %s, seed = %d", repr_dim, alg_id, seed) # read the PADS gene expression data logging.info("Reading reduced gene expression data...") filename = ("data_repr/repr-%s-%d-%s-%s-s%d%s.csv" % (data_set, repr_dim, aux_data_set, alg_id, seed, id_suffix)) logging.info(" * filename: %s" % filename) x = np.loadtxt(filename, delimiter=',') if x.ndim < 2: x = x[:, np.newaxis] logging.info(" * data shape: %d x %d" % x.shape) logging.info("Reading cancer types...") filename = "data/%s.h5" % (target_set) logging.info(" * filename: %s" % filename) import pandas target = pandas.read_hdf(filename, 'cancer_types') logging.info(" * target size: %d" % target.shape) #y = target.as_matrix() y = target.cat.codes.as_matrix() # split train and test sets logging.info("Splitting to train and test sets...") from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=0) logging.info(" * train samples: %d" % x_train.shape[0]) logging.info(" * test samples: %d" % x_test.shape[0]) # init rng np.random.seed(seed) #print(np.amax(np.linalg.norm(x_train, axis=1))) #print(np.mean(np.linalg.norm(x_train, axis=1))) logging.info("Bounding the data to 1-sphere...") if scale_fun == "norm_max": logging.info(" * scale by max norm") scale_factor = np.amax(np.linalg.norm(x_train, axis=1)) elif scale_fun == "dims_max": logging.info(" * scale each dimension by max absolute value") scale_factor = np.amax(np.abs(x_train), axis=0) elif scale_fun == "norm_avg": logging.info(" * scale by average norm") scale_factor = np.mean(np.linalg.norm(x_train, axis=1)) elif scale_fun == "dims_std": logging.info(" * scale each dimension by standard deviation") scale_factor = np.std(x_train, axis=0) elif scale_fun == "none": scale_factor = 1.0 else: assert False x_train /= scale_factor * scale_const x_test /= scale_factor * scale_const #print(np.amax(np.linalg.norm(x_train, axis=1, keepdims=True))) if clip == "norm": logging.info(" * clip norms to max 1") x_train /= np.maximum( np.linalg.norm(x_train, axis=1, keepdims=True) * (1 + bounding_slack), 1) x_test /= np.maximum( np.linalg.norm(x_test, axis=1, keepdims=True) * (1 + bounding_slack), 1) elif clip == "dims": assert False, "not implemented" elif clip == "none": logging.info(" * no clipping -> no bounding") assert private == False #or np.isinf(epsilon) else: assert False # fit logging.info("Fitting a model...") if private: logging.info(" * DP logistic regression: epsilon=%g, alpha=%g", epsilon, regularizer_strength) from models.logistic_regression import DPLogisticRegression model = DPLogisticRegression().init(repr_dim, classes=np.unique(y), alpha=regularizer_strength, epsilon=epsilon) else: logging.info(" * logistic regression: alpha=%g", regularizer_strength) from sklearn.linear_model import LogisticRegression model = LogisticRegression(C=1 / regularizer_strength) model.fit(x_train, y_train) #print(model.predict(x_test)) # compute mean accuracy on test set logging.info("Testing the model...") #acc = model.score(x_test, y_test) from sklearn.metrics import accuracy_score train_acc = accuracy_score(y_train, model.predict(x_train)) test_acc = accuracy_score(y_test, model.predict(x_test)) logging.info(" * train accuracy = %.6f", train_acc) logging.info(" * test accuracy = %.6f", test_acc) logging.info("Writing results to disk...") ensure_dir_exists("res") filename = ( "res/cancertype-pred-accuracy-%d-%s-%s-s%d-%s-%d-%s%s.txt" % (repr_dim, aux_data_set, alg_id, seed, scale_fun, scale_const, clip, ("-e%g" % (epsilon) if private else "-nonpriv"))) logging.info(" * filename: %s", filename) with open(filename, 'w', encoding='utf-8') as f: f.write("%.6f\n" % test_acc)
#color=line.get_color(), label=alg_id+" (val)") #plt.plot((max_epochs-1) * np.array([1, 1.02, 1.04]), ) #plt.gca().annotate('foo', xy=(0.2, 0.0), xytext=(-2.0, 0.3), bbox=dict(boxstyle="round", fc="w")) # offset = transforms.ScaledTranslation(dx, dy, # fig.dpi_scale_trans) # y = ax.transData.inverted().transform(last_rel_mse) # y = y + #shadow_transform = ax.transData.inverted().transform() #plt.plot((max_epochs-1) * np.array([1, 1.05]), [last_rel_mse]) plt.yscale('log') #plt.yscale('symlog', linthreshy=1e-1) plt.xlabel("epoch") if relative_to is None: #plt.ylim([0, 1e1]) plt.ylim([1e-1, 2e0]) plt.ylabel("relative mse") else: plt.ylabel("relative mse diff from " + relative_to) ensure_dir_exists("figs") plt.legend() if not tiled: figname = "figs/%s-progress-mse-%s-%s-%d%s" % (task, data_set, input_dim, repr_dim, fig_name_suffix) plt.savefig(figname) plt.close() if tiled: #figname = "figs/progress-mse-tcga-%s%s" % (input_dim, fig_name_suffix) figname = "figs/%s-progress-mse-tcga%s" % (task, fig_name_suffix) plt.savefig(figname) plt.close()
def task(args): import pandas repr_dim, (alg_id, _, make_alg), seed = args logging.info("dataset = %s, algorithm = %s", data_set, alg_id) # read the data sets logging.info("Reading data...") data = pandas.read_hdf("data/%s.h5" % (data_set), data_type) logging.info(" * gene expression shape: %d x %d" % data.shape) #aux_target = pandas.read_hdf("data/TCGA_cancertype.h5", 'cancer_types') #logging.info(" * auxiliary target size: %d" % aux_target.shape) #common_samples = data.index.intersection(aux_target.index) #data = data.loc[common_samples] #aux_target = aux_target.loc[common_samples] #logging.info(" * number of common samples: %d" % common_samples.size) from common import categorical_to_binary x = data.as_matrix() #y = categorical_to_binary(aux_target.values) #num_classes = y.shape[1] #x = x[:,0:2000] # normalize the input to _total_ unit variance and per-feature zero mean if normalize_data: x -= np.mean(x) x /= np.std(x) x -= np.mean(x, axis=0) # FIXME! #x = (x - np.amin(x,axis=0)) / (np.amax(x,axis=0) - np.amin(x,axis=0)) #x = (x - np.amin(x)) / (np.amax(x) - np.amin(x)) # init rng np.random.seed(seed) import torch torch.manual_seed(seed) torch.cuda.manual_seed(seed) #if args.cuda ?????: # torch.cuda.manual_seed(seed) # separate validation set if needed val_x = None #val_y = None if validation_split: logging.info("Splitting into training and validation sets") m = x.shape[0] perm = np.random.permutation(m) x = x[perm, :] #y = y[perm,:] split_point = int(validation_split * m) (val_x, x) = (x[:split_point, :], x[split_point:, :]) #(val_y, y) = (y[:split_point,:], y[split_point:,:]) logging.info(" * training set shape: %d x %d" % x.shape) logging.info(" * validation set shape: %d x %d" % val_x.shape) data_dim = x.shape[1] logging.info(" * data shape after preprocessing: %d x %d" % x.shape) logging.info("Running the algorithm...") logging.info(" * learning a representation of size %d", repr_dim) start_time = time.time() # init the algorithm #alg = make_alg(data_dim, repr_dim, num_classes) alg = make_alg(data_dim, repr_dim) # create output dir if does not exist ensure_dir_exists('res') full_model_id = "%s-%d-%s-s%d%s" % (data_set, repr_dim, alg_id, seed, id_suffix) # define the progress saving function progress_filename = 'res/progress-encdec-mse-%s.txt' % (full_model_id) progress_file = open(progress_filename, 'w', encoding='utf-8') #aux_progress_filename = 'res/progress-aux-ce-%s.txt' % (full_model_id) #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8') if val_x is not None: val_progress_filename = 'res/progress-encdec-validation-mse-%s.txt' % ( full_model_id) val_progress_file = open(val_progress_filename, 'w', encoding='utf-8') #aux_val_progress_filename = 'res/progress-aux-validation-ce-%s.txt' % (full_model_id) #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8') def save_progress(): x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) progress_file.write("%g\n" % rel_mse) #aux_pred = alg.predict_secondary(x) #aux_rel_ce = relative_cross_entropy(y, aux_pred) #aux_progress_file.write("%g\n" % aux_rel_ce) if val_x is not None: val_x_pred = alg.decode(alg.encode(val_x)) rel_mse = relative_mean_squared_error(val_x, val_x_pred) val_progress_file.write("%g\n" % rel_mse) #val_aux_pred = alg.predict_secondary(val_x) #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred) #aux_val_progress_file.write("%g\n" % aux_rel_ce) # fit to the training data alg.learn(x, validation_data=val_x, log_file_prefix=("log/%s" % (full_model_id)), per_epoch_callback_funs=[save_progress], deadline=deadline, max_duration=max_duration) # test reconstruction error x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) val_x_pred = alg.decode(alg.encode(val_x)) val_rel_mse = relative_mean_squared_error(val_x, val_x_pred) logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse, val_rel_mse) elapsed = time.time() - start_time logging.info(" * running time = %s", pretty_duration(elapsed)) # save model logging.info("Saving the learned model...") ensure_dir_exists('repr_models') alg.save("repr_models/%s" % (full_model_id))