def train_model(alignment_file, output_dir, ensemble=None): if train_params["verbose"]: print("Starting training") if ensemble is not None: model_params["r_seed"] += ensemble + 1 data_helper = helper.DataHelper(alignment_file=alignment_file, calc_weights=True) vae_model = model.VariationalAutoencoder( data_helper, batch_size=model_params["bs"], encoder_architecture=[ model_params["encode_dim_zero"], model_params["encode_dim_one"] ], decoder_architecture=[ model_params["decode_dim_zero"], model_params["decode_dim_one"] ], n_latent=model_params["n_latent"], logit_p=model_params["logit_p"], sparsity=model_params["sparsity"], encode_nonlinearity_type="relu", decode_nonlinearity_type="relu", final_decode_nonlinearity=model_params["final_decode_nonlin"], final_pwm_scale=model_params["final_pwm_scale"], conv_decoder_size=model_params["d_c_size"], convolve_patterns=model_params["conv_pat"], n_patterns=model_params["n_pat"], random_seed=model_params["r_seed"], ) job_string = helper.gen_job_string( {"filename": os.path.basename(alignment_file).split(".")[0]}, model_params) print(job_string) date_prefix = os.path.basename(os.path.dirname(alignment_file)) path = os.path.join(output_dir, date_prefix) if ensemble is not None: path = path + "-ensemble" + str(ensemble) if not os.path.exists(path): os.mkdir(path) train.train(data_helper, vae_model, num_updates=train_params["num_updates"], save_progress=train_params["save_progress"], save_parameters=train_params["save_parameters"], verbose=train_params["verbose"], job_string=job_string) vae_model.save_parameters(file_prefix=job_string, path=path)
"n_pat" : 4, "r_seed" : 12345, "conv_pat" : True, "d_c_size" : 40 } train_params = { "num_updates" : 300000, "save_progress" : True, "verbose" : True, "save_parameters" : False, } if __name__ == "__main__": data_helper = helper.DataHelper(alignment_file=data_params["alignment_file"], calc_weights=True) vae_model = model.VariationalAutoencoder(data_helper, batch_size = model_params["bs"], encoder_architecture = [model_params["encode_dim_zero"], model_params["encode_dim_one"]], decoder_architecture = [model_params["decode_dim_zero"], model_params["decode_dim_one"]], n_latent = model_params["n_latent"], logit_p = model_params["logit_p"], sparsity = model_params["sparsity"], encode_nonlinearity_type = "relu", decode_nonlinearity_type = "relu", final_decode_nonlinearity = model_params["final_decode_nonlin"], final_pwm_scale = model_params["final_pwm_scale"], conv_decoder_size = model_params["d_c_size"],
def main(args): if args.alignment_file != "": data_params["dataset"] = os.path.basename( args.alignment_file).split(".")[0] data_helper = helper.DataHelper(dataset=data_params["dataset"], calc_weights=False, alignment_file=args.alignment_file, working_dir=args.working_dir, load_all_sequences=False) vae_model = model.VariationalAutoencoder( data_helper, batch_size=model_params["bs"], encoder_architecture=[ model_params["encode_dim_zero"], model_params["encode_dim_one"] ], decoder_architecture=[ model_params["decode_dim_zero"], model_params["decode_dim_one"] ], n_latent=model_params["n_latent"], logit_p=model_params["logit_p"], sparsity=model_params["sparsity"], encode_nonlinearity_type="relu", decode_nonlinearity_type="relu", final_decode_nonlinearity=model_params["final_decode_nonlin"], final_pwm_scale=model_params["final_pwm_scale"], conv_decoder_size=model_params["d_c_size"], convolve_patterns=model_params["conv_pat"], n_patterns=model_params["n_pat"], random_seed=model_params["r_seed"], working_dir=args.working_dir) print("Loading model parameters...") vae_model.load_parameters(file_prefix=args.ckpt_path) print("Computing delta elbo...") mutant_name_list, delta_elbo_list = data_helper.single_mutant_matrix( vae_model, N_pred_iterations=args.n_iters) print("Done.") # Result summary outprefix = os.path.join( args.working_dir, os.path.basename(args.alignment_file).split(".")[0]) preds = pd.DataFrame(zip(mutant_name_list, delta_elbo_list), columns=["Variant", "delta_elbo"]) preds.to_csv(outprefix + "_preds_raw.csv", index=False) preds.loc[:, "Variant"] = preds["Variant"].apply( lambda s: s[0] + str(int(s[1:-1]) + args.start_idx - 1) + s[-1]) variant_seq = pd.read_csv(args.variant_seq_csv) variant_seq = variant_seq.loc[variant_seq["Variant"].apply( lambda s: s[0] != s[-1])] preds = preds.loc[preds["Variant"].isin(variant_seq["Variant"])] org_len = len(variant_seq) variant_seq = variant_seq.merge(preds, on="Variant", how="left") assert len(variant_seq) == org_len variant_seq[["Variant", "scaled_effect1", "delta_elbo"]].to_csv(outprefix + ".csv", index=False) print("Saved results to {}".format(outprefix + ".csv")) correlation, _ = spearmanr(variant_seq["scaled_effect1"].values, variant_seq["delta_elbo"].values) print("Spearman's correlation: {}".format(correlation))
def getVAEDataHelper(ds_attributes): return helper.DataHelper( alignment_file=ds_attributes['alignment'], working_dir=ds_attributes['alignment_directory'], calc_weights=False, )
"n_pat": 4, "r_seed": 12345, "conv_pat": True, "d_c_size": 40 } train_params = { "num_updates": 300000, "save_progress": True, "verbose": True, "save_parameters": False, } if __name__ == "__main__": data_helper = helper.DataHelper(dataset=data_params["dataset"], calc_weights=True) vae_model = model.VariationalAutoencoder( data_helper, batch_size=model_params["bs"], encoder_architecture=[ model_params["encode_dim_zero"], model_params["encode_dim_one"] ], decoder_architecture=[ model_params["decode_dim_zero"], model_params["decode_dim_one"] ], n_latent=model_params["n_latent"], logit_p=model_params["logit_p"], sparsity=model_params["sparsity"], encode_nonlinearity_type="relu", decode_nonlinearity_type="relu",
# "save_parameters" : False, "save_parameters": 30000, } if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--alignment_file", type=str, default="") parser.add_argument("--working_dir", type=str, default=".") args = parser.parse_args() if args.alignment_file != "": data_params["dataset"] = os.path.basename( args.alignment_file).split(".")[0] data_helper = helper.DataHelper(dataset=data_params["dataset"], calc_weights=True, alignment_file=args.alignment_file, working_dir=args.working_dir) n_seqs = data_helper.x_train.shape[0] if n_seqs < model_params["bs"]: msg = "Found {} sequences less than batch size: {}.".format( n_seqs, model_params["bs"]) msg += " Setting to {}.".format(n_seqs) print(msg) model_params["bs"] = n_seqs vae_model = model.VariationalAutoencoder( data_helper, batch_size=model_params["bs"], encoder_architecture=[ model_params["encode_dim_zero"], model_params["encode_dim_one"]