示例#1
0
def train_model(alignment_file, output_dir, ensemble=None):
    if train_params["verbose"]:
        print("Starting training")
    if ensemble is not None:
        model_params["r_seed"] += ensemble + 1

    data_helper = helper.DataHelper(alignment_file=alignment_file,
                                    calc_weights=True)

    vae_model = model.VariationalAutoencoder(
        data_helper,
        batch_size=model_params["bs"],
        encoder_architecture=[
            model_params["encode_dim_zero"], model_params["encode_dim_one"]
        ],
        decoder_architecture=[
            model_params["decode_dim_zero"], model_params["decode_dim_one"]
        ],
        n_latent=model_params["n_latent"],
        logit_p=model_params["logit_p"],
        sparsity=model_params["sparsity"],
        encode_nonlinearity_type="relu",
        decode_nonlinearity_type="relu",
        final_decode_nonlinearity=model_params["final_decode_nonlin"],
        final_pwm_scale=model_params["final_pwm_scale"],
        conv_decoder_size=model_params["d_c_size"],
        convolve_patterns=model_params["conv_pat"],
        n_patterns=model_params["n_pat"],
        random_seed=model_params["r_seed"],
    )

    job_string = helper.gen_job_string(
        {"filename": os.path.basename(alignment_file).split(".")[0]},
        model_params)

    print(job_string)

    date_prefix = os.path.basename(os.path.dirname(alignment_file))
    path = os.path.join(output_dir, date_prefix)
    if ensemble is not None:
        path = path + "-ensemble" + str(ensemble)
    if not os.path.exists(path):
        os.mkdir(path)
    train.train(data_helper,
                vae_model,
                num_updates=train_params["num_updates"],
                save_progress=train_params["save_progress"],
                save_parameters=train_params["save_parameters"],
                verbose=train_params["verbose"],
                job_string=job_string)

    vae_model.save_parameters(file_prefix=job_string, path=path)
    "n_pat"             :   4,
    "r_seed"            :   12345,
    "conv_pat"          :   True,
    "d_c_size"          :   40
    }

train_params = {
    "num_updates"       :   300000,
    "save_progress"     :   True,
    "verbose"           :   True,
    "save_parameters"   :   False,
    }

if __name__ == "__main__":

    data_helper = helper.DataHelper(alignment_file=data_params["alignment_file"],
                                    calc_weights=True)

    vae_model   = model.VariationalAutoencoder(data_helper,
        batch_size                     =   model_params["bs"],
        encoder_architecture           =   [model_params["encode_dim_zero"],
                                                model_params["encode_dim_one"]],
        decoder_architecture           =   [model_params["decode_dim_zero"],
                                                model_params["decode_dim_one"]],
        n_latent                       =   model_params["n_latent"],
        logit_p                        =   model_params["logit_p"],
        sparsity                       =   model_params["sparsity"],
        encode_nonlinearity_type       =   "relu",
        decode_nonlinearity_type       =   "relu",
        final_decode_nonlinearity      =   model_params["final_decode_nonlin"],
        final_pwm_scale                =   model_params["final_pwm_scale"],
        conv_decoder_size              =   model_params["d_c_size"],
示例#3
0
def main(args):
    if args.alignment_file != "":
        data_params["dataset"] = os.path.basename(
            args.alignment_file).split(".")[0]

    data_helper = helper.DataHelper(dataset=data_params["dataset"],
                                    calc_weights=False,
                                    alignment_file=args.alignment_file,
                                    working_dir=args.working_dir,
                                    load_all_sequences=False)

    vae_model = model.VariationalAutoencoder(
        data_helper,
        batch_size=model_params["bs"],
        encoder_architecture=[
            model_params["encode_dim_zero"], model_params["encode_dim_one"]
        ],
        decoder_architecture=[
            model_params["decode_dim_zero"], model_params["decode_dim_one"]
        ],
        n_latent=model_params["n_latent"],
        logit_p=model_params["logit_p"],
        sparsity=model_params["sparsity"],
        encode_nonlinearity_type="relu",
        decode_nonlinearity_type="relu",
        final_decode_nonlinearity=model_params["final_decode_nonlin"],
        final_pwm_scale=model_params["final_pwm_scale"],
        conv_decoder_size=model_params["d_c_size"],
        convolve_patterns=model_params["conv_pat"],
        n_patterns=model_params["n_pat"],
        random_seed=model_params["r_seed"],
        working_dir=args.working_dir)

    print("Loading model parameters...")
    vae_model.load_parameters(file_prefix=args.ckpt_path)
    print("Computing delta elbo...")
    mutant_name_list, delta_elbo_list = data_helper.single_mutant_matrix(
        vae_model, N_pred_iterations=args.n_iters)
    print("Done.")

    # Result summary
    outprefix = os.path.join(
        args.working_dir,
        os.path.basename(args.alignment_file).split(".")[0])

    preds = pd.DataFrame(zip(mutant_name_list, delta_elbo_list),
                         columns=["Variant", "delta_elbo"])
    preds.to_csv(outprefix + "_preds_raw.csv", index=False)
    preds.loc[:, "Variant"] = preds["Variant"].apply(
        lambda s: s[0] + str(int(s[1:-1]) + args.start_idx - 1) + s[-1])
    variant_seq = pd.read_csv(args.variant_seq_csv)
    variant_seq = variant_seq.loc[variant_seq["Variant"].apply(
        lambda s: s[0] != s[-1])]
    preds = preds.loc[preds["Variant"].isin(variant_seq["Variant"])]
    org_len = len(variant_seq)
    variant_seq = variant_seq.merge(preds, on="Variant", how="left")
    assert len(variant_seq) == org_len
    variant_seq[["Variant", "scaled_effect1",
                 "delta_elbo"]].to_csv(outprefix + ".csv", index=False)
    print("Saved results to {}".format(outprefix + ".csv"))

    correlation, _ = spearmanr(variant_seq["scaled_effect1"].values,
                               variant_seq["delta_elbo"].values)
    print("Spearman's correlation: {}".format(correlation))
示例#4
0
def getVAEDataHelper(ds_attributes):
    return helper.DataHelper(
        alignment_file=ds_attributes['alignment'],
        working_dir=ds_attributes['alignment_directory'],
        calc_weights=False,
    )
示例#5
0
    "n_pat": 4,
    "r_seed": 12345,
    "conv_pat": True,
    "d_c_size": 40
}

train_params = {
    "num_updates": 300000,
    "save_progress": True,
    "verbose": True,
    "save_parameters": False,
}

if __name__ == "__main__":

    data_helper = helper.DataHelper(dataset=data_params["dataset"],
                                    calc_weights=True)

    vae_model = model.VariationalAutoencoder(
        data_helper,
        batch_size=model_params["bs"],
        encoder_architecture=[
            model_params["encode_dim_zero"], model_params["encode_dim_one"]
        ],
        decoder_architecture=[
            model_params["decode_dim_zero"], model_params["decode_dim_one"]
        ],
        n_latent=model_params["n_latent"],
        logit_p=model_params["logit_p"],
        sparsity=model_params["sparsity"],
        encode_nonlinearity_type="relu",
        decode_nonlinearity_type="relu",
示例#6
0
    # "save_parameters"   :   False,
    "save_parameters": 30000,
}

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--alignment_file", type=str, default="")
    parser.add_argument("--working_dir", type=str, default=".")
    args = parser.parse_args()

    if args.alignment_file != "":
        data_params["dataset"] = os.path.basename(
            args.alignment_file).split(".")[0]

    data_helper = helper.DataHelper(dataset=data_params["dataset"],
                                    calc_weights=True,
                                    alignment_file=args.alignment_file,
                                    working_dir=args.working_dir)

    n_seqs = data_helper.x_train.shape[0]
    if n_seqs < model_params["bs"]:
        msg = "Found {} sequences less than batch size: {}.".format(
            n_seqs, model_params["bs"])
        msg += " Setting to {}.".format(n_seqs)
        print(msg)
        model_params["bs"] = n_seqs

    vae_model = model.VariationalAutoencoder(
        data_helper,
        batch_size=model_params["bs"],
        encoder_architecture=[
            model_params["encode_dim_zero"], model_params["encode_dim_one"]