Exemplo n.º 1
0
def run(args):
    if os.path.exists(args.output):
        logging.info("output exists already, delete it or move it")
        return

    logging.info("Starting")
    Utilities.ensure_requisite_folders(args.output)

    logging.info("Loading data annotation")
    gene_annotation = StudyUtilities.load_gene_annotation(args.gene_annotation)
    gene_annotation = gene_annotation.rename(
        {"gene_name": "genename"},
        axis=1)[["gene_id", "genename", "gene_type"]]

    logging.info("Loading variant annotation")
    features_metadata = pq.read_table(args.features_annotation).to_pandas()

    logging.info("Loading spec")
    weights = get_weights(args.spec)

    w = weights.merge(features_metadata[["id", "allele_0", "allele_1",
                                         "rsid"]],
                      on="id",
                      how="left")
    w = w.rename(
        {
            "allele_0": "ref_allele",
            "allele_1": "eff_allele",
            "id": "varID"
        },
        axis=1)
    w["gene"] = w.gene_id.str.cat(w.cluster_id.astype(str), sep="_")
    w = w.drop(["w", "cluster_id"], axis=1)
    w = w.sort_values(by="gene").assign(weight=1)

    logging.info("Building models")
    with sqlite3.connect(args.output) as conn:
        w.drop("gene_id", axis=1).fillna("NA")[[
            "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"
        ]].to_sql("weights", conn, index=False)

        e = w[["gene_id", "gene"]].merge(gene_annotation,
                                         on="gene_id").drop("gene_id", axis=1)
        e["n_snps_in_window"] = None
        e["n.snps.in.model"] = 1
        e["pred.perf.pval"] = None
        e["pred.perf.qval"] = None
        e["pred.perf.R2"] = None
        e = e[[
            "gene", "genename", "gene_type", "n_snps_in_window",
            "n.snps.in.model", "pred.perf.R2", "pred.perf.pval",
            "pred.perf.qval"
        ]]

        e.to_sql("extra", conn, index=False)

        Models.model_indexes(conn)

    logging.info("Finished")
Exemplo n.º 2
0
def run(args):
    wp = args.output_prefix + "_weights.txt.gz"
    if os.path.exists(wp):
        logging.info("Weights output exists already, delete it or move it")
        return

    sp = args.output_prefix + "_summary.txt.gz"
    if os.path.exists(sp):
        logging.info("Summary output exists already, delete it or move it")
        return

    cp = args.output_prefix + "_covariance.txt.gz"
    if os.path.exists(wp):
        logging.info("covariance output exists already, delete it or move it")
        return

    r = args.output_prefix + "_run.txt.gz"
    if os.path.exists(wp):
        logging.info("run output exists already, delete it or move it")
        return

    logging.info("Starting")
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Opening data")
    data = pq.ParquetFile(args.data)
    available_data = {x for x in data.metadata.schema.names}

    logging.info("Loading data annotation")
    data_annotation = StudyUtilities.load_gene_annotation(
        args.data_annotation, args.chromosome, args.sub_batches,
        args.sub_batch)
    data_annotation = data_annotation[data_annotation.gene_id.isin(
        available_data)]
    if args.gene_whitelist:
        logging.info("Applying gene whitelist")
        data_annotation = data_annotation[data_annotation.gene_id.isin(
            set(args.gene_whitelist))]
    logging.info("Kept %i entries", data_annotation.shape[0])

    logging.info("Opening features annotation")
    if not args.chromosome:
        features_metadata = pq.read_table(args.features_annotation).to_pandas()
    else:
        features_metadata = pq.ParquetFile(
            args.features_annotation).read_row_group(args.chromosome -
                                                     1).to_pandas()

    if args.chromosome and args.sub_batches:
        logging.info("Trimming variants")
        features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(
            features_metadata, data_annotation, args.window)

    if args.rsid_whitelist:
        logging.info("Filtering features annotation")
        whitelist = TextFileTools.load_list(args.rsid_whitelist)
        whitelist = set(whitelist)
        features_metadata = features_metadata[features_metadata.rsid.isin(
            whitelist)]

    if args.features_weights:
        logging.info("Loading weights")
        x_weights = get_weights(args.features_weights,
                                {x
                                 for x in features_metadata.id})
        logging.info(
            "Filtering features metadata to those available in weights")
        features_metadata = features_metadata[features_metadata.id.isin(
            x_weights.id)]
        logging.info("Kept %d entries", features_metadata.shape[0])
    else:
        x_weights = None

    logging.info("Opening features")
    features = pq.ParquetFile(args.features)

    logging.info("Setting R seed")
    s = numpy.random.randint(1e8)
    set_seed(s)
    if args.run_tag:
        d = pandas.DataFrame({
            "run": [args.run_tag],
            "cv_seed": [s]
        })[["run", "cv_seed"]]
        Utilities.save_dataframe(d, r)

    WEIGHTS_FIELDS = [
        "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"
    ]
    SUMMARY_FIELDS = [
        "gene", "genename", "gene_type", "alpha", "n_snps_in_window",
        "n.snps.in.model", "test_R2_avg", "test_R2_sd", "cv_R2_avg",
        "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval",
        "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore",
        "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"
    ]

    train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols

    with gzip.open(wp, "w") as w:
        w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode())
        with gzip.open(sp, "w") as s:
            s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode())
            with gzip.open(cp, "w") as c:
                c.write("GENE RSID1 RSID2 VALUE\n".encode())
                for i, data_annotation_ in enumerate(
                        data_annotation.itertuples()):
                    if args.MAX_M and i >= args.MAX_M:
                        logging.info("Early abort")
                        break
                    logging.log(9, "processing %i/%i:%s", i + 1,
                                data_annotation.shape[0],
                                data_annotation_.gene_id)
                    if args.repeat:
                        for j in range(0, args.repeat):
                            logging.log(9, "%i-th reiteration", j)
                            process(w, s, c, data, data_annotation_, features,
                                    features_metadata, x_weights,
                                    SUMMARY_FIELDS, train, j,
                                    args.nested_cv_folds)
                    else:
                        process(w,
                                s,
                                c,
                                data,
                                data_annotation_,
                                features,
                                features_metadata,
                                x_weights,
                                SUMMARY_FIELDS,
                                train,
                                nested_folds=args.nested_cv_folds)

    logging.info("Finished")
def run(args):
    wp = args.output_prefix + "_weights.txt.gz"
    if os.path.exists(wp):
        logging.info("Weights output exists already, delete it or move it")
        return

    sp = args.output_prefix + "_summary.txt.gz"
    if os.path.exists(sp):
        logging.info("Summary output exists already, delete it or move it")
        return

    cp = args.output_prefix + "_covariance.txt.gz"
    if os.path.exists(wp):
        logging.info("covariance output exists already, delete it or move it")
        return

    r = args.output_prefix + "_run.txt.gz"
    if os.path.exists(wp):
        logging.info("run output exists already, delete it or move it")
        return

    logging.info("Starting")
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Opening data")
    data = pq.ParquetFile(args.data)
    available_data = {x for x in data.metadata.schema.names}

    logging.info("Loading data annotation")
    data_annotation = StudyUtilities.load_gene_annotation(args.data_annotation, args.chromosome, args.sub_batches, args.sub_batch, args.simplify_data_annotation)
    data_annotation = data_annotation[data_annotation.gene_id.isin(available_data)]
    if args.gene_whitelist:
        logging.info("Applying gene whitelist")
        data_annotation = data_annotation[data_annotation.gene_id.isin(set(args.gene_whitelist))]
    logging.info("Kept %i entries", data_annotation.shape[0])

    logging.info("Opening features annotation")
    if not args.chromosome:
        features_metadata = pq.read_table(args.features_annotation).to_pandas()
    else:
        features_metadata = pq.ParquetFile(args.features_annotation).read_row_group(args.chromosome-1).to_pandas()

    if args.output_rsids:
        if not args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]:
            logging.warning("Several variants map to a same rsid (hint: multiple INDELS?).\n"
                            "Can't proceed. Consider the using the --keep_highest_frequency_rsid flag, or models will be ill defined.")
            return

    if args.chromosome and args.sub_batches:
        logging.info("Trimming variants")
        features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(features_metadata, data_annotation, args.window)
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_call_filter:
        logging.info("Filtering variants by average call rate")
        features_metadata = features_metadata[features_metadata.avg_call > args.variant_call_filter]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_r2_filter:
        logging.info("Filtering variants by imputation R2")
        features_metadata = features_metadata[features_metadata.r2 > args.variant_r2_filter]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_variance_filter:
        logging.info("Filtering variants by (dosage/2)'s variance")
        features_metadata = features_metadata[features_metadata["std"]/2 > numpy.sqrt(args.variant_variance_filter)]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.discard_palindromic_snps:
        logging.info("Discarding palindromic snps")
        features_metadata = Genomics.discard_gtex_palindromic_variants(features_metadata)
        logging.info("Kept %d", features_metadata.shape[0])

    if args.rsid_whitelist:
        logging.info("Filtering features annotation for whitelist")
        whitelist = TextFileTools.load_list(args.rsid_whitelist)
        whitelist = set(whitelist)
        features_metadata = features_metadata[features_metadata.rsid.isin(whitelist)]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.only_rsids:
        logging.info("discarding non-rsids")
        features_metadata = StudyUtilities.trim_variant_metadata_to_rsids_only(features_metadata)
        logging.info("Kept %d", features_metadata.shape[0])

        if args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]:
            logging.info("Keeping only the highest frequency entry for every rsid")
            k = features_metadata[["rsid", "allele_1_frequency", "id"]]
            k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"] = 1 - k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"]
            k = k.sort_values(by=["rsid", "allele_1_frequency"], ascending=False)
            k = k.groupby("rsid").first().reset_index()
            features_metadata = features_metadata[features_metadata.id.isin(k.id)]
            logging.info("Kept %d", features_metadata.shape[0])
        else:
            logging.info("rsids are unique, no need to restrict to highest frequency entry")

    if args.features_weights:
        logging.info("Loading weights")
        x_weights = get_weights(args.features_weights, {x for x in features_metadata.id})
        logging.info("Filtering features metadata to those available in weights")
        features_metadata = features_metadata[features_metadata.id.isin(x_weights.id)]
        logging.info("Kept %d entries", features_metadata.shape[0])
    else:
        x_weights = None

    logging.info("Opening features")
    features = pq.ParquetFile(args.features)

    logging.info("Setting R seed")
    s = numpy.random.randint(1e8)
    set_seed(s)
    if args.run_tag:
        d = pandas.DataFrame({"run":[args.run_tag], "cv_seed":[s]})[["run", "cv_seed"]]
        Utilities.save_dataframe(d, r)

    WEIGHTS_FIELDS=["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"]
    SUMMARY_FIELDS=["gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model",
                    "test_R2_avg", "test_R2_sd", "cv_R2_avg", "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval",
                    "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"]

    train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols

    available_individuals = check_missing(args, data, features)

    with gzip.open(wp, "w") as w:
        w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode())
        with gzip.open(sp, "w") as s:
            s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode())
            with gzip.open(cp, "w") as c:
                c.write("GENE RSID1 RSID2 VALUE\n".encode())
                for i,data_annotation_ in enumerate(data_annotation.itertuples()):
                    if args.MAX_M and  i>=args.MAX_M:
                        logging.info("Early abort")
                        break
                    logging.log(9, "processing %i/%i:%s", i+1, data_annotation.shape[0], data_annotation_.gene_id)

                    if args.repeat:
                        for j in range(0, args.repeat):
                            logging.log(9, "%i-th reiteration", j)
                            process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, j, nested_folds=args.nested_cv_folds, use_individuals=available_individuals)
                    else:
                        process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, nested_folds=args.nested_cv_folds, use_individuals=available_individuals)

    logging.info("Finished")