예제 #1
0
def get_variant_key(args):
    v_ = lambda x: numpy.nan if x=="." else x
    snp = args.snp_annotation_file
    variant_key = None
    if len(snp) == 1:
        variant_key = KeyedDataSource.load_data(snp[0], "variant_id", "rs_id_dbSNP150_GRCh38p7",
                                            value_conversion=v_, key_filter=GenotypeUtilities.is_biallelic_variant)
    elif len(snp) == 2:
        if snp[1] == "METADATA":
            variant_key = KeyedDataSource.load_data(snp[0], "id", "rsid", value_conversion=v_)

    if not variant_key:
        raise  RuntimeError("Need right info to process snp metadata")

    return variant_key
예제 #2
0
def run(args):
    start = timer()
    Utilities.ensure_requisite_folders(args.output_prefix)
    logging.info("Loading SNP annotation")
    snp_key = KeyedDataSource.load_data(args.snp_annotation_file,
                                        "varID",
                                        "rsid_dbSNP150",
                                        should_skip=KeyedDataSource.skip_na)

    logging.info("Loading Genotype")
    genotype, individual_ids = ModelTraining.load_genotype_folder(
        args.input_genotype_folder, args.input_genotype_file_pattern, snp_key)

    logging.info("Saving Genotype")
    path_variant = args.output_prefix + ".variants.parquet"
    Parquet.save_variants(path_variant, genotype, individual_ids)

    path_metadata_variant = args.output_prefix + ".variants_metadata.parquet"
    Parquet.save_metadata(path_metadata_variant, genotype)

    logging.info("Processing Expression Phenotype")
    expression_logic = Utilities.file_logic(
        args.input_phenotype_folder, args.input_phenotype_expression_pattern)
    for row in expression_logic.itertuples():
        logging.info("Phenotype: %s", row.name)
        process_phenotype(row.path, row.name, args.output_prefix)
    end = timer()
    logging.info("Finished in %s", str(end - start))
def run(args):
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Loading snp reference")
    key = KeyedDataSource.load_data(args.snp_reference_file,
                                    "variant_id",
                                    "rs_id_dbSNP150_GRCh38p7",
                                    value_conversion=KeyedDataSource.dot_to_na)
    logging.info("Loading samples")
    samples = TextFileTools.load_list(args.samples)
    genotype_format_string = "\t".join(["{}"] * (len(samples) + 1)) + "\n"

    og = args.output_prefix + "_genotype.txt.gz"
    oa = args.output_prefix + "_annotation.txt.gz"
    if os.path.exists(og) or os.path.exists(oa):
        logging.info("Output exists. Nope.")
        return

    logging.info("Processing")
    with gzip.open(args.genotype) as geno:
        with gzip.open(og, "w") as _og:
            _og.write(_to_gl(["varID"] + samples, genotype_format_string))
            with gzip.open(oa, "w") as _oa:
                _oa.write(
                    _to_al([
                        "chromosome", "position", "id", "allele_0", "allele_1",
                        "allele_1_frequency", "rsid"
                    ]))
                for i, line in enumerate(geno):
                    comps = line.decode().strip().split()

                    chr = "chr" + comps[0]
                    pos = comps[2]
                    ref = comps[3]
                    alt = comps[4]
                    af = comps[5]
                    dosage = comps[6:]

                    var_id = "{}_{}_{}_{}_b38".format(chr, pos, ref, alt)
                    if var_id in key:
                        id = key[var_id]
                        comps[1] = var_id
                        _og.write(
                            _to_gl([var_id] + dosage, genotype_format_string))
                        _oa.write(_to_al([chr, pos, var_id, ref, alt, af, id]))
                        next

                    var_id = "{}_{}_{}_{}_b38".format(chr, pos, alt, ref)
                    if var_id in key and len(ref) == 1 and len(alt) == 1:
                        id = key[var_id]
                        af = str(1 - float(af))
                        dosage = list(map(lambda x: str(2 - int(x)),
                                          comps[6:]))
                        _og.write(
                            _to_gl([var_id] + dosage, genotype_format_string))
                        _oa.write(_to_al([chr, pos, var_id, alt, ref, af, id]))
                        next

    logging.info("Finished conversion")
def run(args):
    if os.path.exists(args.output):
        logging.info("Output already exists, either delete it or move it")
        return

    logging.info("Loading snp names")
    snps = KeyedDataSource.load_data(args.snp_annotation, "variant_id", args.rsid_column)

    logging.info("Loading gene annotation")
    genes, types = _gene_annotation(args.gene_annotation)

    with sqlite3.connect(args.output) as conn:
        logging.info("Processing")

        streamer = DataFrameStreamer.data_frame_streamer(args.input,
            header=["tissue_name", "gene_id", "variant_id", "weight", "beta", "se"],
            to_numeric=["weight", "beta", "se"], sentinel_column="gene_id")
        extra = []
        for i, d in enumerate(streamer):
            g_ = d.gene_id.values[0]
            logging.log(9, "processing %i:%s", i+1, g_)
            d = d.loc[d.weight != 0]
            if args.snp_zscore_threshold:
                d = d.assign(zscore=numpy.abs(d.beta / d.se))
                d = d.loc[d.zscore > args.snp_zscore_threshold]

            if d.shape[0] == 0:
                logging.log(9, "no good snps left")
                continue

            extra.append((g_, genes[g_], types[g_], d.shape[0], numpy.nan, numpy.nan, numpy.nan))

            d = d[["gene_id", "variant_id", "weight"]].rename(columns={"gene_id":"gene", "variant_id":"varID"})
            effect, non_effect, rsid = [], [], []
            for t in d.itertuples():
                c_ = t.varID.split("_")
                effect.append(c_[3])
                non_effect.append(c_[2])
                r_ = snps[t.varID]
                rsid.append(r_ if r_ != "." else t.varID)
            d = d.assign(ref_allele = non_effect, eff_allele = effect, rsid = rsid)[["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"]]
            d.to_sql("weights", conn, index=False, if_exists="append")

        extra = pandas.DataFrame(extra, columns=["gene", "genename", "gene_type", "n.snps.in.model", "pred.perf.R2","pred.perf.pval", "pred.perf.qval"])
        extra.to_sql("extra", conn, index=False)

        logging.info("Creating indices")
        Models.model_indexes(conn)

    logging.info("Finished building model.")
def run(args):
    if os.path.exists(args.output):
        logging.info("Output exists. Nope.")
        return

    Utilities.ensure_requisite_folders(args.output)

    logging.info("Loading variant annotation")
    variants = KeyedDataSource.load_data(args.variant_annotation, "variant_id", args.rsid_column)

    logging.info("Loading data annotation")
    if len(args.data_annotation) == 1:
        data_annotation = pandas.read_table(args.data_annotation[0])
        data_annotation = data_annotation[["gene_id", "gene_name", "feature_type", "gene_type"]][data_annotation.feature_type == "gene"].drop_duplicates()
    elif len(args.data_annotation) == 2:
        data_annotation = pandas.read_table(args.data_annotation[0])
        data_annotation = data_annotation[["gene_id", "gene_name", "feature_type", "gene_type"]][
        data_annotation.feature_type == args.data_annotation[1]].drop_duplicates()
    else:
        raise  RuntimeError("Unsupported annotation length")

    logging.info("Loading model_input")
    data = pandas.read_table(args.model_input, usecols=["gene_id", "gene_name", "variant", "weight"])

    logging.info("Processing")
    if args.model_filter and args.model_filter[1] == "PIP":
        w = Miscellaneous.dapg_signals(args.model_filter[0], float(args.model_filter[2]), variants)
        w = w.rename(columns={"gene":"gene_id", "variant_id":"variant"})
        data = data.merge(w[["gene_id", "variant"]], on=["gene_id", "variant"])

    v = pandas.DataFrame([(k,variants[k]) for k in data.variant.drop_duplicates()], columns=["variant", "rsid"])
    v.loc[v.rsid == ".", "rsid"] = v.loc[v.rsid == ".", "variant"]
    weights = data.merge(v, on="variant")
    weights = weights.assign(
        ref_allele = weights.variant.str.replace("(.*)_(.*)_(.*)_(.*)_b38", lambda x: x.group(3)),
        eff_allele=weights.variant.str.replace("(.*)_(.*)_(.*)_(.*)_b38", lambda x: x.group(4)))
    weights = weights.rename(columns={"variant":"varID", "gene_id":"gene"})[["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"]]

    extra = data.groupby("gene_id").size().to_frame("n.snps.in.model").reset_index()
    extra = extra.merge(data_annotation[["gene_id", "gene_name", "gene_type"]], on="gene_id")
    extra["pred.perf.pval"] = None
    extra["pred.perf.qval"] = None
    extra["pred.perf.R2"] = None
    extra = extra[["gene_id", "gene_name", "gene_type", "n.snps.in.model", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"]].rename(columns={"gene_id":"gene", "gene_name":"genename"})

    logging.info("Saving db")
    Models.create_model_db(args.output, extra, weights)

    logging.info("Done")
예제 #6
0
def run(args):
    r_ = pandas.read_csv if ".csv" in args.input else pandas.read_table
    sep = "," if ".csv" in args.output else "\t"

    logging.info("Loading gene table")
    g = KeyedDataSource.load_data(args.gene_table, "gene_id", "gene_name")

    logging.info("Loading input")
    i = r_(args.input)

    gene_name = []
    for t in i.itertuples():
        gene_name.append(g[t.gene])
    i["gene_name"] = gene_name

    logging.info("saving")
    Utilities.save_dataframe(i, args.output, sep=sep)

    logging.info("Done")