示例#1
0
def run(args):
    if os.path.exists(args.output):
        logging.info("Output exists. Nope")
        return

    filters = {x[0]: x[1:] for x in args.filter}

    maf_filter = float(filters["MAF"][0]) if "MAF" in filters else None
    logging.info("Loading GTEX variant map")
    gtex_snp_key = GTExMisc.load_gtex_variant_to_rsid(args.annotation[0])

    logging.info("Processing genotype")
    m = []
    for mean, metadata, ids in ModelTraining.dosage_generator(
            args.genotype,
            gtex_snp_key,
            dosage_conversion=ModelTraining._mean,
            do_none=True):
        if maf_filter:
            f = mean / 2 if mean < 1 else 1 - mean / 2
            if f < maf_filter:
                continue
        m.append(metadata)

    m = Utilities.to_dataframe(m, [x[1] for x in Genotype.MetadataTFE.order])
    if "TOP_CHR_POS_BY_FREQ" in filters:
        logging.info("Simplifying multi-allelic variants")
        m = Genotype._monoallelic_by_frequency(m)

    logging.info("Saving...")
    Utilities.save_dataframe(m, args.output)
    logging.info("Finished")
示例#2
0
def _process(d, key_to_snp, how="left"):
    k = [(k_, key_to_snp[k_]) for k_ in d.variant_id if k_ in key_to_snp]
    k = Utilities.to_dataframe(k, ["variant_id", "rsid"])
    d = d.merge(k, on="variant_id", how=how)
    d = d.rename(columns={"gene_id":"gene", "pval_nominal":"pvalue", "slope":"beta", "slope_se":"se"})
    d = d[list(GTEx.GTExAllAssociations._fields)]
    d = d.assign(maf = d.maf.astype(numpy.float32), beta = d.beta.astype(numpy.float32), se = d.se.astype(numpy.float32))
    return d
示例#3
0
def run(args):
    logging.info("Starting process")

    vf = pq.ParquetFile(args.parquet_genotype_metadata)
    m = None
    last_chromosome = None

    r = []
    for i, line in Utilities.iterate_file(args.regions):
        if i == 0: continue
        comps = line.strip().split()
        count, m, last_chromosome = count_variants(comps[0], comps[1],
                                                   comps[2], vf, m,
                                                   last_chromosome, args)
        r.append((comps[0], comps[1], comps[2], count))

    r = Utilities.to_dataframe(r, ["chromosome", "start", "end", "count"])
    Utilities.save_dataframe(r, args.output)
    logging.info("Finished process")