示例#1
0
def by_chromosome(context, chromosome):
    vm = context.vmf.read_row_group(chromosome - 1).to_pandas()
    if args.frequency_filter:
        vm = filter_by_frequency(vm, args.frequency_filter)

    g = context.get_genotype_file(chromosome)

    regions = context.regions
    regions = regions[regions.chr == "chr{}".format(chromosome)]

    for i, region in enumerate(regions.itertuples()):
        logging.log(9, "Processing region in chr %d: %d/%d", chromosome, i + 1,
                    regions.shape[0])
        vmw = Genomics.entries_for_window(chromosome,
                                          region.start - args.window,
                                          region.stop + args.window, vm)
        ids = vmw.id.values
        logging.log(9, "%d variants", len(ids))
        d = Parquet._read(g, columns=ids, skip_individuals=True)
        d = numpy.array([d[x] for x in ids], dtype=numpy.float32)
        if context.args.standardise_geno:
            cov = numpy.corrcoef(d, ddof=1).astype(numpy.float32, copy=False)
        else:
            cov = numpy.cov(d).astype(numpy.float32, copy=False)
        logging.log(9, "%d rows", cov.shape[0])
        context.sink(cov, ids, region)
示例#2
0
def count_variants(chromosome, start, end, vf, m, last_chromosome, args):
    try:
        chromosome = int(chromosome.split("chr")[1])
        start = int(start)
        end = int(end)
        if chromosome != last_chromosome:
            logging.info("Reading chromosome %d", chromosome)
            m = vf.read_row_group(chromosome - 1).to_pandas()
            last_chromosome = chromosome
            if args.frequency_filter:
                logging.log(9, "Filtering by frequency")
                m = m[(m.allele_1_frequency > args.frequency_filter)
                      & (m.allele_1_frequency < 1 - args.frequency_filter)]
        v = Genomics.entries_for_window(chromosome, start, end, m)
        count = v.shape[0]
    except:
        count = "NA"
    return count, m, last_chromosome