def by_chromosome(context, chromosome): vm = context.vmf.read_row_group(chromosome - 1).to_pandas() if args.frequency_filter: vm = filter_by_frequency(vm, args.frequency_filter) g = context.get_genotype_file(chromosome) regions = context.regions regions = regions[regions.chr == "chr{}".format(chromosome)] for i, region in enumerate(regions.itertuples()): logging.log(9, "Processing region in chr %d: %d/%d", chromosome, i + 1, regions.shape[0]) vmw = Genomics.entries_for_window(chromosome, region.start - args.window, region.stop + args.window, vm) ids = vmw.id.values logging.log(9, "%d variants", len(ids)) d = Parquet._read(g, columns=ids, skip_individuals=True) d = numpy.array([d[x] for x in ids], dtype=numpy.float32) if context.args.standardise_geno: cov = numpy.corrcoef(d, ddof=1).astype(numpy.float32, copy=False) else: cov = numpy.cov(d).astype(numpy.float32, copy=False) logging.log(9, "%d rows", cov.shape[0]) context.sink(cov, ids, region)
def count_variants(chromosome, start, end, vf, m, last_chromosome, args): try: chromosome = int(chromosome.split("chr")[1]) start = int(start) end = int(end) if chromosome != last_chromosome: logging.info("Reading chromosome %d", chromosome) m = vf.read_row_group(chromosome - 1).to_pandas() last_chromosome = chromosome if args.frequency_filter: logging.log(9, "Filtering by frequency") m = m[(m.allele_1_frequency > args.frequency_filter) & (m.allele_1_frequency < 1 - args.frequency_filter)] v = Genomics.entries_for_window(chromosome, start, end, m) count = v.shape[0] except: count = "NA" return count, m, last_chromosome