def split(args): input_bam = tk_bam.create_bam_infile(args.possorted_bam) # chromosomes should be ordered as chr1, chr11, chr12, ..., chr19, chr2 chroms0 = input_bam.references chrom_lengths0 = input_bam.lengths chroms, chrom_lengths = (list(t) for t in zip( *sorted(zip(chroms0, chrom_lengths0)))) loci = [] for (chrom, length) in zip(chroms, chrom_lengths): start = 0 while start + tenkit.constants.PARALLEL_LOCUS_SIZE < length: stop = start + tenkit.constants.PARALLEL_LOCUS_SIZE loci.append({ 'locus': tk_io.create_locus_info(chrom, start, stop), '__mem_gb': 4 }) start += tenkit.constants.PARALLEL_LOCUS_SIZE loci.append({ 'locus': tk_io.create_locus_info(chrom, start, length), '__mem_gb': 4 }) return {'chunks': loci, 'join': {'__mem_gb': 8.0}}
def get_sized_bam_chunks(bam_fn, gb_per_chunk, contig_whitelist=None, target_regions=None, extra_args = {}): '''Divide a BAM file into disjoint loci with a max compressed size of roughly gb_per_chunk. If contig_whitelist is supplied, those contigs will not be included. If target_regions is supplied, boundaries will be adjusted to avoid on-target regions''' total_size = os.path.getsize(bam_fn) bam = pysam.Samfile(bam_fn) file_starts = [] last_pos = 0 for chrom in bam.references: offset = get_voffset(bam, chrom, 0) if offset is None: offset = last_pos else: last_pos = offset file_starts.append(offset) file_sizes = [] for i in range(len(file_starts) - 1): file_sizes.append(file_starts[i+1] - file_starts[i]) file_sizes.append(total_size - file_starts[-1]) loci = [] for (chrom, file_start, file_size, chrom_size) in zip(bam.references, file_starts, file_sizes, bam.lengths): if contig_whitelist is None or chrom in contig_whitelist: n_chunks = max(1, int(math.ceil(float(file_size) / 1e9 / gb_per_chunk))) chunk_size = int(file_size / n_chunks) chunk_starts = []# for i in range(n_chunks): if i == 0: pos = 0 else: voffset = file_start + chunk_size * i _pos = find_pos_of_voffset(bam, chrom, voffset, err=chunk_size/20) pos = adjust_start(chrom, _pos, target_regions) # Don't create very small chunks if len(chunk_starts) > 0 and pos - chunk_starts[-1] < 100: continue chunk_starts.append(pos) for i in range(len(chunk_starts)): if i < len(chunk_starts) - 1: locus = tk_io.create_locus_info(chrom, chunk_starts[i], chunk_starts[i+1]) else: locus = tk_io.create_locus_info(chrom, chunk_starts[i], chrom_size) loci.append(locus) validate_loci(bam, loci, contig_whitelist) chunks = [] for l in loci: chunk = {'locus': l} chunk.update(extra_args) chunks.append(chunk) return chunks
def generate_tiling_windows(input_bam, locus_size, overlap=0): ''' Generate a list of (chrom, start, length) loci that tile over all the references in the bam file ''' chroms = input_bam.references chrom_lengths = input_bam.lengths loci = [] for (chrom, length) in zip(chroms, chrom_lengths): start = 0 while start + locus_size + overlap < length: stop = start + locus_size + overlap loci.append(tk_io.create_locus_info(chrom, start, stop)) start += locus_size loci.append(tk_io.create_locus_info(chrom, start, length)) return loci
def split(args): input_bam = tk_bam.create_bam_infile(args.bam_infile) chroms = input_bam.references chrom_lengths = input_bam.lengths loci = [] for (chrom, length) in zip(chroms, chrom_lengths): bad_chrom = ('random' in chrom or 'U' in chrom or 'hap' in chrom) if bad_chrom or chrom[: 3] != 'chr' or chrom == 'chrM' or chrom == 'chrY': continue start = 0 while start + tenkit.constants.PARALLEL_LOCUS_SIZE < length: stop = start + tenkit.constants.PARALLEL_LOCUS_SIZE loci.append({'locus': tk_io.create_locus_info(chrom, start, stop)}) start += tenkit.constants.PARALLEL_LOCUS_SIZE loci.append({'locus': tk_io.create_locus_info(chrom, start, length)}) return {'chunks': loci, 'join': {'__mem_gb': 12.0}}
def split(args): input_bam = tk_bam.create_bam_infile(args.bam_infile) # chromosomes should be ordered as chr1, chr11, chr12, ..., chr19, chr2 chroms0 = input_bam.references chrom_lengths0 = input_bam.lengths chroms, chrom_lengths = (list(t) for t in zip( *sorted(zip(chroms0, chrom_lengths0)))) loci = [] for (chrom, length) in zip(chroms, chrom_lengths): bad_chrom = ('random' in chrom or 'U' in chrom or 'hap' in chrom) if bad_chrom or chrom == 'chrM' or chrom == 'chrY' or chrom == 'M' or chrom == 'Y': continue start = 0 while start + tenkit.constants.PARALLEL_LOCUS_SIZE < length: stop = start + tenkit.constants.PARALLEL_LOCUS_SIZE loci.append({'locus': tk_io.create_locus_info(chrom, start, stop)}) start += tenkit.constants.PARALLEL_LOCUS_SIZE loci.append({'locus': tk_io.create_locus_info(chrom, start, length)}) return {'chunks': loci, 'join': {'__mem_gb': 12.0}}
def too_many_overhang_variants(pb, vfr, max_allowable_overhang_variants): # find overhangs chrom = pb.chrom overhang0_start = min(pb.start_left, pb.start_right) overhang0_end = max(pb.start_left, pb.start_right) overhang1_start = min(pb.end_left, pb.end_right) overhang1_end = max(pb.end_left, pb.end_right) if overhang1_start <= overhang0_end: overhang_loci = [(chrom, overhang0_start, overhang1_end)] else: overhang_loci = [(chrom, overhang0_start, overhang0_end), (chrom, overhang1_start, overhang1_end)] badness = False for loc in overhang_loci: locus_string = tk_io.create_locus_info(*loc) variants = [ record for record in tk_io.get_variant_iterator_pos( vfr, None, locus_string) ] if len(variants) > max_allowable_overhang_variants: badness = True return badness
def get_variant_iterator(vfr, locus): """ Wrapper to get around the fact that tk_io.get_variant_iterator() takes a locus string instead of a tuple. """ locus_string = tk_io.create_locus_info(locus.chrom, locus.start, locus.end) return tk_io.get_variant_iterator_pos(vfr, None, locus_string)
def generate_chrom_loci(target_regions, chrom, chrom_length, chunk_size, overlap = 0): starts = [adjust_start(chrom, s, target_regions) for s in range(0, chrom_length, chunk_size)] ends = [min(chrom_length, s + overlap) for s in starts[1:]] + [chrom_length] chunks = [tk_io.create_locus_info(chrom, s, e) for (s,e) in zip(starts, ends)] return chunks