def split(args): bam_in = create_bam_infile(args.input) chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_key=None, chunk_size_gb=0.5) for chunk in chunk_defs: chunk["__mem_gb"] = 8.0 return {'chunks': chunk_defs}
def split(args): # Chunk bam to get 1GB per chunk bam_in = create_bam_infile(args.input) bam_chunk_size_disk = 0.75 chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_func, chunk_size_gb=bam_chunk_size_disk) for chunk in chunk_defs: chunk['__mem_gb'] = 4 chunk['__vmem_gb'] = 5 + int( np.ceil(2 * whitelist_mem_gb(args.barcode_whitelist) + bam_chunk_size_disk * 10)) lane_coord_sys = tk_lane.LaneCoordinateSystem() # Reopen BAM for estimating tile extents bam_in = create_bam_infile(args.input) lane_coord_sys.estimate_tile_extents(bam_in) for cnum, chunk in enumerate(chunk_defs): chunk['lane_map'] = lane_coord_sys.to_dict() chunk['chunk_num'] = cnum return {'chunks': chunk_defs, 'join': {'__mem_gb': 8, '__threads': 4}}
def main(args, outs): outs.coerce_strings() bam_in = create_bam_infile(args.bucket[0]) bam_out, _ = tk_bam.create_bam_outfile( outs.bcsorted_bam, None, None, template=bam_in, pgs=[ tk_bam.make_pg_header(martian.get_pipelines_version(), "sort_reads_by_bc", TENX_PRODUCT_NAME) ]) bam_in.close() outs.total_reads = merge_by_key(args.bucket, bc_sort_key, bam_out) bam_out.close()
def main_bucket_reads_by_bc(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end prefixes = get_seqs(args.nbases) bam_in = create_bam_infile(args.input) buckets = {prefix: [] for prefix in prefixes} non_bc_bam_out, _ = tk_bam.create_bam_outfile(outs.default, None, None, template=bam_in) non_bc_reads = [] for r in tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)): barcode = get_read_barcode(r, None) if barcode is None: non_bc_bam_out.write(r) non_bc_reads.append(r) else: prefix = barcode[:args.nbases] buckets[prefix].append(r) non_bc_bam_out.close() # Set random seed to get deterministic qname subsampling random.seed(0) sampled_non_bc_reads = random.sample(non_bc_reads, min(len(non_bc_reads), len(prefixes))) outs.qnames = [read.qname for read in sampled_non_bc_reads] outs.buckets = {} files_dir = os.path.dirname(outs.default) for prefix, bucket in buckets.iteritems(): filename = os.path.join(files_dir, "bc_%s.bam" % prefix) outs.buckets[prefix] = filename bucket.sort(key=bc_sort_key) bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in) try: for r in bucket: bam_out.write(r) finally: bam_out.close() bam_in.close()
def main(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end bam_in = create_bam_infile(args.input) reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end))) tmp_dir = os.path.dirname(outs.default) bams_out = {} outs.buckets = {} buckets = {} for qname in args.qnames: filename = os.path.join(tmp_dir, "qname_%s.bam" % qname) bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in) bams_out[qname] = bam_out outs.buckets[qname] = filename buckets[qname] = [] qname_ranges = zip(args.qnames, args.qnames[1:]) for r in reads: qname = None for qnames in qname_ranges: if qnames[0] <= r.qname and r.qname <= qnames[1]: qname = qnames[0] break if qname is None: qname = args.qnames[-1] buckets[qname].append(r) for qname, bucket in buckets.iteritems(): bucket.sort(key=bc_sort_key) bam_out = bams_out[qname] for r in bucket: bam_out.write(r) bam_out.close()
def join(args, outs, chunk_defs, chunk_outs): chunk_lists = [[], []] outs.total_reads = 0 for chunk in zip(chunk_defs, chunk_outs): index = chunk[0].index chunk_lists[index].append(chunk) outs.total_reads += chunk[1].total_reads # Sanity check vs. position-sorted BAM with create_bam_infile(args.possorted_bam) as possorted_bam_in: assert possorted_bam_in.unmapped + possorted_bam_in.mapped == outs.total_reads buckets = [] for chunks in chunk_lists: chunks = sorted(chunks, key=lambda chunk: chunk[0].prefix) buckets += [chunk[1].bcsorted_bam for chunk in chunks] tk_bam.concatenate(outs.bcsorted_bam, buckets) print "%s indexing BAM file" % PROCESSED_BARCODE_TAG index = tenkit.bam.BamBCIndex(outs.bcsorted_bam) index.save_index() outs.bcsorted_bam_index = outs.bcsorted_bam + ".bxi" print "Wrote bx index to %s" % outs.bcsorted_bam_index
def main(args, outs): """Mark exact duplicate reads in the output BAM file while also writing out some summary statistics. PCR duplicates have the same read1 start site and read2 start site. """ args.coerce_strings() outs.coerce_strings() # Chunk output doesn't get indexed outs.fragments_index = None outs.index = None # Pull in prior likelihoods for barcodes raw_barcode_abundance = None barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) if args.raw_barcode_counts is not None and barcode_whitelist is not None: with open(args.raw_barcode_counts, 'r') as infile: raw_counts = json.load(infile) raw_barcode_abundance = { '{}-{}'.format(barcode, gem_group): count for gem_group, subdict in raw_counts.iteritems() for barcode, count in zip(barcode_whitelist, subdict['bc_counts']) } bam_in = create_bam_infile(args.input) bam_refs = bam_in.references bam_prefix, ext = os.path.splitext(outs.output) raw_bam_file = martian.make_path(bam_prefix + '_five_prime_pos_sorted' + ext) frag_prefix, ext = os.path.splitext(outs.fragments) raw_frag_file = martian.make_path(frag_prefix + '_raw' + ext) # only write CO line for one chunk, so we don't have duplicates after samtools merge if args.chunk_num == 0: COs = [ '10x_bam_to_fastq:R1(SEQ:QUAL,TR:TQ)', '10x_bam_to_fastq:R2(SEQ:QUAL,TR:TQ)', '10x_bam_to_fastq:I1(BC:QT)', '10x_bam_to_fastq:I2(CR:CY)', '10x_bam_to_fastq_seqnames:R1,R3,I1,R2' ] else: COs = None bam_out, _ = tk_bam.create_bam_outfile( raw_bam_file, None, None, template=bam_in, pgs=[ tk_bam.make_pg_header(martian.get_pipelines_version(), "mark_duplicates", TENX_PRODUCT_NAME) ], cos=COs) fragments_out = open(raw_frag_file, 'w') bam_in.reset() # Ensure the summary key indicates what kind of dup marking was actually performed. lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) reference_manager = ReferenceManager(args.reference_path) summarizer = DupSummary(split_bcs=False, lane_coordinate_system=lane_coord_sys, output_bam=bam_out, output_tsv=fragments_out, ref=reference_manager, bam_refs=bam_refs, priors=raw_barcode_abundance) # Now broadcast the selected reads to the summarizers consumers = [summarizer.read_consumer()] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # Close outfiles bam_out.close() fragments_out.close() # Feed the chunk barcode_counts data back to join() with open(outs.singlecell_mapping, 'w') as outfile: pickle.dump(summarizer.bc_counts, outfile) # Sort the output bam & tsv files sort_bam(raw_bam_file, outs.output, threads=martian.get_threads_allocation()) sort_bed(raw_frag_file, outs.fragments, genome=reference_manager.fasta_index, threads=martian.get_threads_allocation(), leave_key=True)
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ chunk = args.chunk bam_in = create_bam_infile(args.align_chunk) bam_out, _ = tk_bam.create_bam_outfile( outs.output, None, None, template=bam_in, pgs=[ tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs", TENX_PRODUCT_NAME) ]) gp_tagger = GlobalFivePrimePosTagger(bam_in) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc: idx for (idx, bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) if chunk['barcode'] is not None: processed_barcode_iter = get_raw_processed_barcodes( open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['sample_index'] is not None: sample_index_iter = tk_fasta.read_generator_fastq( open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) if chunk['trim'] is not None: trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip( chunk['trim']), paired_end=True) else: trim_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter, trim_iter) # First read try: read = bam_in.next() except StopIteration: read = None # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info, trim_info) in iters: tags = [] read_name = None if read is None: break if barcode_info is not None: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info is not None: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name is not None: if si_read_name.split()[0] != read_name: martian.log_info( "mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert (si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] r1_tags = tags r2_tags = list(r1_tags) if trim_info is not None: (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name, trim2_seq, trim2_qual) = trim_info if len(trim1_seq) > 0: r1_tags.append((TRIM_TAG, trim1_seq)) r1_tags.append((TRIM_QUAL_TAG, trim1_qual)) if len(trim2_seq) > 0: r2_tags.append((TRIM_TAG, trim2_seq)) r2_tags.append((TRIM_QUAL_TAG, trim2_qual)) reads_attached = 0 reads_to_attach = [] while read.query_name == read_name or read_name is None: tags = r1_tags if read.is_read1 else r2_tags if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags if reads_to_attach and ( read.query_name != reads_to_attach[0].query_name or reads_to_attach[0].query_name is None): gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not (get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) reads_to_attach = [] reads_to_attach.append(read) try: read = bam_in.next() except StopIteration: read = None break gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not (get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) # We may have more than 2 reads if there was a # secondary alignment, but less than 2 means # something went wrong assert (reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()
def main(args, outs): bam_in = create_bam_infile(args.chunk_bam) references = bam_in.references misc_sm = compute_basic_stats(bam_in, references) misc_sm.save(outs.misc_sm)