def get_feature_generator_fastq(files, extractor, interleaved, read_types, r1_length=None, r2_length=None): '''Extract feature barcodes from FASTQs. Args: files (list of File): FASTQ file handles for R1, R2 extractor (FeatureExtractor): Extracts feature barcodes interleaved (bool): Are R1,R2 interleaved in a single file read_types (list of str): List of read types (e.g. R1,R2) we need to inspect r1_length (int): Length to hard-trim R1 to r2_length (int): Length to hard-trim R2 to Returns: FeatureMatchResult: Yields the feature extraction result for a read pair ''' assert len(files) == 2 assert 'R1' in read_types or 'R2' in read_types # Apply hard trimming on input r1_hard_end = sys.maxint if r1_length is None else r1_length r2_hard_end = sys.maxint if r2_length is None else r2_length if interleaved: f = files[0] assert f # Get R1 and R2 seqs from interleaved FASTQ pair_iter = itertools.imap( lambda x: (x[0:3], x[3:6]), tk_fasta.read_generator_fastq(f, paired_end=True)) else: r1_iter = tk_fasta.read_generator_fastq( files[0], paired_end=False) if 'R1' in read_types else iter([]) r2_iter = tk_fasta.read_generator_fastq( files[1], paired_end=False) if 'R2' in read_types else iter([]) pair_iter = itertools.izip_longest(r1_iter, r2_iter) if read_types == ['R1']: match_func = lambda x: extractor.extract_single_end( x[0][1][0:r1_hard_end], # seq x[0][2][0:r1_hard_end], # qual 'R1') elif read_types == ['R2']: match_func = lambda x: extractor.extract_single_end( x[1][1][0:r2_hard_end], # seq x[1][2][0:r2_hard_end], # qual 'R2') elif read_types == ['R1', 'R2']: match_func = lambda x: extractor.extract_paired_end( x[0][1][0:r1_hard_end], # seq x[0][2][0:r1_hard_end], # qual x[1][1][0:r2_hard_end], # seq x[1][2][0:r2_hard_end]) # qual return itertools.imap(match_func, pair_iter)
def create_unaligned_bam(args, outs): star_ref_path = cr_utils.get_reference_star_path(args.reference_path) header_buf = cStringIO.StringIO() header_buf.write('@HD\tVN:1.4\n') # SQ header lines with open(os.path.join(star_ref_path, 'chrNameLength.txt')) as f: for line in f: chr_name, chr_len = line.strip().split('\t') header_buf.write('@SQ\tSN:{}\tLN:{}\n'.format(chr_name, chr_len)) # RG header lines for packed_rg in args.read_groups: header_buf.write( re.sub('\\\\t', '\t', tk_bam.make_rg_header(packed_rg)) + '\n') # Get read group ID for this chunk of reads read_group = args.read_group # pysam doesn't support reading SAM from a StringIO object with open('tmphdr', 'w') as f: f.write(header_buf.getvalue()) samfile = pysam.AlignmentFile('tmphdr', 'r', check_sq=False) outbam = pysam.AlignmentFile(outs.genome_output, 'wb', template=samfile) fastq_file1 = cr_io.open_maybe_gzip(args.read_chunk) fastq_file2 = cr_io.open_maybe_gzip( args.read2_chunk) if args.read2_chunk else None read1s = tk_fasta.read_generator_fastq(fastq_file1) read2s = tk_fasta.read_generator_fastq(fastq_file2) if fastq_file2 else [] record = pysam.AlignedSegment() record.flag = 4 for read1, read2 in itertools.izip_longest(read1s, read2s): name, seq, qual = read1 record.query_name, record.query_sequence = name.split(' ')[0], seq record.query_qualities = tk_fasta.get_qvs(qual) record.set_tag('RG', read_group, 'Z') outbam.write(record) if read2: name, seq, qual = read2 record.query_name, record.query_sequence = name.split(' ')[0], seq record.query_qualities = tk_fasta.get_qvs(qual) record.set_tag('RG', read_group, 'Z') outbam.write(record) samfile.close() fastq_file1.close() if fastq_file2 is not None: fastq_file2.close() outbam.close()
def main(args, outs): # Martian coerces dict keys to string # Coerce keys back to int args.chunks_per_gem_group = { int(k): v for k, v in args.chunks_per_gem_group.iteritems() } with open(args.read1s_chunk) as f1: read1s = [read for read in tk_fasta.read_generator_fastq(f1)] with open(args.read2s_chunk) as f2: read2s = [read for read in tk_fasta.read_generator_fastq(f2)] assert len(read1s) == len(read2s) fastqs_out = {} buckets = {} outs.buckets = {} for gem_group, bucket_name in enumerate_bucket_names( args.chunks_per_gem_group): filename = martian.make_path("%s.fastq" % bucket_name) fastqs_out[bucket_name] = open(filename, 'w') outs.buckets[bucket_name] = filename buckets[bucket_name] = [] for read1, read2 in itertools.izip(read1s, read2s): barcode = vdj_utils.get_fastq_read_barcode(read1) # Exclude unbarcoded reads if barcode is None: continue assert barcode == vdj_utils.get_fastq_read_barcode(read2) barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group]) buckets[bucket_name].append(read1) buckets[bucket_name].append(read2) # Sort and write each bucket for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=vdj_utils.fastq_barcode_sort_key) fastq_out = fastqs_out[bucket_name] for read in bucket: tk_fasta.write_read_fastq(fastq_out, *read) fastq_out.close()
def split(args): assert args.read1s is not None and args.read2s is not None chunks = [] if cr_chem.get_barcode_whitelist(args.chemistry_def) is not None: # Data are barcoded for read1_fq, read2_fq, barcodes_json in zip(args.read1s, args.read2s, args.chunk_barcodes): with open(barcodes_json) as f: chunk_barcodes = json.load(f) chunks.append({ 'read1_chunk': read1_fq, 'read2_chunk': read2_fq, 'barcodes_chunk': chunk_barcodes, '__mem_gb': 3.0, }) else: # Most stages assume that each chunk has a single barcode. # So unfortunately we have to put all reads in the same chunk, otherwise # metric computation will break. read1_out_filename = martian.make_path('chunk0_1.fastq') read2_out_filename = martian.make_path('chunk0_2.fastq') with open(read1_out_filename, 'w') as read1_out, open(read2_out_filename, 'w') as read2_out: for read1_file, read2_file in zip(args.read1s, args.read2s): with open(read1_file) as in1, open(read2_file) as in2: fastq1_iter = tk_fasta.read_generator_fastq( in1, paired_end=False) fastq2_iter = tk_fasta.read_generator_fastq( in2, paired_end=False) for read1_tuple in fastq1_iter: read2_tuple = fastq2_iter.next() tk_fasta.write_read_fastq(read1_out, *read1_tuple) tk_fasta.write_read_fastq(read2_out, *read2_tuple) chunks.append({ 'read1_chunk': read1_out_filename, 'read2_chunk': read2_out_filename, 'barcodes_chunk': [""], }) # Martian doesn't like empty chunk lists so create a chunk w/ empty data if len(chunks) == 0: return get_dummy_chunk() return {'chunks': chunks}
def merge_by_barcode(in_filenames, r1_out_file, r2_out_file, bcs_out_file, paired_end): barcodes = set() # Note: The filehandle cache precludes the use of compressed files file_cache = tk_cache.FileHandleCache(mode='r', open_func=open) heap = [] key_func = vdj_utils.fastq_barcode_sort_key for filename in in_filenames: try: fastq = tk_fasta.read_generator_fastq(file_cache.get(filename), paired_end=paired_end) first_readpair = fastq.next() key = key_func(first_readpair[0:3]) barcode = key[0] barcodes.add(barcode) heapq.heappush(heap, (key, first_readpair, filename)) except StopIteration: pass while len(heap) > 0: # Get the minimum item and write it. key, readpair, in_filename = heapq.heappop(heap) fastq = tk_fasta.read_generator_fastq(file_cache.get(in_filename), paired_end=paired_end) tk_fasta.write_read_fastq(r1_out_file, *readpair[0:3]) if paired_end: tk_fasta.write_read_fastq(r2_out_file, *readpair[3:6]) # Get the next item from the source file we just wrote from # If that file is out of items, then we leave that one out try: next_readpair = fastq.next() key = key_func(next_readpair[0:3]) barcode = key[0] barcodes.add(barcode) heapq.heappush(heap, (key, next_readpair, in_filename)) except StopIteration: pass json.dump(tk_safe_json.json_sanitize(list(barcodes)), bcs_out_file)
def infer_barcode_reverse_complement(barcode_whitelist, barcode_files): rc_valid_count = 0 reg_valid_count = 0 if barcode_whitelist: barcode_rc = [] for barcode_file in barcode_files: read_num = 0 if barcode_file[-3:] == ".gz": barcode_open_file = gzip.open(barcode_file) else: barcode_open_file = open(barcode_file, 'r') read_iter = tk_fasta.read_generator_fastq(barcode_open_file) for (name, seq, qual) in read_iter: if seq in barcode_whitelist: reg_valid_count += 1 if tk_seq.get_rev_comp(seq) in barcode_whitelist: rc_valid_count += 1 if read_num > 1000: break read_num += 1 if tk_stats.robust_divide(float(rc_valid_count), float(rc_valid_count + reg_valid_count)) > 0.75: barcode_rc.append(True) else: barcode_rc.append(False) barcode_open_file.close() return barcode_rc else: return [False] * len(barcode_files)
def estimate_read_count_and_length(fn, num_reads=1000): ''' Estimate the number of reads AND the average read length in the fastq file fn by only reading in the first num_reads (default 1000) reads. ''' # Open reader if fn[-2:] == 'gz': reader = gzip.open(fn) is_gz = True else: reader = open(fn, 'r') is_gz = False ## first compute the average read length avg_read_length = 0.0 gen = tk_fasta.read_generator_fastq(reader) rec_count = 0 for (header, r, qual) in gen: avg_read_length += len(r) rec_count += 1 if rec_count == num_reads: break avg_read_length = avg_read_length / rec_count if is_gz: file_len = reader.myfileobj.tell() else: file_len = reader.tell() ## total file size file_sz = os.path.getsize(fn) total_reads_est = float(num_reads) / file_len * file_sz return (total_reads_est, avg_read_length)
def fastq_data_estimate(fn, num_reads = 8000): # Open reader if fn[-2:] == 'gz': reader = gzip.open(fn) is_gz = True else: reader = open(fn, 'r') is_gz = False gen = tk_fasta.read_generator_fastq(reader) rds = itertools.islice(gen, num_reads) input_lens = [(len(header) + len(r) + len(qual) + 4, len(r)) for (header,r,qual) in rds] total_seq_len = sum(x[1] for x in input_lens) total_data_len = sum(x[0] for x in input_lens) file_sz = os.path.getsize(fn) if is_gz: #file_len = reader.myfileobj.tell() uncomp_size = estimate_gzip_uncompressed_size(fn) * 0.8 # STUPID FUDGE FACTOR TO GET >= REQUESTED AMT else: #file_len = data_len uncomp_size = file_sz read_yield = float(len(input_lens)) / total_data_len seq_yield = float(total_seq_len) / total_data_len predicted_reads = read_yield * uncomp_size predicted_seq = seq_yield * uncomp_size # For debugging #predicted_sz = float(total_data_len) / file_len * file_sz #gzip_sz = parse_gzip_sz(fn) #print "comp: %.2f, pred: %.2f, pred_mod2: %.2f, gzip_mod2: %.2f, gzip_est: %.2f" % (float(file_sz)/1e9, float(predicted_sz)/1e9, float(predicted_sz % 2**32)/1e9, float(gzip_sz)/1e9, float(uncomp_gzip_est2)/1e9) return (predicted_reads, predicted_seq)
def _compute_r1_length(fastqs, reads_interleaved): """ Infer the length of R1 """ num_reads = 0 r1_max_len = 0 def get_r1_noninterleaved(read_iter): for _, seq, _ in read_iter: yield seq def get_r1_interleaved(read_iter): for _, seq, _, _, _, _ in read_iter: yield seq get_r1 = get_r1_interleaved if reads_interleaved else get_r1_noninterleaved for fastq in fastqs: with cr_utils.open_maybe_gzip(fastq, 'r') as fq_file: reads = tk_fasta.read_generator_fastq(fq_file, reads_interleaved) for r1 in get_r1(reads): if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS: break r1_max_len = max(len(r1), r1_max_len) num_reads += 1 if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS: break return r1_max_len
def test_align(self): args = { 'chunk_input': IN_FASTQ, 'aligner': 'bwa', 'aligner_method': 'MEM', 'reference_path': 'hg19', '__threads': 1, 'reads_interleaved': True } outs = {'default': OUT_BAM} args = martian.Record(args) outs = martian.Record(outs) main(args, outs) # Ensure each read has a barcode out_bam = pysam.Samfile(OUT_BAM) bam_reads = list(out_bam) fq_file = open(IN_FASTQ) fq_reads = list( tk_fasta.read_generator_fastq(fq_file, paired_end=False)) self.assertEqual(len(bam_reads), len(fq_reads))
def split(args): '''We just align each chunk independently -- joining will happen in the join step of SORT_READS''' # Pull some reads from fastq files -- bail out if it's less than 25bp fastq_tests = [x['read1'] for x in args.chunks] for fastq_test in fastq_tests: with open(fastq_test) as in_file: reader = tk_fasta.read_generator_fastq(in_file) for name, read, qual in itertools.islice(reader, 10): continue if len(read) < MIN_READ_LENGTH: martian.alarm( "BWA-MEM can't handle reads <25bp -- reads will be unmapped." ) continue # estimated amount of memory needed to process genome is 2x(num gigabases)+4GB reference_pyfasta = tenkit.reference.open_reference(args.reference_path) reference_bases = sum( len(reference_pyfasta[contig]) for contig in reference_pyfasta) base_mem_in_gb = int(math.ceil(2 * reference_bases / (1024.0**3))) mem_in_gb = base_mem_in_gb + 4 chunks = [{ 'chunk': x, '__threads': args.num_threads, '__mem_gb': mem_in_gb } for x in args.chunks] return {'chunks': chunks}
def get_raw_processed_barcodes(barcode_file, barcode_whitelist, bc_confidence_threshold, gem_group, barcodes_reverse_complement, wl_idxs, wl_dist): """ Stream the barcodes and the 'processed' barcode """ bc_iterator = tk_fasta.read_generator_fastq(barcode_file) gem_group_str = "-" + str(gem_group) for (name, seq, qual) in bc_iterator: if barcodes_reverse_complement: seq = tk_seq.get_rev_comp(seq) qual = qual[::-1] #reverse qual string # Check for valid bc sequences if barcode_whitelist is None: # No whitelist case -- attach BC if there are no Ns if not ('N' in seq): processed_bc = seq + gem_group_str yield (name, seq, processed_bc, qual) else: yield (name, seq, None, qual) else: # whitelist case -- attach bc if posterior probability of best # BC sequence exceeds the confidence threshold bc_seq = handle_10x_barcode(bc_confidence_threshold, seq, qual, wl_idxs, wl_dist) if bc_seq is None: yield (name, seq, None, qual) else: processed_bc = bc_seq + gem_group_str yield (name, seq, processed_bc, qual)
def get_read_generator_fastq(fastq_open_file, read_def, reads_interleaved, r1_length=None, r2_length=None): read_iter = tk_fasta.read_generator_fastq( fastq_open_file, paired_end=reads_interleaved and read_def.read_type in ['R1', 'R2']) for read_tuple in read_iter: yield extract_read_maybe_paired(read_tuple, read_def, reads_interleaved, r1_length, r2_length)
def get_raw_processed_barcodes(barcode_file, barcode_whitelist, bc_confidence_threshold, gem_group, barcodes_reverse_complement, wl_idxs, wl_dist): """Stream through the raw barcodes and generate processed barcodes (which may be none)""" bc_iterator = tk_fasta.read_generator_fastq(barcode_file) for (name, seq, qual) in bc_iterator: if barcodes_reverse_complement: seq = tk_seq.get_rev_comp(seq) qual = qual[::-1] if barcode_whitelist is None: corrected_bc = None if ('N' in seq) else seq else: corrected_bc = correct_barcode(bc_confidence_threshold, seq, qual, wl_idxs, wl_dist, MAXDIST_CORRECT) if corrected_bc is not None: corrected_bc = '{}-{}'.format(corrected_bc, gem_group) yield (name, seq, corrected_bc, qual)
def get_run_data(fn): """ Parse flowcell + lane from the first FASTQ record. NOTE: we don't check whether there are multiple FC / lanes in this file. """ if fn[-2:] == 'gz': reader = gzip.open(fn) else: reader = open(fn, 'r') gen = tk_fasta.read_generator_fastq(reader) try: (name, seq, qual) = gen.next() (flowcell, lane) = re.split(':', name)[2:4] return (flowcell, lane) except StopIteration: # empty fastq martian.exit("FASTQ is empty: %s" % fn)
def fastq_data_estimate(fn, num_reads=5000): # Open reader if fn[-2:] == 'gz': reader = gzip.open(fn) is_gz = True else: reader = open(fn, 'r') is_gz = False gen = tk_fasta.read_generator_fastq(reader) rds = itertools.islice(gen, num_reads) input_lens = [(len(header) + len(r) + len(qual) + 4, len(r)) for (header, r, qual) in rds] total_seq_len = sum(x[1] for x in input_lens) total_data_len = sum(x[0] for x in input_lens) file_sz = os.path.getsize(fn) read_length = total_seq_len / len(input_lens) if is_gz: (uncomp_size, predicted_sz) = estimate_gzip_uncompressed_size(fn) else: uncomp_size = file_sz predicted_sz = file_sz read_yield = float(len(input_lens)) / total_data_len seq_yield = float(total_seq_len) / total_data_len predicted_reads = read_yield * uncomp_size predicted_seq = seq_yield * uncomp_size # Log estimate of downsampling gzip_sz = parse_gzip_sz(fn) martian.log_info("Estimates for: %s" % fn) dbg_str = "compressed_size: %.2f, predicted_size: %.2f, predicted_size_mod: %.2f, gzip_size_mod: %.2f, gzip_predicted_size: %.2f" % ( float(file_sz) / 1e9, float(predicted_sz) / 1e9, float(predicted_sz % 2**32) / 1e9, float(gzip_sz) / 1e9, float(uncomp_size) / 1e9) martian.log_info(dbg_str) return (predicted_reads, predicted_seq, read_length)
def fastq_data_estimate(fn, num_reads=1000000): # Open reader if fn[-2:] == 'gz': reader = gzip.open(fn) is_gz = True else: reader = open(fn, 'r') is_gz = False gen = tk_fasta.read_generator_fastq(reader) rds = itertools.islice(gen, num_reads) input_lens = [(len(header) + len(r) + len(qual) + 4, len(r)) for (header, r, qual) in rds] total_seq_len = sum(x[1] for x in input_lens) total_data_len = sum(x[0] for x in input_lens) file_sz = os.path.getsize(fn) # NOTE: do not try and use the gzip footer containing the length of the compressed data # that only reflects the length of the final gzip block. A valid gzip file may have # many blocks, so that field cannot be relied upon. if is_gz: compressed_sz = reader.myfileobj.tell() predicted_sz = total_data_len / compressed_sz * file_sz else: predicted_sz = file_sz read_yield = len(input_lens) / total_data_len seq_yield = total_seq_len / total_data_len predicted_reads = read_yield * predicted_sz predicted_seq = seq_yield * predicted_sz # Log estimate of downsampling martian.log_info("Estimates for: %s" % fn) dbg_str = "compressed_size: %.2f, predicted_size: %.2f" % \ (file_sz / 1e9, predicted_sz / 1e9) martian.log_info(dbg_str) return (predicted_reads, predicted_seq)
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ # Bail out if there's no barcodes or whitelist if args.barcode_whitelist is None or args.chunk['barcode'] is None: outs.bc_counts = None return def open_maybe_gzip(fn): if fn[-2:] == "gz": return gzip.open(fn) else: return open(fn) barcode_whitelist = sorted( list(tk_seq.load_barcode_whitelist(args.barcode_whitelist))) bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)} bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32) bad_count = 0 barcode_file = open_maybe_gzip(args.chunk['barcode']) bc_iterator = tk_fasta.read_generator_fastq(barcode_file) for (bc_read, raw_bc_seq, raw_bc_qual) in bc_iterator: idx = bc_idx.get(raw_bc_seq) if idx is not None: bc_counts[idx] += 1 else: bad_count += 1 # Write BC count array and bad count to pickle result = {} result['bad_bc_count'] = bad_count result['bc_counts'] = list(bc_counts) with open(outs.bc_counts, 'w') as bc_counts_out: tenkit.safe_json.dump_numpy(result, bc_counts_out)
def main(args, outs): reporter = vdj_report.VdjReporter() with open(args.contig_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) contigs.sort(key=lambda c: (c.barcode, c.get_single_chain( ), not c.productive, -c.umi_count, -c.read_count, -len(c))) low_confidence_contigs = set() cell_contigs = set() for (bc, chain), group in itertools.groupby(contigs, key=lambda c: (c.barcode, c.get_single_chain())): first_cdr3 = None first_cdr3_umis = None seen_cdr3s = set() for contig in group: contig.high_confidence = True if contig.is_cell: cell_contigs.add(contig.contig_name) if first_cdr3 is None: first_cdr3 = contig.cdr3_seq first_cdr3_umis = contig.umi_count # Mark as low confidence: # 1) Any additional CDR3s beyond the highest-(productive,UMI,read,length) contig's CDR3 # with a single UMI or low UMIs relative to the first contig, or extraneous_cdr3 = first_cdr3 is not None \ and contig.cdr3_seq != first_cdr3 \ and (contig.umi_count == 1 or \ (float(contig.umi_count) / first_cdr3_umis) < EXTRA_CONTIG_MIN_UMI_RATIO) # 2) Any contigs with a repeated CDR3. repeat_cdr3 = contig.cdr3_seq in seen_cdr3s if extraneous_cdr3 or repeat_cdr3: contig.high_confidence = False low_confidence_contigs.add(contig.contig_name) seen_cdr3s.add(contig.cdr3_seq) if chain in vdj_constants.VDJ_GENES: reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', chain).add( 1, filter=contig.high_confidence) reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', cr_constants.MULTI_REFS_PREFIX).add( 1, filter=contig.high_confidence) # Write augmented contig annotations with open(outs.contig_annotations, 'w') as f: vdj_annot.save_annotation_list_json(f, contigs) # Write filtered fasta with open(args.contig_fasta) as in_file, \ open(outs.filtered_contig_fasta, 'w') as out_file: for hdr, seq in cr_utils.get_fasta_iter(in_file): # Keep contigs that are high confidence & in cells if hdr not in low_confidence_contigs and hdr in cell_contigs: tk_fasta.write_read_fasta(out_file, hdr, seq) # Write filtered fastq with open(args.contig_fastq) as in_file, \ open(outs.filtered_contig_fastq, 'w') as out_file: for name, seq, qual in tk_fasta.read_generator_fastq(in_file): if name not in low_confidence_contigs and name in cell_contigs: tk_fasta.write_read_fastq(out_file, name, seq, qual) reporter.report_summary_json(outs.summary)
def main(args, outs): if args.vdj_reference_path is None: outs.chunked_annotations = None return chunk_contigs = [] barcodes_in_chunk = set(args.barcodes) # Set of barcodes that were called as cells if args.cell_barcodes: cell_barcodes_set = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes)) else: cell_barcodes_set = set() # Setup feature reference sequences res = vdj_annot.setup_feature_aligners(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes) feature_types, feature_aligners, feature_filters = res # Setup primer reference sequnces if args.primers: primer_aligner, primer_filter = vdj_annot.setup_primer_aligner(args.primers, vdj_constants.VDJ_ANNOTATION_MIN_SCORE_RATIO) read_counts = {} umi_counts = {} if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t') for _, row in contig_summary.iterrows(): read_counts[row.contig_name] = int(row.num_reads) umi_counts[row.contig_name] = int(row.num_umis) if args.filter_summary: try: filter_summary = vdj_utils.load_contig_summary_table(open(args.filter_summary)) except EmptyDataError: filter_summary = None else: filter_summary = None if not args.contigs_fastq is None: fq_iter = tk_fasta.read_generator_fastq(open(args.contigs_fastq), paired_end=False) for header, contig_sequence in cr_utils.get_fasta_iter(open(args.contigs)): if args.contigs_fastq is None: contig_quals = None else: header_fq, contig_sequence_fq, contig_quals = fq_iter.next() assert(contig_sequence_fq == contig_sequence) assert(header_fq == header) barcode = vdj_utils.get_barcode_from_contig_name(header) contig_name = header.split(' ')[0] # Only annotate barcodes assigned to this chunk and contigs with enough read support if barcode in barcodes_in_chunk: if filter_summary is not None: filtered = vdj_utils.is_contig_filtered(filter_summary, contig_name) else: filtered = True contig = vdj_annot.AnnotatedContig(contig_name, contig_sequence, quals=contig_quals, barcode=barcode, is_cell=barcode in cell_barcodes_set, filtered=filtered, read_count=read_counts.get(contig_name), umi_count=umi_counts.get(contig_name), ) contig.annotations = contig.annotate_features(feature_types, feature_aligners, feature_filters) if args.primers: contig.primer_annotations = contig.annotate_features_by_group(primer_aligner, alignment_filter=primer_filter) contig.annotate_cdr3() chunk_contigs.append(contig) cPickle.dump(chunk_contigs, open(outs.chunked_annotations, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
def main(args, outs): """ Trim the reads in a series of fasta files """ # Set a fixed random seed to eliminate noise in metrics random.seed(0) chunk = args.chunk interleaved = chunk['reads_interleaved'] have_read2 = chunk['read2'] is not None paired = interleaved or have_read2 read1_trim = args.read1_trim_length read2_trim = args.read2_trim_length subsample_rate = chunk['subsample_rate'] # BC config -- BC come from separate fastq, or are embedded in R1 or R2 have_barcode = False bc_in_read1 = False bc_in_read2 = False bc_in_fastq = False # If we have bc in read, use that & ignore a separate BC read if chunk.get('bc_in_read', None) is not None and chunk.get('bc_length', 0) > 0: have_barcode = True bc_length = chunk['bc_length'] if chunk['bc_in_read'] == 1: bc_in_read1 = True read1_trim += bc_length elif chunk['bc_in_read'] == 2: bc_in_read2 = True read2_trim += bc_length else: martian.exit( "bc_in_read configuration incorrect -- read must be 1 or 2") # Otherwise use the BC file elif chunk['barcode'] is not None: have_barcode = True bc_in_fastq = True have_sample_index = chunk['sample_index'] is not None output_directory = os.path.dirname(os.path.realpath(outs.placeholder)) max_read_num = args.max_read_num # counter for sub-chunked files file_number = 1 # open the available read files and make the appropriate iterators if interleaved: read_in = openfq(chunk['read1']) read_iter = tk_fasta.read_generator_fastq(read_in, paired_end=True) else: if have_read2: read1_in = openfq(chunk['read1']) read1_iter = tk_fasta.read_generator_fastq(read1_in) read2_in = openfq(chunk['read2']) read2_iter = tk_fasta.read_generator_fastq(read2_in) read_iter = itertools.imap( lambda x, y: (x[0], x[1], x[2], y[0], y[1], y[2]), read1_iter, read2_iter) else: read1_in = openfq(chunk['read1']) read_iter = tk_fasta.read_generator_fastq(read1_in) # open read file read_name = output_directory + "/read" + str(file_number) + ".fastq" read_names = [read_name] out_read_fastq = open(read_name, 'w') # Bail out if there's no barcodes or whitelist if args.barcode_whitelist is None: outs.bc_counts = None bc_idx = None else: barcode_whitelist = sorted( list(tk_seq.load_barcode_whitelist(args.barcode_whitelist))) bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)} bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32) bad_count = 0 # open barcode file if there is one if have_barcode: bc_name = output_directory + "/BC" + str(file_number) + ".fastq" out_bc_fastq = open(bc_name, 'w') bc_names = [bc_name] if bc_in_fastq: bc_in = openfq(chunk['barcode']) bc_iter = tk_fasta.read_generator_fastq(bc_in) elif bc_in_read1 or bc_in_read2: # BC in read -- have output file but no input file bc_iter = itertools.repeat(None) else: bc_iter = itertools.repeat(None) bc_names = [None] outs.bc_counts = None # open sample_index file if there is one if have_sample_index: si_name = output_directory + "/SI" + str(file_number) + ".fastq" out_si_fastq = open(si_name, 'w') si_in = openfq(chunk['sample_index']) si_iter = tk_fasta.read_generator_fastq(si_in) si_names = [si_name] else: si_iter = itertools.repeat(None) si_names = [None] # loop through reads read_num = 0 for read, barcode_read, sample_index_read in itertools.izip( read_iter, bc_iter, si_iter): if read_num > 0 and random.random() > subsample_rate: continue if paired: (name1, seq1, qual1, name2, seq2, qual2) = read else: (name1, seq1, qual1) = read new_seq1 = seq1[read1_trim:] new_qual1 = qual1[read1_trim:] if paired: new_seq2 = seq2[read2_trim:] new_qual2 = qual2[read2_trim:] # Get BC sequence out of the read, for BC-in-read schemes if bc_in_read1: barcode_read = (name1, seq1[:bc_length], qual1[:bc_length]) if bc_in_read2: barcode_read = (name2, seq2[:bc_length], qual2[:bc_length]) read_num += 1 if read_num > max_read_num: read_num = 1 file_number += 1 read_name = output_directory + "/read" + str( file_number) + ".fastq" out_read_fastq.close() out_read_fastq = open(read_name, 'w') read_names.append(read_name) if have_barcode: bc_name = output_directory + "/BC" + str( file_number) + ".fastq" out_bc_fastq.close() out_bc_fastq = open(bc_name, 'w') bc_names.append(bc_name) else: bc_names.append(None) if have_sample_index: si_name = output_directory + "/SI" + str( file_number) + ".fastq" out_si_fastq.close() out_si_fastq = open(si_name, 'w') si_names.append(si_name) else: si_names.append(None) if have_barcode: barcode_seq = barcode_read[1] barcode_qual = barcode_read[2] if chunk['barcode_reverse_complement']: barcode_seq = tk_seq.get_rev_comp(barcode_seq) barcode_qual = barcode_qual[:: -1] # obscure way to reverse string if bc_idx is not None: idx = bc_idx.get(barcode_seq) if idx is not None: bc_counts[idx] += 1 else: bad_count += 1 tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0], barcode_seq, barcode_qual) if have_sample_index: tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0], sample_index_read[1], sample_index_read[2]) tk_fasta.write_read_fastq(out_read_fastq, name1, new_seq1, new_qual1) if paired: tk_fasta.write_read_fastq(out_read_fastq, name2, new_seq2, new_qual2) if have_barcode: out_bc_fastq.close() # Only emit BC counts if we had a whitelist if outs.bc_counts is not None: result = {} result['bad_bc_count'] = bad_count result['bc_counts'] = list(bc_counts) with open(outs.bc_counts, 'w') as bc_counts_out: tenkit.safe_json.dump_numpy(result, bc_counts_out) if have_sample_index: out_si_fastq.close() out_read_fastq.close() chunks = [] for (r, bc, si) in zip(read_names, bc_names, si_names): new_chunk = { 'read1': r, 'read2': None, 'barcode': bc, 'sample_index': si, 'barcode_reverse_complement': False, 'reads_interleaved': have_read2 or interleaved, 'gem_group': chunk['gem_group'], 'read_group': chunk['read_group'] } chunks.append(new_chunk) outs.chunks = chunks
def main(args, outs): """ Trim the reads in a series of fasta files """ chunk = args.chunk subsample_rate = chunk['subsample_rate'] have_barcode = chunk['barcode'] is not None have_sample_index = chunk['sample_index'] is not None # STEP 1: We run the R1/R2 reads through cutadapt, writing them to a temporary file with appropriate adapters # trimmed, optionally filtering out reads where adapters weren't found interleaved = chunk['read2'] is None # can't do discard_untrimmed because we're running cutadapt in single-end mode if args.trim_def['discard_untrimmed']: martian.exit("discard_untrimmed was set in trim_def") if interleaved: trimmed_reads = martian.make_path("trimmed_reads.fastq") trim_info_fn = martian.make_path("trim_info.txt") initial_read_pairs, trimmed_read_pairs = run_cutadapt_single_end( chunk['read1'], trimmed_reads, trim_info_fn, args.trim_def, args.adapters) else: trimmed_r1 = martian.make_path("trimmed_r1.fastq") trimmed_r2 = martian.make_path("trimmed_r2.fastq") trim_info_r1_fn = martian.make_path("trim_info_r1.txt") trim_info_r2_fn = martian.make_path("trim_info_r2.txt") initial1, trimmed1 = run_cutadapt_single_end(chunk['read1'], trimmed_r1, trim_info_r1_fn, args.trim_def, args.adapters, read_id="R1") initial2, trimmed2 = run_cutadapt_single_end(chunk['read2'], trimmed_r2, trim_info_r2_fn, args.trim_def, args.adapters, read_id="R2") initial_read_pairs = initial1 + initial2 trimmed_read_pairs = trimmed1 + trimmed2 if initial1 != initial2: martian.exit( "Input fastq files for R1 and R2 are not the same length") if trimmed1 != trimmed2: raise ValueError( "Cutadapt produced differing numbers of reads for R1 and R2") # STEP 2: We run through the trimmed R1/R2 reads along with sample index and barcode reads, chunking into files of # max_read_num reads or less, and skipping sample index/barcode reads that don't match the trimmed & filtered R1/R2 # reads max_read_num = args.max_read_num file_number = 1 # open the available input read files and get the iterator over them if interleaved: reads_in = open_maybe_gzip(trimmed_reads, 'r') read_iter = tk_fasta.read_generator_fastq(reads_in, paired_end=True) trim_info = open_maybe_gzip(trim_info_fn, 'r') trim_iter = read_generator_trim_info(trim_info, paired_end=True) else: r1_in = open_maybe_gzip(trimmed_r1, 'r') r2_in = open_maybe_gzip(trimmed_r2, 'r') read_iter = ((r1[0], r1[1], r1[2], r2[0], r2[1], r2[2]) for r1, r2 in itertools.izip_longest( tk_fasta.read_generator_fastq(r1_in), tk_fasta.read_generator_fastq(r2_in))) trim_info_r1 = open_maybe_gzip(trim_info_r1_fn, 'r') trim_info_r2 = open_maybe_gzip(trim_info_r2_fn, 'r') trim_iter = (t1 + t2 for t1, t2 in itertools.izip( read_generator_trim_info(trim_info_r1), read_generator_trim_info(trim_info_r2))) # open output read file, which will be interleaved read_name = martian.make_path("read{}.fastq".format(file_number)) out_readfiles = [read_name] out_read_fastq = open(read_name, 'w') # open trimmed read file, which will be interleaved trim_out_name = martian.make_path("TRIM{}.fastq".format(file_number)) out_trimfiles = [trim_out_name] out_trim_fastq = open(trim_out_name, 'w') if args.barcode_whitelist is None: outs.bc_counts = None barcode_indices = None else: barcode_whitelist = sorted( list(load_barcode_whitelist(args.barcode_whitelist))) barcode_indices = { bc: idx for (idx, bc) in enumerate(barcode_whitelist) } bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32) bad_count = 0 # open barcode file if there is one if have_barcode: bc_name = martian.make_path("BC{}.fastq".format(file_number)) out_bc_fastq = open(bc_name, 'w') out_barcodefiles = [bc_name] barcode_read = None bc_in = open_maybe_gzip(chunk['barcode'], 'r') bc_iter = tk_fasta.read_generator_fastq(bc_in) # Determine if barcode sequences need to be reverse complemented. with open_maybe_gzip(chunk['barcode'], 'r') as bc_in2: bc_iter2 = tk_fasta.read_generator_fastq(bc_in2) barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) barcode_rc = infer_barcode_reverse_complement( barcode_whitelist, bc_iter2) else: out_barcodefiles = [None] outs.bc_counts = None # open sample_index file if there is one if have_sample_index: si_name = martian.make_path("SI{}.fastq".format(file_number)) out_si_fastq = open(si_name, 'w') si_in = open_maybe_gzip(chunk['sample_index'], 'r') sample_index_read = None si_iter = tk_fasta.read_generator_fastq(si_in) out_sampleindex_files = [si_name] else: out_sampleindex_files = [None] read_num = 0 random.seed(0) for (read, trim) in itertools.izip(read_iter, trim_iter): # Downsample (other than the first read). Note we've set a fixed seed to make this deterministic. if read_num > 0 and random.random() > subsample_rate: continue # Now we need to step through the barcode and sample index reads to find the matching reads if have_barcode: try: while barcode_read is None or not read_match( read, barcode_read): barcode_read = bc_iter.next() # reverse complement if all barcodes are RC-ed if barcode_rc: barcode_read = (barcode_read[0], tk_seq.get_rev_comp(barcode_read[1]), barcode_read[2][::-1]) except StopIteration: raise ValueError( "Couldn't find barcode read matching {}".format( get_read_name(read))) if have_sample_index: try: while sample_index_read is None or not read_match( read, sample_index_read): sample_index_read = si_iter.next() except StopIteration: raise ValueError( "Couldn't find sample index read matching {}".format( get_read_name(read))) (name1, seq1, qual1, name2, seq2, qual2) = read (tr_name1, tr_seq1, tr_qual1, tr_name2, tr_seq2, tr_qual2) = trim read_num += 1 if read_num > max_read_num: read_num = 1 file_number += 1 read_name = martian.make_path("read{}.fastq".format(file_number)) out_read_fastq.close() out_read_fastq = open(read_name, 'w') out_readfiles.append(read_name) trim_out_name = martian.make_path( "TRIM{}.fastq".format(file_number)) out_trim_fastq.close() out_trim_fastq = open(trim_out_name, 'w') out_trimfiles.append(trim_out_name) if have_barcode: bc_name = martian.make_path("BC{}.fastq".format(file_number)) out_bc_fastq.close() out_bc_fastq = open(bc_name, 'w') out_barcodefiles.append(bc_name) else: out_barcodefiles.append(None) if have_sample_index: si_name = martian.make_path("SI{}.fastq".format(file_number)) out_si_fastq.close() out_si_fastq = open(si_name, 'w') out_sampleindex_files.append(si_name) else: out_sampleindex_files.append(None) if have_barcode: barcode_seq = barcode_read[1] barcode_qual = barcode_read[2] if barcode_indices is not None: idx = barcode_indices.get(barcode_seq) if idx is not None: bc_counts[idx] += 1 else: bad_count += 1 tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0], barcode_seq, barcode_qual) if have_sample_index: tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0], sample_index_read[1], sample_index_read[2]) tk_fasta.write_read_fastq(out_read_fastq, name1, seq1, qual1) tk_fasta.write_read_fastq(out_read_fastq, name2, seq2, qual2) tk_fasta.write_read_fastq(out_trim_fastq, tr_name1, tr_seq1, tr_qual1) tk_fasta.write_read_fastq(out_trim_fastq, tr_name2, tr_seq2, tr_qual2) if interleaved: reads_in.close() else: r1_in.close() r2_in.close() if have_barcode: out_bc_fastq.close() # Only emit BC counts if we had a whitelist if outs.bc_counts is not None: result = {} result['bad_bc_count'] = bad_count result['bc_counts'] = list(bc_counts) with open(outs.bc_counts, 'w') as bc_counts_out: tenkit.safe_json.dump_numpy(result, bc_counts_out) with open(outs.read_counts, 'w') as outfile: read_counts = { 'total_read_pairs': initial_read_pairs, 'filtered_read_pairs': trimmed_read_pairs } tenkit.safe_json.dump_numpy(read_counts, outfile) if have_sample_index: out_si_fastq.close() out_read_fastq.close() out_trim_fastq.close() outs.chunks = [ { 'read1': r, # output chunked trimmed read file 'read2': None, 'trim': t, # output chunked trim file 'barcode': bc, # output chunked barcode file 'sample_index': si, # output chunked sample index file 'barcode_reverse_complement': False, # we always keep BC in correct orientation 'reads_interleaved': True, 'gem_group': chunk['gem_group'], 'read_group': chunk['read_group'] } for (r, t, bc, si) in zip(out_readfiles, out_trimfiles, out_barcodefiles, out_sampleindex_files) ]
def main(args, outs): # Martian coerces dict keys to string # Coerce keys back to int args.chunks_per_gem_group = {int(k): v for k, v in args.chunks_per_gem_group.iteritems()} paired_end = args.read2s_chunk is not None # Lazy load R1 r1_file = cr_io.open_maybe_gzip(args.read1s_chunk) read1s = tk_fasta.read_generator_fastq(r1_file) # Lazy load R2 if paired_end: r2_file = cr_io.open_maybe_gzip(args.read2s_chunk) read2s = tk_fasta.read_generator_fastq(r2_file) else: read2s = [] # Lazy load corrected BCs bc_file = cr_io.open_maybe_gzip(args.bcs) bcs = (line.strip() for line in bc_file) buckets = {} bucket_filenames = {} for gem_group, bucket_name in enumerate_bucket_names(args.chunks_per_gem_group): filename = martian.make_path("%s.fastq" % bucket_name) bucket_filenames[bucket_name] = filename buckets[bucket_name] = [] for read1, read2, barcode in itertools.izip_longest(read1s, read2s, bcs): # Exclude unbarcoded reads if barcode == '': continue # Exclude short reads if len(read1[1]) < MIN_READ_LENGTH or (read2 is not None and len(read2[1]) < MIN_READ_LENGTH): continue # Attach processed barcode to reads r1_hdr = cr_fastq.AugmentedFastqHeader(read1[0]) r1_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode) r1_new_qname = r1_hdr.to_string() if paired_end: r2_hdr = cr_fastq.AugmentedFastqHeader(read2[0]) r2_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode) r2_new_qname = r2_hdr.to_string() barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group]) buckets[bucket_name].append((r1_new_qname, read1[1], read1[2])) if paired_end: buckets[bucket_name].append((r2_new_qname, read2[1], read2[2])) outs.buckets = {} # Sort and write each bucket for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=vdj_utils.fastq_barcode_sort_key) # Don't create empty bucket files. # This is common when the reads are ordered by gem group # And a chunk sees only a single gem group. if len(bucket) == 0: continue filename = bucket_filenames[bucket_name] with cr_io.open_maybe_gzip(filename, 'w') as f: for read in bucket: tk_fasta.write_read_fastq(f, *read) outs.buckets[bucket_name] = bucket_filenames[bucket_name]
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ chunk = args.chunk bam_in = create_bam_infile(args.align_chunk) bam_out, _ = tk_bam.create_bam_outfile( outs.output, None, None, template=bam_in, pgs=[ tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs", TENX_PRODUCT_NAME) ]) gp_tagger = GlobalFivePrimePosTagger(bam_in) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc: idx for (idx, bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) if chunk['barcode'] is not None: processed_barcode_iter = get_raw_processed_barcodes( open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['sample_index'] is not None: sample_index_iter = tk_fasta.read_generator_fastq( open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) if chunk['trim'] is not None: trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip( chunk['trim']), paired_end=True) else: trim_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter, trim_iter) # First read try: read = bam_in.next() except StopIteration: read = None # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info, trim_info) in iters: tags = [] read_name = None if read is None: break if barcode_info is not None: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info is not None: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name is not None: if si_read_name.split()[0] != read_name: martian.log_info( "mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert (si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] r1_tags = tags r2_tags = list(r1_tags) if trim_info is not None: (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name, trim2_seq, trim2_qual) = trim_info if len(trim1_seq) > 0: r1_tags.append((TRIM_TAG, trim1_seq)) r1_tags.append((TRIM_QUAL_TAG, trim1_qual)) if len(trim2_seq) > 0: r2_tags.append((TRIM_TAG, trim2_seq)) r2_tags.append((TRIM_QUAL_TAG, trim2_qual)) reads_attached = 0 reads_to_attach = [] while read.query_name == read_name or read_name is None: tags = r1_tags if read.is_read1 else r2_tags if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags if reads_to_attach and ( read.query_name != reads_to_attach[0].query_name or reads_to_attach[0].query_name is None): gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not (get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) reads_to_attach = [] reads_to_attach.append(read) try: read = bam_in.next() except StopIteration: read = None break gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not (get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) # We may have more than 2 reads if there was a # secondary alignment, but less than 2 means # something went wrong assert (reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ chunk = args.chunk #subsample_rate = 1.0 #if args.subsample_rate is not None: # subsample_rate = args.subsample_rate bam_in = tk_bam.create_bam_infile(args.align_chunk) bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs")) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) def open_maybe_gzip(fn): if fn[-2:] == "gz": return gzip.open(fn) else: return open(fn) if chunk['barcode']: processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['sample_index']: sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter) # First read read = bam_in.next() # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info) in iters: tags = [] read_name = None if read is None: break if barcode_info: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name != None: if si_read_name.split()[0] != read_name: martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert(si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] reads_attached = 0 #emit_read_pair = random.random() < subsample_rate emit_read_pair = True while read.qname == read_name or read_name == None: if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags reads_attached += 1 if not (read_name is None): assert(read.qname == read_name) if emit_read_pair: # Count the perfect reads -- will be used when subsampling in dedup if tenkit.read_filter.stringent_read_filter(read, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not(tk_io.get_read_barcode(read) is None): bam_out.write(read) else: bam_out.write(read) try: read = bam_in.next() except StopIteration: read = None break # We may have more than 2 reads is there was a # secondary alignment, but less than 2 means # something went wrong assert(reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group, args.library_type) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk) in_read2_fastq = cr_io.open_maybe_gzip( args.read2_chunk) if args.read2_chunk else [] outs.corrected_bcs += h5_constants.LZ4_SUFFIX out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) processed_bc = None if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) reporter.vdj_barcode_cb(raw_bc, processed_bc) out_file.write('%s\n' % (processed_bc if processed_bc is not None else '')) in_read1_fastq.close() if in_read2_fastq: in_read2_fastq.close() out_file.close() bc_counter.close() reporter.save(outs.chunked_reporter)
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ # this silences a weird non-failure in --strict=error mode # TODO(lhepler): remove this when martian upstream handles this itself outs.outputs = [] chunk = args.chunk bam_in = tk_bam.create_bam_infile(args.align_chunk) bc_spec = "{}:{}".format(RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG) # only comment the first chunk, otherwise later merge will duplicate the comments and could lead to: # samtools merge ... : '[finish_merged_header] Output header text too long' if args.chunk_index > 0: COs = None elif chunk['trim']: COs = ['10x_bam_to_fastq:R1({},TR:TQ,SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)'] else: COs = ['10x_bam_to_fastq:R1({},SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)'] bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs")], cos = COs) gp_tagger = GlobalFivePrimePosTagger(bam_in) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) def open_maybe_gzip(fn): if fn[-2:] == "gz": return gzip.open(fn) else: return open(fn) if chunk['barcode']: processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['trim']: trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['trim']), paired_end=True) else: trim_iter = itertools.repeat(None) if chunk['sample_index']: sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter, trim_iter) # First read read = bam_in.next() # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info, trim_info) in iters: tags = [] read_name = None if read is None: break if barcode_info: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name != None: if si_read_name.split()[0] != read_name: martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert(si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] r1_tags = tags r2_tags = list(tags) if trim_info: (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name, trim2_seq, trim2_qual) = trim_info if len(trim1_seq) > 0: r1_tags.append((TRIM_TAG, trim1_seq)) r1_tags.append((TRIM_QUAL_TAG, trim1_qual)) if len(trim2_seq) > 0: r2_tags.append((TRIM_TAG, trim2_seq)) r2_tags.append((TRIM_QUAL_TAG, trim2_qual)) reads_attached = 0 reads_to_attach = [] while read.qname == read_name or read_name == None: tags = r1_tags if read.is_read1 else r2_tags if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags if not (read_name is None): assert(read.qname == read_name) if reads_to_attach and (read.query_name != reads_to_attach[0].query_name or reads_to_attach[0].query_name is None): gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not(crdna_io.get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) reads_to_attach = [] reads_to_attach.append(read) try: read = bam_in.next() except StopIteration: read = None break gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not(crdna_io.get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) # We may have more than 2 reads is there was a # secondary alignment, but less than 2 means # something went wrong assert(reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()
def get_consensus_seq(clonotype_name, sel_contigs, best_contig, out_dir, args): """Build a consensus sequence from a set of contigs. Args: - clonotype_name: Used to prefix output files. - sel_contigs: Names of contigs to use for consensus building. - best_contig: Name of "best" contig. Will search for this contig's sequence and base qualities. - out_dir: dir used for temporary results - args: stage args. - Return value: A tuple (best_contig_seq, best_contig_quals, consensus_seq, out_bam_name, out_fastq_name, out_fasta_name). - best_contig_seq/best_contig_quals: the sequence and quals of the best contig - consensus_seq: the consensus sequence or None if no consensus could be built. - out_bam_name: Path of BAM with alignments of contigs to consensus seq. - out_fastq_name: FASTQ with contig sequences. - out_fasta_name: FASTA with consensus sequence. enough reads for consensus. """ best_contig_seq = None best_contig_quals = None # Input to base quality computation - we don't really need the # base qualities because we will replace them by read-based qualities # But we need to do this to get proper alignments of contigs against # the consensus. out_fastq_name = martian.make_path(clonotype_name + '_contigs.fastq') # Input to assembly out_bam_name = martian.make_path(clonotype_name + '_contigs.bam') # The reference in the output bam doesn't really matter. out_bam, _ = tk_bam.create_bam_outfile(out_bam_name, ['chr1'], [1]) # Read the entire fastq (all contigs) and write the selected contigs to # a bam for the assembler and a fastq for the aligner. with open(args.contigs_fastq, 'r') as f, open(out_fastq_name, 'w') as out_fq: fq_iter = tk_fasta.read_generator_fastq(f) for (name, seq, quals) in fq_iter: if name in sel_contigs: if name == best_contig: best_contig_seq = seq best_contig_quals = quals header = cr_fastq.AugmentedFastqHeader(name) # Create a pseudo-UMI for each input contig header.set_tag(PROCESSED_UMI_TAG, name) # Put all reads on the same "barcode". This is important, so # the assembler assembles all of them together. header.set_tag(PROCESSED_BARCODE_TAG, clonotype_name) record = pysam.AlignedRead() record.reference_start = 0 record.reference_id = 0 # Wrap with str() or pysam will crash when given unicode record.qname = str(header.to_string()) record.seq = seq record.qual = quals record.flag = MAPPED_UNPAIRED_FLAG out_bam.write(record) # Now change the tags. The final bam concatenation code will pull # the tags out of the header, so we want these to be meaningful. # Put the real barcode in the barcode tag. The alignment-base-qual # code will ignore it anyway. header.set_tag(PROCESSED_BARCODE_TAG, name.split('_')[0]) tk_fasta.write_read_fastq(out_fq, header.to_string(), seq, quals) out_bam.close() assert (not best_contig_seq is None) out_fasta_name = martian.make_path(clonotype_name + '_contigs.fasta') # Run the assembler to produce a consensus sequence. Read contig-reads from out_bam_name. # The resulting sequences will be in out_dir/<clonotype_name>_contigs.fasta. This is the # only output of the assembler we care about. if len(sel_contigs) >= MIN_CONTIGS_FOR_CONSENSUS: cmd = [ 'vdj_asm', 'asm', out_bam_name, out_dir, '--single-end', '--cons', # required so we produce a single output sequence '--kmers=0', '--min-qual=0', '--score-factor=0.0' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(os.path.join(out_dir, clonotype_name + '_contigs.fasta'), 'r') as contig_f: lines = contig_f.readlines() if lines: out_seq = lines[1].strip() else: # In some rare cases (eg. input contigs have 0 quality), assembly might fail. out_seq = None else: out_seq = None # Write the best contig sequence on a new fasta. We need to make sure this has the # right contig name because this will be the name written in the bam alignments # of the contigs against the consensus with open(out_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, clonotype_name, out_seq if out_seq else best_contig_seq) # Now align the same reads that were used in vdj_asm against the consensus that you just got. # The output will be in out_dir/<clonotype_name> + '_contigs.bam' cmd = [ 'vdj_asm', 'base-quals', martian.make_path(clonotype_name + '_contigs'), out_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move the BAM of the contigs aligned against the consensus out of the outs # (Will overwrite this bam which was already used as input to assembly). cr_io.move(os.path.join(out_dir, clonotype_name + '_contigs.bam'), out_bam_name) return (best_contig_seq, best_contig_quals, out_seq, out_bam_name, out_fastq_name, out_fasta_name)
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = open(args.read1_chunk) in_read2_fastq = open(args.read2_chunk) out_read1_fastq = open(outs.corrected_read1s, 'w') out_read2_fastq = open(outs.corrected_read2s, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) read2_header = cr_fastq.AugmentedFastqHeader(read2[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) reporter.vdj_barcode_cb(raw_bc, processed_bc) tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(), read1[1], read1[2]) tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(), read2[1], read2[2]) in_read1_fastq.close() in_read2_fastq.close() out_read1_fastq.close() out_read2_fastq.close() bc_counter.close() reporter.save(outs.chunked_reporter)