def run_assembly(fastq_pref, fasta_pref, args): cmd = [ 'vdj_asm', 'asm', fastq_pref, fasta_pref, '--kmers=' + str(args.min_kmer_count), '--min-contig=' + str(args.min_contig_len), '--min-qual=' + str(args.min_qual), '--score-factor=' + str(args.score_factor), '--qual-factor=' + str(args.qual_factor), '--min-sw-score=' + str(args.min_sw_score), '--rt-error=' + str(args.rt_error) ] if not cr_chem.has_umis(args.chemistry_def): martian.log_info('Assembly without UMIs is not fully supported.') cutoff = args.min_readpairs_per_umi[str(args.gem_group)] if cr_chem.is_paired_end(args.chemistry_def): cmd.append('--min-umi-reads=' + str(2 * cutoff)) else: cmd.append('--min-umi-reads=' + str(cutoff)) cmd.append('--single-end') if args.use_unmapped: cmd.append('--use-unmapped') #cmd.append('--mixture-filter') print >> sys.stderr, 'Running', ' '.join(cmd) tk_subproc.check_call(cmd, cwd=os.getcwd())
def run_assembly(fastq_pref, fasta_pref, args): cmd = [ 'vdj_asm', 'asm', fastq_pref, fasta_pref, '--kmers=' + str(args.min_kmer_count), '--min-contig=' + str(args.min_contig_len), '--npaths=' + str(args.npaths), '--nx=' + str(args.nx), '--min-qual=' + str(args.min_qual), '--score-factor=' + str(args.score_factor), '--qual-factor=' + str(args.qual_factor), '--min-sw-score=' + str(args.min_sw_score), '--rt-error=' + str(args.rt_error), '--subsample-rate=' + str(args.subsample_rate[str(args.gem_group)]), ] if not cr_chem.has_umis(args.chemistry_def): martian.log_info('Assembly without UMIs is not fully supported.') if not args.use_sw: cmd.append('--fast-align') if not args.min_readpairs_per_umi is None: # If only assembling with read2, adjust this cutoff # NOTE: Martian stores the gem_group dict keys as strings cutoff = args.min_readpairs_per_umi[str(args.gem_group)] cmd.append('--min-umi-reads=' + str(cutoff)) print >> sys.stderr, 'Running', ' '.join(cmd) subprocess.check_call(cmd, cwd=os.getcwd())
def main(args, outs): in_bam = tk_bam.create_bam_infile(args.chunk_input) libraries = rna_library.get_bam_library_info(in_bam) distinct_library_types = sorted( list(set([x['library_type'] for x in libraries]))) library_prefixes = map( lambda lib: rna_library.get_library_type_metric_prefix(lib[ 'library_type']), libraries) chroms = in_bam.references barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_summary = cr_utils.load_barcode_tsv( args.barcodes_detected) if not barcode_whitelist else None # TODO: this is redundant gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_summary=barcode_summary, gem_groups=args.gem_groups, library_types=distinct_library_types) feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) if barcode_whitelist: barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, args.gem_groups) else: barcode_seqs = barcode_summary matrix = cr_matrix.CountMatrix.empty(feature_ref, barcode_seqs, dtype='int32') for qname, reads_iter, _ in cr_utils.iter_by_qname(in_bam, None): is_conf_mapped_deduped, genome, feature_id, bc = reporter.count_genes_bam_cb( reads_iter, libraries, library_prefixes, use_umis=cr_chem.has_umis(args.chemistry_def)) if is_conf_mapped_deduped: matrix.add(feature_id, bc) in_bam.close() reporter.store_reference_metadata(args.reference_path, cr_constants.REFERENCE_TYPE, cr_constants.REFERENCE_METRIC_PREFIX) matrix.save_h5_file(outs.matrices_h5) reporter.save(outs.chunked_reporter)
def main(args, outs): in_bam = tk_bam.create_bam_infile(args.chunk_input) chroms = in_bam.references barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_summary = cr_utils.load_barcode_summary( args.barcode_summary) if not barcode_whitelist else None gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_summary=barcode_summary, gem_groups=args.gem_groups) if barcode_whitelist: barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, args.gem_groups) else: barcode_seqs = barcode_summary genomes = cr_utils.get_reference_genomes(args.reference_path) genes = cr_utils.split_genes_by_genomes(gene_index.get_genes(), genomes) matrices = cr_matrix.GeneBCMatrices(genomes, genes, barcode_seqs) for read in in_bam: is_conf_mapped_deduped, genome, gene_id, bc = reporter.count_genes_bam_cb( read, use_umis=cr_chem.has_umis(args.chemistry_def)) if is_conf_mapped_deduped: matrices.add(genome, gene_id, bc) in_bam.close() matrices.save_h5(outs.matrices_h5) reporter.save(outs.chunked_reporter)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [ rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def ] read_tags = [ None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained trim_defs = compute_trim_defs( read_defs, read_tags, args.chemistry_def.get('retain_trimmed_suffix_read')) outs.bam_comments = sorted( set([td.bam_to_fastq for td in trim_defs.itervalues()])) gem_groups = [chunk['gem_group'] for chunk in args.chunks] reporter = cr_report.Reporter( umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), gem_groups=gem_groups) # Determine if barcode sequences need to be reverse complemented. bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) bc_check_rc.close() # Determine which read_iters need to retain trimmed sequence # (only one per read-type e.g., one per R1, one per R2, etc.) read_types_with_trim_def = set() rna_read_trim_defs = None rna_read2_trim_defs = None bc_read_trim_defs = None si_read_trim_defs = None umi_read_trim_defs = None if rna_read_def.read_type not in read_types_with_trim_def: rna_read_trim_defs = trim_defs read_types_with_trim_def.add(rna_read_def.read_type) if rna_read2_def.read_type not in read_types_with_trim_def: rna_read2_trim_defs = trim_defs read_types_with_trim_def.add(rna_read2_def.read_type) if bc_read_def.read_type not in read_types_with_trim_def: bc_read_trim_defs = trim_defs read_types_with_trim_def.add(bc_read_def.read_type) if si_read_def.read_type not in read_types_with_trim_def: si_read_trim_defs = trim_defs read_types_with_trim_def.add(si_read_def.read_type) if umi_read_def.read_type not in read_types_with_trim_def: umi_read_trim_defs = trim_defs read_types_with_trim_def.add(umi_read_def.read_type) # Setup read iterators. rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, rna_read_trim_defs) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, rna_read2_trim_defs) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, bc_read_trim_defs) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, si_read_trim_defs) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, umi_read_trim_defs) else: umi_reads = FastqReader(None, None, False, None) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads) # Compute trim order of the readers; this is to ensure stability in the ordering # in which trimmed sequence is added to the TRIMMED_SEQ tags trim_order = list( np.argsort([ reader.read_def.read_type for reader in fastq_readers if reader.read_def is not None ])) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest( *[reader.in_iter for reader in fastq_readers]) # Bam file to write auxiliary data to (that won't fit in a fastq hdr / QNAME) trimmed_seq_writer = ChunkedBamWriter(outs.trimmed_seqs, args.reads_per_file) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.initial_reads): # Downsample if random.random() > args.subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions rna_read = rna_extraction.read if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction.read if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction.read if bc_extraction is not None else EMPTY_READ si_read = si_extraction.read if si_extraction is not None else EMPTY_READ umi_read = umi_extraction.read if umi_extraction is not None else EMPTY_READ # Extra trimming for internal purposes if args.rna_read_length is not None: rna_read = (rna_read[0], rna_read[1][0:args.rna_read_length], rna_read[2][0:args.rna_read_length]) # Accumulate trimmed sequence; ordering is by read-type (I1,I2,R1,R2) # to ensure stability trimmed_seq = '' trimmed_qual = '' for i in trim_order: if extractions[i] is None: continue trimmed_seq += extractions[i].trimmed_seq trimmed_qual += extractions[i].trimmed_qual if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) fastq_header_str1 = fastq_header1.to_string() read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2])) # Write trimmed sequence data to a separate, unaligned BAM file # Note: We assume that there is only one trimmed sequence per read-pair trimmed_seq_data = pysam.AlignedSegment() trimmed_seq_data.query_name = fastq_header_str1.split( AugmentedFastqHeader.WORD_SEP)[0] trimmed_seq_data.flag = 4 trimmed_seq_data.seq = trimmed_seq trimmed_seq_data.qual = trimmed_qual trimmed_seq_writer.write(trimmed_seq_data) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) read2_writer.write( (fastq_header2.to_string(), rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() bc_counter.close() trimmed_seq_writer.close() # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] outs.gem_groups = [args.gem_group] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) outs.trimmed_seqs = trimmed_seq_writer.get_out_paths() else: outs.reads = [] outs.read2s = [] outs.gem_groups = [] outs.read_groups = [] outs.trimmed_seqs = [] assert len(outs.gem_groups) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) assert len(outs.trimmed_seqs) == len(outs.reads) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Build the feature reference if args.reference_path: feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) else: feature_ref = rna_feature_ref.FeatureReference.empty() # Setup feature barcode extraction feature_extractor = rna_feature_ref.FeatureExtractor( feature_ref, use_feature_types=[args.library_type]) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [ rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def ] read_tags = [ None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained for bamtofastq trim_defs = get_bamtofastq_defs(read_defs, read_tags) outs.bam_comments = sorted(set(trim_defs.itervalues())) num_libraries = len(args.library_info) reporter = cr_report.Reporter( umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), num_libraries=num_libraries) # Determine if barcode sequences need to be reverse complemented. with FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None) as bc_check_rc: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist, True) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) # Log the untrimmed read lengths to stdout r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None) r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None) r1_untrimmed_len = 0 for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r1_untrimmed_len = max(r1_untrimmed_len, len(read[1])) print "Read 1 untrimmed length = ", r1_untrimmed_len print "Input arg r1_length = ", args.r1_length r1_reader.close() if paired_end: r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None) r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None) r2_untrimmed_len = 0 for read in itertools.islice( r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r2_untrimmed_len = max(r2_untrimmed_len, len(read[1])) print "Read 2 untrimmed length = ", r2_untrimmed_len print "Input arg r2_length = ", args.r2_length r2_reader.close() # Setup read iterators. r1_length = args.r1_length r2_length = args.r2_length rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length) else: umi_reads = FastqReader(None, None, False, r1_length, r2_length) # Record feature counts: feature_counts = np.zeros(feature_ref.get_num_features(), dtype=int) # If this library type has no feature barcodes, make the reader a NOOP if feature_extractor.has_features_to_extract(): feature_reads = FastqFeatureReader(args.read_chunks, feature_extractor, args.reads_interleaved, r1_length, r2_length) else: feature_reads = FastqReader(None, None, None, r1_length, r2_length) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads, feature_reads) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION) tag_writer = None if not args.augment_fastq: tag_writer = ChunkedFastqWriter(outs.tags, args.reads_per_file, compression=COMPRESSION) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest( *[reader.in_iter for reader in fastq_readers]) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.chunk_initial_reads): # Downsample if random.random() > args.chunk_subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction, feature_extraction = extractions rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ si_read = si_extraction if si_extraction is not None else EMPTY_READ umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ if (not rna_read[1]) or (paired_end and (not rna_read2[1])): # Read 1 is empty or read 2 is empty (if paired_end) # Empty reads causes issue with STAR aligner, so eliminate # them here continue if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences lib_idx = [ i for i, x in enumerate(args.library_info) if x['library_id'] == args.library_id ][0] reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, lib_idx, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) feat_raw_bc = None feat_proc_bc = None feat_qual = None feat_ids = None if feature_extraction: if feature_extraction.barcode: feat_raw_bc = feature_extraction.barcode feat_qual = feature_extraction.qual if len(feature_extraction.ids) > 0: feat_proc_bc = feature_extraction.barcode feat_ids = ';'.join(feature_extraction.ids) # If hit a single feature ID, count its frequency if len(feature_extraction.ids) == 1: feature_counts[feature_extraction.indices[0]] += 1 if feat_raw_bc: fastq_header1.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG, feat_raw_bc) fastq_header1.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG, feat_qual) if feat_ids: fastq_header1.set_tag(cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc) fastq_header1.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids) if args.augment_fastq: read1_writer.write( (fastq_header1.to_string(), rna_read[1], rna_read[2])) else: read1_writer.write((rna_read[0], rna_read[1], rna_read[2])) tag_writer.write((fastq_header1.to_string(), '', '')) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) if feat_raw_bc: fastq_header2.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG, feat_raw_bc) fastq_header2.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG, feat_qual) if feat_ids: fastq_header2.set_tag( cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc) fastq_header2.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids) if args.augment_fastq: read2_writer.write( (fastq_header2.to_string(), rna_read2[1], rna_read2[2])) else: read2_writer.write((rna_read2[0], rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() if not args.augment_fastq: tag_writer.close() bc_counter.close() # Write feature BC read counts with open(outs.feature_counts, 'w') as f: json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f) # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] if args.augment_fastq: outs.tags = [] else: outs.tags = tag_writer.get_out_paths(len(outs.tags)) libraries = args.library_info library = [ li for li in libraries if li['library_id'] == args.library_id ][0] outs.gem_groups = [library['gem_group']] * len(outs.reads) outs.library_types = [library['library_type']] * len(outs.reads) outs.library_ids = [library['library_id']] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) else: outs.reads = [] outs.read2s = [] outs.tags = [] outs.gem_groups = [] outs.library_types = [] outs.library_ids = [] outs.read_groups = [] assert len(outs.gem_groups) == len(outs.reads) assert args.augment_fastq or len(outs.tags) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def] read_tags = [None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained for bamtofastq trim_defs = get_bamtofastq_defs(read_defs, read_tags) outs.bam_comments = sorted(set(trim_defs.itervalues())) gem_groups = [chunk['gem_group'] for chunk in args.chunks] reporter = cr_report.Reporter(umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), gem_groups=gem_groups) # Determine if barcode sequences need to be reverse complemented. bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) bc_check_rc.close() # Log the untrimmed read lengths to stdout r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None) r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None) r1_untrimmed_len = 0 for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r1_untrimmed_len = max(r1_untrimmed_len, len(read[1])) print "Read 1 untrimmed length = ", r1_untrimmed_len print "Input arg r1_length = ", args.r1_length r1_reader.close() if paired_end: r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None) r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None) r2_untrimmed_len = 0 for read in itertools.islice(r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r2_untrimmed_len = max(r2_untrimmed_len, len(read[1])) print "Read 2 untrimmed length = ", r2_untrimmed_len print "Input arg r2_length = ", args.r2_length r2_reader.close() # Setup read iterators. r1_length = args.r1_length r2_length = args.r2_length rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length) else: umi_reads = FastqReader(None, None, False, r1_length, r2_length) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest(*[reader.in_iter for reader in fastq_readers]) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.initial_reads): # Downsample if random.random() > args.subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ si_read = si_extraction if si_extraction is not None else EMPTY_READ umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ if (not rna_read[1]) or (paired_end and (not rna_read2[1])): # Read 1 is empty or read 2 is empty (if paired_end) # Empty reads causes issue with STAR aligner, so eliminate # them here continue if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) fastq_header_str1 = fastq_header1.to_string() read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2])) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) read2_writer.write((fastq_header2.to_string(), rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() bc_counter.close() # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] outs.gem_groups = [args.gem_group] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) else: outs.reads = [] outs.read2s = [] outs.gem_groups = [] outs.read_groups = [] assert len(outs.gem_groups) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)