def __init__(self, in_filenames, extractor, reads_interleaved, r1_length, r2_length): """ Args: in_filenames (dict of str -> str): Map of paths to fastq files feature_ref (FeatureExtractor): for extracting feature barcodes """ self.in_fastqs = None self.in_iter = iter([]) # Relevant read types read_types = extractor.get_read_types() if in_filenames: in_filenames = get_fastqs_from_feature_ref(in_filenames, reads_interleaved, read_types) if in_filenames != (None, None): if reads_interleaved: filename = in_filenames[0] if in_filenames[0] else in_filenames[1] self.in_fastqs = (cr_io.open_maybe_gzip(filename, 'r') if filename[0] else None, None) else: self.in_fastqs = (cr_io.open_maybe_gzip(in_filenames[0], 'r') if in_filenames[0] else None, cr_io.open_maybe_gzip(in_filenames[1], 'r') if in_filenames[1] else None) self.in_iter = get_feature_generator_fastq(files=self.in_fastqs, extractor=extractor, interleaved=reads_interleaved, read_types=read_types, r1_length=r1_length, r2_length=r2_length)
def main(args, outs): outs.coerce_strings() paired_end = cr_chem.is_paired_end(args.chemistry_def) outs.read1s = martian.make_path('reads_1.fastq' + h5_constants.LZ4_SUFFIX) r1_fq_out = cr_io.open_maybe_gzip(outs.read1s, 'w') if paired_end: outs.read2s = martian.make_path('reads_2.fastq' + h5_constants.LZ4_SUFFIX) r2_fq_out = cr_io.open_maybe_gzip(outs.read2s, 'w') else: outs.read2s = None r2_fq_out = None barcodes_out = cr_io.open_maybe_gzip(outs.chunk_barcodes, 'w') merge_by_barcode(args.fastqs, r1_fq_out, r2_fq_out, barcodes_out, paired_end) r1_fq_out.close() if r2_fq_out is not None: r2_fq_out.close() barcodes_out.close()
def create_unaligned_bam(args, outs): star_ref_path = cr_utils.get_reference_star_path(args.reference_path) header_buf = cStringIO.StringIO() header_buf.write('@HD\tVN:1.4\n') # SQ header lines with open(os.path.join(star_ref_path, 'chrNameLength.txt')) as f: for line in f: chr_name, chr_len = line.strip().split('\t') header_buf.write('@SQ\tSN:{}\tLN:{}\n'.format(chr_name, chr_len)) # RG header lines for packed_rg in args.read_groups: header_buf.write( re.sub('\\\\t', '\t', tk_bam.make_rg_header(packed_rg)) + '\n') # Get read group ID for this chunk of reads read_group = args.read_group # pysam doesn't support reading SAM from a StringIO object with open('tmphdr', 'w') as f: f.write(header_buf.getvalue()) samfile = pysam.AlignmentFile('tmphdr', 'r', check_sq=False) outbam = pysam.AlignmentFile(outs.genome_output, 'wb', template=samfile) fastq_file1 = cr_io.open_maybe_gzip(args.read_chunk) fastq_file2 = cr_io.open_maybe_gzip( args.read2_chunk) if args.read2_chunk else None read1s = tk_fasta.read_generator_fastq(fastq_file1) read2s = tk_fasta.read_generator_fastq(fastq_file2) if fastq_file2 else [] record = pysam.AlignedSegment() record.flag = 4 for read1, read2 in itertools.izip_longest(read1s, read2s): name, seq, qual = read1 record.query_name, record.query_sequence = name.split(' ')[0], seq record.query_qualities = tk_fasta.get_qvs(qual) record.set_tag('RG', read_group, 'Z') outbam.write(record) if read2: name, seq, qual = read2 record.query_name, record.query_sequence = name.split(' ')[0], seq record.query_qualities = tk_fasta.get_qvs(qual) record.set_tag('RG', read_group, 'Z') outbam.write(record) samfile.close() fastq_file1.close() if fastq_file2 is not None: fastq_file2.close() outbam.close()
def load_barcode_translate_map(bc_whitelist): """ Guide BC to Cell BC translate. If the barcode whitelist needs to translate, return the mapping dictionary, else, return None. """ if bc_whitelist is None: return None file_path = None for extension in ['.txt', '.txt.gz']: file_ext = os.path.join(cr_constants.BARCODE_WHITELIST_TRANSLATE_PATH, bc_whitelist + extension) if os.path.exists(file_ext): file_path = file_ext break if file_path is None: return None else: translate_map = {} for line in cr_io.open_maybe_gzip(file_path, 'r'): if line.startswith('#'): continue bcs = line.strip().split() translate_map[bcs[0]] = bcs[1] return translate_map
def _compute_r1_length(fastqs, reads_interleaved): """ Infer the length of R1 """ num_reads = 0 r1_max_len = 0 def get_r1_noninterleaved(read_iter): for _, seq, _ in read_iter: yield seq def get_r1_interleaved(read_iter): for _, seq, _, _, _, _ in read_iter: yield seq get_r1 = get_r1_interleaved if reads_interleaved else get_r1_noninterleaved for fastq in fastqs: with cr_io.open_maybe_gzip(fastq, 'r') as fq_file: reads = tk_fasta.read_generator_fastq(fq_file, reads_interleaved) for r1 in get_r1(reads): if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS: break r1_max_len = max(len(r1), r1_max_len) num_reads += 1 if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS: break return r1_max_len
def load_barcode_tsv(filename, as_set=False): barcodes = [ x.strip() for x in cr_io.open_maybe_gzip(filename, 'r') if not ('#' in x) ] barcode_set = set(barcodes) if len(barcodes) != len(barcode_set): raise Exception('Duplicates found in barcode whitelist: %s' % filename) return barcode_set if as_set else barcodes
def save_features_tsv(feature_ref, base_dir, compress): """Save a FeatureReference to a tsv file""" out_features_fn = os.path.join(base_dir, 'features.tsv') if compress: out_features_fn += '.gz' with cr_io.open_maybe_gzip(out_features_fn, 'w') as f: for feature_def in feature_ref.feature_defs: f.write('\t'.join((feature_def.id, feature_def.name, feature_def.feature_type)) + '\n')
def __init__(self, in_filenames, read_def, reads_interleaved, r1_length, r2_length): """ Args: in_filenames - Map of paths to fastq files read_def - ReadDef """ self.in_fastq = None self.in_iter = iter([]) self.read_def = read_def if in_filenames: in_filename = get_fastq_from_read_type(in_filenames, read_def, reads_interleaved) if in_filename: self.in_fastq = cr_io.open_maybe_gzip(in_filename, 'r') self.in_iter = get_read_generator_fastq(self.in_fastq, read_def=read_def, reads_interleaved=reads_interleaved, r1_length=r1_length, r2_length=r2_length)
def save_mex(self, base_dir, save_features_func, metadata=None, compress=True): """Save in Matrix Market Exchange format. Args: base_dir (str): Path to directory to write files in. save_features_func (func): Func that takes (FeatureReference, base_dir, compress) and writes a file describing the features. metadata (dict): Optional metadata to encode into the comments as JSON. """ self.tocoo() cr_io.makedirs(base_dir, allow_existing=True) out_matrix_fn = os.path.join(base_dir, 'matrix.mtx') out_barcodes_fn = os.path.join(base_dir, 'barcodes.tsv') if compress: out_matrix_fn += '.gz' out_barcodes_fn += '.gz' # This method only supports an integer matrix. assert self.m.dtype in ['uint32', 'int32', 'uint64', 'int64'] assert type(self.m) == sp_sparse.coo.coo_matrix rows, cols = self.m.shape # Header fields in the file rep = 'coordinate' field = 'integer' symmetry = 'general' metadata = metadata or {} metadata.update({ 'format_version': MATRIX_H5_VERSION, }) metadata_str = json.dumps(metadata) comment = 'metadata_json: %s' % metadata_str with cr_io.open_maybe_gzip(out_matrix_fn, 'w') as stream: # write initial header line stream.write( np.compat.asbytes('%%MatrixMarket matrix {0} {1} {2}\n'.format( rep, field, symmetry))) # write comments for line in comment.split('\n'): stream.write(np.compat.asbytes('%%%s\n' % (line))) # write shape spec stream.write( np.compat.asbytes('%i %i %i\n' % (rows, cols, self.m.nnz))) # write row, col, val in 1-based indexing for r, c, d in itertools.izip(self.m.row + 1, self.m.col + 1, self.m.data): stream.write(np.compat.asbytes(("%i %i %i\n" % (r, c, d)))) # both GEX and ATAC provide an implementation of this in respective feature_ref.py save_features_func(self.feature_ref, base_dir, compress=compress) with cr_io.open_maybe_gzip(out_barcodes_fn, 'w') as f: for bc in self.bcs: f.write(bc + '\n')
def open_file(self, filename): return cr_io.open_maybe_gzip(filename, 'w')
def main(args, outs): # Martian coerces dict keys to string # Coerce keys back to int args.chunks_per_gem_group = {int(k): v for k, v in args.chunks_per_gem_group.iteritems()} paired_end = args.read2s_chunk is not None # Lazy load R1 r1_file = cr_io.open_maybe_gzip(args.read1s_chunk) read1s = tk_fasta.read_generator_fastq(r1_file) # Lazy load R2 if paired_end: r2_file = cr_io.open_maybe_gzip(args.read2s_chunk) read2s = tk_fasta.read_generator_fastq(r2_file) else: read2s = [] # Lazy load corrected BCs bc_file = cr_io.open_maybe_gzip(args.bcs) bcs = (line.strip() for line in bc_file) buckets = {} bucket_filenames = {} for gem_group, bucket_name in enumerate_bucket_names(args.chunks_per_gem_group): filename = martian.make_path("%s.fastq" % bucket_name) bucket_filenames[bucket_name] = filename buckets[bucket_name] = [] for read1, read2, barcode in itertools.izip_longest(read1s, read2s, bcs): # Exclude unbarcoded reads if barcode == '': continue # Exclude short reads if len(read1[1]) < MIN_READ_LENGTH or (read2 is not None and len(read2[1]) < MIN_READ_LENGTH): continue # Attach processed barcode to reads r1_hdr = cr_fastq.AugmentedFastqHeader(read1[0]) r1_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode) r1_new_qname = r1_hdr.to_string() if paired_end: r2_hdr = cr_fastq.AugmentedFastqHeader(read2[0]) r2_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode) r2_new_qname = r2_hdr.to_string() barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group]) buckets[bucket_name].append((r1_new_qname, read1[1], read1[2])) if paired_end: buckets[bucket_name].append((r2_new_qname, read2[1], read2[2])) outs.buckets = {} # Sort and write each bucket for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=vdj_utils.fastq_barcode_sort_key) # Don't create empty bucket files. # This is common when the reads are ordered by gem group # And a chunk sees only a single gem group. if len(bucket) == 0: continue filename = bucket_filenames[bucket_name] with cr_io.open_maybe_gzip(filename, 'w') as f: for read in bucket: tk_fasta.write_read_fastq(f, *read) outs.buckets[bucket_name] = bucket_filenames[bucket_name]
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group, args.library_type) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk) in_read2_fastq = cr_io.open_maybe_gzip( args.read2_chunk) if args.read2_chunk else [] outs.corrected_bcs += h5_constants.LZ4_SUFFIX out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) processed_bc = None if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) reporter.vdj_barcode_cb(raw_bc, processed_bc) out_file.write('%s\n' % (processed_bc if processed_bc is not None else '')) in_read1_fastq.close() if in_read2_fastq: in_read2_fastq.close() out_file.close() bc_counter.close() reporter.save(outs.chunked_reporter)