def main(args, outs): if not args.run_qc: return out_base = os.path.dirname(outs.qc_summary) whitelist_path = tk_preflight.check_barcode_whitelist( args.barcode_whitelist) file_infos = [tk_fasta.IlmnFastqFile(path) for path in args.input_files] bc_file_type = args.file_read_types_map[args.bc_read_type] barcode_files = [f for f in file_infos if f.read == bc_file_type] # Note: this is Martian 3 incompatible; revert back to summary_chunk if merging # back into master (also applies to additional references to `qc_summary` in the main function) # # see https://github.com/10XDev/tenkit/commit/2c59c9a24b0e7cd81945544f62ffde7ab632ed42 outs.qc_summary = {'barcode': [], 'read1': [], 'read2': []} for idx, bf in enumerate(barcode_files): output_json_path = os.path.join(out_base, "output_%d_BC.json" % idx) subproc_args = [ 'barcodeqc', bf.filename, output_json_path, "--whitelist", whitelist_path, "--bc-start-index", str(args.bc_start_index), "--bc-length", str(args.bc_length) ] if args.bc_read_type == "I2" and args.rc_i2_read: subproc_args.append("--rc") try: tk_proc.check_call(subproc_args) except subprocess.CalledProcessError, e: martian.throw("Could not QC barcodes: return code %s" % e.returncode) # needs to be summary_chunk in Martian 3 outs.qc_summary['barcode'].append(output_json_path)
def align_reads_paired(self, in_fastq_r1_fn, in_fastq_r2_fn, out_file, write_sam=False, **kwargs): """ Perform paired-end alignment of reads to reference and produce BAM as output using bowtie2. Args: in_fastq_r1_fn (str): name of fastq file with R1 to align in_fastq_r2_fn (str): name of fastq file with R2 to align out_file (str): name of BAM/SAM file to output aligned reads write_sam (bool): set to True to write SAM instead of BAM **kwargs: Any additional arguments to bowtie2 may be included. Flags may have value set to None. Values are not validated except for conflicts with index and read input arguments. Parameters with hypens in name should be defined using underscores in place of hypens. Examples: kwargs can be specified as such: myBowtie2.align_reads_paired(f1, f2, bam, p=2, very_fast=None, N=3) """ assert self.indexed reserved_arguments = {'x', '1', '2', 'U'} additional_options = cr_utils.kwargs_to_command_line_options(reserved_arguments, replace_chars={'_': '-'}, **kwargs) if write_sam: cmd = 'bowtie2 %s -x %s -1 %s -2 %s -S %s' % \ (additional_options, self.index_path, in_fastq_r1_fn, in_fastq_r2_fn, out_file) else: cmd = 'bowtie2 %s -x %s -1 %s -2 %s | samtools view -bS - -o %s' % \ (additional_options, self.index_path, in_fastq_r1_fn, in_fastq_r2_fn, out_file) tk_subproc.check_call(cmd, shell=True)
def run_assembly(fastq_pref, fasta_pref, args): cmd = [ 'vdj_asm', 'asm', fastq_pref, fasta_pref, '--kmers=' + str(args.min_kmer_count), '--min-contig=' + str(args.min_contig_len), '--min-qual=' + str(args.min_qual), '--score-factor=' + str(args.score_factor), '--qual-factor=' + str(args.qual_factor), '--min-sw-score=' + str(args.min_sw_score), '--rt-error=' + str(args.rt_error) ] if not cr_chem.has_umis(args.chemistry_def): martian.log_info('Assembly without UMIs is not fully supported.') cutoff = args.min_readpairs_per_umi[str(args.gem_group)] if cr_chem.is_paired_end(args.chemistry_def): cmd.append('--min-umi-reads=' + str(2 * cutoff)) else: cmd.append('--min-umi-reads=' + str(cutoff)) cmd.append('--single-end') if args.use_unmapped: cmd.append('--use-unmapped') #cmd.append('--mixture-filter') print >> sys.stderr, 'Running', ' '.join(cmd) tk_subproc.check_call(cmd, cwd=os.getcwd())
def main(args, outs): # Write read_chunk for consumption by Rust with open("chunk_args.json", "w") as f: json.dump(args.read_chunk, f) output_path = martian.make_path("") prefix = "fastq_chunk" chunk_reads_args = [ 'chunk_reads', '--reads-per-fastq', str(args.reads_per_file), output_path, prefix, "--martian-args", "chunk_args.json", '--compress', 'lz4' ] print "running chunk reads: [%s]" % str(chunk_reads_args) tk_subproc.check_call(chunk_reads_args) with open(os.path.join(output_path, "read_chunks.json")) as f: chunk_results = json.load(f) outs.out_chunks = [] # Write out a new chunk entry for each resulting chunk for chunk in chunk_results: print args.read_chunk chunk_copy = args.read_chunk.copy() print chunk_copy chunk_copy['read_chunks'] = chunk outs.out_chunks.append(chunk_copy)
def main(args, outs): if not args.run_qc: return out_base = os.path.dirname(outs.qc_summary) whitelist_path = tk_preflight.check_barcode_whitelist(args.barcode_whitelist) file_infos = [tk_fasta.IlmnFastqFile(path) for path in args.input_files] bc_file_type = args.file_read_types_map[args.bc_read_type] barcode_files = [f for f in file_infos if f.read == bc_file_type] outs.summary_chunk = { 'barcode': [], 'read1': [], 'read2': [] } for idx, bf in enumerate(barcode_files): output_json_path = os.path.join(out_base, "output_%d_BC.json" % idx) subproc_args = [ 'barcodeqc', bf.filename, output_json_path, "--whitelist", whitelist_path, "--bc-start-index", str(args.bc_start_index), "--bc-length", str(args.bc_length)] if args.bc_read_type == "I2" and args.rc_i2_read: subproc_args.append("--rc") try: tk_proc.check_call(subproc_args) except subprocess.CalledProcessError, e: martian.throw("Could not QC barcodes: return code %s" % e.returncode) outs.summary_chunk['barcode'].append(output_json_path)
def sort_bed(input_bed, output_bed, genome, threads=1, leave_key=False, has_key=False): """Use unix sort to properly sort a bed file, including a custom sort order on chromosomes. Warning! sort does not have the --parallel argument on all forms of unix! As such, we are dropping threading support for BED handling through unix sort. """ main_cmds = [] # sort_thread_args = "" if threads == 1 else " --parallel={}".format(threads) sort_thread_args = "" if not has_key: # If the bed file doesn't already have a contig key, we need to create the key file used to add it. tmpdir = os.path.dirname(output_bed) tmp_chroms = os.path.join(tmpdir, "chrom_order.txt") chroms = os.path.join(tmpdir, "sorted_order.txt") with open(genome, "r") as infile, open(tmp_chroms, "w") as outfile: for i, line in enumerate(infile): chrom = line.split()[0] outfile.write("{}\t{}\n".format(chrom, i)) tk_subproc.check_call("sort -k1b,1 -o {} {}".format(chroms, tmp_chroms), shell=True) # Now we'll add the commands to join the key onto the file between the contig & start/stop positions main_cmds.extend(["sort -k1b,1{} {}".format(sort_thread_args, input_bed), "join -t '\t' -j1 {} -".format(chroms)]) else: main_cmds.append("cat {}".format(input_bed)) # Next we sort on the contig key and start/stop positions main_cmds.append("sort -k2n -k3n -k4n{}".format(sort_thread_args)) if not leave_key: # Finally we remove the key from the output file main_cmds.append("cut -f 1,3-8") with open(output_bed, 'w') as outfile: tk_subproc.check_call(" | ".join(main_cmds), shell=True, stdout=outfile)
def index_reference(self, in_fasta_fn, in_gtf_fn, num_threads=1, sa_sparse_d=None, sa_index_n_bases=None, chr_bin_n_bits=None, limit_ram=None): if os.path.exists(self.reference_star_path): raise Exception('STAR reference path %s already exists' % self.reference_star_path) os.mkdir(self.reference_star_path) args = [ 'STAR', '--runMode', 'genomeGenerate', '--genomeDir', self.reference_star_path, '--runThreadN', str(num_threads), '--genomeFastaFiles', in_fasta_fn, '--sjdbGTFfile', in_gtf_fn ] if limit_ram is not None: args += ['--limitGenomeGenerateRAM', str(limit_ram)] if sa_sparse_d is not None: args += ['--genomeSAsparseD', str(sa_sparse_d)] if sa_index_n_bases is not None: args += ['--genomeSAindexNbases', str(sa_index_n_bases)] if chr_bin_n_bits is not None: args += ['--genomeChrBinNbits', str(chr_bin_n_bits)] tk_subproc.check_call(args)
def join(args, outs, chunk_defs, chunk_outs): outs.output = [str(chunk_out.output) for chunk_out in chunk_outs] outs.chunked_reporter = None outs.coerce_strings() # Write chunk info to a temporary file for the rust code to consume chunk_metrics = [] for chunk_def, chunk_out in zip(chunk_defs, chunk_outs): chunk_metrics.append({ 'metrics': chunk_out.chunked_reporter, 'library_type': chunk_def.library_type, }) with open('chunk_metrics.json', 'w') as f: json.dump(chunk_metrics, f) cmd = [ 'annotate_reads', 'join', 'chunk_metrics.json', outs.summary, outs.barcodes_detected, ] print >> sys.stderr, 'Running', ' '.join(cmd) tk_subproc.check_call(cmd, cwd=os.getcwd()) outs.num_alignments = [ chunk_out.num_alignments for chunk_out in chunk_outs ]
def run_louvain_unweighted_clustering(bin_filename, louvain_out): """ Run Louvain clustering on an unweighted edge-list """ with open(louvain_out, 'w') as f: tk_subproc.check_call([ LOUVAIN_BINPATH, bin_filename, '-q', '0', '-l', '-1', ], stdout=f)
def load_from_index(index_path): b = Bowtie2Reference() b.reference_fasta_path = None # Check existence and validity of index try: tk_subproc.check_call('bowtie2-inspect -n %s' % index_path, shell=True) except subprocess.CalledProcessError: raise ValueError('Bowtie2 index could not be found or was invalid') b.index_path = index_path b.indexed = True return b
def merge_keyed_bed(input_beds, output_bed, threads=1): """Merge sorted bedfiles retaining their chromosome keys, dropping the key afterwards. Warning! sort does not have the --parallel argument on all forms of unix! As such, we are dropping threading support for BED handling through unix sort. """ main_cmds = [] # sort_thread_args = "" if threads == 1 else " --parallel={}".format(threads) sort_thread_args = "" main_cmds.append("sort -m -k2n -k3n -k4n{} {}".format(sort_thread_args, ' '.join(input_beds))) main_cmds.append("cut -f 1,3-8") with open(output_bed, 'w') as outfile: tk_subproc.check_call(" | ".join(main_cmds), shell=True, stdout=outfile)
def main(args, outs): convert_pickle_to_rust_index( cr_utils.get_reference_genes_index(args.reference_path), outs.gene_index_tab) if args.barcode_whitelist is None: barcode_whitelist = 'null' elif not os.path.exists(args.barcode_whitelist): barcode_whitelist = cr_utils.get_barcode_whitelist_path( args.barcode_whitelist) else: barcode_whitelist = args.barcode_whitelist cmd = [ 'annotate_reads', 'main', args.chunk_genome_input, args.chunk_tags, outs.output, outs.chunked_reporter, args.reference_path, outs.gene_index_tab, args.barcode_counts, barcode_whitelist, str(args.gem_group), outs.chunk_metadata, cr_chem.get_strandedness(args.chemistry_def), args.feature_counts, args.library_type or lib_constants.DEFAULT_LIBRARY_TYPE, args.library_id, args.library_info_json, '--bam-comments', args.bam_comments_json, ] if cr_chem.get_endedness(args.chemistry_def) == cr_constants.FIVE_PRIME: cmd.append('--fiveprime') if args.skip_translate: cmd.append('--skip-translate') if args.feature_reference is not None: cmd.extend(['--feature-ref', args.feature_reference]) print >> sys.stderr, 'Running', ' '.join(map(lambda x: "'%s'" % x, cmd)) tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(outs.chunk_metadata) as f: metadata = json.load(f) outs.num_alignments = metadata['num_alignments']
def run_read_match(read1_path, read2_path, fasta_path, out_bam_filename, strand, sw_params): assert strand in ('+', '-') cmd = ['vdj_asm', 'read-match', '--ref', fasta_path, '--r1', read1_path, '--outbam', out_bam_filename, '--seed=' + str(sw_params['seed']), '--min-sw-score=' + str(sw_params['min_sw_score'])] if strand == '-': cmd.append('--rev-strand') if read2_path: cmd.extend(['--r2', read2_path]) print >> sys.stderr, 'Running', ' '.join(cmd) tk_subproc.check_call(cmd, cwd=os.getcwd())
def main(args, outs): if not args.split_by_tile: return os.makedirs(outs.demultiplexed_fastq_path) demux_read_types = ("RA", "I1", "I2") # covering the bases # like tenkit.fasta.find_input_fastq_files_10x_preprocess but allow Ns # from combined barcode list for read_type in demux_read_types: for barcode in args.bcs: file_glob = "read-%s_si-%s_lane-%03d[_\-]*.fastq*" % (read_type, barcode, args.lane) dir_glob = os.path.join(args.demultiplexed_fastq_path, "Tile*", file_glob) files = glob.glob(dir_glob) # assuming here that all files are already gzipped out_path = os.path.join(outs.demultiplexed_fastq_path, "read-%s_si-%s_lane-%03d-chunk-001.fastq.gz" % (read_type, barcode, args.lane)) if files: subprocess_args = ["cat"] + files + [">", out_path] tk_proc.check_call(" ".join(subprocess_args), shell=True)
def join(args, outs, chunk_defs, chunk_outs): outs.output = [str(chunk_out.output) for chunk_out in chunk_outs] outs.chunked_reporter = None outs.coerce_strings() with open(outs.metric_chunk_list, 'w') as f: for chunk_out in chunk_outs: f.write(chunk_out.chunked_reporter + '\n') cmd = [ 'annotate_reads', 'join', outs.metric_chunk_list, outs.summary, outs.barcodes_detected, ] print >> sys.stderr, 'Running', ' '.join(cmd) tk_subproc.check_call(cmd, cwd=os.getcwd()) outs.num_alignments = [ chunk_out.num_alignments for chunk_out in chunk_outs ]
def main(args, outs): args.coerce_strings() bam_prefix, ext = op.splitext(outs.default) # Sort based on the five prime position tag sort_args = [ "samtools", "sort", "-t", SELF_FIVE_PRIME_POS_TAG, "-o", outs.default, args.chunk_input ] check_call(sort_args) perfect_read_count = 0 bam = tk_bam.create_bam_infile(str(args.chunk_input)) while True: try: read = bam.next() if crdna.read_filter.stringent_read_filter(read, True): perfect_read_count += 1 except StopIteration: break outs.perfect_read_count = perfect_read_count
def main(args, outs): convert_pickle_to_rust_index( cr_utils.get_reference_genes_index(args.reference_path), outs.gene_index_tab) if args.barcode_whitelist is None: barcode_whitelist = 'null' elif not os.path.exists(args.barcode_whitelist): barcode_whitelist = cr_utils.get_barcode_whitelist_path( args.barcode_whitelist) else: barcode_whitelist = args.barcode_whitelist cmd = [ 'annotate_reads', 'main', args.chunk_genome_input, outs.output, outs.chunked_reporter, args.reference_path, outs.gene_index_tab, args.barcode_counts, barcode_whitelist, str(args.gem_group), outs.chunk_metadata, cr_chem.get_strandedness(args.chemistry_def), '--bam-comments', args.bam_comments_json, ] if cr_chem.get_endedness(args.chemistry_def) == cr_constants.FIVE_PRIME: cmd.append('--fiveprime') print >> sys.stderr, 'Running', ' '.join(cmd) tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(outs.chunk_metadata) as f: metadata = json.load(f) outs.num_alignments = metadata['num_alignments']
def _index_reference(self, index_path, **kwargs): """ Generates a bowtie2 index for the specified reference file. Args: index_path (str): path to index prefix **kwargs: Any additional arguments to bowtie2-build may be included. Flags may have value set to None. Values are not validated. Parameters with hypens in name should be defined using underscores in place of hypens. Notes: Bowtie2 generates temporary files for indexing as a side-effect. Examples: kwargs can be specified as such: myBowtie2._index_reference(index_path, large_index=None, bmax=4) """ additional_arguments = cr_utils.kwargs_to_command_line_options(set(), replace_chars={'_': '-'}, **kwargs) command = 'bowtie2-build %s %s %s' % (additional_arguments, self.reference_fasta_path, index_path) tk_subproc.check_call(command, shell=True) self.index_path = index_path self.indexed = True
def run(args): """ Run tk_subproc.check_call and print command """ print ' '.join(args) tk_subproc.check_call(args)
def join(args, outs, chunk_defs, chunk_outs): contigs = [] contig_fastqs = [] contig_bams = [] if len(chunk_outs) == 0: # No input reads # Create empty BAM file with open(outs.contig_bam, 'w') as f: pass outs.contig_bam_bai = None # Create empty contig FASTA with open(outs.contig_fasta, 'w') as f: pass outs.contig_fasta_fai = None # Create empty contig FASTQ with open(outs.contig_fastq, 'w') as f: pass outs.metrics_summary_json = None outs.summary_tsv = None outs.umi_summary_tsv = None return summary_tsvs = [] umi_summary_tsvs = [] for chunk_out in chunk_outs: if not os.path.isfile(chunk_out.contig_fasta): continue contigs.append(chunk_out.contig_fasta) contig_fastqs.append(chunk_out.contig_fastq) contig_bams.append(chunk_out.contig_bam) summary_tsvs.append(chunk_out.summary_tsv) umi_summary_tsvs.append(chunk_out.umi_summary_tsv) cr_io.concatenate_files(outs.contig_fasta, contigs) if os.path.getsize(outs.contig_fasta) > 0: tk_subproc.check_call('samtools faidx %s' % outs.contig_fasta, shell=True) outs.contig_fasta_fai = outs.contig_fasta + '.fai' cr_io.concatenate_files(outs.contig_fastq, contig_fastqs) if len(summary_tsvs) > 0: cr_io.concatenate_headered_files(outs.summary_tsv, summary_tsvs) if len(umi_summary_tsvs) > 0: cr_io.concatenate_headered_files(outs.umi_summary_tsv, umi_summary_tsvs) if contig_bams: # Merge every N BAMs. Trying to merge them all at once # risks hitting the filehandle limit. n_merged = 0 while len(contig_bams) > 1: to_merge = contig_bams[0:MERGE_BAMS_N] tmp_bam = martian.make_path('merged-%04d.bam' % n_merged) n_merged += 1 print "Merging %d BAMs into %s ..." % (len(to_merge), tmp_bam) tk_bam.merge(tmp_bam, to_merge, threads=args.__threads) # Delete any temporary bams that have been merged for in_bam in to_merge: if os.path.basename(in_bam).startswith('merged-'): cr_io.remove(in_bam) # Pop the input bams and push the merged bam contig_bams = contig_bams[len(to_merge):] + [tmp_bam] if os.path.basename(contig_bams[0]).startswith('merged-'): # We merged at least two chunks together. # Rename it to the output bam. cr_io.move(contig_bams[0], outs.contig_bam) else: # There was only a single chunk, so copy it from the input cr_io.copy(contig_bams[0], outs.contig_bam) tk_bam.index(outs.contig_bam) # Make sure the Martian out matches the actual index filename outs.contig_bam_bai = outs.contig_bam + '.bai' # Merge the assembler summary jsons merged_summary = cr_io.merge_jsons_single_level( [out.metrics_summary_json for out in chunk_outs]) with open(outs.metrics_summary_json, 'w') as f: json.dump(tk_safe_json.json_sanitize(merged_summary), f, indent=4, sort_keys=True)
def run_cutadapt(args, out_read1s, out_read2s, chemistry_def, stdout=sys.stdout): paired_end = cr_chem.is_paired_end(chemistry_def) # If single end, determine which read the single read is (R1 or R2) if paired_end: single_read = None else: single_read = cr_chem.get_rna_read_def(chemistry_def).read_type assert single_read in ('R1', 'R2') out_r1_file = cr_utils.open_maybe_gzip(out_read1s, 'w') # Note: The complexity of forcing cutadapt to output a compressed file # means we'll have to give up on that for now. cmd = ['cutadapt', '-e', '0.12', '--times', '3', '--overlap', '5', '-f', 'fastq', '-o', '/proc/%d/fd/%d' % (os.getpid(), out_r1_file.fileno())] out_r2_file = None if paired_end: out_r2_file = cr_utils.open_maybe_gzip(out_read2s, 'w') cmd.extend(['-p', '/proc/%d/fd/%d' % (os.getpid(), out_r2_file.fileno())]) primers = {anno['name']:anno['seq'] for anno in args.primers} if paired_end or single_read == 'R1': # R1 adapters for name in R1_ANCHORED_FIVE_PRIME_SEQS: if name in primers: cmd.extend(['-g', '%s=^%s' % (name, primers[name])]) for name in R1_THREE_PRIME_REV_COMP_SEQS: if name in primers: cmd.extend(['-a', '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))]) for name in R1_THREE_PRIME_SEQS: if name in primers: cmd.extend(['-a', '%s=%s' % (name, primers[name])]) if paired_end or single_read == 'R2': for name in R2_THREE_PRIME_REV_COMP_SEQS: if name in primers: flag = '-A' if paired_end else '-a' cmd.extend([flag, '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))]) for name in R2_THREE_PRIME_SEQS: if name in primers: flag = '-A' if paired_end else '-a' cmd.extend([flag, '%s=%s' % (name, primers[name])]) read1_file = cr_utils.open_maybe_gzip(args.read1s_chunk) cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read1_file.fileno())]) read2_file = None if paired_end: read2_file = cr_utils.open_maybe_gzip(args.read2s_chunk) cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read2_file.fileno())]) print cmd status = tk_subproc.check_call(cmd, stdout=stdout) # closing these files is important both because we need to wait on the # subprocess, if any, or else its rusage isn't accounted for for this # process, and because if we don't have a reference to the objects down # here, then python's garbage collector is free to finalize the objects # before cmd runs, which would result in a failure. out_r1_file.close() if out_r2_file: out_r2_file.close() read1_file.close() if read2_file: read2_file.close() return status
file_info.read, str(file_info.group).zfill(3) ) out_path = os.path.join(output_folder, os.path.basename(out_name)) os.rename(paths[0], out_path) else: out_file = "%s_S0%s_L%s_%s_%s.fastq.gz" % ( file_info.prefix, args.output_snum, str(file_info.lane).zfill(3), file_info.read, str(file_info.group).zfill(3)) out_path = os.path.join(output_folder, os.path.basename(out_file)) subprocess_args = ["cat"] + paths + [">", out_path] log_subprocess.check_call(" ".join(subprocess_args), shell=True) out_files.append(out_path) # need something for non-blank chunk_outs (Martian foible) outs.files_merged = True outs.merged_file_paths = out_files def join(args, outs, chunk_args, chunk_outs): if args.remove_split_fastqs: unique_folders = set([]) for chunk_arg, chunk_out in zip(chunk_args, chunk_outs): # if no files present or no files need merging, no folders necessary to remove if not chunk_out.files_merged: continue for input_file in chunk_arg.input_files:
r1_files = [ f for f in file_infos if args.file_read_types_map['R1'] == f.read ] for idx, r1f in enumerate(r1_files): output_json_path = os.path.join(out_base, "output_%d_R1.json" % idx) if bc_file_type == 'R1': start_index = args.bc_start_index else: start_index = 0 subproc_args = [ 'q30count', r1f.filename, output_json_path, '--read-start-index', str(start_index) ] try: tk_proc.check_call(subproc_args) except subprocess.CalledProcessError, e: martian.throw("Could not count Q30 reads on R1: return code %s" % e.returncode) # needs to be summary_chunk in Martian 3 outs.qc_summary['read1'].append(output_json_path) r2_files = [ f for f in file_infos if args.file_read_types_map['R2'] == f.read ] for idx, r2f in enumerate(r2_files): output_json_path = os.path.join(out_base, "output_%d_R2.json" % idx) if bc_file_type == 'R2': start_index = args.bc_start_index else:
def run_cutadapt(args, out_read1s, out_read2s, chemistry_def): paired_end = cr_chem.is_paired_end(chemistry_def) # If single end, determine which read the single read is (R1 or R2) if paired_end: single_read = None else: single_read = cr_chem.get_rna_read_def(chemistry_def).read_type assert single_read in ('R1', 'R2') out_r1_file = cr_utils.open_maybe_gzip(out_read1s, 'w') # Note: The complexity of forcing cutadapt to output a compressed file # means we'll have to give up on that for now. cmd = [ 'cutadapt', '-e', '0.12', '--times', '3', '--overlap', '5', '-f', 'fastq', '-o', '/proc/%d/fd/%d' % (os.getpid(), out_r1_file.fileno()) ] if paired_end: out_r2_file = cr_utils.open_maybe_gzip(out_read2s, 'w') cmd.extend( ['-p', '/proc/%d/fd/%d' % (os.getpid(), out_r2_file.fileno())]) primers = {anno['name']: anno['seq'] for anno in args.primers} if paired_end or single_read == 'R1': # R1 adapters for name in R1_ANCHORED_FIVE_PRIME_SEQS: if name in primers: cmd.extend(['-g', '%s=^%s' % (name, primers[name])]) for name in R1_THREE_PRIME_REV_COMP_SEQS: if name in primers: cmd.extend([ '-a', '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name])) ]) for name in R1_THREE_PRIME_SEQS: if name in primers: cmd.extend(['-a', '%s=%s' % (name, primers[name])]) if paired_end or single_read == 'R2': for name in R2_THREE_PRIME_REV_COMP_SEQS: if name in primers: flag = '-A' if paired_end else '-a' cmd.extend([ flag, '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name])) ]) for name in R2_THREE_PRIME_SEQS: if name in primers: flag = '-A' if paired_end else '-a' cmd.extend([flag, '%s=%s' % (name, primers[name])]) read1_file = cr_utils.open_maybe_gzip(args.read1s_chunk) cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read1_file.fileno())]) if paired_end: read2_file = cr_utils.open_maybe_gzip(args.read2s_chunk) cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read2_file.fileno())]) print cmd status = tk_subproc.check_call(cmd) return status
def concatenate_and_index_fastas(out_fasta, fastas): cr_io.concatenate_files(out_fasta, fastas) tk_subproc.check_call(['samtools', 'faidx', out_fasta], cwd=os.getcwd())
def get_consensus_quals(in_bam, clonotype_name, in_fasta, sel_contigs, contig_umis, out_dir): """Compute base quality scores of a sequence. Args: - in_bam: bam file to get the list of reads assigned to UMIs on the selected contigs - clonotype_name: Used for naming output files. - sel_contigs: Contigs that led to the consensus sequence above - contig_umis: from contig name to list of umis assigned to that contig Return value: String with base qualities (in FASTQ format). """ pref = re.sub('.fasta', '', os.path.basename(in_fasta)) fastq1 = re.sub('.fasta', '_1.fastq', in_fasta) fastq2 = re.sub('.fasta', '_2.fastq', in_fasta) sel_reads = {} for contig in sel_contigs: umi_read_count = Counter() barcode = contig.split('_')[0] contig_read_count = 0 # Wrap contig w/ str() because pysam crashes on unicode input for read in in_bam.fetch(str(contig)): # NOTE: Assembler assumes that any tags are part of the read name # BUT the bam that we feed to this stage has the tags stripped out # of the name. umi = read.get_tag(PROCESSED_UMI_TAG) if umi in contig_umis[contig] and not read.is_secondary: umi_read_count[umi] += 1 if umi_read_count[umi] >= MAX_READS_PER_UMI: continue contig_read_count += 1 if contig_read_count >= MAX_READS_PER_CONTIG: continue if not read.qname in sel_reads: sel_reads[read.qname] = [None, None] sel_reads[read.qname][read.is_read2] = read with open(fastq1, 'w') as f1, open(fastq2, 'w') as f2: for read_name, pair in sel_reads.iteritems(): read1, read2 = pair[0], pair[1] if read1 is None: # Replace the UMI with <BC>_<UMI>. umi = read2.get_tag(PROCESSED_UMI_TAG) else: umi = read1.get_tag(PROCESSED_UMI_TAG) header = cr_fastq.AugmentedFastqHeader(read_name) header.set_tag(PROCESSED_UMI_TAG, barcode + '_' + umi) header.set_tag(PROCESSED_BARCODE_TAG, barcode) if read1 is None: out_seq1 = "" out_quals1 = "" else: out_seq1 = tk_seq.get_rev_comp( read1.seq) if read1.is_reverse else read1.seq out_quals1 = read1.qual[:: -1] if read1.is_reverse else read1.qual tk_fasta.write_read_fastq(f1, header.to_string(), out_seq1, out_quals1) if read2 is None: out_seq2 = "" out_quals2 = "" else: out_seq2 = tk_seq.get_rev_comp( read2.seq) if read2.is_reverse else read2.seq out_quals2 = read2.qual[:: -1] if read2.is_reverse else read2.qual tk_fasta.write_read_fastq(f2, header.to_string(), out_seq2, out_quals2) assert (len(sel_reads) > 0) cmd = ['vdj_asm', 'base-quals', re.sub('.fasta', '', in_fasta), out_dir] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(os.path.join(out_dir, pref + '.fastq'), 'r') as f: lines = f.readlines() return lines[3].strip()
def get_consensus_seq(clonotype_name, sel_contigs, best_contig, out_dir, args): """Build a consensus sequence from a set of contigs. Args: - clonotype_name: Used to prefix output files. - sel_contigs: Names of contigs to use for consensus building. - best_contig: Name of "best" contig. Will search for this contig's sequence and base qualities. - out_dir: dir used for temporary results - args: stage args. - Return value: A tuple (best_contig_seq, best_contig_quals, consensus_seq, out_bam_name, out_fastq_name, out_fasta_name). - best_contig_seq/best_contig_quals: the sequence and quals of the best contig - consensus_seq: the consensus sequence or None if no consensus could be built. - out_bam_name: Path of BAM with alignments of contigs to consensus seq. - out_fastq_name: FASTQ with contig sequences. - out_fasta_name: FASTA with consensus sequence. enough reads for consensus. """ best_contig_seq = None best_contig_quals = None # Input to base quality computation - we don't really need the # base qualities because we will replace them by read-based qualities # But we need to do this to get proper alignments of contigs against # the consensus. out_fastq_name = martian.make_path(clonotype_name + '_contigs.fastq') # Input to assembly out_bam_name = martian.make_path(clonotype_name + '_contigs.bam') # The reference in the output bam doesn't really matter. out_bam, _ = tk_bam.create_bam_outfile(out_bam_name, ['chr1'], [1]) # Read the entire fastq (all contigs) and write the selected contigs to # a bam for the assembler and a fastq for the aligner. with open(args.contigs_fastq, 'r') as f, open(out_fastq_name, 'w') as out_fq: fq_iter = tk_fasta.read_generator_fastq(f) for (name, seq, quals) in fq_iter: if name in sel_contigs: if name == best_contig: best_contig_seq = seq best_contig_quals = quals header = cr_fastq.AugmentedFastqHeader(name) # Create a pseudo-UMI for each input contig header.set_tag(PROCESSED_UMI_TAG, name) # Put all reads on the same "barcode". This is important, so # the assembler assembles all of them together. header.set_tag(PROCESSED_BARCODE_TAG, clonotype_name) record = pysam.AlignedRead() record.reference_start = 0 record.reference_id = 0 # Wrap with str() or pysam will crash when given unicode record.qname = str(header.to_string()) record.seq = seq record.qual = quals record.flag = MAPPED_UNPAIRED_FLAG out_bam.write(record) # Now change the tags. The final bam concatenation code will pull # the tags out of the header, so we want these to be meaningful. # Put the real barcode in the barcode tag. The alignment-base-qual # code will ignore it anyway. header.set_tag(PROCESSED_BARCODE_TAG, name.split('_')[0]) tk_fasta.write_read_fastq(out_fq, header.to_string(), seq, quals) out_bam.close() assert (not best_contig_seq is None) out_fasta_name = martian.make_path(clonotype_name + '_contigs.fasta') # Run the assembler to produce a consensus sequence. Read contig-reads from out_bam_name. # The resulting sequences will be in out_dir/<clonotype_name>_contigs.fasta. This is the # only output of the assembler we care about. if len(sel_contigs) >= MIN_CONTIGS_FOR_CONSENSUS: cmd = [ 'vdj_asm', 'asm', out_bam_name, out_dir, '--single-end', '--cons', # required so we produce a single output sequence '--kmers=0', '--min-qual=0', '--score-factor=0.0' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(os.path.join(out_dir, clonotype_name + '_contigs.fasta'), 'r') as contig_f: lines = contig_f.readlines() if lines: out_seq = lines[1].strip() else: # In some rare cases (eg. input contigs have 0 quality), assembly might fail. out_seq = None else: out_seq = None # Write the best contig sequence on a new fasta. We need to make sure this has the # right contig name because this will be the name written in the bam alignments # of the contigs against the consensus with open(out_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, clonotype_name, out_seq if out_seq else best_contig_seq) # Now align the same reads that were used in vdj_asm against the consensus that you just got. # The output will be in out_dir/<clonotype_name> + '_contigs.bam' cmd = [ 'vdj_asm', 'base-quals', martian.make_path(clonotype_name + '_contigs'), out_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move the BAM of the contigs aligned against the consensus out of the outs # (Will overwrite this bam which was already used as input to assembly). cr_io.move(os.path.join(out_dir, clonotype_name + '_contigs.bam'), out_bam_name) return (best_contig_seq, best_contig_quals, out_seq, out_bam_name, out_fastq_name, out_fasta_name)
def build_reference_fasta_from_ensembl(gtf_paths, transcripts_to_remove_path, genome_fasta_path, reference_path, reference_name, ref_version, mkref_version): """Create cellranger-compatible vdj reference files from a list of ENSEMBL-like GTF files. Input files are concatenated. No attempt to merge/reconcile information across them is made. Providing the files in a different order might change the output in cases where there are multiple entries with the same transcript id and the same feature type (eg. V-region). """ transcripts = collections.defaultdict(list) if transcripts_to_remove_path: with open(transcripts_to_remove_path) as f: rm_transcripts = set([line.strip() for line in f.readlines()]) else: rm_transcripts = set() # Note: We cannot symlink here because some filesystems in the wild # do not support symlinks. print 'Copying genome reference sequence...' os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path))) tmp_genome_fa_path = os.path.join(reference_path, 'genome.fasta') cr_utils.copy(genome_fasta_path, tmp_genome_fa_path) print '...done.\n' print 'Indexing genome reference sequence...' tk_subproc.check_call(['samtools', 'faidx', tmp_genome_fa_path]) print '...done.\n' print 'Loading genome reference sequence...' genome_fasta = pysam.FastaFile(tmp_genome_fa_path) print '...done.\n' print 'Computing hash of genome FASTA file...' fasta_hash = cr_utils.compute_hash_of_file(tmp_genome_fa_path) print '...done.\n' for gtf in gtf_paths: print 'Reading GTF {}'.format(gtf) for line_no, entry in enumerate(get_gtf_iter(open(gtf))): if not entry.feature in [ ENSEMBL_FIVE_PRIME_UTR_FEATURE, ENSEMBL_CDS_FEATURE ]: continue entry = parse_attributes(entry) transcript_id = entry.attributes.get('transcript_id') transcript_biotype = entry.attributes.get('transcript_biotype') gene_biotype = entry.attributes.get('gene_biotype') gene_name = entry.attributes.get('gene_name') # Skip irrelevant biotypes if transcript_biotype not in ENSEMBL_VDJ_BIOTYPES and not gene_biotype in ENSEMBL_VDJ_BIOTYPES: continue # Skip blacklisted gene names if transcript_id in rm_transcripts: continue # Warn and skip if transcript_id missing if transcript_id is None: print 'Warning: Entry on row %d has no transcript_id' % line_no continue # Warn and skip if gene_name missing if gene_name is None: print 'Warning: Transcript %s on row %d has biotype %s but no gene_name. Skipping.' % ( transcript_id, line_no, transcript_biotype) continue # Infer region type from biotype if transcript_biotype in ENSEMBL_VDJ_BIOTYPES: vdj_feature = infer_ensembl_vdj_feature_type( entry.feature, transcript_biotype) else: vdj_feature = infer_ensembl_vdj_feature_type( entry.feature, gene_biotype) # Warn and skip if region type could not be inferred if vdj_feature is None: print 'Warning: Transcript %s has biotype %s. Could not infer VDJ gene type. Skipping.' % ( transcript_id, transcript_biotype) continue # Features that share a transcript_id and feature type are presumably exons # so keep them together. transcripts[(transcript_id, vdj_feature)].append(entry) print '...done.\n' print 'Computing hash of genes GTF files...' digest = hashlib.sha1() # concatenate all the hashes into a string and then hash that string digest.update( reduce(lambda x, y: x + y, [cr_utils.compute_hash_of_file(gtf) for gtf in gtf_paths])) gtf_hash = digest.hexdigest() print '...done.\n' print 'Fetching sequences...' out_fasta = open(get_vdj_reference_fasta(reference_path), 'w') feature_id = 1 seen_features = set() for (transcript_id, region_type), regions in transcripts.iteritems(): if not all(r.chrom == regions[0].chrom for r in regions): chroms = sorted(list(set([r.chrom for r in regions]))) print 'Warning: Transcript %s spans multiple contigs: %s. Skipping.' % ( transcript_id, str(chroms)) continue if not all(r.strand == regions[0].strand for r in regions): print 'Warning: Transcript %s spans multiple strands. Skipping.' % transcript_id continue chrom = regions[0].chrom strand = regions[0].strand ens_gene_name = standardize_ensembl_gene_name( regions[0].attributes['gene_name']) transcript_id = regions[0].attributes['transcript_id'] if chrom not in genome_fasta: print 'Warning: Transcript %s is on contig "%s" which is not in the provided reference fasta. Skipping.' % ( transcript_id, chrom) continue # Build sequence regions.sort(key=lambda r: r.start) seq = '' for region in regions: # GTF coordinates are 1-based start, end = int(region.start) - 1, int(region.end) seq += genome_fasta.fetch(chrom, start, end) # Revcomp if transcript on reverse strand if strand == '-': seq = tk_seq.get_rev_comp(seq) # Strip Ns from termini if 'N' in seq: print 'Warning: Feature %s contains Ns. Stripping from the ends.' % str( (ens_gene_name, transcript_id, region_type)) seq = seq.strip('N') if len(seq) == 0: print 'Warning: Feature %s is all Ns. Skipping.' % str( (ens_gene_name, transcript_id, region_type)) continue # Infer various attributes from the Ensembl gene name record_id = transcript_id gene_name = ens_gene_name display_name = make_display_name(gene_name=gene_name, allele_name=None) chain = infer_ensembl_vdj_chain(gene_name) chain_type = infer_ensembl_vdj_chain_type(gene_name) # Ensembl doesn't encode alleles allele_name = '00' # Disallow spaces in these fields if ' ' in region_type: raise ValueError('Spaces not allowed in region type: "%s"' % region_type) if ' ' in gene_name: raise ValueError('Spaces not allowed in gene name: "%s"' % gene_name) if ' ' in record_id: raise ValueError('Spaces not allowed in record ID: "%s"' % record_id) # Warn on features we couldn't classify properly if chain_type not in vdj_constants.VDJ_CHAIN_TYPES: print ('Warning: Could not infer chain type for: %s. ' + \ 'Expected the first two characters of the gene name to be in %s. Feature skipped.') % \ (str((gene_name, record_id, region_type)), str(tuple(vdj_constants.VDJ_CHAIN_TYPES))) continue if region_type in vdj_constants.VDJ_C_FEATURE_TYPES and chain in vdj_constants.CHAINS_WITH_ISOTYPES: isotype = infer_ensembl_isotype(ens_gene_name) else: isotype = None feature = VdjAnnotationFeature( feature_id=feature_id, record_id=record_id, display_name=display_name, gene_name=gene_name, region_type=region_type, chain_type=chain_type, chain=chain, isotype=isotype, allele_name=allele_name, sequence=seq, ) # Don't add duplicate entries feat_key = get_duplicate_feature_key(feature) if feat_key in seen_features: print 'Warning: Skipping duplicate entry for %s (%s, %s).' % ( display_name, region_type, record_id) continue seen_features.add(feat_key) feature_id += 1 out_fasta.write(convert_vdj_feature_to_fasta_entry(feature) + '\n') print '...done.\n' print 'Deleting copy of genome fasta...' os.remove(tmp_genome_fa_path) os.remove(tmp_genome_fa_path + '.fai') print '...done.\n' print 'Writing metadata JSON file into reference folder...' metadata = { cr_constants.REFERENCE_GENOMES_KEY: reference_name, cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash, cr_constants.REFERENCE_GTF_HASH_KEY: gtf_hash, cr_constants.REFERENCE_INPUT_FASTA_KEY: os.path.basename(genome_fasta_path), cr_constants.REFERENCE_INPUT_GTF_KEY: ','.join([os.path.basename(gtf_path) for gtf_path in gtf_paths]), cr_constants.REFERENCE_VERSION_KEY: ref_version, cr_constants.REFERENCE_MKREF_VERSION_KEY: mkref_version, cr_constants.REFERENCE_TYPE_KEY: vdj_constants.REFERENCE_TYPE, } with open( os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE), 'w') as json_file: json.dump(tk_safe_json.json_sanitize(metadata), json_file, sort_keys=True, indent=4) print '...done.\n'
def main(args, outs): outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = [] chunk_clonotypes = set(args.chunk_clonotypes) reporter = vdj_report.VdjReporter() if not args.clonotype_assignments or not vdj_utils.bam_has_seqs( args.contig_bam): # always produce an empty summary reporter.save(outs.chunked_reporter) return # Get the clonotype-barcode assignments with open(args.clonotype_assignments) as f: clonotypes = json.load(f) # Partition contig annotations by consensus id consensus_to_contigs = defaultdict(list) relevant_contig_ids = set() with open(args.chunk_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) clo_key = '%s_clonotype_id' % args.metric_prefix cons_key = '%s_consensus_id' % args.metric_prefix for contig in contigs: clo_id = contig.info_dict.get(clo_key) cons_id = contig.info_dict.get(cons_key) assert clo_id in chunk_clonotypes and cons_id is not None consensus_to_contigs[cons_id].append(contig) relevant_contig_ids.add(contig.contig_name) assert len(consensus_to_contigs) > 0 in_bam = tk_bam.create_bam_infile(args.contig_bam) n_merged_bams = 0 # For all contigs relevant to this chunk, # get the assembler umi data required for base qual recalculation. # Do not attempt to read into a pandas object because it can be huge. contig_umis = defaultdict(set) with open(args.umi_summary_tsv, 'r') as umi_file: for line in umi_file: fields = line.strip().split('\t') umi = fields[2] if umi == 'umi' or len(fields) < 7: continue good_umi = fields[5].lower() == 'true' contig_ids = set(fields[6].split(',')) if good_umi and len(contig_ids & relevant_contig_ids) > 0: for c in contig_ids: contig_umis[c].add(umi) consensus_fastq = open(outs.consensus_fastq, 'w') consensus_fasta = open(outs.consensus_fasta, 'w') ref_fasta = open(outs.concat_ref_fasta, 'w') consensus_contigs = [] ref_contigs = [] assert (args.metric_prefix in reporter.vdj_clonotype_types) # Iterate over clonotype assignments for clonotype_id, clonotype in clonotypes.iteritems(): if not clonotype_id in chunk_clonotypes: continue for consensus_id, consensus in clonotype['consensuses'].iteritems(): cdr = consensus['cdr3_seq'] # Verify that the contig annotation data are consistent with the clonotype assignment data assert set(consensus['cell_contigs']) == \ set(c.contig_name for c in consensus_to_contigs[consensus_id]) sel_contigs = consensus_to_contigs[consensus_id] sel_contig_ids = [c.contig_name for c in sel_contigs] # Keep track of the "best" contig. This will be used in case the # merging fails. best_contig = None # Keep track of the set of distinct annotations of the contigs to merge. # Will use to report rate of discrepancies. feature_annotations = defaultdict(set) for contig in sel_contigs: for anno in contig.annotations: feature_annotations[anno.feature.region_type].add( anno.feature.gene_name) # Always choose a productive over a non-productive. Between # contigs with the same productivity, choose the one that had more UMIs. if best_contig is None or (not best_contig.productive and contig.productive) or \ (best_contig.productive == contig.productive and \ best_contig.umi_count < contig.umi_count): best_contig = contig assert best_contig is not None anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_v_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_j_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) wrong_cdr_metric = reporter._get_metric_attr( 'vdj_clonotype_consensus_wrong_cdr_contig_frac', args.metric_prefix) tmp_dir = martian.make_path(consensus_id + '_outs') cr_io.mkdir(tmp_dir, allow_existing=True) res = get_consensus_seq(consensus_id, sel_contig_ids, best_contig.contig_name, tmp_dir, args) (best_seq, best_quals, consensus_seq, contig_to_cons_bam, contig_fastq, contig_fasta) = res outs.chunked_consensus_bams.append(contig_to_cons_bam) # make sure the bam file has the right header (single sequence with this consensus name) tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam) if list(tmp_bam.references) != [consensus_id]: # Print some info to help us debug print tmp_bam.references, consensus_id assert (list(tmp_bam.references) == [consensus_id]) tmp_bam.close() if consensus_seq: # If this is not None, we actually built a consensus, so we have to compute the quals from scratch. # Use a subset of the contigs for computing quals. contig_ids = map( lambda c: c.contig_name, sorted(sel_contigs, key=lambda c: c.umi_count, reverse=True)) contig_ids = contig_ids[0:MAX_CELLS_FOR_BASE_QUALS] consensus_quals = get_consensus_quals(in_bam, consensus_id, contig_fasta, contig_ids, contig_umis, tmp_dir) else: consensus_seq = best_seq consensus_quals = best_quals assert (len(consensus_seq) == len(consensus_quals)) total_read_count = sum([c.read_count for c in sel_contigs]) total_umi_count = sum([c.umi_count for c in sel_contigs]) contig_info_dict = { 'cells': clonotype['barcodes'], 'cell_contigs': sel_contig_ids, 'clonotype_freq': clonotype['freq'], 'clonotype_prop': clonotype['prop'], } contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) wrong_cdr_metric.add(1, filter=contig.cdr3_seq is None or contig.cdr3_seq != cdr) if contig.cdr3_seq is None or contig.cdr3_seq != cdr: # Something went wrong. Use "best" contig as the consensus. consensus_seq = best_seq consensus_quals = best_quals contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr) consensus_contigs.append(contig) tk_fasta.write_read_fasta(consensus_fasta, consensus_id, consensus_seq) tk_fasta.write_read_fastq(consensus_fastq, consensus_id, consensus_seq, consensus_quals) assert (len(consensus_seq) == len(consensus_quals)) ref_seq_parts, ref_annos = contig.get_concat_reference_sequence() # Align the contigs and consensus to a synthetic concatenated reference if ref_seq_parts is not None: # Trim the last segment down to the annotated length # to avoid including the entire (500nt) C-region ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1]. annotation_match_end] # Concatenate the reference VDJC segments ref_seq = reduce(lambda x, y: x + y, ref_seq_parts) ref_name = re.sub('consensus', 'concat_ref', consensus_id) # Reannotate the reference sequence. # Restrict the annotation to the already-called segments to # reduce the risk of discordance between the consensus and # concat_ref annotations. ref_contig = annotate_consensus_contig( args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, ref_name, clonotype_id, ref_seq, 'I' * len(ref_seq), use_features=set([a.feature.feature_id for a in ref_annos]), ) ref_contigs.append(ref_contig) # Add the consensus sequence to the input FASTQ (next to the contigs) with open(contig_fastq, 'a') as contig_fq: # Create a fake UMI and barcode header = cr_fastq.AugmentedFastqHeader(consensus_id) header.set_tag(PROCESSED_UMI_TAG, consensus_id) header.set_tag(PROCESSED_BARCODE_TAG, consensus_id) tk_fasta.write_read_fastq(contig_fq, header.to_string(), consensus_seq, consensus_quals) # Reuse this file (this had the assembly output but we don't need it anymore) ref_fasta_name = martian.make_path(consensus_id + '_contigs.fasta') with open(ref_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, ref_name, ref_seq) # Also append to the final output tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq) cmd = [ 'vdj_asm', 'base-quals', martian.make_path(consensus_id + '_contigs'), tmp_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move out of tmp dir rec_bam = martian.make_path(consensus_id + '_reference.bam') cr_io.move( os.path.join(tmp_dir, consensus_id + '_contigs.bam'), rec_bam) outs.chunked_concat_ref_bams.append(rec_bam) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) # Clean up unneeded files ASAP rm_files([ consensus_id + '_contigs.fasta', consensus_id + '_contigs.fastq' ]) # Merge N most recent BAM files to avoid filesystem overload if len(outs.chunked_consensus_bams) >= MERGE_BAMS_EVERY: assert len(outs.chunked_consensus_bams) == len( outs.chunked_concat_ref_bams) new_cons_bam = martian.make_path('merged-consensus-%03d.bam' % n_merged_bams) concatenate_bams(new_cons_bam, outs.chunked_consensus_bams) rm_files(outs.chunked_consensus_bams) outs.chunked_consensus_bams = [new_cons_bam] new_ref_bam = martian.make_path('merged-ref-%03d.bam' % n_merged_bams) concatenate_bams(new_ref_bam, outs.chunked_concat_ref_bams) rm_files(outs.chunked_concat_ref_bams) outs.chunked_concat_ref_bams = [new_ref_bam] n_merged_bams += 1 in_bam.close() consensus_fastq.close() consensus_fasta.close() ref_fasta.close() reporter.save(outs.chunked_reporter) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs)