def assemble_amplicons(contigs_fa=None, ref_fa=None, ref_gtf=None, outdir='.', sample_id='sampleXX', padding=50, min_contig_len=200, keep_tmp=False, quiet=False, logfile=None, debug=False): """ Pipeline step to assemble contigs using reference and amplicon regions Args: contigs_fa (str): Path to fasta file with assembled contigs ref_fa (str): Path to reference fasta file ref_gtf (str): Path to reference GTF file with amplicons outdir (str): Path to output directory sample_id (str): Name to append to scaffold sequence padding (int): Bases to include outside reference annotation min_contig_len (int): Minimum contig length for tiling path keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_assembly (str): Path to assembled amplicons (FASTA) out_summary (str): Path to assembly summary out_padded (str): Path to padded output file """ # Check dependencies sysutils.check_dependency('nucmer') sysutils.check_dependency('delta-filter') sysutils.check_dependency('show-tiling') # Outputs out_assembly = os.path.join(outdir, 'amplicon_assembly.fna') out_summary = os.path.join(outdir, 'amplicon_summary.txt') out_padded = os.path.join(outdir, 'amplicon_padded.out') if os.path.exists(out_padded): os.unlink(out_padded) # Temporary directory tempdir = sysutils.create_tempdir('assemble_amplicons', None, quiet, logfile) # Create fasta file with sequence IDs only (remove decription) tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir) # Load reference sequence(s) refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} # For each amplicon, extract the sequence from the reference and scaffold using nucmer amplicon_alignments = [] amps = [ gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon' ] for gl in amps: msg = 'Amplicon ref|%s|reg|%s\n' % (gl.chrom, gl.attrs['name']) sysutils.log_message(msg, quiet, logfile) # Extract reference amplicon amp_s = max(0, (gl.start - 1) - padding) amp_e = min(len(refseqs[gl.chrom]), gl.end + padding) ampseq = refseqs[gl.chrom].seq[amp_s:amp_e] amplicon_fa = os.path.join(tempdir, 'subject.fa') with open(amplicon_fa, 'w') as outh: print('>ref|%s|reg|%s' % (gl.chrom, gl.attrs['name']), file=outh) print(sequtils.wrap(str(ampseq)), file=outh) # Align with nucmer fil, til = alignutils.align_nucmer(tmp_contigs_fa, amplicon_fa, tempdir, min_contig_len=min_contig_len, quiet=quiet, logfile=logfile, debug=debug) # Skip everything else if debugging if debug: continue # Parse tiling and show alignments trows = [alignutils.TilingRow(l) for l in open(til, 'rU')] if not trows: amplicon_alignments.append((gl.chrom, gl.attrs['name'], None)) else: # Initialize alignment amp_seq = SeqIO.read(amplicon_fa, 'fasta') combined = alignutils.EmptyReferenceAlignment( str(amp_seq.seq).lower()) for tr in trows: out = alignutils.show_aligns(tr.ref, tr.qry, fil) for nucaln in alignutils.parse_show_aligns(out): combined = combined.merge_alignments(nucaln) with open(out_padded, 'a') as outh: print('%s\n%s\n%s' % (tr, combined.raln(), combined.qaln()), file=outh) amplicon_alignments.append((gl.chrom, gl.attrs['name'], combined)) # Cleanup for f in [fil, til, amplicon_fa]: if os.path.isfile(f): os.unlink(f) # Write to output files with open(out_assembly, 'w') as outseq, open(out_summary, 'w') as outsum: for ref_id, reg, combined in amplicon_alignments: amp_id = sequtils.make_seq_id(sid=sample_id, ref=ref_id, reg=reg) if combined is None: msg1 = '%s\tFAIL\t%d' % (amp_id, 0) msg2 = u'%s\tFAIL\t%d\t%s\n' % (amp_id, 0, u"👎🏼") if logfile is not None: print(u'%s\tFAIL\t%d\t%s' % (amp_id, 0, u"👎🏼"), file=logfile) else: scaf, s, e = combined.scaffold2() msg1 = '%s\tPASS\t%d' % (amp_id, len(scaf)) msg2 = u'%s\tPASS\t%d\t%s\n' % (amp_id, len(scaf), u"👍🏼") print('>%s' % (amp_id), file=outseq) print('%s' % sequtils.wrap(scaf), file=outseq) print(msg1, file=outsum) sysutils.log_message(msg2, quiet, logfile) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'assemble_amplicons', quiet, logfile) return out_assembly, out_summary, out_padded
def assemble_scaffold( contigs_fa=None, ref_fa=None, outdir='.', seqname='sample01', keep_tmp=False, quiet=False, logfile=None, debug=False ): """ Pipeline step to assemble contigs to reference scaffold Args: contigs_fa (str): Path to fasta file with assembled contigs ref_fa (str): Path to reference fasta file outdir (str): Path to output directory seqname (str): Name to append to scaffold sequence keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_scaffold (str): Path to scaffold FASTA. Reference positions that were not covered have 'n' out_imputed (str): Path to imputed FASTA. Reference positions that were not covered have reference base. out_aln (str): Path to FASTA alignment between scaffold and reference. out_padded (str): Path to output with all contigs aligned to reference. """ # Check dependencies sysutils.check_dependency('nucmer') sysutils.check_dependency('delta-filter') sysutils.check_dependency('show-tiling') # Outputs out_scaffold = os.path.join(outdir, 'scaffold_assembly.fa') out_imputed = os.path.join(outdir, 'scaffold_imputed.fa') out_aln = os.path.join(outdir, 'scaffold_aligned.fa') out_padded = os.path.join(outdir, 'scaffold_padded.out') # Temporary directory tempdir = sysutils.create_tempdir( 'assemble_scaffold', None, quiet, logfile ) # Create fasta file with sequence IDs only (remove decription) tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir) with open(out_padded, 'w') as pad_fh: scaffolds = alignutils.assemble_to_ref( tmp_contigs_fa, ref_fa, tempdir, pad_fh=pad_fh, quiet=quiet, logfile=logfile, debug=debug ) # Output scaffolds as FASTA with open(out_scaffold, 'w') as outh: for ref in sorted(scaffolds.keys()): n = '%s.%s' % (ref.split('.')[0], seqname) s = scaffolds[ref].scaffold() print('>%s\n%s' % (n, sequtils.wrap(s)), file=outh) # Output imputed as FASTA with open(out_imputed, 'w') as outh: for ref in sorted(scaffolds.keys()): n = '%s.%s' % (ref.split('.')[0], seqname) s = scaffolds[ref].imputed() print('>%s\n%s' % (n, sequtils.wrap(s)), file=outh) # Output alignments for other pipeline stages with open(out_aln, 'w') as outh: for ref in sorted(scaffolds.keys()): n = '%s.%s' % (ref.split('.')[0], seqname) print('>REF|%s\n%s' % (n, scaffolds[ref].raln()), file=outh) print('>%s\n%s' % (n, scaffolds[ref].qaln()), file=outh) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'assemble_scaffold', quiet, logfile) return out_scaffold, out_imputed, out_aln, out_padded
def vcf_to_consensus( vcf=None, outdir='.', sampidx=0, min_dp=5, major=0.5, minor=0.2, keep_tmp=False, quiet=False, logfile=None, ): """ Pipeline step to create consensus sequence from VCF Args: vcf (str): Path to variant calls (VCF) outdir (str): Path to output directory sampidx (int): Index for sample if multi-sample VCF min_dp (int): Minimum depth to call site major (float): Allele fraction to make unambiguous call minor (float): Allele fraction to make ambiguous call keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: """ # Check inputs if vcf is None: raise sysutils.PipelineStepError('VCF file is required') # Outputs out_fasta = os.path.join(outdir, 'consensus.fna') sysutils.log_message('[--- vcf_to_consensus ---]\n', quiet, logfile) sysutils.log_message('VCF: %s\n' % vcf, quiet, logfile) # Parse VCF chroms = [] samples = [] if os.path.splitext(vcf)[1] == '.gz': lines = (l.decode('utf-8').strip('\n') for l in gzip.open(vcf, 'rb')) else: lines = (l.strip('\n') for l in open(vcf, 'r')) # Parse headers for l in lines: if l.startswith('##'): m = re.match('##contig=<ID=(\S+),length=(\d+)>', l) if m: chroms.append((m.group(1), int(m.group(2)))) else: assert l.startswith('#') cols = l.strip('#').split('\t') samples = cols[9:] break if len(samples) <= sampidx: msg = 'Sample index %d does not exist. Samples: %s' % (sampidx, str(samples)) raise sysutils.PipelineStepError(msg) chrom_ordered = [_[0] for _ in chroms] chroms = dict(chroms) newseqs = dict((c, ['.'] * chroms[c]) for c in list(chroms.keys())) imputed = dict((c, [''] * chroms[c]) for c in list(chroms.keys())) for l in lines: chrom, start, stop, RA, AA, info, svals = parse_vcf_sample(l, sampidx) gt = call_gt(RA, AA, svals, min_dp, major, minor) if gt is None: imputed[chrom][start - 1] = RA[0].lower() else: if len(gt) == 1: newseqs[chrom][start - 1] = gt[0] imputed[chrom][start - 1] = gt[0] else: if all(len(_) == 1 for _ in gt): newseqs[chrom][start - 1] = sequtils.get_ambig(gt) imputed[chrom][start - 1] = sequtils.get_ambig(gt) else: newseqs[chrom][start - 1] = ''.join(gt[0]) imputed[chrom][start - 1] = ''.join(gt[0]) # newseqs = imputed sysutils.log_message('Output FASTA: %s\n' % out_fasta, quiet, logfile) with open(out_fasta, 'w') as outh: for chrom in chrom_ordered: new_seqid = sequtils.update_seq_id(chrom, samples[sampidx]) new_seq = ''.join(newseqs[chrom]).replace('.', 'n') m = re.match('^(?P<pre>n*)(?P<seq>[^n].+[^n])?(?P<suf>n*)$', new_seq) if m.group('seq') is None: msg = u'%s\tFAIL\t%d\t%s\n' % (new_seqid, 0, u"👎🏼") # Don't output sequence if not present else: msg = u'%s\tPASS\t%d\t%s\n' % (new_seqid, len( m.group('seq')), u"👍🏼") print('>%s SM:%s' % (new_seqid, samples[sampidx]), file=outh) print(sequtils.wrap(new_seq), file=outh) sysutils.log_message(msg, quiet, logfile) return out_fasta
def extract_pairwise( align_json=None, outfile=None, outfmt=None, refreg=None, debug=False, ): outh = sys.stdout if outfile is None else open(outfile, 'w') if outfmt == 'nuc_fa' or outfmt == 'prot_fa': jaln = load_slot_json(align_json, 'padded_alignments') if refreg is None: for newname, alignment in list(jaln.items()): nucstr = ''.join(t[2] for t in alignment if t[3] != -1) nucstr = nucstr.replace('*', 'N') print('>%s' % newname, file=outh) if outfmt == 'nuc_fa': print(sequtils.wrap(nucstr), file=outh) else: s = Seq(nucstr[:(old_div(len(nucstr), 3)) * 3]) print(sequtils.wrap(str(s.translate())), file=outh) else: refmap = { sequtils.parse_seq_id(k)['ref']: k for k in list(jaln.keys()) } chrom, ref_s, ref_e = sequtils.region_to_tuple(refreg) ref_s = ref_s - 1 alignment = jaln[refmap[chrom]] # Get alignment start for aln_s in range(len(alignment)): if alignment[aln_s][0] == ref_s: break while alignment[aln_s][3] == -1: aln_s += 1 # Get alignment end for aln_e in range(len(alignment) - 1, -1, -1): if alignment[aln_e][0] == ref_e: break while alignment[aln_e][3] == -1: aln_e += -1 nucstr = ''.join(t[2] for t in alignment[aln_s:aln_e] if t[3] != -1) nucstr = nucstr.replace('*', 'N') print('>%s (%s)' % (refmap[chrom], refreg), file=outh) if outfmt == 'nuc_fa': print(sequtils.wrap(nucstr), file=outh) else: s = Seq(nucstr[:(old_div(len(nucstr), 3)) * 3]) print(sequtils.wrap(str(s.translate())), file=outh) elif outfmt == 'aln_fa': jaln = load_slot_json(align_json, 'padded_alignments') for newname, alignment in list(jaln.items()): aid = sequtils.parse_seq_id(newname) rstr = ''.join(t[1] for t in alignment).replace('*', 'N') qstr = ''.join(t[2] for t in alignment).replace('*', 'N') print('>ref|%s|' % aid['ref'], file=outh) print(sequtils.wrap(rstr), file=outh) print('>sid|%s|' % aid['sid'], file=outh) print(sequtils.wrap(qstr), file=outh) elif outfmt == 'amp_gtf': jgtf = load_slot_json(align_json, 'padded_gtf') print('\n'.join(_ for _ in jgtf), file=outh) elif outfmt == 'tsv': jaln = load_slot_json(align_json, 'padded_alignments') for newname, alignment in list(jaln.items()): print('# %s' % newname, file=outh) for l in alignment: print('\t'.join(str(_) for _ in l), file=outh)
def finalize_assembly( fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.', bt2_preset='very-sensitive', sample_id='sampleXX', ncpu=1, keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to finalize consensus """ # Outputs out_ref = os.path.join(outdir, 'final.fna') out_aligned = os.path.join(outdir, 'final.bam') out_bt2 = os.path.join(outdir, 'final_bt2.out') out_vcf = os.path.join(outdir, 'final.vcf.gz') # Temporary directory tempdir = sysutils.create_tempdir('finalize_assembly', None, quiet, logfile) # Copy reference and rename sequences with open(out_ref, 'w') as outh: for s in SeqIO.parse(ref_fa, 'fasta'): if sample_id == 'sampleXX': dline = s.description else: dline = sequtils.update_seq_id(s.id, sample_id) dline += ' SM:%s' % sample_id print('>%s' % dline, file=outh) print(sequtils.wrap(str(s.seq).upper()), file=outh) # Align to reference tmp_aligned, tmp_bt2 = align_reads.align_reads( fq1=fq1, fq2=fq2, fqU=fqU, ref_fa=out_ref, outdir=tempdir, bt2_preset=bt2_preset, sample_id=sample_id, ncpu=ncpu, keep_tmp=keep_tmp, quiet=quiet, logfile=logfile, debug=debug, ) # Call variants tmp_vcf = call_variants.call_variants( aln_bam=tmp_aligned, ref_fa=out_ref, outdir=tempdir, emit_all=False, ncpu=ncpu, keep_tmp=keep_tmp, quiet=quiet, logfile=logfile, debug=debug, ) shutil.copy(tmp_aligned, out_aligned) shutil.copy(tmp_bt2, out_bt2) shutil.copy(tmp_vcf, out_vcf) # Index BAM and VCF cmds = [ ['tabix', out_vcf], ['samtools', 'index', out_aligned], ] sysutils.command_runner( cmds, 'finalize_assembly', quiet, logfile, debug, ) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'finalize_assembly', quiet, logfile) return out_ref, out_aligned, out_vcf, out_bt2