예제 #1
0
def assemble_amplicons(contigs_fa=None,
                       ref_fa=None,
                       ref_gtf=None,
                       outdir='.',
                       sample_id='sampleXX',
                       padding=50,
                       min_contig_len=200,
                       keep_tmp=False,
                       quiet=False,
                       logfile=None,
                       debug=False):
    """ Pipeline step to assemble contigs using reference and amplicon regions

    Args:
        contigs_fa (str): Path to fasta file with assembled contigs
        ref_fa (str): Path to reference fasta file
        ref_gtf (str): Path to reference GTF file with amplicons
        outdir (str): Path to output directory
        sample_id (str): Name to append to scaffold sequence
        padding (int): Bases to include outside reference annotation
        min_contig_len (int): Minimum contig length for tiling path
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_assembly (str): Path to assembled amplicons (FASTA)
        out_summary (str): Path to assembly summary
        out_padded (str): Path to padded output file

    """
    # Check dependencies
    sysutils.check_dependency('nucmer')
    sysutils.check_dependency('delta-filter')
    sysutils.check_dependency('show-tiling')

    # Outputs
    out_assembly = os.path.join(outdir, 'amplicon_assembly.fna')
    out_summary = os.path.join(outdir, 'amplicon_summary.txt')
    out_padded = os.path.join(outdir, 'amplicon_padded.out')
    if os.path.exists(out_padded): os.unlink(out_padded)

    # Temporary directory
    tempdir = sysutils.create_tempdir('assemble_amplicons', None, quiet,
                                      logfile)

    # Create fasta file with sequence IDs only (remove decription)
    tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir)

    # Load reference sequence(s)
    refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}

    # For each amplicon, extract the sequence from the reference and scaffold using nucmer
    amplicon_alignments = []
    amps = [
        gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon'
    ]

    for gl in amps:
        msg = 'Amplicon ref|%s|reg|%s\n' % (gl.chrom, gl.attrs['name'])
        sysutils.log_message(msg, quiet, logfile)
        # Extract reference amplicon
        amp_s = max(0, (gl.start - 1) - padding)
        amp_e = min(len(refseqs[gl.chrom]), gl.end + padding)
        ampseq = refseqs[gl.chrom].seq[amp_s:amp_e]
        amplicon_fa = os.path.join(tempdir, 'subject.fa')
        with open(amplicon_fa, 'w') as outh:
            print('>ref|%s|reg|%s' % (gl.chrom, gl.attrs['name']), file=outh)
            print(sequtils.wrap(str(ampseq)), file=outh)

        # Align with nucmer
        fil, til = alignutils.align_nucmer(tmp_contigs_fa,
                                           amplicon_fa,
                                           tempdir,
                                           min_contig_len=min_contig_len,
                                           quiet=quiet,
                                           logfile=logfile,
                                           debug=debug)

        # Skip everything else if debugging
        if debug: continue

        # Parse tiling and show alignments
        trows = [alignutils.TilingRow(l) for l in open(til, 'rU')]
        if not trows:
            amplicon_alignments.append((gl.chrom, gl.attrs['name'], None))
        else:
            # Initialize alignment
            amp_seq = SeqIO.read(amplicon_fa, 'fasta')
            combined = alignutils.EmptyReferenceAlignment(
                str(amp_seq.seq).lower())
            for tr in trows:
                out = alignutils.show_aligns(tr.ref, tr.qry, fil)
                for nucaln in alignutils.parse_show_aligns(out):
                    combined = combined.merge_alignments(nucaln)
                    with open(out_padded, 'a') as outh:
                        print('%s\n%s\n%s' %
                              (tr, combined.raln(), combined.qaln()),
                              file=outh)
            amplicon_alignments.append((gl.chrom, gl.attrs['name'], combined))

        # Cleanup
        for f in [fil, til, amplicon_fa]:
            if os.path.isfile(f):
                os.unlink(f)

    # Write to output files
    with open(out_assembly, 'w') as outseq, open(out_summary, 'w') as outsum:
        for ref_id, reg, combined in amplicon_alignments:
            amp_id = sequtils.make_seq_id(sid=sample_id, ref=ref_id, reg=reg)
            if combined is None:
                msg1 = '%s\tFAIL\t%d' % (amp_id, 0)
                msg2 = u'%s\tFAIL\t%d\t%s\n' % (amp_id, 0, u"👎🏼")
                if logfile is not None:
                    print(u'%s\tFAIL\t%d\t%s' % (amp_id, 0, u"👎🏼"),
                          file=logfile)
            else:
                scaf, s, e = combined.scaffold2()
                msg1 = '%s\tPASS\t%d' % (amp_id, len(scaf))
                msg2 = u'%s\tPASS\t%d\t%s\n' % (amp_id, len(scaf), u"👍🏼")
                print('>%s' % (amp_id), file=outseq)
                print('%s' % sequtils.wrap(scaf), file=outseq)

            print(msg1, file=outsum)
            sysutils.log_message(msg2, quiet, logfile)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'assemble_amplicons', quiet, logfile)

    return out_assembly, out_summary, out_padded
예제 #2
0
def assemble_scaffold(
        contigs_fa=None, ref_fa=None, outdir='.',
        seqname='sample01',
        keep_tmp=False, quiet=False, logfile=None, debug=False
    ):
    """ Pipeline step to assemble contigs to reference scaffold

    Args:
        contigs_fa (str): Path to fasta file with assembled contigs
        ref_fa (str): Path to reference fasta file
        outdir (str): Path to output directory
        seqname (str): Name to append to scaffold sequence
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_scaffold (str): Path to scaffold FASTA. Reference positions that
                            were not covered have 'n'
        out_imputed (str):  Path to imputed FASTA. Reference positions that
                            were not covered have reference base.
        out_aln (str):      Path to FASTA alignment between scaffold and
                            reference.
        out_padded (str):   Path to output with all contigs aligned to
                            reference.
    """
    # Check dependencies
    sysutils.check_dependency('nucmer')
    sysutils.check_dependency('delta-filter')
    sysutils.check_dependency('show-tiling')
    
    # Outputs
    out_scaffold = os.path.join(outdir, 'scaffold_assembly.fa')
    out_imputed = os.path.join(outdir, 'scaffold_imputed.fa')
    out_aln = os.path.join(outdir, 'scaffold_aligned.fa')
    out_padded = os.path.join(outdir, 'scaffold_padded.out')
    
    # Temporary directory
    tempdir = sysutils.create_tempdir(
        'assemble_scaffold', None, quiet, logfile
    )

    # Create fasta file with sequence IDs only (remove decription)
    tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir)

    with open(out_padded, 'w') as pad_fh:
        scaffolds = alignutils.assemble_to_ref(
            tmp_contigs_fa, ref_fa, tempdir, pad_fh=pad_fh,
            quiet=quiet, logfile=logfile, debug=debug
        )

    # Output scaffolds as FASTA
    with open(out_scaffold, 'w') as outh:
        for ref in sorted(scaffolds.keys()):
            n = '%s.%s' % (ref.split('.')[0], seqname)
            s = scaffolds[ref].scaffold()
            print('>%s\n%s' % (n, sequtils.wrap(s)), file=outh)

    # Output imputed as FASTA
    with open(out_imputed, 'w') as outh:
        for ref in sorted(scaffolds.keys()):
            n = '%s.%s' % (ref.split('.')[0], seqname)
            s = scaffolds[ref].imputed()
            print('>%s\n%s' % (n, sequtils.wrap(s)), file=outh)

    # Output alignments for other pipeline stages
    with open(out_aln, 'w') as outh:
        for ref in sorted(scaffolds.keys()):
            n = '%s.%s' % (ref.split('.')[0], seqname)
            print('>REF|%s\n%s' % (n, scaffolds[ref].raln()), file=outh)
            print('>%s\n%s' % (n, scaffolds[ref].qaln()), file=outh)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'assemble_scaffold', quiet, logfile)

    return out_scaffold, out_imputed, out_aln, out_padded
예제 #3
0
def vcf_to_consensus(
    vcf=None,
    outdir='.',
    sampidx=0,
    min_dp=5,
    major=0.5,
    minor=0.2,
    keep_tmp=False,
    quiet=False,
    logfile=None,
):
    """ Pipeline step to create consensus sequence from VCF

    Args:
        vcf (str): Path to variant calls (VCF)
        outdir (str): Path to output directory
        sampidx (int): Index for sample if multi-sample VCF
        min_dp (int): Minimum depth to call site
        major (float): Allele fraction to make unambiguous call
        minor (float): Allele fraction to make ambiguous call
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:

    """
    # Check inputs
    if vcf is None:
        raise sysutils.PipelineStepError('VCF file is required')

    # Outputs
    out_fasta = os.path.join(outdir, 'consensus.fna')

    sysutils.log_message('[--- vcf_to_consensus ---]\n', quiet, logfile)
    sysutils.log_message('VCF:          %s\n' % vcf, quiet, logfile)

    # Parse VCF
    chroms = []
    samples = []

    if os.path.splitext(vcf)[1] == '.gz':
        lines = (l.decode('utf-8').strip('\n') for l in gzip.open(vcf, 'rb'))
    else:
        lines = (l.strip('\n') for l in open(vcf, 'r'))

    # Parse headers
    for l in lines:
        if l.startswith('##'):
            m = re.match('##contig=<ID=(\S+),length=(\d+)>', l)
            if m:
                chroms.append((m.group(1), int(m.group(2))))
        else:
            assert l.startswith('#')
            cols = l.strip('#').split('\t')
            samples = cols[9:]
            break

    if len(samples) <= sampidx:
        msg = 'Sample index %d does not exist. Samples: %s' % (sampidx,
                                                               str(samples))
        raise sysutils.PipelineStepError(msg)

    chrom_ordered = [_[0] for _ in chroms]
    chroms = dict(chroms)
    newseqs = dict((c, ['.'] * chroms[c]) for c in list(chroms.keys()))
    imputed = dict((c, [''] * chroms[c]) for c in list(chroms.keys()))
    for l in lines:
        chrom, start, stop, RA, AA, info, svals = parse_vcf_sample(l, sampidx)
        gt = call_gt(RA, AA, svals, min_dp, major, minor)

        if gt is None:
            imputed[chrom][start - 1] = RA[0].lower()
        else:
            if len(gt) == 1:
                newseqs[chrom][start - 1] = gt[0]
                imputed[chrom][start - 1] = gt[0]
            else:
                if all(len(_) == 1 for _ in gt):
                    newseqs[chrom][start - 1] = sequtils.get_ambig(gt)
                    imputed[chrom][start - 1] = sequtils.get_ambig(gt)
                else:
                    newseqs[chrom][start - 1] = ''.join(gt[0])
                    imputed[chrom][start - 1] = ''.join(gt[0])
    # newseqs = imputed
    sysutils.log_message('Output FASTA: %s\n' % out_fasta, quiet, logfile)
    with open(out_fasta, 'w') as outh:
        for chrom in chrom_ordered:
            new_seqid = sequtils.update_seq_id(chrom, samples[sampidx])
            new_seq = ''.join(newseqs[chrom]).replace('.', 'n')
            m = re.match('^(?P<pre>n*)(?P<seq>[^n].+[^n])?(?P<suf>n*)$',
                         new_seq)
            if m.group('seq') is None:
                msg = u'%s\tFAIL\t%d\t%s\n' % (new_seqid, 0, u"👎🏼")
                # Don't output sequence if not present
            else:
                msg = u'%s\tPASS\t%d\t%s\n' % (new_seqid, len(
                    m.group('seq')), u"👍🏼")
                print('>%s SM:%s' % (new_seqid, samples[sampidx]), file=outh)
                print(sequtils.wrap(new_seq), file=outh)

            sysutils.log_message(msg, quiet, logfile)

    return out_fasta
예제 #4
0
def extract_pairwise(
    align_json=None,
    outfile=None,
    outfmt=None,
    refreg=None,
    debug=False,
):
    outh = sys.stdout if outfile is None else open(outfile, 'w')

    if outfmt == 'nuc_fa' or outfmt == 'prot_fa':
        jaln = load_slot_json(align_json, 'padded_alignments')
        if refreg is None:
            for newname, alignment in list(jaln.items()):
                nucstr = ''.join(t[2] for t in alignment if t[3] != -1)
                nucstr = nucstr.replace('*', 'N')
                print('>%s' % newname, file=outh)
                if outfmt == 'nuc_fa':
                    print(sequtils.wrap(nucstr), file=outh)
                else:
                    s = Seq(nucstr[:(old_div(len(nucstr), 3)) * 3])
                    print(sequtils.wrap(str(s.translate())), file=outh)
        else:
            refmap = {
                sequtils.parse_seq_id(k)['ref']: k
                for k in list(jaln.keys())
            }
            chrom, ref_s, ref_e = sequtils.region_to_tuple(refreg)
            ref_s = ref_s - 1
            alignment = jaln[refmap[chrom]]

            # Get alignment start
            for aln_s in range(len(alignment)):
                if alignment[aln_s][0] == ref_s:
                    break
                while alignment[aln_s][3] == -1:
                    aln_s += 1

            # Get alignment end
            for aln_e in range(len(alignment) - 1, -1, -1):
                if alignment[aln_e][0] == ref_e:
                    break
            while alignment[aln_e][3] == -1:
                aln_e += -1

            nucstr = ''.join(t[2] for t in alignment[aln_s:aln_e]
                             if t[3] != -1)
            nucstr = nucstr.replace('*', 'N')
            print('>%s (%s)' % (refmap[chrom], refreg), file=outh)
            if outfmt == 'nuc_fa':
                print(sequtils.wrap(nucstr), file=outh)
            else:
                s = Seq(nucstr[:(old_div(len(nucstr), 3)) * 3])
                print(sequtils.wrap(str(s.translate())), file=outh)

    elif outfmt == 'aln_fa':
        jaln = load_slot_json(align_json, 'padded_alignments')
        for newname, alignment in list(jaln.items()):
            aid = sequtils.parse_seq_id(newname)
            rstr = ''.join(t[1] for t in alignment).replace('*', 'N')
            qstr = ''.join(t[2] for t in alignment).replace('*', 'N')
            print('>ref|%s|' % aid['ref'], file=outh)
            print(sequtils.wrap(rstr), file=outh)
            print('>sid|%s|' % aid['sid'], file=outh)
            print(sequtils.wrap(qstr), file=outh)

    elif outfmt == 'amp_gtf':
        jgtf = load_slot_json(align_json, 'padded_gtf')
        print('\n'.join(_ for _ in jgtf), file=outh)

    elif outfmt == 'tsv':
        jaln = load_slot_json(align_json, 'padded_alignments')
        for newname, alignment in list(jaln.items()):
            print('# %s' % newname, file=outh)
            for l in alignment:
                print('\t'.join(str(_) for _ in l), file=outh)
예제 #5
0
def finalize_assembly(
    fq1=None,
    fq2=None,
    fqU=None,
    ref_fa=None,
    outdir='.',
    bt2_preset='very-sensitive',
    sample_id='sampleXX',
    ncpu=1,
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to finalize consensus
    """
    # Outputs
    out_ref = os.path.join(outdir, 'final.fna')
    out_aligned = os.path.join(outdir, 'final.bam')
    out_bt2 = os.path.join(outdir, 'final_bt2.out')
    out_vcf = os.path.join(outdir, 'final.vcf.gz')

    # Temporary directory
    tempdir = sysutils.create_tempdir('finalize_assembly', None, quiet,
                                      logfile)

    # Copy reference and rename sequences
    with open(out_ref, 'w') as outh:
        for s in SeqIO.parse(ref_fa, 'fasta'):
            if sample_id == 'sampleXX':
                dline = s.description
            else:
                dline = sequtils.update_seq_id(s.id, sample_id)
                dline += ' SM:%s' % sample_id
            print('>%s' % dline, file=outh)
            print(sequtils.wrap(str(s.seq).upper()), file=outh)

    # Align to reference
    tmp_aligned, tmp_bt2 = align_reads.align_reads(
        fq1=fq1,
        fq2=fq2,
        fqU=fqU,
        ref_fa=out_ref,
        outdir=tempdir,
        bt2_preset=bt2_preset,
        sample_id=sample_id,
        ncpu=ncpu,
        keep_tmp=keep_tmp,
        quiet=quiet,
        logfile=logfile,
        debug=debug,
    )

    # Call variants
    tmp_vcf = call_variants.call_variants(
        aln_bam=tmp_aligned,
        ref_fa=out_ref,
        outdir=tempdir,
        emit_all=False,
        ncpu=ncpu,
        keep_tmp=keep_tmp,
        quiet=quiet,
        logfile=logfile,
        debug=debug,
    )

    shutil.copy(tmp_aligned, out_aligned)
    shutil.copy(tmp_bt2, out_bt2)
    shutil.copy(tmp_vcf, out_vcf)

    # Index BAM and VCF
    cmds = [
        ['tabix', out_vcf],
        ['samtools', 'index', out_aligned],
    ]
    sysutils.command_runner(
        cmds,
        'finalize_assembly',
        quiet,
        logfile,
        debug,
    )

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'finalize_assembly', quiet, logfile)

    return out_ref, out_aligned, out_vcf, out_bt2