Exemplo n.º 1
0
def annotate_from_ref(
    align_json=None,
    ref_gtf=None,
    outfile=None,
    outfmt=None,
    debug=False,
):
    outh = sys.stdout if outfile is None else open(outfile, 'w')
    jaln = load_slot_json(align_json, 'padded_alignments')

    refmap = {parse_seq_id(k)['ref']: k for k in list(jaln.keys())}
    for gr in gtf_parser(ref_gtf):
        if gr.feature not in [
                'gene',
        ]:
            continue
        alignment = jaln[refmap[gr.chrom]]
        ref_s = gr.start - 1
        ref_e = gr.end

        # Get alignment start
        for aln_s in range(len(alignment)):
            if alignment[aln_s][0] == ref_s:
                break
        while alignment[aln_s][3] == -1:
            aln_s += 1

        # Get alignment end
        for aln_e in range(len(alignment) - 1, -1, -1):
            if alignment[aln_e][0] == ref_e:
                break
        while alignment[aln_e][3] == -1:
            aln_e += -1

        con_s = alignment[aln_s][3]
        con_e = alignment[aln_e][3]

        new_gr = GTFRow()
        new_gr.chrom, new_gr.source = (refmap[gr.chrom], 'haphpipe')
        new_gr.feature = gr.feature
        new_gr.start, new_gr.end = (con_s + 1, con_e)
        new_gr.score, new_gr.strand, new_gr.frame = ('.', gr.strand, gr.frame)
        new_gr.attrs['name'] = gr.attrs['name']

        # Include statistics in attributes
        new_gr.attrs.update(get_seg_stats(alignment[aln_s:aln_e + 1]))
        # Get the regions that are actually called
        creg = called_regions(alignment[aln_s:aln_e + 1])
        new_gr.attrs['call_reg'] = ','.join('%d-%d' % t for t in creg)
        new_gr.attrs['call_len'] = sum((t[1] - t[0] + 1) for t in creg)

        print(new_gr, file=outh)
Exemplo n.º 2
0
def assemble_amplicons(contigs_fa=None,
                       ref_fa=None,
                       ref_gtf=None,
                       outdir='.',
                       sample_id='sampleXX',
                       padding=50,
                       min_contig_len=200,
                       keep_tmp=False,
                       quiet=False,
                       logfile=None,
                       debug=False):
    """ Pipeline step to assemble contigs using reference and amplicon regions

    Args:
        contigs_fa (str): Path to fasta file with assembled contigs
        ref_fa (str): Path to reference fasta file
        ref_gtf (str): Path to reference GTF file with amplicons
        outdir (str): Path to output directory
        sample_id (str): Name to append to scaffold sequence
        padding (int): Bases to include outside reference annotation
        min_contig_len (int): Minimum contig length for tiling path
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_assembly (str): Path to assembled amplicons (FASTA)
        out_summary (str): Path to assembly summary
        out_padded (str): Path to padded output file

    """
    # Check dependencies
    sysutils.check_dependency('nucmer')
    sysutils.check_dependency('delta-filter')
    sysutils.check_dependency('show-tiling')

    # Outputs
    out_assembly = os.path.join(outdir, 'amplicon_assembly.fna')
    out_summary = os.path.join(outdir, 'amplicon_summary.txt')
    out_padded = os.path.join(outdir, 'amplicon_padded.out')
    if os.path.exists(out_padded): os.unlink(out_padded)

    # Temporary directory
    tempdir = sysutils.create_tempdir('assemble_amplicons', None, quiet,
                                      logfile)

    # Create fasta file with sequence IDs only (remove decription)
    tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir)

    # Load reference sequence(s)
    refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}

    # For each amplicon, extract the sequence from the reference and scaffold using nucmer
    amplicon_alignments = []
    amps = [
        gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon'
    ]

    for gl in amps:
        msg = 'Amplicon ref|%s|reg|%s\n' % (gl.chrom, gl.attrs['name'])
        sysutils.log_message(msg, quiet, logfile)
        # Extract reference amplicon
        amp_s = max(0, (gl.start - 1) - padding)
        amp_e = min(len(refseqs[gl.chrom]), gl.end + padding)
        ampseq = refseqs[gl.chrom].seq[amp_s:amp_e]
        amplicon_fa = os.path.join(tempdir, 'subject.fa')
        with open(amplicon_fa, 'w') as outh:
            print('>ref|%s|reg|%s' % (gl.chrom, gl.attrs['name']), file=outh)
            print(sequtils.wrap(str(ampseq)), file=outh)

        # Align with nucmer
        fil, til = alignutils.align_nucmer(tmp_contigs_fa,
                                           amplicon_fa,
                                           tempdir,
                                           min_contig_len=min_contig_len,
                                           quiet=quiet,
                                           logfile=logfile,
                                           debug=debug)

        # Skip everything else if debugging
        if debug: continue

        # Parse tiling and show alignments
        trows = [alignutils.TilingRow(l) for l in open(til, 'rU')]
        if not trows:
            amplicon_alignments.append((gl.chrom, gl.attrs['name'], None))
        else:
            # Initialize alignment
            amp_seq = SeqIO.read(amplicon_fa, 'fasta')
            combined = alignutils.EmptyReferenceAlignment(
                str(amp_seq.seq).lower())
            for tr in trows:
                out = alignutils.show_aligns(tr.ref, tr.qry, fil)
                for nucaln in alignutils.parse_show_aligns(out):
                    combined = combined.merge_alignments(nucaln)
                    with open(out_padded, 'a') as outh:
                        print('%s\n%s\n%s' %
                              (tr, combined.raln(), combined.qaln()),
                              file=outh)
            amplicon_alignments.append((gl.chrom, gl.attrs['name'], combined))

        # Cleanup
        for f in [fil, til, amplicon_fa]:
            if os.path.isfile(f):
                os.unlink(f)

    # Write to output files
    with open(out_assembly, 'w') as outseq, open(out_summary, 'w') as outsum:
        for ref_id, reg, combined in amplicon_alignments:
            amp_id = sequtils.make_seq_id(sid=sample_id, ref=ref_id, reg=reg)
            if combined is None:
                msg1 = '%s\tFAIL\t%d' % (amp_id, 0)
                msg2 = u'%s\tFAIL\t%d\t%s\n' % (amp_id, 0, u"👎🏼")
                if logfile is not None:
                    print(u'%s\tFAIL\t%d\t%s' % (amp_id, 0, u"👎🏼"),
                          file=logfile)
            else:
                scaf, s, e = combined.scaffold2()
                msg1 = '%s\tPASS\t%d' % (amp_id, len(scaf))
                msg2 = u'%s\tPASS\t%d\t%s\n' % (amp_id, len(scaf), u"👍🏼")
                print('>%s' % (amp_id), file=outseq)
                print('%s' % sequtils.wrap(scaf), file=outseq)

            print(msg1, file=outsum)
            sysutils.log_message(msg2, quiet, logfile)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'assemble_amplicons', quiet, logfile)

    return out_assembly, out_summary, out_padded
Exemplo n.º 3
0
def pairwise_align(
    amplicons_fa=None,
    ref_fa=None,
    ref_gtf=None,
    outdir='.',
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to align amplicons to reference

    Args:
        amplicons_fa (str): Path to fasta file with amplicon sequences
        ref_fa (str): Path to reference fasta file
        ref_gtf (str): Path to reference GTF file with amplicons
        outdir (str): Path to output directory
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_aln (str): Path to alignment in JSON format

    """
    # Check dependencies
    sysutils.check_dependency('blastx')

    # Outputs
    out_aln = os.path.join(outdir, 'alignments.json')

    # Temporary directory
    tempdir = sysutils.create_tempdir('pairwise_align', None, quiet, logfile)

    # Load reference sequence(s)
    refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}

    # Load amplicons from GTF file
    amps = [
        gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon'
    ]
    ampdict = {(gl.chrom, gl.attrs['name']): gl for gl in amps}

    out_json = {
        'aa_alignments': {},
        'nuc_alignments': {},
        'padded_alignments': {},
        'padded_gtf': [],
    }
    # {(sid, ref): [(reg, list(alignment)), ...], ...}
    all_nuc_aln = defaultdict(list)

    for amprec in SeqIO.parse(amplicons_fa, 'fasta'):
        # Get amplicon reference and region from sequence ID
        aid = sequtils.parse_seq_id(amprec.id)
        # Find the GTF line used to orient this amplicon
        try:
            gl = ampdict[(aid['ref'], aid['reg'])]
        except KeyError:
            poss_gl = [t for t in ampdict.keys() if t[1] == aid['reg']]
            gl = ampdict[poss_gl[0]]

        # Start and stop for primary coding region
        pri_s = int(gl.attrs['primary_cds'].split('-')[0]) - 1
        pri_e = int(gl.attrs['primary_cds'].split('-')[1])
        # Start and stop for additional coding regions
        altcds = []
        if 'alt_cds' in gl.attrs:
            for x in gl.attrs['alt_cds'].split(','):
                altcds.append(
                    ((int(x.split('-')[0]) - 1), int(x.split('-')[1])))

        # Align using amino acids
        refseq = matching_refseq(refseqs, aid['ref'])
        alnobj, nuc_aln = baln.alignAA(refseq, amprec, (pri_s, pri_e), altcds,
                                       tempdir, quiet)
        # prialn is a BlastxAlignment object with amplicon aligned to primary cds
        # merged is a nucleotide alignment over the full amplicon, with unaligned regions
        # aligned using alternate cds or nucleotide alignments

        all_nuc_aln[(aid['sid'], aid['ref'])].append((aid['reg'], nuc_aln))
        jid = 'sid|%s|ref|%s|reg|%s|' % (aid['sid'], aid['ref'], aid['reg'])
        out_json['aa_alignments'][jid] = alnobj.aa_align
        out_json['nuc_alignments'][jid] = nuc_aln

    # Full sequence with padding
    for sid, ref in list(all_nuc_aln.keys()):
        _refseq = matching_refseq(refseqs, ref)
        # New name and new alignment
        newname = 'sid|%s|ref|%s|' % (sid, _refseq.id)
        tmp = []
        # Sort all segments by the start position
        segments = sorted(all_nuc_aln[(sid, ref)], key=lambda x: x[1][0][0])
        rpos = qpos = 0
        for sname, seg in segments:
            gr = GTFRow()
            gr.chrom, gr.source, gr.feature = (newname, 'haphpipe', 'amplicon')
            gr.score, gr.strand, gr.frame = ('.', '+', '.')
            gr.attrs['name'] = sname

            # Pad up to first position of segment
            if rpos < seg[0][0]:
                for p in range(rpos, seg[0][0]):
                    tmp.append((p, str(_refseq.seq[p]), '*', qpos))
                    qpos += 1
            gr.start = qpos + 1
            for t in seg:
                if t[3] == -1:
                    tmp.append(t)
                else:
                    tmp.append((t[0], t[1], t[2], qpos))
                    qpos += 1
            # Add annotation line
            gr.end = qpos
            # Include statistics in attributes
            gr.attrs.update(baln.get_seg_stats(seg))
            # Include called regions
            gr.attrs['call_reg'] = '%d-%d' % (gr.start, gr.end)
            gr.attrs['call_len'] = (gr.end - gr.start + 1)
            # Append to json object
            out_json['padded_gtf'].append(str(gr))
            rpos = seg[-1][0] + 1

        # Add padding for end of sequence
        if rpos < len(_refseq.seq):
            for p in range(rpos, len(_refseq.seq)):
                tmp.append((p, str(_refseq.seq[p]), '*', qpos))
                qpos += 1

        # Validate the alignment
        vseq = ''.join(t[2] for t in tmp if t[3] != -1)
        if baln.validate_alignment(tmp, _refseq.seq, vseq):
            if not quiet:
                print('%s alignment validation passed' % newname,
                      file=sys.stderr)
            out_json['padded_alignments'][newname] = tmp

    for s in out_json['padded_gtf']:
        if not quiet:
            print(s, file=sys.stdout)

    with open(out_aln, 'w') as outh:
        print(json.dumps(out_json), file=outh)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'pairwise_align', quiet, logfile)

    return out_aln