Exemplo n.º 1
0
def align_to_refseq(
        reference,
        records,
        score_matrix=None,
        do_codon=True,
        reverse_complement=True,
        expected_identity=None,
        keep_insertions=False,
        **kwargs
        ):

    if keep_insertions:
        raise ValueError('keeping insertions is unsupported at this time')

    if score_matrix is None:
        from BioExt.scorematrices import BLOSUM62
        score_matrix = BLOSUM62.load()

    # drop-in compatibility with hy454
    do_codon = kwargs.get('codon', do_codon)
    reverse_complement = kwargs.get('revcomp', reverse_complement)

    discards = []

    def discard(record):
        discards.append(record)

    alignment = MultipleSeqAlignment([])

    alignment_length = len(reference)

    def suffix_pad (record):
        deficit = alignment_length - len(record)
        if deficit > 0:
           return SeqRecord(
               Seq(''.join((str(record.seq), '-' * deficit))),
               id=record.id,
               name=record.name,
               dbxrefs=copy(record.dbxrefs),
               description=record.description,
               annotations=copy(record.annotations),
               )
        return record

    def output(records):
        for record in records:
            alignment.append(suffix_pad(gapful(gapless(record), insertions=False)))

    _align_par(
        reference,
        records,
        score_matrix,
        do_codon,
        reverse_complement,
        expected_identity,
        discard,
        output
        )

    return alignment, discards
Exemplo n.º 2
0
def test_align():
    ''' Ensure that sequence that ends with a '-' will not cause an error '''

    dir_path = os.path.dirname(os.path.realpath(__file__))

    ## Load reference sequence
    seqpath = os.path.join(dir_path, "./rsrc/SHORT.FASTA")
    output_file = os.path.join(dir_path, "./rsrc/SHORT.FASTA.test.bam")

    records = SeqIO.parse(seqpath, 'fasta')

    reference = gapless(next(records))

    def allseqs(records):
        yield compute_cigar(reference, reference)
        for record in records:
            print(record)
            yield record

    def output(records):
        BamIO.write(allseqs(records), output_file, reference)

    _align_par(reference, records, BLOSUM62.load(), True, False, None, None,
               output, False)

    # Read output file
    BamIO.sort(output_file)
Exemplo n.º 3
0
Arquivo: uds.py Projeto: veg/BioExt
def test_align_to_refseq_suffix_pad():
    ''' Ensure that sequence that ends with a '-' will not cause an error '''


    # Load reference sequence
    refseq = hxb2.prrt.load()
    dir_path = os.path.dirname(os.path.realpath(__file__))
    seqpath = os.path.join(dir_path, "./rsrc/TEST.FASTA")

    # Load sequences
    with open(seqpath) as fh:
        seqrecords = [record for record in SeqIO.parse(fh, "fasta")]

    if len (seqrecords) == 1:
        refseq = seqrecords[0].format ('fasta')
        return {'ref': refseq, 'alignment': refseq, 'seqs': seqrecords}

    sm = BLOSUM62.load()

    all([len(seqrecord) == len(seqrecords[0]) for seqrecord in seqrecords])

    ### find the longest sequence
    msa, discarded = align_to_refseq(
        refseq,
        seqrecords,
        score_matrix=sm,
        codon=True,
        expected_identity=0.6,
        keep_insertions=False
    )

    assert msa[3].seq == seqrecords[3].seq
Exemplo n.º 4
0
def align_to_refseq(
        reference,
        records,
        score_matrix=None,
        do_codon=True,
        reverse_complement=True,
        expected_identity=None,
        keep_insertions=False,
        **kwargs
        ):

    if keep_insertions:
        raise ValueError('keeping insertions is unsupported at this time')

    if score_matrix is None:
        from BioExt.scorematrices import BLOSUM62
        score_matrix = BLOSUM62.load()

    # drop-in compatibility with hy454
    do_codon = kwargs.get('codon', do_codon)
    reverse_complement = kwargs.get('revcomp', reverse_complement)

    discards = []

    def discard(record):
        discards.append(record)

    alignment = MultipleSeqAlignment([])

    alignment_length = len(reference)

    def suffix_pad (record):
        deficit = alignment_length - len(record)
        if deficit > 0:
           return SeqRecord(
               Seq(''.join((str(record.seq), '-' * deficit)), record.seq.alphabet),
               id=record.id,
               name=record.name,
               dbxrefs=copy(record.dbxrefs),
               description=record.description,
               annotations=copy(record.annotations),
               )
        return record

    def output(records):
        for record in records:
            alignment.append(suffix_pad(gapful(gapless(record), insertions=False)))

    _align_par(
        reference,
        records,
        score_matrix,
        do_codon,
        reverse_complement,
        expected_identity,
        discard,
        output
        )

    return alignment, discards
Exemplo n.º 5
0
    def align(self, refseq, seqs, score_matrix=None, revcomp=False, expected_identity=0., keep_insertions=False, quiet=True):
        # if we have no sequences, abort early to prevent later errors
        if not len(seqs):
            return [], []

        if score_matrix is None:
            if self.codon:
                score_matrix = BLOSUM62.load()
            else:
                score_matrix = DNAExpIdScoreMatrix(
                    0.8 if expected_identity == 0. else expected_identity,
                    { 'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25 }
                )

        if self.codon and not isinstance(score_matrix, ProteinScoreMatrix):
            raise ValueError('score_matrix incompatible with codon alignment')
        elif not self.codon and not isinstance(score_matrix, DNAScoreMatrix):
            raise ValueError('score_matrix incompatible with dna alignment')

        smdef = { ('_%s_letters' % self.__smvar, '_%s_scorematrix' % self.__smvar): score_matrix }

        # uppercase the refseq to deal with bugs in HyPhy's aligner
        refseq = refseq.upper()

        numseqs = len(seqs)
        # if the # nodes exceeds the number of seqs, we just need numseqs jobs
        numnodes = min(numseqs, self.nodes)
        seqs_per_node = max(1, numseqs // numnodes)
        remainder = numseqs % numnodes

        arg1 = 'Yes' if revcomp else 'No'
        arg2 = 'Yes' if keep_insertions else 'No'

        argslist = []
        lwr, upr = 0, 0

        for i in range(numnodes):
            # since our traversal is stateful, keep these cursors
            # around. During the first remainder iterations,
            # add an extra seq to the list of seqs, afterwards
            # proceed as normal
            lwr = upr
            if i < remainder:
                upr = min(numseqs, lwr + seqs_per_node + 1)
            else:
                upr = min(numseqs, lwr + seqs_per_node)
            node_seqs = [s.upper() for s in seqs[lwr:upr]]
            argslist.append( [arg1, arg2, refseq, expected_identity, len(node_seqs)] + node_seqs )

        retstrs = self.map(argslist, globalvars=smdef, quiet=quiet)

        seqscores = []
        for retstr in retstrs:
            seqscores.extend(json.loads(retstr))

        newrefstrs, newseqstrs, scores, overlaps, identities = zip(*seqscores)

        return list(newrefstrs), list(newseqstrs), list(scores), list(overlaps), list(identities)
Exemplo n.º 6
0
def run_group_alignment (sequence_group):

    print ("%d sequences with matching JUNCTION regions" % (len (sequence_group)  - 1))
    seqrecords = []

    for seq_id in sequence_group:
        #print ("Step 1\n%s" % sequence_group[seq_id])
        massaged_string = sequence_group[seq_id].replace ('NNN','').replace ('---','').replace ('-','N')
        #print ("Step 2\n%s" % massaged_string)
        if len (massaged_string) % 3:
            massaged_string = massaged_string [:len (massaged_string) - len (massaged_string) % 3]
            #print ("Step 3\n%s" % massaged_string)

        seqrecords.append(gapless(Bio.SeqRecord.SeqRecord (Bio.Seq.Seq(massaged_string), id = seq_id, name = seq_id, description = '')))

    if len (seqrecords) == 1:
        refseq = seqrecords[0].format ('fasta')
        return {'ref': refseq, 'alignment': refseq, 'seqs': seqrecords}

    # find the longest sequence
    seq_lengths = [len(record.seq) for record in seqrecords]
    refseq_id = seq_lengths.index(max(seq_lengths))
    refseq = seqrecords.pop(refseq_id)


    #print (len (seqrecords))

    if len(refseq.seq) % 3:
        seqrecords = [s for s in seqrecords]
        print (">ref\n%s" % str(refseq.seq))
        print ('\n'.join ([">%s\n%s" % (str(k.id), str(k.seq)) for k in seqrecords]))

    sm = BLOSUM62.load()

    msa, discarded = align_to_refseq(
        refseq,
        seqrecords,
        score_matrix=sm,
        do_codon=True,
        reverse_complement=False,
        #expected_identity=0.6,
        keep_insertions=False,
    )

    if len (discarded):
        print (">ref\n%s" % str(refseq.seq))
        print ('\n'.join ([">%s\n%s" % (str(k.id), str(k.seq)) for k in seqrecords]))
        print (discarded)
        raise Exception ("Non-empty discarded")
        sys.exit (1)

    string_buffer = io.StringIO ()
    Bio.SeqIO.write (msa, string_buffer, "fasta")
    all_lines = string_buffer.getvalue()
    string_buffer.close()
    return {'ref': refseq.format ('fasta'), 'alignment': all_lines, 'seqs': seqrecords}
Exemplo n.º 7
0
def align_to_refseq(
        reference,
        records,
        score_matrix=None,
        do_codon=True,
        reverse_complement=True,
        expected_identity=None,
        keep_insertions=False,
        **kwargs
        ):

    if keep_insertions:
        raise ValueError('keeping insertions is unsupported at this time')

    if score_matrix is None:
        from BioExt.scorematrices import BLOSUM62
        score_matrix = BLOSUM62.load()


    # drop-in compatibility with hy454
    do_codon = kwargs.get('codon', do_codon)
    reverse_complement = kwargs.get('revcomp', reverse_complement)

    discards = []

    def discard(record):
        discards.append(record)

    alignment = MultipleSeqAlignment([])

    def output(records):
        for record in records:
            alignment.append(gapful(gapless(record), insertions=False))

    _align_par(
        reference,
        records,
        score_matrix,
        do_codon,
        reverse_complement,
        expected_identity,
        discard,
        output
        )

    return alignment, discards
Exemplo n.º 8
0
def align_to_refseq(refseq, seqrecords, score_matrix=None, codon=True, revcomp=True, expected_identity=0., keep_insertions=False, quiet=False):
    if score_matrix is None:
        score_matrix = BLOSUM62.load()
    _, aligned, scores, overlaps, identities = Aligner(codon=codon)(
        str(refseq.seq),
        [str(s.seq) for s in seqrecords],
        score_matrix,
        revcomp,
        expected_identity,
        keep_insertions,
        quiet
    )

    # deepcopy the seqrecords so that we can change their sequences later
    aligned_records = []
    discarded_records = []
    for i, aln in enumerate(aligned):
        old = seqrecords[i]
        if expected_identity > 0. and identities[i] < 0:
            discarded_records.append(old)
        else:
            annotations = deepcopy(old.annotations)
            annotations['_nbpidentical'] = overlaps[i]
            annotations['_pbpscore'] = scores[i]
            new = SeqRecord(
                Seq(aln, generic_nucleotide),
                old.id,
                old.name,
                old.description,
                deepcopy(old.dbxrefs),
                deepcopy(old.features),
                annotations
                # don't grab the letter_annotations,
                # they won't match anymore
            )
            aligned_records.append(new)

    if not keep_insertions:
        return MultipleSeqAlignment(aligned_records), discarded_records

    return aligned_records, discarded_records
Exemplo n.º 9
0
#!/usr/bin/env python3

import nose

from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

from BioExt.align import Aligner
from BioExt.scorematrices import BLOSUM62

aln = Aligner(BLOSUM62.load())


class test_Aligner():
    def test_align_self(self):
        """Check alignment of reference against itself"""
        assert aln('GCTAGA', 'GCTAGA') == (4.5, 'GCTAGA', 'GCTAGA')

    def test_align_self_case(self):
        """Check case is irrelevant for self-alignment"""
        assert aln('GCTAGA', 'GCTAGA') == aln('GCTAGA', 'GcTaGa')

    def test_align_self_seq_to_str(self):
        """Check alignment of Seq instance against seq-identical str"""
        ref = 'GCTAGA'
        record = Seq(ref)
        assert aln(ref, record) == (4.5, ref, record)

    def test_align_self_seqrecord_to_str(self):
        """Check alignment of SeqRecord instance against seq-identical str"""
        ref = 'GCTAGA'
Exemplo n.º 10
0
def validate(
    refseq,
    seqs,
    dna_score_matrix=None,
    protein_score_matrix=None,
    dna_mismatch=0,
    protein_mismatch=0,
    codon=True,
    revcomp=True,
    expected_identity=0.,
    keep_insertions=True,
    quiet=False):

    msg = "cannot validate sequences that are not SeqRecord, Seq, or str objects"

    if isinstance(refseq, SeqRecord):
        r = str(refseq.seq)
    elif isinstance(refseq, Seq):
        r = str(refseq)
    elif isinstance(refseq, str):
        r = refseq
    else:
        raise ValueError(msg)

    qs = []
    for i, q in enumerate(seqs):
        if isinstance(q, SeqRecord):
            qs.append(str(q.seq))
        elif isinstance(q, Seq):
            qs.append(str(q))
        elif isinstance(q, str):
            qs.append(q)
        else:
            raise ValueError(msg)

    if dna_score_matrix is None:
        dna_score_matrix = DNA80

    if protein_score_matrix is None:
        score_matrix = BLOSUM62.load()

    if codon:
        score_matrix = protein_score_matrix
    else:
        score_matrix = dna_score_matrix

    aligner = Aligner(codon=codon)
    refs, queries, _, _, identities = aligner(
        r,
        qs,
        score_matrix,
        revcomp,
        expected_identity,
        keep_insertions,
        quiet
    )

    lengths = []
    dna_scores = []
    protein_scores = []
    for r, q, i in zip(refs, queries, identities):
        assert len(r) == len(q), 'sequences unaligned for some reason'
        lengths.append(len(r))
        if expected_identity > 0. and i < expected_identity:
            dna_scores.append(None)
            protein_scores.append(None)
        else:
            dna_scores.append(dna_score_matrix(r, q, dna_mismatch))
            # we can translate codon-aligned sequences,
            # but not DNA-aligned sequences
            if codon:
                protein_scores.append(
                    protein_score_matrix(
                        translate(r),
                        translate(q),
                        protein_mismatch
                    )
                )
            else:
                protein_scores.append(None)

    return lengths, dna_scores, protein_scores