Exemplo n.º 1
0
def write(alignments, handle, format):
    """Write complete set of alignments to a file.

    Arguments:
     - alignments - A list (or iterator) of Alignment objects (ideally the
                   new MultipleSeqAlignment objects), or (if using Biopython
                   1.54 or later) a single alignment object.
     - handle    - File handle object to write to, or filename as string
                   (note older versions of Biopython only took a handle).
     - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of alignments written (as an integer).
    """
    from SAP.Bio import SeqIO

    #Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(alignments, Alignment):
        #This raised an exception in older versions of Biopython
        alignments = [alignments]

    with as_handle(handle, 'w') as fp:
        #Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(alignments)
        elif format in SeqIO._FormatToWriter:
            #Exploit the existing SeqIO parser to do the dirty work!
            #TODO - Can we make one call to SeqIO.write() and count the alignments?
            count = 0
            for alignment in alignments:
                if not isinstance(alignment, Alignment):
                    raise TypeError(
                        "Expect a list or iterator of Alignment objects.")
                SeqIO.write(alignment, fp, format)
                count += 1
        elif format in _FormatToIterator or format in SeqIO._FormatToIterator:
            raise ValueError(
                "Reading format '%s' is supported, but not writing" % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

    assert isinstance(count, int), "Internal error - the underlying %s " \
           "writer should have returned the alignment count, not %s" \
           % (format, repr(count))

    return count
Exemplo n.º 2
0
def write(alignments, handle, format):
    """Write complete set of alignments to a file.

    Arguments:
     - alignments - A list (or iterator) of Alignment objects (ideally the
                   new MultipleSeqAlignment objects), or (if using Biopython
                   1.54 or later) a single alignment object.
     - handle    - File handle object to write to, or filename as string
                   (note older versions of Biopython only took a handle).
     - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of alignments written (as an integer).
    """
    from SAP.Bio import SeqIO

    #Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(alignments, Alignment):
        #This raised an exception in older versions of Biopython
        alignments = [alignments]

    with as_handle(handle, 'w') as fp:
        #Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(alignments)
        elif format in SeqIO._FormatToWriter:
            #Exploit the existing SeqIO parser to do the dirty work!
            #TODO - Can we make one call to SeqIO.write() and count the alignments?
            count = 0
            for alignment in alignments:
                if not isinstance(alignment, Alignment):
                    raise TypeError(
                        "Expect a list or iterator of Alignment objects.")
                SeqIO.write(alignment, fp, format)
                count += 1
        elif format in _FormatToIterator or format in SeqIO._FormatToIterator:
            raise ValueError("Reading format '%s' is supported, but not writing"
                             % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

    assert isinstance(count, int), "Internal error - the underlying %s " \
           "writer should have returned the alignment count, not %s" \
           % (format, repr(count))

    return count
Exemplo n.º 3
0
    def __init__(self,
                 dir_path=None,
                 version=None,
                 scop=None,
                 astral_file=None,
                 db_handle=None):
        """
        Initialise the astral database.

        You must provide either a directory of SCOP files:

        dir_path - string, the path to location of the scopseq-x.xx directory
                   (not the directory itself), and
        version   -a version number.

        or, a FASTA file:

        astral_file - string, a path to a fasta file (which will be loaded in memory)

        or, a MYSQL database:

        db_handle - a database handle for a MYSQL database containing a table
                    'astral' with the astral data in it.  This can be created
                    using writeToSQL.
        """

        if astral_file is None and dir_path is None and db_handle is None:
            raise RuntimeError(
                "Need either file handle, or (dir_path + " +
                "version) or database handle to construct Astral")
        if not scop:
            raise RuntimeError("Must provide a Scop instance to construct")

        self.scop = scop
        self.db_handle = db_handle

        if not astral_file and not db_handle:
            if dir_path is None or version is None:
                raise RuntimeError("must provide dir_path and version")

            self.version = version
            self.path = os.path.join(dir_path, "scopseq-%s" % version)
            astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version
            astral_file = os.path.join(self.path, astral_file)

        if astral_file:
            #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY
            self.fasta_dict = SeqIO.to_dict(SeqIO.parse(astral_file, "fasta"))

        self.astral_file = astral_file
        self.EvDatasets = {}
        self.EvDatahash = {}
        self.IdDatasets = {}
        self.IdDatahash = {}
Exemplo n.º 4
0
def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
    """SeqIO conversion function (PRIVATE)."""
    try:
        f = _converter[(in_format, out_format)]
    except KeyError:
        f = None
    if f:
        return f(in_handle, out_handle, alphabet)
    else:
        records = SeqIO.parse(in_handle, in_format, alphabet)
        return SeqIO.write(records, out_handle, out_format)
Exemplo n.º 5
0
def align(cmdline,
          pair,
          kbyte=None,
          force_type=None,
          dry_run=False,
          quiet=False,
          debug=False):
    """
    Returns a filehandle
    """
    if not pair or len(pair) != 2:
        raise ValueError("Expected pair of filename, not %s" % repr(pair))

    output_file = tempfile.NamedTemporaryFile(mode='r')
    input_files = tempfile.NamedTemporaryFile(
        mode="w"), tempfile.NamedTemporaryFile(mode="w")

    if dry_run:
        print(
            _build_align_cmdline(cmdline, pair, output_file.name, kbyte,
                                 force_type, quiet))
        return

    for filename, input_file in zip(pair, input_files):
        # Pipe the file through Biopython's Fasta parser/writer
        # to make sure it conforms to the Fasta standard (in particular,
        # Wise2 may choke on long lines in the Fasta file)
        records = SeqIO.parse(open(filename), 'fasta')
        SeqIO.write(records, input_file, 'fasta')
        input_file.flush()

    input_file_names = [input_file.name for input_file in input_files]

    cmdline_str = _build_align_cmdline(cmdline, input_file_names,
                                       output_file.name, kbyte, force_type,
                                       quiet)

    if debug:
        sys.stderr.write("%s\n" % cmdline_str)

    status = os.system(cmdline_str) >> 8

    if status > 1:
        if kbyte != 0:  # possible memory problem; could be None
            sys.stderr.write("INFO trying again with the linear model\n")
            return align(cmdline, pair, 0, force_type, dry_run, quiet, debug)
        else:
            raise OSError("%s returned %s" % (" ".join(cmdline), status))

    return output_file
Exemplo n.º 6
0
def align(cmdline, pair, kbyte=None, force_type=None, dry_run=False, quiet=False, debug=False):
    """
    Returns a filehandle
    """
    if not pair or len(pair) != 2:
        raise ValueError("Expected pair of filename, not %s" % repr(pair))

    output_file = tempfile.NamedTemporaryFile(mode='r')
    input_files = tempfile.NamedTemporaryFile(mode="w"), tempfile.NamedTemporaryFile(mode="w")

    if dry_run:
        print(_build_align_cmdline(cmdline,
                                   pair,
                                   output_file.name,
                                   kbyte,
                                   force_type,
                                   quiet))
        return

    for filename, input_file in zip(pair, input_files):
        # Pipe the file through Biopython's Fasta parser/writer
        # to make sure it conforms to the Fasta standard (in particular,
        # Wise2 may choke on long lines in the Fasta file)
        records = SeqIO.parse(open(filename), 'fasta')
        SeqIO.write(records, input_file, 'fasta')
        input_file.flush()

    input_file_names = [input_file.name for input_file in input_files]

    cmdline_str = _build_align_cmdline(cmdline,
                                       input_file_names,
                                       output_file.name,
                                       kbyte,
                                       force_type,
                                       quiet)

    if debug:
        sys.stderr.write("%s\n" % cmdline_str)

    status = os.system(cmdline_str) >> 8

    if status > 1:
        if kbyte != 0: # possible memory problem; could be None
            sys.stderr.write("INFO trying again with the linear model\n")
            return align(cmdline, pair, 0, force_type, dry_run, quiet, debug)
        else:
            raise OSError("%s returned %s" % (" ".join(cmdline), status))

    return output_file
Exemplo n.º 7
0
    def __init__( self, dir_path=None, version=None, scop=None,
                  astral_file=None, db_handle=None):
        """
        Initialise the astral database.

        You must provide either a directory of SCOP files:

        dir_path - string, the path to location of the scopseq-x.xx directory
                   (not the directory itself), and
        version   -a version number.

        or, a FASTA file:

        astral_file - string, a path to a fasta file (which will be loaded in memory)

        or, a MYSQL database:

        db_handle - a database handle for a MYSQL database containing a table
                    'astral' with the astral data in it.  This can be created
                    using writeToSQL.
        """

        if astral_file is None and dir_path is None and db_handle is None:
            raise RuntimeError("Need either file handle, or (dir_path + "
                       + "version) or database handle to construct Astral")
        if not scop:
            raise RuntimeError("Must provide a Scop instance to construct")

        self.scop = scop
        self.db_handle = db_handle

        if not astral_file and not db_handle:
            if dir_path is None or version is None:
                raise RuntimeError("must provide dir_path and version")

            self.version = version
            self.path = os.path.join( dir_path, "scopseq-%s" % version)
            astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version
            astral_file = os.path.join(self.path, astral_file)

        if astral_file:
            #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY
            self.fasta_dict = SeqIO.to_dict(SeqIO.parse(astral_file, "fasta"))

        self.astral_file = astral_file
        self.EvDatasets = {}
        self.EvDatahash = {}
        self.IdDatasets = {}
        self.IdDatahash = {}
Exemplo n.º 8
0
def _handle_convert(in_handle,
                    in_format,
                    out_handle,
                    out_format,
                    alphabet=None):
    """SeqIO conversion function (PRIVATE)."""
    try:
        f = _converter[(in_format, out_format)]
    except KeyError:
        f = None
    if f:
        return f(in_handle, out_handle, alphabet)
    else:
        records = SeqIO.parse(in_handle, in_format, alphabet)
        return SeqIO.write(records, out_handle, out_format)
Exemplo n.º 9
0
def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
    """Fast EMBL to FASTA (PRIVATE)."""
    #We don't need to parse the features...
    from SAP.Bio.GenBank.Scanner import EmblScanner
    records = EmblScanner().parse_records(in_handle, do_features=False)
    #For FASTA output we can ignore the alphabet too
    return SeqIO.write(records, out_handle, "fasta")
Exemplo n.º 10
0
def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
    """Fast EMBL to FASTA (PRIVATE)."""
    #We don't need to parse the features...
    from SAP.Bio.GenBank.Scanner import EmblScanner
    records = EmblScanner().parse_records(in_handle, do_features=False)
    #For FASTA output we can ignore the alphabet too
    return SeqIO.write(records, out_handle, "fasta")
Exemplo n.º 11
0
def _SeqIO_to_alignment_iterator(handle,
                                 format,
                                 alphabet=None,
                                 seq_count=None):
    """Uses Bio.SeqIO to create an MultipleSeqAlignment iterator (PRIVATE).

    Arguments:
     - handle    - handle to the file.
     - format    - string describing the file format.
     - alphabet  - optional Alphabet object, useful when the sequence type
                   cannot be automatically inferred from the file itself
                   (e.g. fasta, phylip, clustal)
     - seq_count - Optional integer, number of sequences expected in each
                   alignment.  Recommended for fasta format files.

    If count is omitted (default) then all the sequences in the file are
    combined into a single MultipleSeqAlignment.
    """
    from SAP.Bio import SeqIO
    assert format in SeqIO._FormatToIterator

    if seq_count:
        #Use the count to split the records into batches.
        seq_record_iterator = SeqIO.parse(handle, format, alphabet)

        records = []
        for record in seq_record_iterator:
            records.append(record)
            if len(records) == seq_count:
                yield MultipleSeqAlignment(records, alphabet)
                records = []
        if len(records) > 0:
            raise ValueError("Check seq_count argument, not enough sequences?")
    else:
        #Must assume that there is a single alignment using all
        #the SeqRecord objects:
        records = list(SeqIO.parse(handle, format, alphabet))
        if records:
            yield MultipleSeqAlignment(records, alphabet)
    raise StopIteration
Exemplo n.º 12
0
def _SeqIO_to_alignment_iterator(handle, format, alphabet=None, seq_count=None):
    """Uses Bio.SeqIO to create an MultipleSeqAlignment iterator (PRIVATE).

    Arguments:
     - handle    - handle to the file.
     - format    - string describing the file format.
     - alphabet  - optional Alphabet object, useful when the sequence type
                   cannot be automatically inferred from the file itself
                   (e.g. fasta, phylip, clustal)
     - seq_count - Optional integer, number of sequences expected in each
                   alignment.  Recommended for fasta format files.

    If count is omitted (default) then all the sequences in the file are
    combined into a single MultipleSeqAlignment.
    """
    from SAP.Bio import SeqIO
    assert format in SeqIO._FormatToIterator

    if seq_count:
        #Use the count to split the records into batches.
        seq_record_iterator = SeqIO.parse(handle, format, alphabet)

        records = []
        for record in seq_record_iterator:
            records.append(record)
            if len(records) == seq_count:
                yield MultipleSeqAlignment(records, alphabet)
                records = []
        if len(records) > 0:
            raise ValueError("Check seq_count argument, not enough sequences?")
    else:
        #Must assume that there is a single alignment using all
        #the SeqRecord objects:
        records = list(SeqIO.parse(handle, format, alphabet))
        if records:
            yield MultipleSeqAlignment(records, alphabet)
    raise StopIteration
Exemplo n.º 13
0
    def __format__(self, format_spec):
        """Returns the record as a string in the specified file format.

        This method supports the python format() function added in
        Python 2.6/3.0.  The format_spec should be a lower case string
        supported by Bio.SeqIO as an output file format. See also the
        SeqRecord's format() method.

        Under Python 3 please note that for binary formats a bytes
        string is returned, otherwise a (unicode) string is returned.
        """
        if not format_spec:
            #Follow python convention and default to using __str__
            return str(self)
        from SAP.Bio import SeqIO
        if format_spec in SeqIO._BinaryFormats:
            #Return bytes on Python 3
            from io import BytesIO
            handle = BytesIO()
        else:
            from SAP.Bio._py3k import StringIO
            handle = StringIO()
        SeqIO.write(self, handle, format_spec)
        return handle.getvalue()
Exemplo n.º 14
0
    def __format__(self, format_spec):
        """Returns the record as a string in the specified file format.

        This method supports the python format() function added in
        Python 2.6/3.0.  The format_spec should be a lower case string
        supported by Bio.SeqIO as an output file format. See also the
        SeqRecord's format() method.

        Under Python 3 please note that for binary formats a bytes
        string is returned, otherwise a (unicode) string is returned.
        """
        if not format_spec:
            #Follow python convention and default to using __str__
            return str(self)
        from SAP.Bio import SeqIO
        if format_spec in SeqIO._BinaryFormats:
            #Return bytes on Python 3
            from io import BytesIO
            handle = BytesIO()
        else:
            from SAP.Bio._py3k import StringIO
            handle = StringIO()
        SeqIO.write(self, handle, format_spec)
        return handle.getvalue()
Exemplo n.º 15
0
    def _count_codons(self, fasta_file):
        with open(fasta_file, 'r') as handle:

            # make the codon dictionary local
            self.codon_count = CodonsDict.copy()

            # iterate over sequence and count all the codons in the FastaFile.
            for cur_record in SeqIO.parse(handle, "fasta"):
                # make sure the sequence is lower case
                if str(cur_record.seq).islower():
                    dna_sequence = str(cur_record.seq).upper()
                else:
                    dna_sequence = str(cur_record.seq)
                for i in range(0, len(dna_sequence), 3):
                    codon = dna_sequence[i:i+3]
                    if codon in self.codon_count:
                        self.codon_count[codon] += 1
                    else:
                        raise TypeError("illegal codon %s in gene: %s" % (codon, cur_record.id))
Exemplo n.º 16
0
        """
        outstr = [
            "\n<%s: %s %d features>" %
            (self.__class__, self.name, len(self.features))
        ]
        return "\n".join(outstr)


################################################################################
# RUN AS SCRIPT
################################################################################

if __name__ == '__main__':
    from SAP.Bio import SeqIO

    genbank_entry = SeqIO.read(
        '/data/Genomes/Bacteria/Nanoarchaeum_equitans/NC_005213.gbk', 'gb')

    # Test code
    gdfs = FeatureSet(0, 'Nanoarchaeum equitans CDS')
    for feature in genbank_entry.features:
        if feature.type == 'CDS':
            gdfs.add_feature(feature)

    #print len(gdfs)
    #print gdfs.get_ids()
    #gdfs.del_feature(560)
    #print gdfs.get_ids()
    #print gdfs.get_features()
    #for feature in gdfs.get_features():
    #    print feature.id, feature.start, feature.end
    #print gdfs[500]
Exemplo n.º 17
0
        record = SeqRecord(Seq(''.join(res_out), generic_protein),
                id=record_id,
                description=record_id,
                )

        # The PDB header was loaded as a dictionary, so let's reuse it all
        record.annotations = struct.header.copy()
        # Plus some chain specifics:
        record.annotations["model"] = model.id
        record.annotations["chain"] = chain.id

        # Start & end
        record.annotations["start"] = int(rnumbers[0])
        record.annotations["end"] = int(rnumbers[-1])

        # ENH - add letter annotations -- per-residue info, e.g. numbers

        yield record


if __name__ == '__main__':
    # Test
    import sys
    from SAP.Bio import SeqIO
    for fname in sys.argv[1:]:
        for parser in (PdbSeqresIterator, PdbAtomIterator):
            with open(fname) as handle:
                records = parser(handle)
                SeqIO.write(records, sys.stdout, 'fasta')
Exemplo n.º 18
0
 def _parse(handle):
     """Dynamically generated parser function (PRIVATE)."""
     try:
         return next(i(handle, alphabet=alphabet))
     except TypeError:
         return next(SeqIO._force_alphabet(i(handle), alphabet))
Exemplo n.º 19
0
 def _parse(handle):
     """Dynamically generated parser function (PRIVATE)."""
     try:
         return next(i(handle, alphabet=alphabet))
     except TypeError:
         return next(SeqIO._force_alphabet(i(handle), alphabet))
Exemplo n.º 20
0
                            self.xml_generator.endElement("property")

                elif isinstance(value, (int, float, basestring)):

                    attr = {"name": key, "value": str(value)}
                    self.xml_generator.startElement(
                        "property", AttributesImpl(attr))
                    self.xml_generator.endElement("property")

if __name__ == "__main__":
    print("Running quick self test")

    from SAP.Bio import SeqIO
    import sys

    with open("Tests/SeqXML/protein_example.xml", "r") as fileHandle:
        records = list(SeqIO.parse(fileHandle, "seqxml"))

    from SAP.Bio._py3k import StringIO
    stringHandle = StringIO()

    SeqIO.write(records, stringHandle, "seqxml")
    SeqIO.write(records, sys.stdout, "seqxml")
    print("")

    stringHandle.seek(0)
    records = list(SeqIO.parse(stringHandle, "seqxml"))

    SeqIO.write(records, sys.stdout, "seqxml")
    print("")
Exemplo n.º 21
0
                elif isinstance(value, (int, float, basestring)):

                    attr = {"name": key, "value": str(value)}
                    self.xml_generator.startElement("property",
                                                    AttributesImpl(attr))
                    self.xml_generator.endElement("property")


if __name__ == "__main__":
    print("Running quick self test")

    from SAP.Bio import SeqIO
    import sys

    with open("Tests/SeqXML/protein_example.xml", "r") as fileHandle:
        records = list(SeqIO.parse(fileHandle, "seqxml"))

    from SAP.Bio._py3k import StringIO
    stringHandle = StringIO()

    SeqIO.write(records, stringHandle, "seqxml")
    SeqIO.write(records, sys.stdout, "seqxml")
    print("")

    stringHandle.seek(0)
    records = list(SeqIO.parse(stringHandle, "seqxml"))

    SeqIO.write(records, sys.stdout, "seqxml")
    print("")
Exemplo n.º 22
0
        return "\n".join(outstr)


################################################################################
# RUN AS SCRIPT
################################################################################

if __name__ == '__main__':

    # test code
    from SAP.Bio import SeqIO
    from ._FeatureSet import FeatureSet
    from ._GraphSet import GraphSet
    from random import normalvariate

    genbank_entry = SeqIO.read('/data/genomes/Bacteria/Nanoarchaeum_equitans/NC_005213.gbk', 'gb')

    gdfs1 = FeatureSet(0, 'Nanoarchaeum equitans CDS - CDS')
    gdfs2 = FeatureSet(1, 'Nanoarchaeum equitans CDS - gene')
    for feature in genbank_entry.features:
        if feature.type == 'CDS':
            gdfs1.add_feature(feature)
        if feature.type == 'gene':
            gdfs2.add_feature(feature)

    gdt = Track()
    gdt.add_set(gdfs1)
    gdt.add_set(gdfs2)

    graphdata = []
    for pos in range(1, len(genbank_entry.seq), 1000):
Exemplo n.º 23
0
        record = SeqRecord(
            Seq(''.join(res_out), generic_protein),
            id=record_id,
            description=record_id,
        )

        # The PDB header was loaded as a dictionary, so let's reuse it all
        record.annotations = struct.header.copy()
        # Plus some chain specifics:
        record.annotations["model"] = model.id
        record.annotations["chain"] = chain.id

        # Start & end
        record.annotations["start"] = int(rnumbers[0])
        record.annotations["end"] = int(rnumbers[-1])

        # ENH - add letter annotations -- per-residue info, e.g. numbers

        yield record


if __name__ == '__main__':
    # Test
    import sys
    from SAP.Bio import SeqIO
    for fname in sys.argv[1:]:
        for parser in (PdbSeqresIterator, PdbAtomIterator):
            with open(fname) as handle:
                records = parser(handle)
                SeqIO.write(records, sys.stdout, 'fasta')
Exemplo n.º 24
0
def build(pro_align,
          nucl_seqs,
          corr_dict=None,
          gap_char='-',
          unknown='X',
          codon_table=default_codon_table,
          alphabet=None,
          complete_protein=False,
          anchor_len=10,
          max_score=10):
    """Build a codon alignment from a protein alignment and
    corresponding nucleotide sequences

    Arguments:
     - pro_align  - a protein MultipleSeqAlignment object
     - nucl_align - an object returned by SeqIO.parse or SeqIO.index
                    or a colloction of SeqRecord.
     - alphabet   - alphabet for the returned codon alignment
     - corr_dict  - a dict that maps protein id to nucleotide id
     - complete_protein - whether the sequence begins with a start
                          codon
     - frameshift - whether to appply frameshift detection

    Return a CodonAlignment object

    >>> from SAP.Bio.Alphabet import IUPAC
    >>> from SAP.Bio.Seq import Seq
    >>> from SAP.Bio.SeqRecord import SeqRecord
    >>> from SAP.Bio.Align import MultipleSeqAlignment
    >>> seq1 = SeqRecord(Seq('TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG',
    ...     alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1')
    >>> seq2 = SeqRecord(Seq('TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG',
    ...     alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2')
    >>> pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein),id='pro1')
    >>> pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein),id='pro2')
    >>> aln = MultipleSeqAlignment([pro1, pro2])
    >>> codon_aln = build(aln, [seq1, seq2])
    >>> print(codon_aln)
    CodonAlphabet(Standard) CodonAlignment with 2 rows and 69 columns (23 codons)
    TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGT...GAG pro1
    TCAGGGACTTCGAGAACCAAGCG-CTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGT...GAG pro2

    """
    # TODO
    # add an option to allow the user to specify the returned object?

    from SAP.Bio.Alphabet import ProteinAlphabet
    from SAP.Bio.Align import MultipleSeqAlignment

    # check the type of object of pro_align
    if not isinstance(pro_align, MultipleSeqAlignment):
        raise TypeError("the first argument should be a MultipleSeqAlignment "
                        "object")
    # check the alphabet of pro_align
    for pro in pro_align:
        if not isinstance(pro.seq.alphabet, ProteinAlphabet):
            raise TypeError("Alphabet Error!\nThe input alignment should be "
                            "a *PROTEIN* alignment")
    if alphabet is None:
        alphabet = _get_codon_alphabet(codon_table, gap_char=gap_char)
    # check whether the number of seqs in pro_align and nucl_seqs is
    # the same
    pro_num = len(pro_align)
    if corr_dict is None:
        if nucl_seqs.__class__.__name__ == "generator":
            # nucl_seqs will be a tuple if read by SeqIO.parse()
            nucl_seqs = tuple(nucl_seqs)
        nucl_num = len(nucl_seqs)
        if pro_num > nucl_num:
            raise ValueError("More Number of SeqRecords in Protein Alignment "
                             "({0}) than the Number of Nucleotide SeqRecords "
                             "({1}) are found!".format(pro_num, nucl_num))

        # Determine the protein sequences and nucl sequences
        # correspondance. If nucl_seqs is a list, tuple or read by
        # SeqIO.parse(), we assume the order of sequences in pro_align
        # and nucl_seqs are the same. If nucl_seqs is a dict or read by
        # SeqIO.index(), we match seqs in pro_align and those in
        # nucl_seq by their id.
        if nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict", "dict"):
            corr_method = 1
        elif nucl_seqs.__class__.__name__ in ("list", "tuple"):
            corr_method = 0
        else:
            raise TypeError("Nucl Sequences Error, Unknown type to assign "
                            "correspondance method")
    else:
        if not isinstance(corr_dict, dict):
            raise TypeError("corr_dict should be a dict that corresponds "
                            "protein id to nucleotide id!")
        if len(corr_dict) >= pro_num:
            # read by SeqIO.parse()
            if nucl_seqs.__class__.__name__ == "generator":
                from SAP.Bio import SeqIO
                nucl_seqs = SeqIO.to_dict(nucl_seqs)
            elif nucl_seqs.__class__.__name__ in ("list", "tuple"):
                nucl_seqs = dict((i.id, i) for i in nucl_seqs)
                #nucl_seqs = {i.id: i for i in nucl_seqs}
            elif nucl_seqs.__class__.__name__ in \
                    ("_IndexedSeqFileDict", "dict"):
                pass
            else:
                raise TypeError("Nucl Sequences Error, Unknown type of "
                                "Nucleotide Records!")
            corr_method = 2
        else:
            raise RuntimeError("Number of items in corr_dict ({0}) is less "
                               "than number of protein records "
                               "({1})".format(len(corr_dict), pro_num))

    # set up pro-nucl correspondance based on corr_method
    # corr_method = 0, consecutive pairing
    if corr_method == 0:
        pro_nucl_pair = izip(pro_align, nucl_seqs)
    # corr_method = 1, keyword pairing
    elif corr_method == 1:
        nucl_id = set(nucl_seqs.keys())
        pro_id = set([i.id for i in pro_align])
        # check if there is pro_id that does not have a nucleotide match
        if pro_id - nucl_id:
            diff = pro_id - nucl_id
            raise ValueError("Protein Record {0} cannot find a nucleotide "
                             "sequence match, please check the "
                             "id".format(', '.join(diff)))
        else:
            pro_nucl_pair = []
            for pro_rec in pro_align:
                pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id]))
    # corr_method = 2, dict pairing
    elif corr_method == 2:
        pro_nucl_pair = []
        for pro_rec in pro_align:
            try:
                nucl_id = corr_dict[pro_rec.id]
            except KeyError:
                print("Protein record (%s) is not in corr_dict!" % pro_rec.id)
                exit(1)
            pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id]))

    codon_aln = []
    shift = None
    for pair in pro_nucl_pair:
        # Beaware that the following span corresponds to an ungapped
        # nucleotide sequence.
        corr_span = _check_corr(pair[0],
                                pair[1],
                                gap_char=gap_char,
                                codon_table=codon_table,
                                complete_protein=complete_protein,
                                anchor_len=anchor_len)
        if not corr_span:
            raise ValueError("Protein Record {0} and Nucleotide Record {1} do"
                             " not match!".format((pair[0].id, pair[1].id)))
        else:
            codon_rec = _get_codon_rec(pair[0],
                                       pair[1],
                                       corr_span,
                                       alphabet=alphabet,
                                       complete_protein=False,
                                       max_score=max_score)
            codon_aln.append(codon_rec)
            if corr_span[1] == 2:
                shift = True
    if shift is True:
        return CodonAlignment(_align_shift_recs(codon_aln), alphabet=alphabet)
    else:
        return CodonAlignment(codon_aln, alphabet=alphabet)
Exemplo n.º 25
0
def build(pro_align, nucl_seqs, corr_dict=None, gap_char='-', unknown='X',
          codon_table=default_codon_table, alphabet=None,
          complete_protein=False, anchor_len=10, max_score=10):
    """Build a codon alignment from a protein alignment and
    corresponding nucleotide sequences

    Arguments:
     - pro_align  - a protein MultipleSeqAlignment object
     - nucl_align - an object returned by SeqIO.parse or SeqIO.index
                    or a colloction of SeqRecord.
     - alphabet   - alphabet for the returned codon alignment
     - corr_dict  - a dict that maps protein id to nucleotide id
     - complete_protein - whether the sequence begins with a start
                          codon
     - frameshift - whether to appply frameshift detection

    Return a CodonAlignment object

    >>> from SAP.Bio.Alphabet import IUPAC
    >>> from SAP.Bio.Seq import Seq
    >>> from SAP.Bio.SeqRecord import SeqRecord
    >>> from SAP.Bio.Align import MultipleSeqAlignment
    >>> seq1 = SeqRecord(Seq('TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG',
    ...     alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1')
    >>> seq2 = SeqRecord(Seq('TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG',
    ...     alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2')
    >>> pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein),id='pro1')
    >>> pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein),id='pro2')
    >>> aln = MultipleSeqAlignment([pro1, pro2])
    >>> codon_aln = build(aln, [seq1, seq2])
    >>> print(codon_aln)
    CodonAlphabet(Standard) CodonAlignment with 2 rows and 69 columns (23 codons)
    TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGT...GAG pro1
    TCAGGGACTTCGAGAACCAAGCG-CTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGT...GAG pro2

    """
    # TODO
    # add an option to allow the user to specify the returned object?

    from SAP.Bio.Alphabet import ProteinAlphabet
    from SAP.Bio.Align import MultipleSeqAlignment

    # check the type of object of pro_align
    if not isinstance(pro_align, MultipleSeqAlignment):
        raise TypeError("the first argument should be a MultipleSeqAlignment "
                        "object")
    # check the alphabet of pro_align
    for pro in pro_align:
        if not isinstance(pro.seq.alphabet, ProteinAlphabet):
            raise TypeError("Alphabet Error!\nThe input alignment should be "
                            "a *PROTEIN* alignment")
    if alphabet is None:
        alphabet = _get_codon_alphabet(codon_table, gap_char=gap_char)
    # check whether the number of seqs in pro_align and nucl_seqs is
    # the same
    pro_num = len(pro_align)
    if corr_dict is None:
        if nucl_seqs.__class__.__name__ == "generator":
            # nucl_seqs will be a tuple if read by SeqIO.parse()
            nucl_seqs = tuple(nucl_seqs)
        nucl_num = len(nucl_seqs)
        if pro_num > nucl_num:
            raise ValueError("More Number of SeqRecords in Protein Alignment "
                             "({0}) than the Number of Nucleotide SeqRecords "
                             "({1}) are found!".format(pro_num, nucl_num))

        # Determine the protein sequences and nucl sequences
        # correspondance. If nucl_seqs is a list, tuple or read by
        # SeqIO.parse(), we assume the order of sequences in pro_align
        # and nucl_seqs are the same. If nucl_seqs is a dict or read by
        # SeqIO.index(), we match seqs in pro_align and those in
        # nucl_seq by their id.
        if nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict", "dict"):
            corr_method = 1
        elif nucl_seqs.__class__.__name__ in ("list", "tuple"):
            corr_method = 0
        else:
            raise TypeError("Nucl Sequences Error, Unknown type to assign "
                            "correspondance method")
    else:
        if not isinstance(corr_dict, dict):
            raise TypeError("corr_dict should be a dict that corresponds "
                            "protein id to nucleotide id!")
        if len(corr_dict) >= pro_num:
            # read by SeqIO.parse()
            if nucl_seqs.__class__.__name__ == "generator":
                from SAP.Bio import SeqIO
                nucl_seqs = SeqIO.to_dict(nucl_seqs)
            elif nucl_seqs.__class__.__name__ in ("list", "tuple"):
                nucl_seqs = dict((i.id, i) for i in nucl_seqs)
                #nucl_seqs = {i.id: i for i in nucl_seqs}
            elif nucl_seqs.__class__.__name__ in \
                    ("_IndexedSeqFileDict", "dict"):
                pass
            else:
                raise TypeError("Nucl Sequences Error, Unknown type of "
                                "Nucleotide Records!")
            corr_method = 2
        else:
            raise RuntimeError("Number of items in corr_dict ({0}) is less "
                               "than number of protein records "
                               "({1})".format(len(corr_dict), pro_num))

    # set up pro-nucl correspondance based on corr_method
    # corr_method = 0, consecutive pairing
    if corr_method == 0:
        pro_nucl_pair = izip(pro_align, nucl_seqs)
    # corr_method = 1, keyword pairing
    elif corr_method == 1:
        nucl_id = set(nucl_seqs.keys())
        pro_id = set([i.id for i in pro_align])
        # check if there is pro_id that does not have a nucleotide match
        if pro_id - nucl_id:
            diff = pro_id - nucl_id
            raise ValueError("Protein Record {0} cannot find a nucleotide "
                             "sequence match, please check the "
                             "id".format(', '.join(diff)))
        else:
            pro_nucl_pair = []
            for pro_rec in pro_align:
                pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id]))
    # corr_method = 2, dict pairing
    elif corr_method == 2:
        pro_nucl_pair = []
        for pro_rec in pro_align:
            try:
                nucl_id = corr_dict[pro_rec.id]
            except KeyError:
                print("Protein record (%s) is not in corr_dict!" % pro_rec.id)
                exit(1)
            pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id]))

    codon_aln = []
    shift = None
    for pair in pro_nucl_pair:
        # Beaware that the following span corresponds to an ungapped
        # nucleotide sequence.
        corr_span = _check_corr(pair[0], pair[1], gap_char=gap_char,
                                codon_table=codon_table,
                                complete_protein=complete_protein,
                                anchor_len=anchor_len)
        if not corr_span:
            raise ValueError("Protein Record {0} and Nucleotide Record {1} do"
                             " not match!".format((pair[0].id, pair[1].id)))
        else:
            codon_rec = _get_codon_rec(pair[0], pair[1], corr_span,
                                       alphabet=alphabet,
                                       complete_protein=False,
                                       max_score=max_score)
            codon_aln.append(codon_rec)
            if corr_span[1] == 2:
                shift = True
    if shift is True:
        return CodonAlignment(_align_shift_recs(codon_aln), alphabet=alphabet)
    else:
        return CodonAlignment(codon_aln, alphabet=alphabet)
Exemplo n.º 26
0
        """ __str__(self) -> ""

            Returns a formatted string with information about the feature set
        """
        outstr = ["\n<%s: %s %d features>" % (self.__class__, self.name, len(self.features))]
        return "\n".join(outstr)


################################################################################
# RUN AS SCRIPT
################################################################################

if __name__ == "__main__":
    from SAP.Bio import SeqIO

    genbank_entry = SeqIO.read("/data/Genomes/Bacteria/Nanoarchaeum_equitans/NC_005213.gbk", "gb")

    # Test code
    gdfs = FeatureSet(0, "Nanoarchaeum equitans CDS")
    for feature in genbank_entry.features:
        if feature.type == "CDS":
            gdfs.add_feature(feature)

    # print len(gdfs)
    # print gdfs.get_ids()
    # gdfs.del_feature(560)
    # print gdfs.get_ids()
    # print gdfs.get_features()
    # for feature in gdfs.get_features():
    #    print feature.id, feature.start, feature.end
    # print gdfs[500]