示例#1
0
def convert(in_file, in_format, out_file, out_format, alphabet=None):
    """Convert between two alignment files, returns number of alignments.

        - in_file - an input handle or filename
        - in_format - input file format, lower case string
        - output - an output handle or filename
        - out_file - output file format, lower case string
        - alphabet - optional alphabet to assume

    **NOTE** - If you provide an output filename, it will be opened which will
    overwrite any existing file without warning. This may happen if even the
    conversion is aborted (e.g. an invalid out_format name is given).
    """
    # TODO - Add optimised versions of important conversions
    # For now just off load the work to SeqIO parse/write
    with as_handle(in_file, 'rU') as in_handle:
        # Don't open the output file until we've checked the input is OK:
        alignments = parse(in_handle, in_format, None, alphabet)

        # This will check the arguments and issue error messages,
        # after we have opened the file which is a shame.
        with as_handle(out_file, 'w') as out_handle:
            count = write(alignments, out_handle, out_format)

    return count
示例#2
0
def convert(in_file, in_format, out_file, out_format, alphabet=None):
    """Convert between two sequence file formats, return number of records.

        - in_file - an input handle or filename
        - in_format - input file format, lower case string
        - out_file - an output handle or filename
        - out_format - output file format, lower case string
        - alphabet - optional alphabet to assume

    **NOTE** - If you provide an output filename, it will be opened which will
    overwrite any existing file without warning. This may happen if even
    the conversion is aborted (e.g. an invalid out_format name is given).

    For example, going from a filename to a handle:

    >>> from anarci.Bio import SeqIO
    >>> try:
    ...     from StringIO import StringIO # Python 2
    ... except ImportError:
    ...     from io import StringIO # Python 3
    ...
    >>> handle = StringIO("")
    >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta")
    3
    >>> print(handle.getvalue())
    >EAS54_6_R1_2_1_413_324
    CCCTTCTTGTCTTCAGCGTTTCTCC
    >EAS54_6_R1_2_1_540_792
    TTGGCAGGCCAAGGCCGATGGATCA
    >EAS54_6_R1_2_1_443_348
    GTTGCTTCTGGCGTGGGTGGGGGGG
    <BLANKLINE>
    """
    # Hack for SFF, will need to make this more general in future
    if in_format in _BinaryFormats:
        in_mode = 'rb'
    else:
        in_mode = 'rU'

    # Don't open the output file until we've checked the input is OK?
    if out_format in ["sff", "sff_trim"]:
        out_mode = 'wb'
    else:
        out_mode = 'w'

    # This will check the arguments and issue error messages,
    # after we have opened the file which is a shame.
    from ._convert import _handle_convert  # Lazy import
    with as_handle(in_file, in_mode) as in_handle:
        with as_handle(out_file, out_mode) as out_handle:
            count = _handle_convert(in_handle, in_format, out_handle,
                                    out_format, alphabet)
    return count
示例#3
0
def write(alignments, handle, format):
    """Write complete set of alignments to a file.

    Arguments:
      - alignments - A list (or iterator) of Alignment objects (ideally the
        new MultipleSeqAlignment objects), or (if using Biopython
        1.54 or later) a single alignment object.
      - handle    - File handle object to write to, or filename as string
        (note older versions of Biopython only took a handle).
      - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of alignments written (as an integer).
    """
    from anarci.Bio import SeqIO

    # Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(alignments, Alignment):
        # This raised an exception in older versions of Biopython
        alignments = [alignments]

    with as_handle(handle, 'w') as fp:
        # Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(alignments)
        elif format in SeqIO._FormatToWriter:
            # Exploit the existing SeqIO parser to do the dirty work!
            # TODO - Can we make one call to SeqIO.write() and count the alignments?
            count = 0
            for alignment in alignments:
                if not isinstance(alignment, Alignment):
                    raise TypeError("Expect a list or iterator of Alignment "
                                    "objects, got: %r" % alignment)
                SeqIO.write(alignment, fp, format)
                count += 1
        elif format in _FormatToIterator or format in SeqIO._FormatToIterator:
            raise ValueError(
                "Reading format '%s' is supported, but not writing" % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

    assert isinstance(count, int), "Internal error - the underlying %s " \
           "writer should have returned the alignment count, not %s" \
           % (format, repr(count))

    return count
示例#4
0
def parse(handle, format=None, **kwargs):
    """Turns a search output file into a generator that yields QueryResult
    objects.

     - handle - Handle to the file, or the filename as a string.
     - format - Lower case string denoting one of the supported formats.
     - kwargs - Format-specific keyword arguments.

    This function is used to iterate over each query in a given search output
    file:

    >>> from anarci.Bio import SearchIO
    >>> qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml')
    >>> qresults
    <generator object ...>
    >>> for qresult in qresults:
    ...     print("Search %s has %i hits" % (qresult.id, len(qresult)))
    ...
    Search 33211 has 100 hits
    Search 33212 has 44 hits
    Search 33213 has 95 hits

    Depending on the file format, `parse` may also accept additional keyword
    argument(s) that modifies the behavior of the format parser. Here is a
    simple example, where the keyword argument enables parsing of a commented
    BLAST tabular output file:

    >>> from anarci.Bio import SearchIO
    >>> for qresult in SearchIO.parse('Blast/mirna.tab', 'blast-tab', comments=True):
    ...     print("Search %s has %i hits" % (qresult.id, len(qresult)))
    ...
    Search 33211 has 100 hits
    Search 33212 has 44 hits
    Search 33213 has 95 hits

    """
    # get the iterator object and do error checking
    iterator = get_processor(format, _ITERATOR_MAP)

    # HACK: force BLAST XML decoding to use utf-8
    handle_kwargs = {}
    if format == 'blast-xml' and sys.version_info[0] > 2:
        handle_kwargs['encoding'] = 'utf-8'

    # and start iterating
    with as_handle(handle, 'rU', **handle_kwargs) as source_file:
        generator = iterator(source_file, **kwargs)

        for qresult in generator:
            yield qresult
示例#5
0
def write(qresults, handle, format=None, **kwargs):
    """Writes QueryResult objects to a file in the given format.

     - qresults - An iterator returning QueryResult objects or a single
                  QueryResult object.
     - handle   - Handle to the file, or the filename as a string.
     - format   - Lower case string denoting one of the supported formats.
     - kwargs   - Format-specific keyword arguments.

    The `write` function writes QueryResult object(s) into the given output
    handle / filename. You can supply it with a single QueryResult object or an
    iterable returning one or more QueryResult objects. In both cases, the
    function will return a tuple of four values: the number of QueryResult, Hit,
    HSP, and HSPFragment objects it writes to the output file::

        from anarci.Bio import SearchIO
        qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml')
        SearchIO.write(qresults, 'results.tab', 'blast-tab')
        <stdout> (3, 239, 277, 277)

    The output of different formats may be adjusted using the format-specific
    keyword arguments. Here is an example that writes BLAT PSL output file with
    a header::

        from anarci.Bio import SearchIO
        qresults = SearchIO.parse('Blat/psl_34_001.psl', 'blat-psl')
        SearchIO.write(qresults, 'results.tab', 'blat-psl', header=True)
        <stdout> (2, 13, 22, 26)

    """
    # turn qresults into an iterator if it's a single QueryResult object
    if isinstance(qresults, QueryResult):
        qresults = iter([qresults])
    else:
        qresults = iter(qresults)

    # get the writer object and do error checking
    writer_class = get_processor(format, _WRITER_MAP)

    # write to the handle
    with as_handle(handle, 'w') as target_file:
        writer = writer_class(target_file, **kwargs)
        # count how many qresults, hits, and hsps
        qresult_count, hit_count, hsp_count, frag_count = \
                writer.write_file(qresults)

    return qresult_count, hit_count, hsp_count, frag_count
示例#6
0
def parse(handle, format='fasta', alphabet=None):
    r"""Turns a sequence file into an iterator returning SeqRecords.

        - handle   - handle to the file, or the filename as a string
          (note older versions of Biopython only took a handle).
        - format   - lower case string describing the file format.
        - alphabet - optional Alphabet object, useful when the sequence type
          cannot be automatically inferred from the file itself
          (e.g. format="fasta" or "tab")

    Typical usage, opening a file to read in, and looping over the record(s):

    >>> from anarci.Bio import SeqIO
    >>> filename = "Fasta/sweetpea.nu"
    >>> for record in SeqIO.parse(filename, "fasta"):
    ...    print("ID %s" % record.id)
    ...    print("Sequence length %i" % len(record))
    ...    print("Sequence alphabet %s" % record.seq.alphabet)
    ID gi|3176602|gb|U78617.1|LOU78617
    Sequence length 309
    Sequence alphabet SingleLetterAlphabet()

    For file formats like FASTA where the alphabet cannot be determined, it
    may be useful to specify the alphabet explicitly:

    >>> from anarci.Bio import SeqIO
    >>> from anarci.Bio.Alphabet import generic_dna
    >>> filename = "Fasta/sweetpea.nu"
    >>> for record in SeqIO.parse(filename, "fasta", generic_dna):
    ...    print("ID %s" % record.id)
    ...    print("Sequence length %i" % len(record))
    ...    print("Sequence alphabet %s" % record.seq.alphabet)
    ID gi|3176602|gb|U78617.1|LOU78617
    Sequence length 309
    Sequence alphabet DNAAlphabet()

    If you have a string 'data' containing the file contents, you must
    first turn this into a handle in order to parse it:

    >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
    >>> from anarci.Bio import SeqIO
    >>> try:
    ...     from StringIO import StringIO # Python 2
    ... except ImportError:
    ...     from io import StringIO # Python 3
    ...
    >>> for record in SeqIO.parse(StringIO(data), "fasta"):
    ...     print("%s %s" % (record.id, record.seq))
    Alpha ACCGGATGTA
    Beta AGGCTCGGTTA

    Use the Bio.SeqIO.read(...) function when you expect a single record
    only.
    """
    # NOTE - The above docstring has some raw \n characters needed
    # for the StringIO example, hence the whole docstring is in raw
    # string mode (see the leading r before the opening quote).
    from anarci.Bio import AlignIO

    # Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    if alphabet is not None and not (isinstance(alphabet, Alphabet)
                                     or isinstance(alphabet, AlphabetEncoder)):
        raise ValueError("Invalid alphabet, %r" % alphabet)

    mode = 'rU'

    with as_handle(handle, mode) as fp:
        # Map the file format to a sequence iterator:
        if format in _FormatToIterator:
            iterator_generator = _FormatToIterator[format]
            if alphabet is None:
                i = iterator_generator(fp)
            else:
                try:
                    i = iterator_generator(fp, alphabet=alphabet)
                except TypeError:
                    i = _force_alphabet(iterator_generator(fp), alphabet)
        elif format in AlignIO._FormatToIterator:
            # Use Bio.AlignIO to read in the alignments
            i = (r
                 for alignment in AlignIO.parse(fp, format, alphabet=alphabet)
                 for r in alignment)
        else:
            raise ValueError("Unknown format '%s'" % format)
        # This imposes some overhead... wait until we drop Python 2.4 to fix it
        for r in i:
            yield r
示例#7
0
def write(sequences, handle, format):
    """Write complete set of sequences to a file.

        - sequences - A list (or iterator) of SeqRecord objects, or (if using
          Biopython 1.54 or later) a single SeqRecord.
        - handle    - File handle object to write to, or filename as string
          (note older versions of Biopython only took a handle).
        - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of records written (as an integer).
    """
    from anarci.Bio import AlignIO

    # Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(handle, SeqRecord):
        raise TypeError("Check arguments, handle should NOT be a SeqRecord")
    if isinstance(handle, list):
        # e.g. list of SeqRecord objects
        raise TypeError("Check arguments, handle should NOT be a list")

    if isinstance(sequences, SeqRecord):
        # This raised an exception in order version of Biopython
        sequences = [sequences]

    if format in _BinaryFormats:
        mode = 'wb'
    else:
        mode = 'w'

    with as_handle(handle, mode) as fp:
        # Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(sequences)
        elif format in AlignIO._FormatToWriter:
            # Try and turn all the records into a single alignment,
            # and write that using Bio.AlignIO
            alignment = MultipleSeqAlignment(sequences)
            alignment_count = AlignIO.write([alignment], fp, format)
            assert alignment_count == 1, \
                "Internal error - the underlying writer " \
                " should have returned 1, not %r" % alignment_count
            count = len(alignment)
            del alignment_count, alignment
        elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
            raise ValueError(
                "Reading format '%s' is supported, but not writing" % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

        assert isinstance(count, int), "Internal error - the underlying %s " \
            "writer should have returned the record count, not %r" \
            % (format, count)

    return count
示例#8
0
def parse(handle, format, seq_count=None, alphabet=None):
    """Iterate over an alignment file as MultipleSeqAlignment objects.

    Arguments:
      - handle    - handle to the file, or the filename as a string
        (note older versions of Biopython only took a handle).
      - format    - string describing the file format.
      - alphabet  - optional Alphabet object, useful when the sequence type
        cannot be automatically inferred from the file itself
        (e.g. fasta, phylip, clustal)
      - seq_count - Optional integer, number of sequences expected in each
        alignment.  Recommended for fasta format files.

    If you have the file name in a string 'filename', use:

    >>> from anarci.Bio import AlignIO
    >>> filename = "Emboss/needle.txt"
    >>> format = "emboss"
    >>> for alignment in AlignIO.parse(filename, format):
    ...     print("Alignment of length %i" % alignment.get_alignment_length())
    Alignment of length 124
    Alignment of length 119
    Alignment of length 120
    Alignment of length 118
    Alignment of length 125

    If you have a string 'data' containing the file contents, use::

      from anarci.Bio import AlignIO
      from StringIO import StringIO
      my_iterator = AlignIO.parse(StringIO(data), format)

    Use the Bio.AlignIO.read() function when you expect a single record only.
    """
    from anarci.Bio import SeqIO

    # Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    if alphabet is not None and not (isinstance(alphabet, Alphabet)
                                     or isinstance(alphabet, AlphabetEncoder)):
        raise ValueError("Invalid alphabet, %s" % repr(alphabet))
    if seq_count is not None and not isinstance(seq_count, int):
        raise TypeError("Need integer for seq_count (sequences per alignment)")

    with as_handle(handle, 'rU') as fp:
        # Map the file format to a sequence iterator:
        if format in _FormatToIterator:
            iterator_generator = _FormatToIterator[format]
            if alphabet is None:
                i = iterator_generator(fp, seq_count)
            else:
                try:
                    # Initially assume the optional alphabet argument is supported
                    i = iterator_generator(fp, seq_count, alphabet=alphabet)
                except TypeError:
                    # It isn't supported.
                    i = _force_alphabet(iterator_generator(fp, seq_count),
                                        alphabet)

        elif format in TCRDB.SeqIO._FormatToIterator:
            # Exploit the existing SeqIO parser to the dirty work!
            i = _SeqIO_to_alignment_iterator(fp,
                                             format,
                                             alphabet=alphabet,
                                             seq_count=seq_count)
        else:
            raise ValueError("Unknown format '%s'" % format)

        # This imposes some overhead... wait until we drop Python 2.4 to fix it
        for a in i:
            yield a