Пример #1
0
    def get_raw(self, offset):
        """Return the raw record from the file as a bytes string."""
        handle = self._handle
        qresult_raw = _as_bytes('')
        query_mark = _as_bytes('>>>')

        # read header first
        handle.seek(0)
        while True:
            line = handle.readline()
            peekline = handle.peekline()
            qresult_raw += line
            if not peekline.startswith(query_mark) and query_mark in peekline:
                break

        # and read the qresult raw string
        handle.seek(offset)
        while True:
            # preserve whitespace, don't use read_forward
            line = handle.readline()
            peekline = handle.peekline()
            qresult_raw += line

            # break when we've reached qresult end
            if (not peekline.startswith(query_mark) and query_mark in peekline) or \
                    not line:
                break

        # append mock end marker to qresult_raw, since it's not always present
        return qresult_raw + _as_bytes('>>><<<\n')
Пример #2
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)'))

        # determine flag for hmmsearch
        is_hmmsearch = False
        line = read_forward(handle)
        if line.startswith(_as_bytes('hmmsearch')):
            is_hmmsearch = True

        while True:
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                # HACK: since hmmsearch can only have one query result
                if is_hmmsearch:
                    yield _bytes_to_string(qresult_key), start_offset, 0
                break

            line = read_forward(handle)
Пример #3
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         # We cannot assume the record.id is the first word after ID,
         # normally the following AC line is used.
         line = handle.readline()
         length += len(line)
         assert line.startswith(_as_bytes("AC "))
         key = line[3:].strip().split(semi_char)[0].strip()
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             length += len(line)
     assert not line, repr(line)
Пример #4
0
class Hmmer3TextIndexer(_BaseHmmerTextIndexer):

    """Indexer class for HMMER plain text output."""

    _parser = Hmmer3TextParser
    qresult_start = _as_bytes('Query: ')
    qresult_end = _as_bytes('//')

    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN))

        while True:
            line = read_forward(handle)
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                break
Пример #5
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     dot_char = _as_bytes(".")
     sv_marker = _as_bytes("SV ")
     ac_marker = _as_bytes("AC ")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after ID,
         # normally the SV line is used.
         setbysv = False  # resets sv as false
         length = len(line)
         if line[2:].count(semi_char) == 6:
             # Looks like the semi colon separated style introduced in 2006
             parts = line[3:].rstrip().split(semi_char)
             if parts[1].strip().startswith(sv_marker):
                 # The SV bit gives the version
                 key = parts[0].strip() + dot_char + \
                     parts[1].strip().split()[1]
                 setbysv = True
             else:
                 key = parts[0].strip()
         elif line[2:].count(semi_char) == 3:
             # Looks like the pre 2006 style, take first word only
             key = line[3:].strip().split(None, 1)[0]
             if key.endswith(semi_char):
                 key = key[:-1]
         else:
             raise ValueError('Did not recognise the ID line layout:\n' +
                              line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(ac_marker) and not setbysv:
                 key = line.rstrip().split()[1]
                 if key.endswith(semi_char):
                     key = key[:-1]
             elif line.startswith(sv_marker):
                 key = line.rstrip().split()[1]
                 setbysv = True
             length += len(line)
     assert not line, repr(line)
Пример #6
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         # Empty file!
         return
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     while line:
         # assert line[0]=="@"
         # This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         # Find the seq line(s)
         seq_len = 0
         length = len(line)
         while line:
             line = handle.readline()
             length += len(line)
             if line.startswith(plus_char):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         # assert line[0]=="+"
         # Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 if seq_len == 0:
                     # Special case, quality line should be just "\n"
                     line = handle.readline()
                     if line.strip():
                         raise ValueError(
                             "Expected blank quality line, not %r" % line)
                 # Should be end of record...
                 end_offset = handle.tell()
                 line = handle.readline()
                 if line and line[0:1] != at_char:
                     raise ValueError("Problem with line %r" % line)
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
                 length += len(line)
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         yield _bytes_to_string(id), start_offset, length
         start_offset = end_offset
Пример #7
0
 def get_raw(self, offset):
     """Return the raw record from the file as a bytes string."""
     handle = self._handle
     handle.seek(offset)
     marker_re = self._marker_re
     lines = []
     line = handle.readline()
     semi_char = _as_bytes(";")
     while line.startswith(semi_char):
         lines.append(line)
         line = handle.readline()
     while line and not line.startswith(semi_char):
         lines.append(line)
         line = handle.readline()
     return _as_bytes("").join(lines)
Пример #8
0
    def get_raw(self, offset):
        """Return the raw record from the file as a bytes string."""
        handle = self._handle
        qresult_raw = _as_bytes('')

        # read header first
        if not self._preamble:
            handle.seek(0)
            while True:
                line = handle.readline()
                if line.startswith(self.qresult_start):
                    break
                qresult_raw += line
        else:
            qresult_raw += self._preamble

        # and read the qresult raw string
        handle.seek(offset)
        while True:
            # preserve whitespace, don't use read_forward
            line = handle.readline()
            qresult_raw += line

            # break when we've reached qresult end
            if line.startswith(self.qresult_end) or not line:
                break

        return qresult_raw
Пример #9
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     start_acc_marker = _as_bytes("<accession>")
     end_acc_marker = _as_bytes("</accession>")
     end_entry_marker = _as_bytes("</entry>")
     less_than = _as_bytes("<")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         # We expect the next line to be <accession>xxx</accession>
         # (possibly with leading spaces)
         # but allow it to be later on within the <entry>
         key = None
         while True:
             line = handle.readline()
             if key is None and start_acc_marker in line:
                 assert end_acc_marker in line, line
                 key = line[line.find(start_acc_marker) + 11:].split(
                     less_than, 1)[0]
                 length += len(line)
             elif end_entry_marker in line:
                 end_offset = handle.tell() - len(line) \
                     + line.find(end_entry_marker) + 8
                 break
             elif marker_re.match(line) or not line:
                 # Start of next record or end of file
                 raise ValueError("Didn't find end of record")
             else:
                 length += len(line)
         if not key:
             raise ValueError(
                 "Did not find <accession> line in bytes %i to %i" %
                 (start_offset, end_offset))
         yield _bytes_to_string(key), start_offset, length
         # Find start of next record
         while not marker_re.match(line) and line:
             start_offset = handle.tell()
             line = handle.readline()
     assert not line, repr(line)
Пример #10
0
 def get_raw(self, offset):
     """Return the raw record from the file as a bytes string."""
     # TODO - Refactor this and the __init__ method to reduce code duplication?
     handle = self._handle
     handle.seek(offset)
     line = handle.readline()
     data = line
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     # Find the seq line(s)
     seq_len = 0
     while line:
         line = handle.readline()
         data += line
         if line.startswith(plus_char):
             break
         seq_len += len(line.strip())
     if not line:
         raise ValueError("Premature end of file in seq section")
     assert line[0:1] == plus_char
     # Find the qual line(s)
     qual_len = 0
     while line:
         if seq_len == qual_len:
             if seq_len == 0:
                 # Special case, quality line should be just "\n"
                 line = handle.readline()
                 if line.strip():
                     raise ValueError(
                         "Expected blank quality line, not %r" % line)
                 data += line
             # Should be end of record...
             line = handle.readline()
             if line and line[0:1] != at_char:
                 raise ValueError("Problem with line %r" % line)
             break
         else:
             line = handle.readline()
             data += line
             qual_len += len(line.strip())
     if seq_len != qual_len:
         raise ValueError("Problem with quality section")
     return data
Пример #11
0
 def get_raw(self, offset):
     """Return the raw record from the file as a bytes string."""
     handle = self._handle
     marker_re = self._marker_re
     end_entry_marker = _as_bytes("</entry>")
     handle.seek(offset)
     data = [handle.readline()]
     while True:
         line = handle.readline()
         i = line.find(end_entry_marker)
         if i != -1:
             data.append(line[:i + 8])
             break
         if marker_re.match(line) or not line:
             # End of file, or start of next record
             raise ValueError("Didn't find end of record")
         data.append(line)
     return _as_bytes("").join(data)
Пример #12
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        query_id_idx = self._query_id_idx
        qresult_key = None
        header_mark = _as_bytes('#')
        split_mark = _as_bytes(' ')
        # set line with initial mock value, to emulate header
        line = header_mark

        # read through header
        while line.startswith(header_mark):
            start_offset = handle.tell()
            line = handle.readline()

        # and index the qresults
        while True:
            end_offset = handle.tell()

            if not line:
                break

            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    adj_end = end_offset - len(line)
                    yield _bytes_to_string(qresult_key), start_offset, \
                            adj_end - start_offset
                    qresult_key = curr_key
                    start_offset = adj_end

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
Пример #13
0
class Hmmer2TextIndexer(_BaseHmmerTextIndexer):
    """Indexer for hmmer2-text format."""

    _parser = Hmmer2TextParser
    qresult_start = _as_bytes('Query')
    # qresults_ends for hmmpfam and hmmsearch
    # need to anticipate both since hmmsearch have different query end mark
    qresult_end = _as_bytes('//')

    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)'))

        # determine flag for hmmsearch
        is_hmmsearch = False
        line = read_forward(handle)
        if line.startswith(_as_bytes('hmmsearch')):
            is_hmmsearch = True

        while True:
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                # HACK: since hmmsearch can only have one query result
                if is_hmmsearch:
                    yield _bytes_to_string(qresult_key), start_offset, 0
                break

            line = read_forward(handle)
Пример #14
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     dot_char = _as_bytes(".")
     accession_marker = _as_bytes("ACCESSION ")
     version_marker = _as_bytes("VERSION ")
     # Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after LOCUS,
         # normally the first entry on the VERSION or ACCESSION line is used.
         key = None
         length = len(line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 if not key:
                     raise ValueError(
                         "Did not find ACCESSION/VERSION lines")
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(accession_marker):
                 key = line.rstrip().split()[1]
             elif line.startswith(version_marker):
                 version_id = line.rstrip().split()[1]
                 if version_id.count(dot_char) == 1 and version_id.split(
                         dot_char)[1].isdigit():
                     # This should mimic the GenBank parser...
                     key = version_id
             length += len(line)
     assert not line, repr(line)
Пример #15
0
 def get_raw(self, offset):
     """Return the raw record from the file as a bytes string."""
     # For non-trivial file formats this must be over-ridden in the subclass
     handle = self._handle
     marker_re = self._marker_re
     handle.seek(offset)
     lines = [handle.readline()]
     while True:
         line = handle.readline()
         if marker_re.match(line) or not line:
             # End of file, or start of next record => end of this record
             break
         lines.append(line)
     return _as_bytes("").join(lines)
Пример #16
0
    def get_raw(self, offset):
        """Returns the raw bytes string of a QueryResult object from the given offset."""
        handle = self._handle
        handle.seek(offset)
        query_id_idx = self._query_id_idx
        qresult_key = None
        qresult_raw = _as_bytes('')
        split_mark = _as_bytes(' ')

        while True:
            line = handle.readline()
            if not line:
                break
            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]
                if curr_key != qresult_key:
                    break
            qresult_raw += line

        return qresult_raw
Пример #17
0
 def write(self, data):
     # TODO - Check bytes vs unicode
     data = _as_bytes(data)
     # block_size = 2**16 = 65536
     data_len = len(data)
     if len(self._buffer) + data_len < 65536:
         # print("Cached %r" % data)
         self._buffer += data
         return
     else:
         # print("Got %r, writing out some data..." % data)
         self._buffer += data
         while len(self._buffer) >= 65536:
             self._write_block(self._buffer[:65536])
             self._buffer = self._buffer[65536:]
Пример #18
0
 def __init__(self, filename, format, alphabet):
     SeqFileRandomAccess.__init__(self, filename, format, alphabet)
     marker = {
         "ace": "CO ",
         "embl": "ID ",
         "fasta": ">",
         "genbank": "LOCUS ",
         "gb": "LOCUS ",
         "imgt": "ID ",
         "phd": "BEGIN_SEQUENCE",
         "pir": ">..;",
         "qual": ">",
         "swiss": "ID ",
         "uniprot-xml": "<entry ",
     }[format]
     self._marker = marker
     self._marker_re = re.compile(_as_bytes("^%s" % marker))
Пример #19
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     tab_char = _as_bytes("\t")
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if not line:
             break  # End of file
         try:
             key = line.split(tab_char)[0]
         except ValueError as err:
             if not line.strip():
                 # Ignore blank lines
                 continue
             else:
                 raise err
         else:
             yield _bytes_to_string(key), start_offset, len(line)
Пример #20
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN))

        while True:
            line = read_forward(handle)
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                break
Пример #21
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        # denotes column location for query identifier
        query_id_idx = 9
        qresult_key = None
        tab_char = _as_bytes('\t')

        start_offset = handle.tell()
        line = handle.readline()
        # read through header
        # this assumes that the result row match the regex
        while not re.search(_RE_ROW_CHECK_IDX, line.strip()):
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                raise StopIteration

        # and index the qresults
        while True:
            end_offset = handle.tell()

            cols = [x for x in line.strip().split(tab_char) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    yield _bytes_to_string(qresult_key), start_offset, \
                            end_offset - start_offset
                    qresult_key = curr_key
                    start_offset = end_offset - len(line)

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
Пример #22
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     while True:
         offset = handle.tell()
         line = handle.readline()
         length = len(line)
         if marker_re.match(line):
             # Now look for the first line which doesn't start ";"
             while True:
                 line = handle.readline()
                 if line[0:1] != semi_char and line.strip():
                     key = line.split()[0]
                     yield _bytes_to_string(key), offset, length
                     break
                 if not line:
                     raise ValueError("Premature end of file?")
                 length += len(line)
         elif not line:
             # End of file
             break
Пример #23
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        qresult_key = None
        query_mark = _as_bytes('>>>')

        while True:
            line = handle.readline()
            peekline = handle.peekline()
            end_offset = handle.tell()

            if not line.startswith(query_mark) and query_mark in line:
                regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line)
                qresult_key = _bytes_to_string(regx.group(1))
                start_offset = end_offset - len(line)
            # yield whenever we encounter a new query or at the end of the file
            if qresult_key is not None:
                if (not peekline.startswith(query_mark)
                        and query_mark in peekline) or not line:
                    yield qresult_key, start_offset, end_offset - start_offset
                    if not line:
                        break
                    start_offset = end_offset
Пример #24
0
from anarci.Bio._py3k import _as_bytes, _bytes_to_string
from anarci.Bio._py3k import zip

from anarci.Bio.Alphabet import generic_dna
from anarci.Bio.SearchIO._index import SearchIndexer
from anarci.Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment


__all__ = ['BlatPslParser', 'BlatPslIndexer', 'BlatPslWriter']


# precompile regex patterns
_PTR_ROW_CHECK = r'^\d+\s+\d+\s+\d+\s+\d+'
_RE_ROW_CHECK = re.compile(_PTR_ROW_CHECK)
_RE_ROW_CHECK_IDX = re.compile(_as_bytes(_PTR_ROW_CHECK))


def _list_from_csv(csv_string, caster=None):
    """Transforms the given comma-separated string into a list.

    :param csv_string: comma-separated input string
    :type csv_string: string
    :param caster: function used to cast each item in the input string
                   to its intended type
    :type caster: callable, accepts string, returns object

    """
    if caster is None:
        return [x for x in csv_string.split(',') if x]
    else:
Пример #25
0
 def __init__(self, *args, **kwargs):
     super(_BaseHmmerTextIndexer, self).__init__(*args, **kwargs)
     self._preamble = _as_bytes('')
Пример #26
0
 def __init__(self, filename, format, alphabet):
     SeqFileRandomAccess.__init__(self, filename, format, alphabet)
     self._marker_re = re.compile(_as_bytes("^;"))
Пример #27
0
from anarci.Bio._py3k import _as_bytes, _bytes_to_string
from anarci.Bio.Alphabet import generic_dna, generic_protein
from anarci.Bio.File import UndoHandle
from anarci.Bio.SearchIO._index import SearchIndexer
from anarci.Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment

__all__ = ['FastaM10Parser', 'FastaM10Indexer']

# precompile regex patterns
# regex for program name
_RE_FLAVS = re.compile(r't?fast[afmsxy]|pr[sf][sx]|lalign|[gs]?[glso]search')
# regex for sequence ID and length ~ deals with both \n and \r\n
_PTR_ID_DESC_SEQLEN = r'>>>(.+?)\s+(.*?) *- (\d+) (?:aa|nt)\s*$'
_RE_ID_DESC_SEQLEN = re.compile(_PTR_ID_DESC_SEQLEN)
_RE_ID_DESC_SEQLEN_IDX = re.compile(_as_bytes(_PTR_ID_DESC_SEQLEN))
# regex for qresult, hit, or hsp attribute value
_RE_ATTR = re.compile(r'^; [a-z]+(_[ \w-]+):\s+(.*)$')
# regex for capturing excess start and end sequences in alignments
_RE_START_EXC = re.compile(r'^-*')
_RE_END_EXC = re.compile(r'-*$')

# attribute name mappings
_HSP_ATTR_MAP = {
    '_initn': ('initn_score', int),
    '_init1': ('init1_score', int),
    '_opt': ('opt_score', int),
    '_s-w opt': ('opt_score', int),
    '_z-score': ('z_score', float),
    '_bits': ('bitscore', float),
    '_expect': ('evalue', float),