Пример #1
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         #We cannot assume the record.id is the first word after ID,
         #normally the following AC line is used.
         line = handle.readline()
         length += len(line)
         assert line.startswith(_as_bytes("AC "))
         key = line[3:].strip().split(semi_char)[0].strip()
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             length += len(line)
     assert not line, repr(line)
Пример #2
0
    def _get_raw_qresult(self, offset):
        """Returns the raw string of a single QueryResult from a noncommented file."""
        handle = self._handle
        handle.seek(offset)
        qresult_raw = _as_bytes('')
        tab_char = _as_bytes('\t')
        key_idx = self._key_idx
        qresult_key = None

        while True:
            line = handle.readline()
            # get the key if the first line (qresult key)
            if qresult_key is None:
                qresult_key = line.split(tab_char)[key_idx]
            else:
                try:
                    curr_key = line.split(tab_char)[key_idx]
                except IndexError:
                    curr_key = _as_bytes('')
                # only break when qresult is finished (key is different)
                if curr_key != qresult_key:
                    break
            # append to the raw string as long as qresult is the same
            qresult_raw += line

        return qresult_raw
Пример #3
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         # We cannot assume the record.id is the first word after ID,
         # normally the following AC line is used.
         line = handle.readline()
         length += len(line)
         assert line.startswith(_as_bytes("AC "))
         key = line[3:].strip().split(semi_char)[0].strip()
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             length += len(line)
     assert not line, repr(line)
Пример #4
0
    def _qresult_index_commented(self):
        """Indexer for commented BLAST tabular files."""
        handle = self._handle
        handle.seek(0)
        start_offset = 0
        # mark of a new query
        query_mark = None
        # mark of the query's ID
        qid_mark = _as_bytes('# Query: ')
        # mark of the last line
        end_mark = _as_bytes('# BLAST processed')

        while True:
            end_offset = handle.tell()
            line = handle.readline()

            if query_mark is None:
                query_mark = line
                start_offset = end_offset
            elif line.startswith(qid_mark):
                qresult_key = line[len(qid_mark):].split()[0]
            elif line == query_mark or line.startswith(end_mark):
                yield qresult_key, start_offset, end_offset - start_offset
                start_offset = end_offset
            elif not line:
                break
Пример #5
0
    def _qresult_index(self):
        """Indexer for noncommented BLAST tabular files."""
        handle = self._handle
        handle.seek(0)
        start_offset = 0
        qresult_key = None
        key_idx = self._key_idx
        tab_char = _as_bytes('\t')

        while True:
            # get end offset here since we only know a qresult ends after
            # encountering the next one
            end_offset = handle.tell()
            #line = handle.readline()
            line = handle.readline()

            if qresult_key is None:
                qresult_key = line.split(tab_char)[key_idx]
            else:
                try:
                    curr_key = line.split(tab_char)[key_idx]
                except IndexError:
                    curr_key = _as_bytes('')

                if curr_key != qresult_key:
                    yield qresult_key, start_offset, end_offset - start_offset
                    qresult_key = curr_key
                    start_offset = end_offset

            # break if we've reached EOF
            if not line:
                break
Пример #6
0
    def _qresult_index(self):
        """Indexer for noncommented BLAST tabular files."""
        handle = self._handle
        handle.seek(0)
        start_offset = 0
        qresult_key = None
        key_idx = self._key_idx
        tab_char = _as_bytes('\t')

        while True:
            # get end offset here since we only know a qresult ends after
            # encountering the next one
            end_offset = handle.tell()
            #line = handle.readline()
            line = handle.readline()

            if qresult_key is None:
                qresult_key = line.split(tab_char)[key_idx]
            else:
                try:
                    curr_key = line.split(tab_char)[key_idx]
                except IndexError:
                    curr_key = _as_bytes('')

                if curr_key != qresult_key:
                    yield qresult_key, start_offset, end_offset - start_offset
                    qresult_key = curr_key
                    start_offset = end_offset

            # break if we've reached EOF
            if not line:
                break
Пример #7
0
    def _qresult_index_commented(self):
        """Indexer for commented BLAST tabular files."""
        handle = self._handle
        handle.seek(0)
        start_offset = 0
        # mark of a new query
        query_mark = None
        # mark of the query's ID
        qid_mark = _as_bytes('# Query: ')
        # mark of the last line
        end_mark = _as_bytes('# BLAST processed')

        while True:
            end_offset = handle.tell()
            line = handle.readline()

            if query_mark is None:
                query_mark = line
                start_offset = end_offset
            elif line.startswith(qid_mark):
                qresult_key = line[len(qid_mark):].split()[0]
            elif line == query_mark or line.startswith(end_mark):
                yield qresult_key, start_offset, end_offset - start_offset
                start_offset = end_offset
            elif not line:
                break
Пример #8
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)'))

        # determine flag for hmmsearch
        is_hmmsearch = False
        line = read_forward(handle)
        if line.startswith(_as_bytes('hmmsearch')):
            is_hmmsearch = True

        while True:
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                # HACK: since hmmsearch can only have one query result
                if is_hmmsearch:
                    yield _bytes_to_string(qresult_key), start_offset, 0
                break

            line = read_forward(handle)
Пример #9
0
    def _get_raw_qresult(self, offset):
        """Returns the raw string of a single QueryResult from a noncommented file."""
        handle = self._handle
        handle.seek(offset)
        qresult_raw = _as_bytes('')
        tab_char = _as_bytes('\t')
        key_idx = self._key_idx
        qresult_key = None

        while True:
            line = handle.readline()
            # get the key if the first line (qresult key)
            if qresult_key is None:
                qresult_key = line.split(tab_char)[key_idx]
            else:
                try:
                    curr_key = line.split(tab_char)[key_idx]
                except IndexError:
                    curr_key = _as_bytes('')
                # only break when qresult is finished (key is different)
                if curr_key != qresult_key:
                    break
            # append to the raw string as long as qresult is the same
            qresult_raw += line

        return qresult_raw
Пример #10
0
    def get_raw(self, offset):
        handle = self._handle
        qresult_raw = _as_bytes('')
        query_mark = _as_bytes('>>>')

        # read header first
        handle.seek(0)
        while True:
            line = handle.readline()
            peekline = handle.peekline()
            qresult_raw += line
            if not peekline.startswith(query_mark) and query_mark in peekline:
                break

        # and read the qresult raw string
        handle.seek(offset)
        while True:
            # preserve whitespace, don't use read_forward
            line = handle.readline()
            peekline = handle.peekline()
            qresult_raw += line

            # break when we've reached qresult end
            if (not peekline.startswith(query_mark) and query_mark in peekline) or \
                    not line:
                break

        # append mock end marker to qresult_raw, since it's not always present
        return qresult_raw + _as_bytes('>>><<<\n')
Пример #11
0
class Hmmer3TextIndexer(_BaseHmmerTextIndexer):
    """Indexer class for HMMER plain text output."""

    _parser = Hmmer3TextParser
    qresult_start = _as_bytes('Query: ')
    qresult_end = _as_bytes('//')

    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN))

        while True:
            line = read_forward(handle)
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                break
Пример #12
0
    def get_raw(self, offset):
        handle = self._handle
        qresult_raw = _as_bytes('')
        query_mark = _as_bytes('>>>')

        # read header first
        handle.seek(0)
        while True:
            line = handle.readline()
            peekline = handle.peekline()
            qresult_raw += line
            if not peekline.startswith(query_mark) and query_mark in peekline:
                break

        # and read the qresult raw string
        handle.seek(offset)
        while True:
            # preserve whitespace, don't use read_forward
            line = handle.readline()
            peekline = handle.peekline()
            qresult_raw += line

            # break when we've reached qresult end
            if (not peekline.startswith(query_mark) and query_mark in peekline) or \
                    not line:
                break

        # append mock end marker to qresult_raw, since it's not always present
        return qresult_raw + _as_bytes('>>><<<\n')
Пример #13
0
def crc32(seq):
    """Returns the crc32 checksum for a sequence (string or Seq object)."""
    #NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned
    #Docs suggest should use crc32(x) & 0xffffffff for consistency.
    #TODO - Should we return crc32(x) & 0xffffffff here?
    try:
        #Assume its a Seq object
        return _crc32(_as_bytes(str(seq)))
    except AttributeError:
        #Assume its a string/unicode
        return _crc32(_as_bytes(seq))
Пример #14
0
def crc32(seq):
    """Returns the crc32 checksum for a sequence (string or Seq object)."""
    #NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned
    #Docs suggest should use crc32(x) & 0xffffffff for consistency.
    #TODO - Should we return crc32(x) & 0xffffffff here?
    try:
        #Assume its a Seq object
        return _crc32(_as_bytes(str(seq)))
    except AttributeError:
        #Assume its a string/unicode
        return _crc32(_as_bytes(seq))
Пример #15
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         #Empty file!
         return
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     while line:
         #assert line[0]=="@"
         #This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         #Find the seq line(s)
         seq_len = 0
         length = len(line)
         while line:
             line = handle.readline()
             length += len(line)
             if line.startswith(plus_char):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         #assert line[0]=="+"
         #Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 if seq_len == 0:
                     #Special case, quality line should be just "\n"
                     line = handle.readline()
                     if line.strip():
                         raise ValueError(
                             "Expected blank quality line, not %r" % line)
                 #Should be end of record...
                 end_offset = handle.tell()
                 line = handle.readline()
                 if line and line[0:1] != at_char:
                     raise ValueError("Problem with line %r" % line)
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
                 length += len(line)
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         yield _bytes_to_string(id), start_offset, length
         start_offset = end_offset
Пример #16
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         # Empty file!
         return
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     while line:
         # assert line[0]=="@"
         # This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         # Find the seq line(s)
         seq_len = 0
         length = len(line)
         while line:
             line = handle.readline()
             length += len(line)
             if line.startswith(plus_char):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         # assert line[0]=="+"
         # Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 if seq_len == 0:
                     # Special case, quality line should be just "\n"
                     line = handle.readline()
                     if line.strip():
                         raise ValueError("Expected blank quality line, not %r" % line)
                 # Should be end of record...
                 end_offset = handle.tell()
                 line = handle.readline()
                 if line and line[0:1] != at_char:
                     raise ValueError("Problem with line %r" % line)
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
                 length += len(line)
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         yield _bytes_to_string(id), start_offset, length
         start_offset = end_offset
Пример #17
0
 def get_raw(self, offset):
     handle = self._handle
     handle.seek(offset)
     marker_re = self._marker_re
     lines = []
     line = handle.readline()
     semi_char = _as_bytes(";")
     while line.startswith(semi_char):
         lines.append(line)
         line = handle.readline()
     while line and not line.startswith(semi_char):
         lines.append(line)
         line = handle.readline()
     return _as_bytes("").join(lines)
Пример #18
0
 def get_raw(self, offset):
     handle = self._handle
     handle.seek(offset)
     marker_re = self._marker_re
     lines = []
     line = handle.readline()
     semi_char = _as_bytes(";")
     while line.startswith(semi_char):
         lines.append(line)
         line = handle.readline()
     while line and not line.startswith(semi_char):
         lines.append(line)
         line = handle.readline()
     return _as_bytes("").join(lines)
Пример #19
0
    def get_raw(self, offset):
        handle = self._handle
        qresult_raw = _as_bytes('')

        # read header first
        if not self._preamble:
            handle.seek(0)
            while True:
                line = handle.readline()
                if line.startswith(self.qresult_start):
                    break
                qresult_raw += line
        else:
            qresult_raw += self._preamble

        # and read the qresult raw string
        handle.seek(offset)
        while True:
            # preserve whitespace, don't use read_forward
            line = handle.readline()
            qresult_raw += line

            # break when we've reached qresult end
            if line.startswith(self.qresult_end) or not line:
                break

        return qresult_raw
Пример #20
0
    def get_raw(self, offset):
        handle = self._handle
        qresult_raw = _as_bytes("")

        # read header first
        if not self._preamble:
            handle.seek(0)
            while True:
                line = handle.readline()
                if line.startswith(self.qresult_start):
                    break
                qresult_raw += line
        else:
            qresult_raw += self._preamble

        # and read the qresult raw string
        handle.seek(offset)
        while True:
            # preserve whitespace, don't use read_forward
            line = handle.readline()
            qresult_raw += line

            # break when we've reached qresult end
            if line.startswith(self.qresult_end) or not line:
                break

        return qresult_raw
Пример #21
0
def _open(url, post=None):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to TogoWS, will raise an IOError if it encounters an error.

    In the absense of clear guidelines, this function enforces a limit of
    "up to three queries per second" to avoid abusing the TogoWS servers.
    """
    delay = 0.333333333  # one third of a second
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current

    #print(url)
    if post:
        handle = _urlopen(url, _as_bytes(post))
    else:
        handle = _urlopen(url)

    #We now trust TogoWS to have set an HTTP error code, that
    #suffices for my current unit tests. Previously we would
    #examine the start of the data returned back.
    return _binary_to_string_handle(handle)
Пример #22
0
def seguid(seq):
    """Returns the SEGUID (string) for a sequence (string or Seq object).

    Given a nucleotide or amino-acid secuence (or any string),
    returns the SEGUID string (A SEquence Globally Unique IDentifier).
    seq type = str.
    For more information about SEGUID, see:
    http://bioinformatics.anl.gov/seguid/
    DOI: 10.1002/pmic.200600032 """
    import hashlib
    import base64
    m = hashlib.sha1()
    try:
        #Assume it's a Seq object
        seq = str(seq)
    except AttributeError:
        #Assume it's a string
        pass
    m.update(_as_bytes(seq.upper()))
    try:
        #For Python 3+
        return base64.encodebytes(m.digest()).decode().replace("\n", "").rstrip("=")
    except AttributeError:
        pass
    # For all other Pythons
    return base64.b64encode(m.digest()).rstrip("=")
Пример #23
0
def _open(url, post=None):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to TogoWS, will raise an IOError if it encounters an error.

    In the absense of clear guidelines, this function enforces a limit of
    "up to three queries per second" to avoid abusing the TogoWS servers.
    """
    delay = 0.333333333  # one third of a second
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current

    #print(url)
    if post:
        handle = _urlopen(url, _as_bytes(post))
    else:
        handle = _urlopen(url)

    #We now trust TogoWS to have set an HTTP error code, that
    #suffices for my current unit tests. Previously we would
    #examine the start of the data returned back.
    return _binary_to_string_handle(handle)
Пример #24
0
def seguid(seq):
    """Returns the SEGUID (string) for a sequence (string or Seq object).

    Given a nucleotide or amino-acid secuence (or any string),
    returns the SEGUID string (A SEquence Globally Unique IDentifier).
    seq type = str.
    For more information about SEGUID, see:
    http://bioinformatics.anl.gov/seguid/
    DOI: 10.1002/pmic.200600032 """
    import hashlib
    import base64
    m = hashlib.sha1()
    try:
        #Assume it's a Seq object
        seq = str(seq)
    except AttributeError:
        #Assume it's a string
        pass
    m.update(_as_bytes(seq.upper()))
    try:
        #For Python 3+
        return base64.encodebytes(m.digest()).decode().replace("\n",
                                                               "").rstrip("=")
    except AttributeError:
        pass
    # For all other Pythons
    return base64.b64encode(m.digest()).rstrip("=")
Пример #25
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     start_acc_marker = _as_bytes("<accession>")
     end_acc_marker = _as_bytes("</accession>")
     end_entry_marker = _as_bytes("</entry>")
     less_than = _as_bytes("<")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         #We expect the next line to be <accession>xxx</accession>
         #(possibly with leading spaces)
         #but allow it to be later on within the <entry>
         key = None
         while True:
             line = handle.readline()
             if key is None and start_acc_marker in line:
                 assert end_acc_marker in line, line
                 key = line[line.find(start_acc_marker) + 11:].split(
                     less_than, 1)[0]
                 length += len(line)
             elif end_entry_marker in line:
                 end_offset = handle.tell() - len(line) \
                     + line.find(end_entry_marker) + 8
                 break
             elif marker_re.match(line) or not line:
                 #Start of next record or end of file
                 raise ValueError("Didn't find end of record")
             else:
                 length += len(line)
         if not key:
             raise ValueError(
                 "Did not find <accession> line in bytes %i to %i" %
                 (start_offset, end_offset))
         yield _bytes_to_string(key), start_offset, length
         #Find start of next record
         while not marker_re.match(line) and line:
             start_offset = handle.tell()
             line = handle.readline()
     assert not line, repr(line)
Пример #26
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     dot_char = _as_bytes(".")
     sv_marker = _as_bytes("SV ")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #We cannot assume the record.id is the first word after ID,
         #normally the SV line is used.
         length = len(line)
         if line[2:].count(semi_char) == 6:
             #Looks like the semi colon separated style introduced in 2006
             parts = line[3:].rstrip().split(semi_char)
             if parts[1].strip().startswith(sv_marker):
                 #The SV bit gives the version
                 key = parts[0].strip() + dot_char + \
                     parts[1].strip().split()[1]
             else:
                 key = parts[0].strip()
         elif line[2:].count(semi_char) == 3:
             #Looks like the pre 2006 style, take first word only
             key = line[3:].strip().split(None, 1)[0]
         else:
             raise ValueError('Did not recognise the ID line layout:\n' +
                              line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(sv_marker):
                 key = line.rstrip().split()[1]
             length += len(line)
     assert not line, repr(line)
Пример #27
0
 def get_raw(self, offset):
     """Similar to the get method, but returns the record as a raw string."""
     #TODO - Refactor this and the __init__ method to reduce code duplication?
     handle = self._handle
     handle.seek(offset)
     line = handle.readline()
     data = line
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     #Find the seq line(s)
     seq_len = 0
     while line:
         line = handle.readline()
         data += line
         if line.startswith(plus_char):
             break
         seq_len += len(line.strip())
     if not line:
         raise ValueError("Premature end of file in seq section")
     assert line[0:1] == plus_char
     #Find the qual line(s)
     qual_len = 0
     while line:
         if seq_len == qual_len:
             if seq_len == 0:
                 #Special case, quality line should be just "\n"
                 line = handle.readline()
                 if line.strip():
                     raise ValueError(
                         "Expected blank quality line, not %r" % line)
                 data += line
             #Should be end of record...
             line = handle.readline()
             if line and line[0:1] != at_char:
                 raise ValueError("Problem with line %r" % line)
             break
         else:
             line = handle.readline()
             data += line
             qual_len += len(line.strip())
     if seq_len != qual_len:
         raise ValueError("Problem with quality section")
     return data
Пример #28
0
 def get_raw(self, offset):
     """Similar to the get method, but returns the record as a raw string."""
     handle = self._handle
     marker_re = self._marker_re
     end_entry_marker = _as_bytes("</entry>")
     handle.seek(offset)
     data = [handle.readline()]
     while True:
         line = handle.readline()
         i = line.find(end_entry_marker)
         if i != -1:
             data.append(line[: i + 8])
             break
         if marker_re.match(line) or not line:
             # End of file, or start of next record
             raise ValueError("Didn't find end of record")
         data.append(line)
     return _as_bytes("").join(data)
Пример #29
0
 def get_raw(self, offset):
     """Similar to the get method, but returns the record as a raw string."""
     handle = self._handle
     marker_re = self._marker_re
     end_entry_marker = _as_bytes("</entry>")
     handle.seek(offset)
     data = [handle.readline()]
     while True:
         line = handle.readline()
         i = line.find(end_entry_marker)
         if i != -1:
             data.append(line[:i + 8])
             break
         if marker_re.match(line) or not line:
             #End of file, or start of next record
             raise ValueError("Didn't find end of record")
         data.append(line)
     return _as_bytes("").join(data)
Пример #30
0
 def get_raw(self, offset):
     """Similar to the get method, but returns the record as a raw string."""
     # TODO - Refactor this and the __init__ method to reduce code duplication?
     handle = self._handle
     handle.seek(offset)
     line = handle.readline()
     data = line
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     # Find the seq line(s)
     seq_len = 0
     while line:
         line = handle.readline()
         data += line
         if line.startswith(plus_char):
             break
         seq_len += len(line.strip())
     if not line:
         raise ValueError("Premature end of file in seq section")
     assert line[0:1] == plus_char
     # Find the qual line(s)
     qual_len = 0
     while line:
         if seq_len == qual_len:
             if seq_len == 0:
                 # Special case, quality line should be just "\n"
                 line = handle.readline()
                 if line.strip():
                     raise ValueError("Expected blank quality line, not %r" % line)
                 data += line
             # Should be end of record...
             line = handle.readline()
             if line and line[0:1] != at_char:
                 raise ValueError("Problem with line %r" % line)
             break
         else:
             line = handle.readline()
             data += line
             qual_len += len(line.strip())
     if seq_len != qual_len:
         raise ValueError("Problem with quality section")
     return data
Пример #31
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     dot_char = _as_bytes(".")
     sv_marker = _as_bytes("SV ")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after ID,
         # normally the SV line is used.
         length = len(line)
         if line[2:].count(semi_char) == 6:
             # Looks like the semi colon separated style introduced in 2006
             parts = line[3:].rstrip().split(semi_char)
             if parts[1].strip().startswith(sv_marker):
                 # The SV bit gives the version
                 key = parts[0].strip() + dot_char + parts[1].strip().split()[1]
             else:
                 key = parts[0].strip()
         elif line[2:].count(semi_char) == 3:
             # Looks like the pre 2006 style, take first word only
             key = line[3:].strip().split(None, 1)[0]
         else:
             raise ValueError("Did not recognise the ID line layout:\n" + line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(sv_marker):
                 key = line.rstrip().split()[1]
             length += len(line)
     assert not line, repr(line)
Пример #32
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     start_acc_marker = _as_bytes("<accession>")
     end_acc_marker = _as_bytes("</accession>")
     end_entry_marker = _as_bytes("</entry>")
     less_than = _as_bytes("<")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         # We expect the next line to be <accession>xxx</accession>
         # (possibly with leading spaces)
         # but allow it to be later on within the <entry>
         key = None
         while True:
             line = handle.readline()
             if key is None and start_acc_marker in line:
                 assert end_acc_marker in line, line
                 key = line[line.find(start_acc_marker) + 11 :].split(less_than, 1)[0]
                 length += len(line)
             elif end_entry_marker in line:
                 end_offset = handle.tell() - len(line) + line.find(end_entry_marker) + 8
                 break
             elif marker_re.match(line) or not line:
                 # Start of next record or end of file
                 raise ValueError("Didn't find end of record")
             else:
                 length += len(line)
         if not key:
             raise ValueError("Did not find <accession> line in bytes %i to %i" % (start_offset, end_offset))
         yield _bytes_to_string(key), start_offset, length
         # Find start of next record
         while not marker_re.match(line) and line:
             start_offset = handle.tell()
             line = handle.readline()
     assert not line, repr(line)
Пример #33
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        query_id_idx = self._query_id_idx
        qresult_key = None
        header_mark = _as_bytes('#')
        split_mark = _as_bytes(' ')
        # set line with initial mock value, to emulate header
        line = header_mark

        # read through header
        while line.startswith(header_mark):
            start_offset = handle.tell()
            line = handle.readline()

        # and index the qresults
        while True:
            end_offset = handle.tell()

            if not line:
                break

            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    adj_end = end_offset - len(line)
                    yield _bytes_to_string(qresult_key), start_offset, \
                            adj_end - start_offset
                    qresult_key = curr_key
                    start_offset = adj_end

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
Пример #34
0
class Hmmer2TextIndexer(_BaseHmmerTextIndexer):

    """Indexer for hmmer2-text format."""

    _parser = Hmmer2TextParser
    qresult_start = _as_bytes('Query')
    # qresults_ends for hmmpfam and hmmsearch
    # need to anticipate both since hmmsearch have different query end mark
    qresult_end = _as_bytes('//')

    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)'))

        # determine flag for hmmsearch
        is_hmmsearch = False
        line = read_forward(handle)
        if line.startswith(_as_bytes('hmmsearch')):
            is_hmmsearch = True

        while True:
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                # HACK: since hmmsearch can only have one query result
                if is_hmmsearch:
                    yield _bytes_to_string(qresult_key), start_offset, 0
                break

            line = read_forward(handle)
Пример #35
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        query_id_idx = self._query_id_idx
        qresult_key = None
        header_mark = _as_bytes('#')
        split_mark = _as_bytes(' ')
        # set line with initial mock value, to emulate header
        line = header_mark

        # read through header
        while line.startswith(header_mark):
            start_offset = handle.tell()
            line = handle.readline()

        # and index the qresults
        while True:
            end_offset = handle.tell()

            if not line:
                break

            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    adj_end = end_offset - len(line)
                    yield _bytes_to_string(qresult_key), start_offset, \
                            adj_end - start_offset
                    qresult_key = curr_key
                    start_offset = adj_end

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
Пример #36
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     dot_char = _as_bytes(".")
     accession_marker = _as_bytes("ACCESSION ")
     version_marker = _as_bytes("VERSION ")
     #Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #We cannot assume the record.id is the first word after LOCUS,
         #normally the first entry on the VERSION or ACCESSION line is used.
         key = None
         length = len(line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 if not key:
                     raise ValueError(
                         "Did not find ACCESSION/VERSION lines")
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(accession_marker):
                 key = line.rstrip().split()[1]
             elif line.startswith(version_marker):
                 version_id = line.rstrip().split()[1]
                 if version_id.count(dot_char) == 1 and version_id.split(
                         dot_char)[1].isdigit():
                     #This should mimic the GenBank parser...
                     key = version_id
             length += len(line)
     assert not line, repr(line)
Пример #37
0
    def get_raw(self, offset):
        """Returns the raw string of a QueryResult object from the given offset."""
        handle = self._handle
        handle.seek(offset)
        query_id_idx = self._query_id_idx
        qresult_key = None
        qresult_raw = _as_bytes('')
        split_mark = _as_bytes(' ')

        while True:
            line = handle.readline()
            if not line:
                break
            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]
                if curr_key != qresult_key:
                    break
            qresult_raw += line

        return qresult_raw
Пример #38
0
    def get_raw(self, offset):
        """Returns the raw string of a QueryResult object from the given offset."""
        handle = self._handle
        handle.seek(offset)
        query_id_idx = self._query_id_idx
        qresult_key = None
        qresult_raw = _as_bytes('')
        split_mark = _as_bytes(' ')

        while True:
            line = handle.readline()
            if not line:
                break
            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]
                if curr_key != qresult_key:
                    break
            qresult_raw += line

        return qresult_raw
Пример #39
0
 def get_raw(self, offset):
     """Similar to the get method, but returns the record as a raw string."""
     #For non-trivial file formats this must be over-ridden in the subclass
     handle = self._handle
     marker_re = self._marker_re
     handle.seek(offset)
     lines = [handle.readline()]
     while True:
         line = handle.readline()
         if marker_re.match(line) or not line:
             #End of file, or start of next record => end of this record
             break
         lines.append(line)
     return _as_bytes("").join(lines)
Пример #40
0
 def get_raw(self, offset):
     """Similar to the get method, but returns the record as a raw string."""
     # For non-trivial file formats this must be over-ridden in the subclass
     handle = self._handle
     marker_re = self._marker_re
     handle.seek(offset)
     lines = [handle.readline()]
     while True:
         line = handle.readline()
         if marker_re.match(line) or not line:
             # End of file, or start of next record => end of this record
             break
         lines.append(line)
     return _as_bytes("").join(lines)
Пример #41
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     dot_char = _as_bytes(".")
     accession_marker = _as_bytes("ACCESSION ")
     version_marker = _as_bytes("VERSION ")
     # Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after LOCUS,
         # normally the first entry on the VERSION or ACCESSION line is used.
         key = None
         length = len(line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 if not key:
                     raise ValueError("Did not find ACCESSION/VERSION lines")
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(accession_marker):
                 key = line.rstrip().split()[1]
             elif line.startswith(version_marker):
                 version_id = line.rstrip().split()[1]
                 if version_id.count(dot_char) == 1 and version_id.split(dot_char)[1].isdigit():
                     # This should mimic the GenBank parser...
                     key = version_id
             length += len(line)
     assert not line, repr(line)
Пример #42
0
 def write(self, data):
     #TODO - Check bytes vs unicode
     data = _as_bytes(data)
     #block_size = 2**16 = 65536
     data_len = len(data)
     if len(self._buffer) + data_len < 65536:
         #print("Cached %r" % data)
         self._buffer += data
         return
     else:
         #print("Got %r, writing out some data..." % data)
         self._buffer += data
         while len(self._buffer) >= 65536:
             self._write_block(self._buffer[:65536])
             self._buffer = self._buffer[65536:]
Пример #43
0
    def get_qresult_id(self, pos):
        """Returns the query ID from the nearest "Query:" line."""
        handle = self._handle
        handle.seek(pos)
        sentinel = _as_bytes('Query:')

        while True:
            line = handle.readline().strip()
            if line.startswith(sentinel):
                break
            if not line:
                raise StopIteration
        qid, desc = _parse_hit_or_query_line(_bytes_to_string(line))

        return qid
Пример #44
0
 def write(self, data):
     #TODO - Check bytes vs unicode
     data = _as_bytes(data)
     #block_size = 2**16 = 65536
     data_len = len(data)
     if len(self._buffer) + data_len < 65536:
         #print("Cached %r" % data)
         self._buffer += data
         return
     else:
         #print("Got %r, writing out some data..." % data)
         self._buffer += data
         while len(self._buffer) >= 65536:
             self._write_block(self._buffer[:65536])
             self._buffer = self._buffer[65536:]
Пример #45
0
    def get_qresult_id(self, pos):
        """Returns the query ID from the nearest "Query:" line."""
        handle = self._handle
        handle.seek(pos)
        sentinel = _as_bytes('Query:')

        while True:
            line = handle.readline().strip()
            if line.startswith(sentinel):
                break
            if not line:
                raise StopIteration
        qid, desc = _parse_hit_or_query_line(_bytes_to_string(line))

        return qid
Пример #46
0
    def _get_raw_qresult_commented(self, offset):
        """Returns the raw string of a single QueryResult from a commented file."""
        handle = self._handle
        handle.seek(offset)
        qresult_raw = _as_bytes('')
        end_mark = _as_bytes('# BLAST processed')

        # query mark is the line marking a new query
        # something like '# TBLASTN 2.2.25+'
        query_mark = None
        line = handle.readline()
        while line:
            # since query_mark depends on the BLAST search, we need to obtain it
            # first
            if query_mark is None:
                query_mark = line
            # break when we've reached the next qresult or the search ends
            elif line == query_mark or line.startswith(end_mark):
                break

            qresult_raw += line
            line = handle.readline()

        return qresult_raw
Пример #47
0
    def _get_raw_qresult_commented(self, offset):
        """Returns the raw string of a single QueryResult from a commented file."""
        handle = self._handle
        handle.seek(offset)
        qresult_raw = _as_bytes('')
        end_mark = _as_bytes('# BLAST processed')

        # query mark is the line marking a new query
        # something like '# TBLASTN 2.2.25+'
        query_mark = None
        line = handle.readline()
        while line:
            # since query_mark depends on the BLAST search, we need to obtain it
            # first
            if query_mark is None:
                query_mark = line
            # break when we've reached the next qresult or the search ends
            elif line == query_mark or line.startswith(end_mark):
                break

            qresult_raw += line
            line = handle.readline()

        return qresult_raw
Пример #48
0
class ExonerateCigarIndexer(ExonerateVulgarIndexer):
    """Indexer class for exonerate cigar lines."""

    _parser = ExonerateCigarParser
    _query_mark = _as_bytes('cigar')

    def get_qresult_id(self, pos):
        """Returns the query ID of the nearest cigar line."""
        handle = self._handle
        handle.seek(pos)
        # get line, check if it's a vulgar line, and get query ID
        line = handle.readline()
        assert line.startswith(self._query_mark), line
        id = re.search(_RE_CIGAR, _bytes_to_string(line))
        return id.group(1)
Пример #49
0
    def __iter__(self):
        qstart_mark = self.qstart_mark
        qend_mark = self.qend_mark
        blast_id_mark = _as_bytes('Query_')
        block_size = self.block_size
        handle = self._handle
        handle.seek(0)
        re_desc = re.compile(
            _as_bytes(r'<Iteration_query-ID>(.*?)'
                      '</Iteration_query-ID>\s+?<Iteration_query-def>'
                      '(.*?)</Iteration_query-def>'))
        re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>'))
        counter = 0

        while True:
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                break
            if qstart_mark not in line:
                continue
            # The following requirements are to make supporting BGZF compressed
            # BLAST XML files simpler (avoids complex offset manipulations):
            assert line.count(qstart_mark) == 1, "XML without line breaks?"
            assert line.lstrip().startswith(qstart_mark), line
            if qend_mark in line:
                # Should cope with <Iteration>...</Iteration> on one long line
                block = line
            else:
                # Load the rest of this block up to and including </Iteration>
                block = [line]
                while line and qend_mark not in line:
                    line = handle.readline()
                    assert qstart_mark not in line, line
                    block.append(line)
                assert line.rstrip().endswith(qend_mark), line
                block = _empty_bytes_string.join(block)
            assert block.count(
                qstart_mark) == 1, "XML without line breaks? %r" % block
            assert block.count(
                qend_mark) == 1, "XML without line breaks? %r" % block
            #Now we have a full <Iteration>...</Iteration> block, find the ID
            regx = re.search(re_desc, block)
            try:
                qstart_desc = regx.group(2)
                qstart_id = regx.group(1)
            except AttributeError:
                # use the fallback values
                assert re.search(re_desc_end, block)
                qstart_desc = _as_bytes(self._fallback['description'])
                qstart_id = _as_bytes(self._fallback['id'])
            if qstart_id.startswith(blast_id_mark):
                qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0]
            yield _bytes_to_string(qstart_id), start_offset, len(block)
            counter += 1
Пример #50
0
 def read(self, handle):
     """Set up the parser and let it parse the XML results"""
     # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser
     # expects binary data
     if handle.__class__.__name__ == 'EvilHandleHack':
         handle = handle._handle
     if handle.__class__.__name__ == 'TextIOWrapper':
         handle = handle.buffer
     if hasattr(handle, "closed") and handle.closed:
         # Should avoid a possible Segmentation Fault, see:
         # http://bugs.python.org/issue4877
         raise IOError("Can't parse a closed handle")
     if sys.version_info[0] >= 3:
         # Another nasty hack to cope with a unicode StringIO handle
         # since the Entrez XML parser expects binary data (bytes)
         from io import StringIO
         if isinstance(handle, StringIO):
             from io import BytesIO
             from SAP.Bio._py3k import _as_bytes
             handle = BytesIO(_as_bytes(handle.read()))
     try:
         self.parser.ParseFile(handle)
     except expat.ExpatError as e:
         if self.parser.StartElementHandler:
             # We saw the initial <!xml declaration, so we can be sure that
             # we are parsing XML data. Most likely, the XML file is
             # corrupted.
             raise CorruptedXMLError(e)
         else:
             # We have not seen the initial <!xml declaration, so probably
             # the input data is not in XML format.
             raise NotXMLError(e)
     try:
         return self.object
     except AttributeError:
         if self.parser.StartElementHandler:
             # We saw the initial <!xml declaration, and expat didn't notice
             # any errors, so self.object should be defined. If not, this is
             # a bug.
             raise RuntimeError(
                 "Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at [email protected] for assistance."
             )
         else:
             # We did not see the initial <!xml declaration, so probably
             # the input data is not in XML format.
             raise NotXMLError("XML declaration not found")
Пример #51
0
class ExonerateTextIndexer(_BaseExonerateIndexer):
    """Indexer class for Exonerate plain text."""

    _parser = ExonerateTextParser
    _query_mark = _as_bytes('C4 Alignment')

    def get_qresult_id(self, pos):
        """Returns the query ID from the nearest "Query:" line."""
        handle = self._handle
        handle.seek(pos)
        sentinel = _as_bytes('Query:')

        while True:
            line = handle.readline().strip()
            if line.startswith(sentinel):
                break
            if not line:
                raise StopIteration
        qid, desc = _parse_hit_or_query_line(_bytes_to_string(line))

        return qid

    def get_raw(self, offset):
        """Returns the raw string of a QueryResult object from the given offset."""
        handle = self._handle
        handle.seek(offset)
        qresult_key = None
        qresult_raw = _as_bytes('')

        while True:
            line = handle.readline()
            if not line:
                break
            elif line.startswith(self._query_mark):
                cur_pos = handle.tell()
                if qresult_key is None:
                    qresult_key = self.get_qresult_id(cur_pos)
                else:
                    curr_key = self.get_qresult_id(cur_pos)
                    if curr_key != qresult_key:
                        break
                handle.seek(cur_pos)
            qresult_raw += line

        return qresult_raw
Пример #52
0
    def __iter__(self):
        qstart_mark = self.qstart_mark
        qend_mark = self.qend_mark
        blast_id_mark = _as_bytes('Query_')
        block_size = self.block_size
        handle = self._handle
        handle.seek(0)
        re_desc = re.compile(_as_bytes(r'<Iteration_query-ID>(.*?)'
                '</Iteration_query-ID>\s+?<Iteration_query-def>'
                '(.*?)</Iteration_query-def>'))
        re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>'))
        counter = 0

        while True:
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                break
            if qstart_mark not in line:
                continue
            # The following requirements are to make supporting BGZF compressed
            # BLAST XML files simpler (avoids complex offset manipulations):
            assert line.count(qstart_mark) == 1, "XML without line breaks?"
            assert line.lstrip().startswith(qstart_mark), line
            if qend_mark in line:
                # Should cope with <Iteration>...</Iteration> on one long line
                block = line
            else:
                # Load the rest of this block up to and including </Iteration>
                block = [line]
                while line and qend_mark not in line:
                    line = handle.readline()
                    assert qstart_mark not in line, line
                    block.append(line)
                assert line.rstrip().endswith(qend_mark), line
                block = _empty_bytes_string.join(block)
            assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block
            assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block
            #Now we have a full <Iteration>...</Iteration> block, find the ID
            regx = re.search(re_desc, block)
            try:
                qstart_desc = regx.group(2)
                qstart_id = regx.group(1)
            except AttributeError:
                # use the fallback values
                assert re.search(re_desc_end, block)
                qstart_desc = _as_bytes(self._fallback['description'])
                qstart_id = _as_bytes(self._fallback['id'])
            if qstart_id.startswith(blast_id_mark):
                qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0]
            yield _bytes_to_string(qstart_id), start_offset, len(block)
            counter += 1
Пример #53
0
 def __init__(self, filename, format, alphabet):
     SeqFileRandomAccess.__init__(self, filename, format, alphabet)
     marker = {
         "ace": "CO ",
         "embl": "ID ",
         "fasta": ">",
         "genbank": "LOCUS ",
         "gb": "LOCUS ",
         "imgt": "ID ",
         "phd": "BEGIN_SEQUENCE",
         "pir": ">..;",
         "qual": ">",
         "qual": ">",
         "swiss": "ID ",
         "uniprot-xml": "<entry ",
     }[format]
     self._marker = marker
     self._marker_re = re.compile(_as_bytes("^%s" % marker))
Пример #54
0
 def __init__(self, filename, format, alphabet):
     SeqFileRandomAccess.__init__(self, filename, format, alphabet)
     marker = {
         "ace": "CO ",
         "embl": "ID ",
         "fasta": ">",
         "genbank": "LOCUS ",
         "gb": "LOCUS ",
         "imgt": "ID ",
         "phd": "BEGIN_SEQUENCE",
         "pir": ">..;",
         "qual": ">",
         "qual": ">",
         "swiss": "ID ",
         "uniprot-xml": "<entry ",
     }[format]
     self._marker = marker
     self._marker_re = re.compile(_as_bytes("^%s" % marker))
Пример #55
0
 def read(self, handle):
     """Set up the parser and let it parse the XML results"""
     # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser
     # expects binary data
     if handle.__class__.__name__ == 'EvilHandleHack':
         handle = handle._handle
     if handle.__class__.__name__ == 'TextIOWrapper':
         handle = handle.buffer
     if hasattr(handle, "closed") and handle.closed:
         # Should avoid a possible Segmentation Fault, see:
         # http://bugs.python.org/issue4877
         raise IOError("Can't parse a closed handle")
     if sys.version_info[0] >= 3:
         # Another nasty hack to cope with a unicode StringIO handle
         # since the Entrez XML parser expects binary data (bytes)
         from io import StringIO
         if isinstance(handle, StringIO):
             from io import BytesIO
             from SAP.Bio._py3k import _as_bytes
             handle = BytesIO(_as_bytes(handle.read()))
     try:
         self.parser.ParseFile(handle)
     except expat.ExpatError as e:
         if self.parser.StartElementHandler:
             # We saw the initial <!xml declaration, so we can be sure that
             # we are parsing XML data. Most likely, the XML file is
             # corrupted.
             raise CorruptedXMLError(e)
         else:
             # We have not seen the initial <!xml declaration, so probably
             # the input data is not in XML format.
             raise NotXMLError(e)
     try:
         return self.object
     except AttributeError:
         if self.parser.StartElementHandler:
             # We saw the initial <!xml declaration, and expat didn't notice
             # any errors, so self.object should be defined. If not, this is
             # a bug.
             raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at [email protected] for assistance.")
         else:
             # We did not see the initial <!xml declaration, so probably
             # the input data is not in XML format.
             raise NotXMLError("XML declaration not found")
Пример #56
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     tab_char = _as_bytes("\t")
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if not line:
             break  # End of file
         try:
             key = line.split(tab_char)[0]
         except ValueError as err:
             if not line.strip():
                 # Ignore blank lines
                 continue
             else:
                 raise err
         else:
             yield _bytes_to_string(key), start_offset, len(line)
Пример #57
0
def _open(cgi, params=None, post=None, ecitmatch=False):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    The arugment post should be a boolean to explicitly control if an HTTP
    POST should be used rather an HTTP GET based on the query length.
    By default (post=None), POST is used if the URL encoded paramters would
    be over 1000 characters long.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    # NCBI requirement: At most three queries per second.
    # Equivalently, at least a third of second between queries
    delay = 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current

    params = _construct_params(params)
    options = _encode_options(ecitmatch, params)

    # By default, post is None. Set to a boolean to over-ride length choice:
    if post is None and len(options) > 1000:
        post = True
    cgi = _construct_cgi(cgi, post, options)

    try:
        if post:
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
Пример #58
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN))

        while True:
            line = read_forward(handle)
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                break
Пример #59
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        # denotes column location for query identifier
        query_id_idx = 9
        qresult_key = None
        tab_char = _as_bytes('\t')

        start_offset = handle.tell()
        line = handle.readline()
        # read through header
        # this assumes that the result row match the regex
        while not re.search(_RE_ROW_CHECK_IDX, line.strip()):
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                raise StopIteration

        # and index the qresults
        while True:
            end_offset = handle.tell()

            cols = [x for x in line.strip().split(tab_char) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    yield _bytes_to_string(qresult_key), start_offset, \
                            end_offset - start_offset
                    qresult_key = curr_key
                    start_offset = end_offset - len(line)

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break