示例#1
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)'))

        # determine flag for hmmsearch
        is_hmmsearch = False
        line = read_forward(handle)
        if line.startswith(_as_bytes('hmmsearch')):
            is_hmmsearch = True

        while True:
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                # HACK: since hmmsearch can only have one query result
                if is_hmmsearch:
                    yield _bytes_to_string(qresult_key), start_offset, 0
                break

            line = read_forward(handle)
示例#2
0
    def __iter__(self):
        """Iterate over Hmmer2TextIndexer; yields query results' key, offsets, 0."""
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)'))

        # determine flag for hmmsearch
        is_hmmsearch = False
        line = read_forward(handle)
        if line.startswith(_as_bytes('hmmsearch')):
            is_hmmsearch = True

        while True:
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                # HACK: since hmmsearch can only have one query result
                if is_hmmsearch:
                    yield _bytes_to_string(qresult_key), start_offset, 0
                break

            line = read_forward(handle)
 def get_raw_check(self, filename, format, alphabet):
     handle = open(filename, "rb")
     raw_file = handle.read()
     handle.close()
     #Also checking the key_function here
     id_list = [rec.id.lower() for rec in \
                SeqIO.parse(filename, format, alphabet)]
     rec_dict = SeqIO.index(filename, format, alphabet,
                            key_function = lambda x : x.lower())
     self.assertEqual(set(id_list), set(rec_dict.keys()))
     self.assertEqual(len(id_list), len(rec_dict))
     for key in id_list:
         self.assertTrue(key in rec_dict)
         self.assertEqual(key, rec_dict[key].id.lower())
         self.assertEqual(key, rec_dict.get(key).id.lower())
         raw = rec_dict.get_raw(key)
         self.assertTrue(raw.strip())
         self.assertTrue(raw in raw_file)
         rec1 = rec_dict[key]
         #Following isn't very elegant, but it lets me test the
         #__getitem__ SFF code is working.
         if format in SeqIO._BinaryFormats:
             handle = BytesIO(raw)
         else:
             handle = StringIO(_bytes_to_string(raw))
         if format == "sff":
             rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                         rec_dict._proxy._flows_per_read,
                         rec_dict._proxy._flow_chars,
                         rec_dict._proxy._key_sequence,
                         rec_dict._proxy._alphabet,
                         trim=False)
         elif format == "sff-trim":
             rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                         rec_dict._proxy._flows_per_read,
                         rec_dict._proxy._flow_chars,
                         rec_dict._proxy._key_sequence,
                         rec_dict._proxy._alphabet,
                         trim=True)
         elif format == "uniprot-xml":
             self.assertTrue(raw.startswith(_as_bytes("<entry ")))
             self.assertTrue(raw.endswith(_as_bytes("</entry>")))
             #Currently the __getitem__ method uses this
             #trick too, but we hope to fix that later
             raw = """<?xml version='1.0' encoding='UTF-8'?>
             <uniprot xmlns="http://uniprot.org/uniprot"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://uniprot.org/uniprot
             http://www.uniprot.org/support/docs/uniprot.xsd">
             %s
             </uniprot>
             """ % _bytes_to_string(raw)
             handle = StringIO(raw)
             rec2 = SeqIO.read(handle, format, alphabet)
         else:
             rec2 = SeqIO.read(handle, format, alphabet)
         self.assertEqual(True, compare_record(rec1, rec2))
     rec_dict._proxy._handle.close() #TODO - Better solution
     del rec_dict
示例#4
0
文件: _index.py 项目: LyonsLab/coge
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     dot_char = _as_bytes(".")
     accession_marker = _as_bytes("ACCESSION ")
     version_marker = _as_bytes("VERSION ")
     #Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #We cannot assume the record.id is the first word after LOCUS,
         #normally the first entry on the VERSION or ACCESSION line is used.
         key = None
         while True:
             line = handle.readline()
             if marker_re.match(line) or not line:
                 if not key:
                     raise ValueError("Did not find ACCESSION/VERSION lines")
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, end_offset - start_offset
                 start_offset = end_offset
                 break
             elif line.startswith(accession_marker):
                 key = line.rstrip().split()[1]
             elif line.startswith(version_marker):
                 version_id = line.rstrip().split()[1]
                 if version_id.count(dot_char)==1 and version_id.split(dot_char)[1].isdigit():
                     #This should mimic the GenBank parser...
                     key = version_id
     assert not line, repr(line)
示例#5
0
文件: _index.py 项目: umbrr/biopython
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     # Skip any header
     offset = 0
     line = ""
     while True:
         offset += len(line)
         line = handle.readline()
         if not line:
             break  # Premature end of file, or just empty?
         if not line.startswith(b";;"):
             break
     while line:
         length = 0
         assert offset + len(line) == handle.tell()
         if not line.startswith(b";"):
             raise ValueError("Records should start with ';' and not:\n%r" % line)
         while line.startswith(b";"):
             length += len(line)
             line = handle.readline()
         key = line.rstrip()
         # Now look for the first line which starts ";"
         while line and not line.startswith(b";"):
             length += len(line)
             line = handle.readline()
         yield _bytes_to_string(key), offset, length
         offset += length
         assert offset + len(line) == handle.tell()
示例#6
0
 def __iter__(self):
     """Return (id, offset, length) tuples."""
     marker_offset = len(self._marker)
     marker_re = self._marker_re
     handle = self._handle
     handle.seek(0)
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # Here we can assume the record.id is the first word after the
         # marker. This is generally fine... but not for GenBank, EMBL, Swiss
         id = line[marker_offset:].strip().split(None, 1)[0]
         length = len(line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(id), start_offset, length
                 start_offset = end_offset
                 break
             else:
                 # Track this explicitly as can't do file offset difference on BGZF
                 length += len(line)
     assert not line, repr(line)
示例#7
0
 def __iter__(self):
     """Iterate over the sequence records in the file."""
     handle = self._handle
     handle.seek(0)
     # Skip any header
     offset = 0
     line = ""
     while True:
         offset += len(line)
         line = handle.readline()
         if not line:
             break  # Premature end of file, or just empty?
         if not line.startswith(b";;"):
             break
     while line:
         length = 0
         assert offset + len(line) == handle.tell()
         if not line.startswith(b";"):
             raise ValueError("Records should start with ';' and not:\n%r" %
                              line)
         while line.startswith(b";"):
             length += len(line)
             line = handle.readline()
         key = line.rstrip()
         # Now look for the first line which starts ";"
         while line and not line.startswith(b";"):
             length += len(line)
             line = handle.readline()
         yield _bytes_to_string(key), offset, length
         offset += length
         assert offset + len(line) == handle.tell()
示例#8
0
 def __iter__(self):
     """Returns (id,offset) tuples."""
     marker_offset = len(self._marker)
     marker_re = self._marker_re
     handle = self._handle
     handle.seek(0)
     #Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #Here we can assume the record.id is the first word after the
         #marker. This is generally fine... but not for GenBank, EMBL, Swiss
         id = line[marker_offset:].strip().split(None, 1)[0]
         length = len(line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(id), start_offset, length
                 start_offset = end_offset
                 break
             else:
                 #Track this explicitly as can't do file offset difference on BGZF
                 length += len(line)
     assert not line, repr(line)
示例#9
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         #We cannot assume the record.id is the first word after ID,
         #normally the following AC line is used.
         line = handle.readline()
         length += len(line)
         assert line.startswith(_as_bytes("AC "))
         key = line[3:].strip().split(semi_char)[0].strip()
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             length += len(line)
     assert not line, repr(line)
示例#10
0
 def __iter__(self):
     """Returns (id,offset) tuples."""
     marker_offset = len(self._marker)
     marker_re = self._marker_re
     handle = self._handle
     handle.seek(0)
     #Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #Here we can assume the record.id is the first word after the
         #marker. This is generally fine... but not for GenBank, EMBL, Swiss
         id = line[marker_offset:].strip().split(None, 1)[0]
         while True:
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(
                     id), start_offset, end_offset - start_offset
                 start_offset = end_offset
                 break
     assert not line, repr(line)
示例#11
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #We cannot assume the record.id is the first word after ID,
         #normally the following AC line is used.
         line = handle.readline()
         assert line.startswith(_as_bytes("AC "))
         key = line[3:].strip().split(semi_char)[0].strip()
         while True:
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(
                     key), start_offset, end_offset - start_offset
                 start_offset = end_offset
                 break
     assert not line, repr(line)
示例#12
0
    def __iter__(self):
        """Iterate over FastaM10Indexer; yields query results' keys, start offsets, offset lengths."""
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        qresult_key = None
        query_mark = b">>>"

        while True:
            line = handle.readline()
            peekline = handle.peekline()
            end_offset = handle.tell()

            if not line.startswith(query_mark) and query_mark in line:
                regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line)
                qresult_key = _bytes_to_string(regx.group(1))
                start_offset = end_offset - len(line)
            # yield whenever we encounter a new query or at the end of the file
            if qresult_key is not None:
                if (not peekline.startswith(query_mark)
                        and query_mark in peekline) or not line:
                    yield qresult_key, start_offset, end_offset - start_offset
                    if not line:
                        break
                    start_offset = end_offset
示例#13
0
 def __iter__(self):
     """Iterate over the sequence records in the file."""
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         # We cannot assume the record.id is the first word after ID,
         # normally the following AC line is used.
         line = handle.readline()
         length += len(line)
         assert line.startswith(b"AC ")
         key = line[3:].strip().split(b";")[0].strip()
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             length += len(line)
     assert not line, repr(line)
示例#14
0
    def __iter__(self):
        """Iterate over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        query_id_idx = self._query_id_idx
        qresult_key = None
        header_mark = _as_bytes("#")
        split_mark = _as_bytes(" ")
        # set line with initial mock value, to emulate header
        line = header_mark

        # read through header
        while line.startswith(header_mark):
            start_offset = handle.tell()
            line = handle.readline()

        # and index the qresults
        while True:
            end_offset = handle.tell()

            if not line:
                break

            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    adj_end = end_offset - len(line)
                    yield (
                        _bytes_to_string(qresult_key),
                        start_offset,
                        adj_end - start_offset,
                    )
                    qresult_key = curr_key
                    start_offset = adj_end

            line = handle.readline()
            if not line:
                yield (
                    _bytes_to_string(qresult_key),
                    start_offset,
                    end_offset - start_offset,
                )
                break
    def __iter__(self):
        """Iterate over BlastXmlIndexer yields qstart_id, start_offset, block's length."""
        qstart_mark = self.qstart_mark
        qend_mark = self.qend_mark
        blast_id_mark = _as_bytes("Query_")
        block_size = self.block_size
        handle = self._handle
        handle.seek(0)
        re_desc = re.compile(
            _as_bytes(
                r"<Iteration_query-ID>(.*?)"
                r"</Iteration_query-ID>\s+?"
                "<Iteration_query-def>"
                "(.*?)</Iteration_query-def>"
            )
        )
        re_desc_end = re.compile(_as_bytes(r"</Iteration_query-def>"))
        counter = 0

        while True:
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                break
            if qstart_mark not in line:
                continue
            # The following requirements are to make supporting BGZF compressed
            # BLAST XML files simpler (avoids complex offset manipulations):
            assert line.count(qstart_mark) == 1, "XML without line breaks?"
            assert line.lstrip().startswith(qstart_mark), line
            if qend_mark in line:
                # Should cope with <Iteration>...</Iteration> on one long line
                block = line
            else:
                # Load the rest of this block up to and including </Iteration>
                block = [line]
                while line and qend_mark not in line:
                    line = handle.readline()
                    assert qstart_mark not in line, line
                    block.append(line)
                assert line.rstrip().endswith(qend_mark), line
                block = _empty_bytes_string.join(block)
            assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block
            assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block
            # Now we have a full <Iteration>...</Iteration> block, find the ID
            regx = re.search(re_desc, block)
            try:
                qstart_desc = regx.group(2)
                qstart_id = regx.group(1)
            except AttributeError:
                # use the fallback values
                assert re.search(re_desc_end, block)
                qstart_desc = _as_bytes(self._fallback["description"])
                qstart_id = _as_bytes(self._fallback["id"])
            if qstart_id.startswith(blast_id_mark):
                qstart_id = qstart_desc.split(_as_bytes(" "), 1)[0]
            yield _bytes_to_string(qstart_id), start_offset, len(block)
            counter += 1
示例#16
0
 def get_qresult_id(self, pos):
     """Return the query ID of the nearest cigar line."""
     handle = self._handle
     handle.seek(pos)
     # get line, check if it's a vulgar line, and get query ID
     line = handle.readline()
     assert line.startswith(self._query_mark), line
     id = re.search(_RE_CIGAR, _bytes_to_string(line))
     return id.group(1)
示例#17
0
 def __iter__(self):
     """Iterate over the sequence records in the file."""
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = b";"
     sv_marker = b"SV "
     ac_marker = b"AC "
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after ID,
         # normally the SV line is used.
         setbysv = False  # resets sv as false
         length = len(line)
         if line[2:].count(b";") in [5, 6]:
             # Looks like the semi colon separated style introduced in 2006
             # Or style from IPD-IMGT/HLA after their v3.16.0 release
             parts = line[3:].rstrip().split(b";")
             if parts[1].strip().startswith(sv_marker):
                 # The SV bit gives the version
                 key = parts[0].strip() + b"." + \
                     parts[1].strip().split()[1]
                 setbysv = True
             else:
                 key = parts[0].strip()
         elif line[2:].count(b";") in [2, 3]:
             # Looks like the pre 2006 style, take first word only
             # Or, with two colons, the KIPO patent variantion
             key = line[3:].strip().split(None, 1)[0]
             if key.endswith(b";"):
                 key = key[:-1]
         else:
             raise ValueError('Did not recognise the ID line layout:\n%r' %
                              line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(ac_marker) and not setbysv:
                 key = line.rstrip().split()[1]
                 if key.endswith(b";"):
                     key = key[:-1]
             elif line.startswith(sv_marker):
                 key = line.rstrip().split()[1]
                 setbysv = True
             length += len(line)
     assert not line, repr(line)
示例#18
0
 def get_qresult_id(self, pos):
     """Return the query ID of the nearest vulgar line."""
     handle = self._handle
     handle.seek(pos)
     # get line, check if it's a vulgar line, and get query ID
     line = handle.readline()
     assert line.startswith(self._query_mark), line
     id = re.search(_RE_VULGAR, _bytes_to_string(line))
     return id.group(1)
示例#19
0
文件: _index.py 项目: umbrr/biopython
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = b";"
     sv_marker = b"SV "
     ac_marker = b"AC "
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after ID,
         # normally the SV line is used.
         setbysv = False  # resets sv as false
         length = len(line)
         if line[2:].count(b";") in [5, 6]:
             # Looks like the semi colon separated style introduced in 2006
             # Or style from IPD-IMGT/HLA after their v3.16.0 release
             parts = line[3:].rstrip().split(b";")
             if parts[1].strip().startswith(sv_marker):
                 # The SV bit gives the version
                 key = parts[0].strip() + b"." + \
                     parts[1].strip().split()[1]
                 setbysv = True
             else:
                 key = parts[0].strip()
         elif line[2:].count(b";") in [2, 3]:
             # Looks like the pre 2006 style, take first word only
             # Or, with two colons, the KIPO patent variantion
             key = line[3:].strip().split(None, 1)[0]
             if key.endswith(b";"):
                 key = key[:-1]
         else:
             raise ValueError(
                 'Did not recognise the ID line layout:\n%r' % line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(ac_marker) and not setbysv:
                 key = line.rstrip().split()[1]
                 if key.endswith(b";"):
                     key = key[:-1]
             elif line.startswith(sv_marker):
                 key = line.rstrip().split()[1]
                 setbysv = True
             length += len(line)
     assert not line, repr(line)
示例#20
0
def gzip_open(filename, format):
    # At time of writing, under Python 3.2.2 seems gzip.open(filename, mode)
    # insists on giving byte strings (i.e. binary mode)
    # See http://bugs.python.org/issue13989
    if sys.version_info[0] < 3 or format in SeqIO._BinaryFormats:
        return gzip.open(filename)
    handle = gzip.open(filename)
    data = handle.read()  # bytes!
    handle.close()
    return StringIO(_bytes_to_string(data))
示例#21
0
def gzip_open(filename, format):
    # At time of writing, under Python 3.2.2 seems gzip.open(filename, mode)
    # insists on giving byte strings (i.e. binary mode)
    # See http://bugs.python.org/issue13989
    if sys.version_info[0] < 3 or format in SeqIO._BinaryFormats:
        return gzip.open(filename)
    handle = gzip.open(filename)
    data = handle.read()  # bytes!
    handle.close()
    return StringIO(_bytes_to_string(data))
示例#22
0
    def __iter__(self):
        """Iterate over the sequence records in the file."""
        handle = self._handle
        handle.seek(0)
        marker_re = self._marker_re
        accession_marker = b"ACCESSION "
        version_marker = b"VERSION "
        # Skip and header before first record
        while True:
            start_offset = handle.tell()
            line = handle.readline()
            if marker_re.match(line) or not line:
                break
        # Should now be at the start of a record, or end of the file
        while marker_re.match(line):
            # We cannot assume the record.id is the first word after LOCUS,
            # normally the first entry on the VERSION or ACCESSION line is used.
            # However if both missing, GenBank parser falls back on LOCUS entry.
            try:
                key = line[5:].split(None, 1)[0]
            except ValueError:
                # Warning?
                # No content in LOCUS line
                key = None
            length = len(line)
            while True:
                end_offset = handle.tell()
                line = handle.readline()
                if marker_re.match(line) or not line:
                    if not key:
                        raise ValueError(
                            "Did not find usable ACCESSION/VERSION/LOCUS lines"
                        )
                    yield _bytes_to_string(key), start_offset, length
                    start_offset = end_offset
                    break
                elif line.startswith(accession_marker):
                    try:
                        key = line.rstrip().split()[1]
                    except IndexError:
                        # No content in ACCESSION line
                        pass
                elif line.startswith(version_marker):
                    try:
                        version_id = line.rstrip().split()[1]
                        if version_id.count(b".") == 1 and version_id.split(
                                b".")[1].isdigit():
                            # This should mimic the GenBank parser...
                            key = version_id
                    except IndexError:
                        # No content in VERSION line
                        pass

                length += len(line)
        assert not line, repr(line)
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     dot_char = _as_bytes(".")
     sv_marker = _as_bytes("SV ")
     ac_marker = _as_bytes("AC ")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after ID,
         # normally the SV line is used.
         setbysv = False  # resets sv as false
         length = len(line)
         if line[2:].count(semi_char) == 6:
             # Looks like the semi colon separated style introduced in 2006
             parts = line[3:].rstrip().split(semi_char)
             if parts[1].strip().startswith(sv_marker):
                 # The SV bit gives the version
                 key = parts[0].strip() + dot_char + \
                     parts[1].strip().split()[1]
                 setbysv = True
             else:
                 key = parts[0].strip()
         elif line[2:].count(semi_char) == 3:
             # Looks like the pre 2006 style, take first word only
             key = line[3:].strip().split(None, 1)[0]
             if key.endswith(semi_char):
                 key = key[:-1]
         else:
             raise ValueError(
                 'Did not recognise the ID line layout:\n' + line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(ac_marker) and not setbysv:
                 key = line.rstrip().split()[1]
                 if key.endswith(semi_char):
                     key = key[:-1]
             elif line.startswith(sv_marker):
                 key = line.rstrip().split()[1]
                 setbysv = True
             length += len(line)
     assert not line, repr(line)
示例#24
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     dot_char = _as_bytes(".")
     sv_marker = _as_bytes("SV ")
     ac_marker = _as_bytes("AC ")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after ID,
         # normally the SV line is used.
         setbysv = False  # resets sv as false
         length = len(line)
         if line[2:].count(semi_char) == 6:
             # Looks like the semi colon separated style introduced in 2006
             parts = line[3:].rstrip().split(semi_char)
             if parts[1].strip().startswith(sv_marker):
                 # The SV bit gives the version
                 key = parts[0].strip() + dot_char + \
                     parts[1].strip().split()[1]
                 setbysv = True
             else:
                 key = parts[0].strip()
         elif line[2:].count(semi_char) == 3:
             # Looks like the pre 2006 style, take first word only
             key = line[3:].strip().split(None, 1)[0]
             if key.endswith(semi_char):
                 key = key[:-1]
         else:
             raise ValueError(
                 'Did not recognise the ID line layout:\n' + line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(ac_marker) and not setbysv:
                 key = line.rstrip().split()[1]
                 if key.endswith(semi_char):
                     key = key[:-1]
             elif line.startswith(sv_marker):
                 key = line.rstrip().split()[1]
                 setbysv = True
             length += len(line)
     assert not line, repr(line)
示例#25
0
文件: AbiIO.py 项目: walcob/biopython
def _parse_tag_data(elem_code, elem_num, raw_data):
    """Returns single data value.

    Arguments:
     - elem_code - What kind of data
     - elem_num - How many data points
     - raw_data - abi file object from which the tags would be unpacked

    """
    if elem_code in _BYTEFMT:
        # because '>1s' unpack differently from '>s'
        if elem_num == 1:
            num = ''
        else:
            num = str(elem_num)
        fmt = '>' + num + _BYTEFMT[elem_code]

        assert len(raw_data) == struct.calcsize(fmt)
        data = struct.unpack(fmt, raw_data)

        # no need to use tuple if len(data) == 1
        # also if data is date / time
        if elem_code not in [10, 11] and len(data) == 1:
            data = data[0]

        # account for different data types
        if elem_code == 2:
            return _bytes_to_string(data)
        elif elem_code == 10:
            return str(datetime.date(*data))
        elif elem_code == 11:
            return str(datetime.time(*data[:3]))
        elif elem_code == 13:
            return bool(data)
        elif elem_code == 18:
            return _bytes_to_string(data[1:])
        elif elem_code == 19:
            return _bytes_to_string(data[:-1])
        else:
            return data
    else:
        return None
示例#26
0
def _parse_tag_data(elem_code, elem_num, raw_data):
    """Returns single data value.

    Arguments:
     - elem_code - What kind of data
     - elem_num - How many data points
     - raw_data - abi file object from which the tags would be unpacked

    """
    if elem_code in _BYTEFMT:
        # because '>1s' unpack differently from '>s'
        if elem_num == 1:
            num = ''
        else:
            num = str(elem_num)
        fmt = '>' + num + _BYTEFMT[elem_code]

        assert len(raw_data) == struct.calcsize(fmt)
        data = struct.unpack(fmt, raw_data)

        # no need to use tuple if len(data) == 1
        # also if data is date / time
        if elem_code not in [10, 11] and len(data) == 1:
            data = data[0]

        # account for different data types
        if elem_code == 2:
            return _bytes_to_string(data)
        elif elem_code == 10:
            return str(datetime.date(*data))
        elif elem_code == 11:
            return str(datetime.time(*data[:3]))
        elif elem_code == 13:
            return bool(data)
        elif elem_code == 18:
            return _bytes_to_string(data[1:])
        elif elem_code == 19:
            return _bytes_to_string(data[:-1])
        else:
            return data
    else:
        return None
示例#27
0
    def __iter__(self):
        """Iterate over BlastXmlIndexer yields qstart_id, start_offset, block's length."""
        qstart_mark = self.qstart_mark
        qend_mark = self.qend_mark
        blast_id_mark = _as_bytes('Query_')
        block_size = self.block_size
        handle = self._handle
        handle.seek(0)
        re_desc = re.compile(_as_bytes(r'<Iteration_query-ID>(.*?)'
                                       r'</Iteration_query-ID>\s+?'
                                       '<Iteration_query-def>'
                                       '(.*?)</Iteration_query-def>'))
        re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>'))
        counter = 0

        while True:
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                break
            if qstart_mark not in line:
                continue
            # The following requirements are to make supporting BGZF compressed
            # BLAST XML files simpler (avoids complex offset manipulations):
            assert line.count(qstart_mark) == 1, "XML without line breaks?"
            assert line.lstrip().startswith(qstart_mark), line
            if qend_mark in line:
                # Should cope with <Iteration>...</Iteration> on one long line
                block = line
            else:
                # Load the rest of this block up to and including </Iteration>
                block = [line]
                while line and qend_mark not in line:
                    line = handle.readline()
                    assert qstart_mark not in line, line
                    block.append(line)
                assert line.rstrip().endswith(qend_mark), line
                block = _empty_bytes_string.join(block)
            assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block
            assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block
            # Now we have a full <Iteration>...</Iteration> block, find the ID
            regx = re.search(re_desc, block)
            try:
                qstart_desc = regx.group(2)
                qstart_id = regx.group(1)
            except AttributeError:
                # use the fallback values
                assert re.search(re_desc_end, block)
                qstart_desc = _as_bytes(self._fallback['description'])
                qstart_id = _as_bytes(self._fallback['id'])
            if qstart_id.startswith(blast_id_mark):
                qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0]
            yield _bytes_to_string(qstart_id), start_offset, len(block)
            counter += 1
示例#28
0
    def __iter__(self):
        """Iterate over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        query_id_idx = self._query_id_idx
        qresult_key = None
        header_mark = _as_bytes('#')
        split_mark = _as_bytes(' ')
        # set line with initial mock value, to emulate header
        line = header_mark

        # read through header
        while line.startswith(header_mark):
            start_offset = handle.tell()
            line = handle.readline()

        # and index the qresults
        while True:
            end_offset = handle.tell()

            if not line:
                break

            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    adj_end = end_offset - len(line)
                    yield (_bytes_to_string(qresult_key), start_offset,
                           adj_end - start_offset)
                    qresult_key = curr_key
                    start_offset = adj_end

            line = handle.readline()
            if not line:
                yield (_bytes_to_string(qresult_key), start_offset,
                       end_offset - start_offset)
                break
示例#29
0
 def __iter__(self):
     """Iterate over the sequence records in the file."""
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         # Empty file!
         return
     if line[0:1] != b"@":
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     while line:
         # assert line[0]=="@"
         # This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         # Find the seq line(s)
         seq_len = 0
         length = len(line)
         while line:
             line = handle.readline()
             length += len(line)
             if line.startswith(b"+"):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         # assert line[0]=="+"
         # Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 if seq_len == 0:
                     # Special case, quality line should be just "\n"
                     line = handle.readline()
                     if line.strip():
                         raise ValueError(
                             "Expected blank quality line, not %r" % line)
                     length += len(line)  # Need to include the blank ling
                 # Should be end of record...
                 end_offset = handle.tell()
                 line = handle.readline()
                 if line and line[0:1] != b"@":
                     raise ValueError("Problem with line %r" % line)
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
                 length += len(line)
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         yield _bytes_to_string(id), start_offset, length
         start_offset = end_offset
示例#30
0
    def __iter__(self):
        """Iterate over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)

        if not self._kwargs['comments']:
            iterfunc = self._qresult_index
        else:
            iterfunc = self._qresult_index_commented

        for key, offset, length in iterfunc():
            yield _bytes_to_string(key), offset, length
示例#31
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)

        if not self._kwargs['comments']:
            iterfunc = self._qresult_index
        else:
            iterfunc = self._qresult_index_commented

        for key, offset, length in iterfunc():
            yield _bytes_to_string(key), offset, length
示例#32
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        # denotes column location for query identifier
        query_id_idx = 9
        qresult_key = None
        tab_char = b"\t"

        start_offset = handle.tell()
        line = handle.readline()
        # read through header
        # this assumes that the result row match the regex
        while not re.search(_RE_ROW_CHECK_IDX, line.strip()):
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                return

        # and index the qresults
        while True:
            end_offset = handle.tell()

            cols = [x for x in line.strip().split(tab_char) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    yield _bytes_to_string(qresult_key), start_offset, \
                            end_offset - start_offset
                    qresult_key = curr_key
                    start_offset = end_offset - len(line)

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
示例#33
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        query_id_idx = self._query_id_idx
        qresult_key = None
        header_mark = _as_bytes('#')
        split_mark = _as_bytes(' ')
        # set line with initial mock value, to emulate header
        line = header_mark

        # read through header
        while line.startswith(header_mark):
            start_offset = handle.tell()
            line = handle.readline()

        # and index the qresults
        while True:
            end_offset = handle.tell()

            if not line:
                break

            cols = line.strip().split(split_mark)
            if qresult_key is None:
                qresult_key = list(filter(None, cols))[query_id_idx]
            else:
                curr_key = list(filter(None, cols))[query_id_idx]

                if curr_key != qresult_key:
                    yield _bytes_to_string(qresult_key), start_offset, \
                            end_offset - start_offset
                    qresult_key = curr_key
                    start_offset = end_offset - len(line)

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
示例#34
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        # denotes column location for query identifier
        query_id_idx = 9
        qresult_key = None
        tab_char = _as_bytes('\t')

        start_offset = handle.tell()
        line = handle.readline()
        # read through header
        # this assumes that the result row match the regex
        while not re.search(_RE_ROW_CHECK_IDX, line.strip()):
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                raise StopIteration

        # and index the qresults
        while True:
            end_offset = handle.tell()

            cols = line.strip().split(tab_char)
            if qresult_key is None:
                qresult_key = list(filter(None, cols))[query_id_idx]
            else:
                curr_key = list(filter(None, cols))[query_id_idx]

                if curr_key != qresult_key:
                    yield _bytes_to_string(qresult_key), start_offset, \
                            end_offset - start_offset
                    qresult_key = curr_key
                    start_offset = end_offset - len(line)

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
示例#35
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         # Empty file!
         return
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     while line:
         # assert line[0]=="@"
         # This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         # Find the seq line(s)
         seq_len = 0
         length = len(line)
         while line:
             line = handle.readline()
             length += len(line)
             if line.startswith(plus_char):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         # assert line[0]=="+"
         # Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 if seq_len == 0:
                     # Special case, quality line should be just "\n"
                     line = handle.readline()
                     if line.strip():
                         raise ValueError("Expected blank quality line, not %r" % line)
                 # Should be end of record...
                 end_offset = handle.tell()
                 line = handle.readline()
                 if line and line[0:1] != at_char:
                     raise ValueError("Problem with line %r" % line)
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
                 length += len(line)
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         yield _bytes_to_string(id), start_offset, length
         start_offset = end_offset
示例#36
0
def _abi_parse_header(header, handle):
    """Generator that returns directory contents.
    """
    # header structure (after ABIF marker):
    # file version, tag name, tag number,
    # element type code, element size, number of elements
    # data size, data offset, handle (not file handle)
    head_elem_size = header[4]
    head_elem_num = header[5]
    head_offset = header[7]
    index = 0

    while index < head_elem_num:
        start = head_offset + index * head_elem_size
        # add directory offset to tuple
        # to handle directories with data size <= 4 bytes
        handle.seek(start)
        dir_entry = struct.unpack(_DIRFMT, handle.read(
            struct.calcsize(_DIRFMT))) + (start, )
        index += 1
        # only parse desired dirs
        key = _bytes_to_string(dir_entry[0])
        key += str(dir_entry[1])

        tag_name = _bytes_to_string(dir_entry[0])
        tag_number = dir_entry[1]
        elem_code = dir_entry[2]
        elem_num = dir_entry[4]
        data_size = dir_entry[5]
        data_offset = dir_entry[6]
        tag_offset = dir_entry[8]
        # if data size <= 4 bytes, data is stored inside tag
        # so offset needs to be changed
        if data_size <= 4:
            data_offset = tag_offset + 20
        handle.seek(data_offset)
        data = handle.read(data_size)
        yield tag_name, tag_number, \
            _parse_tag_data(elem_code, elem_num, data)
示例#37
0
def _abi_parse_header(header, handle):
    """Generator that returns directory contents.
    """
    # header structure (after ABIF marker):
    # file version, tag name, tag number,
    # element type code, element size, number of elements
    # data size, data offset, handle (not file handle)
    head_elem_size = header[4]
    head_elem_num = header[5]
    head_offset = header[7]
    index = 0

    while index < head_elem_num:
        start = head_offset + index * head_elem_size
        # add directory offset to tuple
        # to handle directories with data size <= 4 bytes
        handle.seek(start)
        dir_entry = struct.unpack(_DIRFMT,
                                  handle.read(struct.calcsize(_DIRFMT))) + (start,)
        index += 1
        # only parse desired dirs
        key = _bytes_to_string(dir_entry[0])
        key += str(dir_entry[1])

        tag_name = _bytes_to_string(dir_entry[0])
        tag_number = dir_entry[1]
        elem_code = dir_entry[2]
        elem_num = dir_entry[4]
        data_size = dir_entry[5]
        data_offset = dir_entry[6]
        tag_offset = dir_entry[8]
        # if data size <= 4 bytes, data is stored inside tag
        # so offset needs to be changed
        if data_size <= 4:
            data_offset = tag_offset + 20
        handle.seek(data_offset)
        data = handle.read(data_size)
        yield tag_name, tag_number, \
            _parse_tag_data(elem_code, elem_num, data)
示例#38
0
文件: _index.py 项目: umbrr/biopython
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        marker_re = self._marker_re
        accession_marker = b"ACCESSION "
        version_marker = b"VERSION "
        # Skip and header before first record
        while True:
            start_offset = handle.tell()
            line = handle.readline()
            if marker_re.match(line) or not line:
                break
        # Should now be at the start of a record, or end of the file
        while marker_re.match(line):
            # We cannot assume the record.id is the first word after LOCUS,
            # normally the first entry on the VERSION or ACCESSION line is used.
            # However if both missing, GenBank parser falls back on LOCUS entry.
            try:
                key = line[5:].split(None, 1)[0]
            except ValueError:
                # Warning?
                # No content in LOCUS line
                key = None
            length = len(line)
            while True:
                end_offset = handle.tell()
                line = handle.readline()
                if marker_re.match(line) or not line:
                    if not key:
                        raise ValueError("Did not find usable ACCESSION/VERSION/LOCUS lines")
                    yield _bytes_to_string(key), start_offset, length
                    start_offset = end_offset
                    break
                elif line.startswith(accession_marker):
                    try:
                        key = line.rstrip().split()[1]
                    except IndexError:
                        # No content in ACCESSION line
                        pass
                elif line.startswith(version_marker):
                    try:
                        version_id = line.rstrip().split()[1]
                        if version_id.count(b".") == 1 and version_id.split(b".")[1].isdigit():
                            # This should mimic the GenBank parser...
                            key = version_id
                    except IndexError:
                        # No content in VERSION line
                        pass

                length += len(line)
        assert not line, repr(line)
示例#39
0
 def get(self, offset):
     #TODO - Can we handle this directly in the parser?
     #This is a hack - use get_raw for <entry>...</entry> and wrap it with
     #the apparently required XML header and footer.
     data = """<?xml version='1.0' encoding='UTF-8'?>
     <uniprot xmlns="http://uniprot.org/uniprot"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://uniprot.org/uniprot
     http://www.uniprot.org/support/docs/uniprot.xsd">
     %s
     </uniprot>
     """ % _bytes_to_string(self.get_raw(offset))
     #TODO - For consistency, this function should not accept a string:
     return next(SeqIO.UniprotIO.UniprotIterator(data))
示例#40
0
 def get(self, offset):
     #TODO - Can we handle this directly in the parser?
     #This is a hack - use get_raw for <entry>...</entry> and wrap it with
     #the apparently required XML header and footer.
     data = """<?xml version='1.0' encoding='UTF-8'?>
     <uniprot xmlns="http://uniprot.org/uniprot"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://uniprot.org/uniprot
     http://www.uniprot.org/support/docs/uniprot.xsd">
     %s
     </uniprot>
     """ % _bytes_to_string(self.get_raw(offset))
     #TODO - For consistency, this function should not accept a string:
     return SeqIO.UniprotIO.UniprotIterator(data).next()
示例#41
0
 def __iter__(self):
     """Iterate over the sequence records in the file."""
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     start_acc_marker = b"<accession>"
     end_acc_marker = b"</accession>"
     end_entry_marker = b"</entry>"
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         # We expect the next line to be <accession>xxx</accession>
         # (possibly with leading spaces)
         # but allow it to be later on within the <entry>
         key = None
         while True:
             line = handle.readline()
             if key is None and start_acc_marker in line:
                 assert end_acc_marker in line, line
                 key = line[line.find(start_acc_marker) + 11:].split(
                     b"<", 1)[0]
                 length += len(line)
             elif end_entry_marker in line:
                 length += line.find(end_entry_marker) + 8
                 end_offset = handle.tell() - len(line) \
                     + line.find(end_entry_marker) + 8
                 assert start_offset + length == end_offset
                 break
             elif marker_re.match(line) or not line:
                 # Start of next record or end of file
                 raise ValueError("Didn't find end of record")
             else:
                 length += len(line)
         if not key:
             raise ValueError(
                 "Did not find <accession> line in bytes %i to %i" %
                 (start_offset, start_offset + length))
         yield _bytes_to_string(key), start_offset, length
         # Find start of next record
         while not marker_re.match(line) and line:
             start_offset = handle.tell()
             line = handle.readline()
     assert not line, repr(line)
示例#42
0
def _get_string_tag(opt_bytes_value, default=None):
    """Return the string value of the given an optional raw bytes tag value.

    If the bytes value is None, return the given default value.

    """
    if opt_bytes_value is None:
        return default
    try:
        return _bytes_to_string(opt_bytes_value)
    except UnicodeDecodeError:
        # If we are in this 'except' block, a `.decode` call must have been
        # attempted, and so we must be on Python 3, which means opt_bytes_value
        # is a byte string.
        return opt_bytes_value.decode(encoding=sys.getdefaultencoding())
示例#43
0
def _get_string_tag(opt_bytes_value, default=None):
    """Return the string value of the given an optional raw bytes tag value.

    If the bytes value is None, return the given default value.

    """
    if opt_bytes_value is None:
        return default
    try:
        return _bytes_to_string(opt_bytes_value)
    except UnicodeDecodeError:
        # If we are in this 'except' block, a `.decode` call must have been
        # attempted, and so we must be on Python 3, which means opt_bytes_value
        # is a byte string.
        return opt_bytes_value.decode(encoding=sys.getdefaultencoding())
示例#44
0
    def get_qresult_id(self, pos):
        """Returns the query ID from the nearest "Query:" line."""
        handle = self._handle
        handle.seek(pos)
        sentinel = _as_bytes('Query:')

        while True:
            line = handle.readline().strip()
            if line.startswith(sentinel):
                break
            if not line:
                raise StopIteration
        qid, desc = _parse_hit_or_query_line(_bytes_to_string(line))

        return qid
示例#45
0
    def get_qresult_id(self, pos):
        """Returns the query ID from the nearest "Query:" line."""
        handle = self._handle
        handle.seek(pos)
        sentinel = _as_bytes('Query:')

        while True:
            line = handle.readline().strip()
            if line.startswith(sentinel):
                break
            if not line:
                raise StopIteration
        qid, desc = _parse_hit_or_query_line(_bytes_to_string(line))

        return qid
示例#46
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         #Empty file!
         return
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
     while line:
         #assert line[0]=="@"
         #This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         #Find the seq line(s)
         seq_len = 0
         length = len(line)
         while line:
             line = handle.readline()
             length += len(line)
             if line.startswith(plus_char):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         #assert line[0]=="+"
         #Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 #Should be end of record...
                 end_offset = handle.tell()
                 line = handle.readline()
                 if line and line[0:1] != at_char:
                     ValueError("Problem with line %s" % repr(line))
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
                 length += len(line)
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         yield _bytes_to_string(id), start_offset, length
         start_offset = end_offset
示例#47
0
文件: _index.py 项目: umbrr/biopython
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     start_acc_marker = b"<accession>"
     end_acc_marker = b"</accession>"
     end_entry_marker = b"</entry>"
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         # We expect the next line to be <accession>xxx</accession>
         # (possibly with leading spaces)
         # but allow it to be later on within the <entry>
         key = None
         while True:
             line = handle.readline()
             if key is None and start_acc_marker in line:
                 assert end_acc_marker in line, line
                 key = line[line.find(
                     start_acc_marker) + 11:].split(b"<", 1)[0]
                 length += len(line)
             elif end_entry_marker in line:
                 length += line.find(end_entry_marker) + 8
                 end_offset = handle.tell() - len(line) \
                     + line.find(end_entry_marker) + 8
                 assert start_offset + length == end_offset
                 break
             elif marker_re.match(line) or not line:
                 # Start of next record or end of file
                 raise ValueError("Didn't find end of record")
             else:
                 length += len(line)
         if not key:
             raise ValueError("Did not find <accession> line in bytes %i to %i"
                              % (start_offset, start_offset + length))
         yield _bytes_to_string(key), start_offset, length
         # Find start of next record
         while not marker_re.match(line) and line:
             start_offset = handle.tell()
             line = handle.readline()
     assert not line, repr(line)
示例#48
0
文件: _index.py 项目: BingW/biopython
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     tab_char = _as_bytes("\t")
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if not line : break #End of file
         try:
             key = line.split(tab_char)[0]
         except ValueError, err:
             if not line.strip():
                 #Ignore blank lines
                 continue
             else:
                 raise err
         else:
             yield _bytes_to_string(key), start_offset, len(line)
示例#49
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     tab_char = _as_bytes("\t")
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if not line : break #End of file
         try:
             key = line.split(tab_char)[0]
         except ValueError, err:
             if not line.strip():
                 #Ignore blank lines
                 continue
             else:
                 raise err
         else:
             yield _bytes_to_string(key), start_offset, len(line)
示例#50
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         # Empty file!
         return
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
     while line:
         # assert line[0]=="@"
         # This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         # Find the seq line(s)
         seq_len = 0
         while line:
             line = handle.readline()
             if line.startswith(plus_char):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         # assert line[0]=="+"
         # Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 # Should be end of record...
                 line = handle.readline()
                 if line and line[0:1] != at_char:
                     ValueError("Problem with line %s" % repr(line))
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         end_offset = handle.tell() - len(line)
         yield _bytes_to_string(id), start_offset, end_offset - start_offset
         start_offset = end_offset
示例#51
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     start_acc_marker = _as_bytes("<accession>")
     end_acc_marker = _as_bytes("</accession>")
     end_entry_marker = _as_bytes("</entry>")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #We expect the next line to be <accession>xxx</accession>
         #(possibly with leading spaces)
         #but allow it to be later on within the <entry>
         key = None
         done = False
         while True:
             line = handle.readline()
             if key is None and start_acc_marker in line:
                 assert end_acc_marker in line, line
                 key = line[line.find(start_acc_marker) + 11:].split(
                     _as_bytes("<"))[0]
             elif end_entry_marker in line:
                 end_offset = handle.tell() - len(line) \
                            + line.find(end_entry_marker) + 8
                 break
             elif marker_re.match(line) or not line:
                 #Start of next record or end of file
                 raise ValueError("Didn't find end of record")
         if not key:
             raise ValueError("Did not find <accession> line in bytes %i to %i" \
                              % (start_offset, end_offset))
         yield _bytes_to_string(
             key), start_offset, end_offset - start_offset
         #Find start of next record
         while not marker_re.match(line) and line:
             start_offset = handle.tell()
             line = handle.readline()
     assert not line, repr(line)
示例#52
0
 def __iter__(self):
     """Iterate over the sequence records in the file."""
     handle = self._handle
     handle.seek(0)
     tab_char = b"\t"
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if not line:
             break  # End of file
         try:
             key = line.split(tab_char)[0]
         except ValueError as err:
             if not line.strip():
                 # Ignore blank lines
                 continue
             else:
                 raise err
         else:
             yield _bytes_to_string(key), start_offset, len(line)
示例#53
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()

        while True:
            line = _read_forward(handle)
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(_QRE_ID_LEN_IDX, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                break
示例#54
0
文件: _index.py 项目: LyonsLab/coge
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     while True:
         offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line):
             #Now look for the first line which doesn't start ";"
             while True:
                 line = handle.readline()
                 if line[0:1] != semi_char and line.strip():
                     key = line.split()[0]
                     yield _bytes_to_string(key), offset, 0
                     break
                 if not line:
                     raise ValueError("Premature end of file?")
         elif not line:
             #End of file
             break
示例#55
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     while True:
         offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line):
             #Now look for the first line which doesn't start ";"
             while True:
                 line = handle.readline()
                 if line[0:1] != semi_char and line.strip():
                     key = line.split()[0]
                     yield _bytes_to_string(key), offset, 0
                     break
                 if not line:
                     raise ValueError("Premature end of file?")
         elif not line:
             #End of file
             break
示例#56
0
    def __iter__(self):
        """Iterate over Hmmer3TextIndexer; yields query results' key, offsets, 0."""
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN))

        while True:
            line = read_forward(handle)
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                break
示例#57
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     start_acc_marker = _as_bytes("<accession>")
     end_acc_marker = _as_bytes("</accession>")
     end_entry_marker = _as_bytes("</entry>")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #We expect the next line to be <accession>xxx</accession>
         #but allow it to be later on within the <entry>
         key = None
         done = False
         while True:
             line = handle.readline()
             if key is None and line.startswith(start_acc_marker):
                 assert end_acc_marker in line, line
                 key = line[11:].split(_as_bytes("<"))[0]
             elif end_entry_marker in line:
                 end_offset = handle.tell() - len(line) \
                            + line.find(end_entry_marker) + 8
                 break
             elif marker_re.match(line) or not line:
                 #Start of next record or end of file
                 raise ValueError("Didn't find end of record")
         if not key:
             raise ValueError("Did not find <accession> line")
         yield _bytes_to_string(key), start_offset, end_offset - start_offset
         #Find start of next record
         while not marker_re.match(line) and line:
             start_offset = handle.tell()
             line = handle.readline()
     assert not line, repr(line)
示例#58
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        qresult_key = None
        query_mark = b">>>"

        while True:
            line = handle.readline()
            peekline = handle.peekline()
            end_offset = handle.tell()

            if not line.startswith(query_mark) and query_mark in line:
                regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line)
                qresult_key = _bytes_to_string(regx.group(1))
                start_offset = end_offset - len(line)
            # yield whenever we encounter a new query or at the end of the file
            if qresult_key is not None:
                if (not peekline.startswith(query_mark) and query_mark in peekline) or not line:
                    yield qresult_key, start_offset, end_offset - start_offset
                    if not line:
                        break
                    start_offset = end_offset
示例#59
0
    def get_raw_check(self, filename, format, alphabet, comp):
        # Also checking the key_function here
        if comp:
            h = gzip.open(filename, "rb")
            raw_file = h.read()
            h.close()
            h = gzip_open(filename, format)
            id_list = [rec.id.lower() for rec in
                       SeqIO.parse(h, format, alphabet)]
            h.close()
        else:
            h = open(filename, "rb")
            raw_file = h.read()
            h.close()
            id_list = [rec.id.lower() for rec in
                       SeqIO.parse(filename, format, alphabet)]

        if format in ["sff"]:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', BiopythonParserWarning)
                rec_dict = SeqIO.index(filename, format, alphabet,
                                       key_function=lambda x: x.lower())
                rec_dict_db = SeqIO.index_db(":memory:", filename, format, alphabet,
                                             key_function=lambda x: x.lower())
        else:
            rec_dict = SeqIO.index(filename, format, alphabet,
                                   key_function=lambda x: x.lower())
            rec_dict_db = SeqIO.index_db(":memory:", filename, format, alphabet,
                                         key_function=lambda x: x.lower())

        self.assertEqual(set(id_list), set(rec_dict))
        self.assertEqual(set(id_list), set(rec_dict_db))
        self.assertEqual(len(id_list), len(rec_dict))
        for key in id_list:
            self.assertIn(key, rec_dict)
            self.assertEqual(key, rec_dict[key].id.lower())
            self.assertEqual(key, rec_dict.get(key).id.lower())
            raw = rec_dict.get_raw(key)
            self.assertTrue(isinstance(raw, bytes),
                            "Didn't get bytes from %s get_raw" % format)
            self.assertTrue(raw.strip())
            self.assertIn(raw, raw_file)

            raw_db = rec_dict_db.get_raw(key)
            # Via index using format-specific get_raw which scans the file,
            # Via index_db in general using raw length found when indexing.
            self.assertEqual(raw, raw_db,
                             "index and index_db .get_raw() different for %s" % format)

            rec1 = rec_dict[key]
            # Following isn't very elegant, but it lets me test the
            # __getitem__ SFF code is working.
            if format in SeqIO._BinaryFormats:
                handle = BytesIO(raw)
            else:
                handle = StringIO(_bytes_to_string(raw))
            if format == "sff":
                rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                            rec_dict._proxy._flows_per_read,
                            rec_dict._proxy._flow_chars,
                            rec_dict._proxy._key_sequence,
                            rec_dict._proxy._alphabet,
                            trim=False)
            elif format == "sff-trim":
                rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                            rec_dict._proxy._flows_per_read,
                            rec_dict._proxy._flow_chars,
                            rec_dict._proxy._key_sequence,
                            rec_dict._proxy._alphabet,
                            trim=True)
            elif format == "uniprot-xml":
                self.assertTrue(raw.startswith(b"<entry "))
                self.assertTrue(raw.endswith(b"</entry>"))
                # Currently the __getitem__ method uses this
                # trick too, but we hope to fix that later
                raw = """<?xml version='1.0' encoding='UTF-8'?>
                <uniprot xmlns="http://uniprot.org/uniprot"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                xsi:schemaLocation="http://uniprot.org/uniprot
                http://www.uniprot.org/support/docs/uniprot.xsd">
                %s
                </uniprot>
                """ % _bytes_to_string(raw)
                handle = StringIO(raw)
                rec2 = SeqIO.read(handle, format, alphabet)
            else:
                rec2 = SeqIO.read(handle, format, alphabet)
            self.assertEqual(True, compare_record(rec1, rec2))
        rec_dict.close()
        del rec_dict