def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memoryview]: """ Read a chunk of complete FASTA or FASTQ records from a file. The size of a chunk is at most buffer_size. f needs to be a file opened in binary mode. The yielded memoryview objects become invalid on the next iteration. """ # This buffer is re-used in each iteration. buf = bytearray(buffer_size) # Read one byte to determine file format. # If there is a comment char, we assume FASTA! start = f.readinto(memoryview(buf)[0:1]) if start == 1 and buf[0:1] == b'@': head = _fastq_head elif start == 1 and (buf[0:1] == b'#' or buf[0:1] == b'>'): head = _fasta_head elif start == 0: # Empty file return else: raise UnknownFileFormat('Input file format unknown') # Layout of buf # # |-- complete records --| # +---+------------------+---------+-------+ # | | | | | # +---+------------------+---------+-------+ # ^ ^ ^ ^ ^ # 0 start end bufend len(buf) # # buf[0:start] is the 'leftover' data that could not be processed # in the previous iteration because it contained an incomplete # FASTA or FASTQ record. while True: if start == len(buf): raise OverflowError('FASTA/FASTQ record does not fit into buffer') bufend = f.readinto(memoryview(buf)[start:]) + start # type: ignore if start == bufend: # End of file break end = head(buf, bufend) assert end <= bufend if end > 0: yield memoryview(buf)[0:end] start = bufend - end assert start >= 0 buf[0:start] = buf[end:bufend] if start > 0: yield memoryview(buf)[0:start]
def read_paired_chunks( f: RawIOBase, f2: RawIOBase, buffer_size: int = 4 * 1024**2, ) -> Iterator[Tuple[memoryview, memoryview]]: if buffer_size < 1: raise ValueError("Buffer size too small") buf1 = bytearray(buffer_size) buf2 = bytearray(buffer_size) # Read one byte to make sure we are processing FASTQ start1 = f.readinto(memoryview(buf1)[0:1]) # type: ignore start2 = f2.readinto(memoryview(buf2)[0:1]) # type: ignore if (start1 == 1 and buf1[0:1] != b'@') or (start2 == 1 and buf2[0:1] != b'@'): raise FileFormatError( "Paired-end data must be in FASTQ format when using multiple cores", line=None) while True: if start1 == len(buf1) or start2 == len(buf2): raise ValueError("FASTQ record does not fit into buffer") bufend1 = f.readinto( memoryview(buf1)[start1:]) + start1 # type: ignore bufend2 = f2.readinto( memoryview(buf2)[start2:]) + start2 # type: ignore if start1 == bufend1 and start2 == bufend2: break end1, end2 = _paired_fastq_heads(buf1, buf2, bufend1, bufend2) assert end1 <= bufend1 assert end2 <= bufend2 if end1 > 0 or end2 > 0: yield (memoryview(buf1)[0:end1], memoryview(buf2)[0:end2]) start1 = bufend1 - end1 assert start1 >= 0 buf1[0:start1] = buf1[end1:bufend1] start2 = bufend2 - end2 assert start2 >= 0 buf2[0:start2] = buf2[end2:bufend2] if start1 > 0 or start2 > 0: yield (memoryview(buf1)[0:start1], memoryview(buf2)[0:start2])
def read_intf_packets(fd: io.RawIOBase, inq: MQueue, outq: MQueue): logger.info("read: start reading from interface") while True: m = inq.pop() n = fd.readinto(m.start) if n <= 0: logger.error("read: bad read %d on interface, dropping", n) inq.push(m, True) else: if DEBUG: logger.debug("read: %d bytes on interface", n) m.end = m.start[n:] outq.push(m, False)
def cache_segment_data(input_file: io.RawIOBase, segments: List[Any], segment_id: int, base_file_offset: int=0) -> None: """ base_file_offset: when the input file is located within a containing file. """ data = None file_offset = get_segment_data_file_offset(segments, segment_id) # No data for segments that have no data.. if file_offset != -1: file_length = get_segment_data_length(segments, segment_id) input_file.seek(base_file_offset + file_offset, os.SEEK_SET) file_data = bytearray(file_length) if input_file.readinto(file_data) == file_length: # NOTE(rmtew): Python 2, type(data[0]) is str. Python 3, type(data[0]) is int data = memoryview(file_data) else: logger.error("Unable to cache segment %d data, got %d bytes, wanted %d", segment_id, len(file_data), file_length) segments[segment_id][SI_CACHED_DATA] = data
def read_file( f: io.RawIOBase, executor: futures.Executor, q: _result_queue, stop_reading: threading.Event, ): try: while not stop_reading.is_set(): # make a bytearray and try very hard to fill it. buf = memoryview(bytearray(_CHUNK_SIZE)) ntotal: int = 0 nread: int = 1 while nread > 0: # As of 2020-06-01: typeshed wrongly claims we can't # `readinto(memoryview)`, so we disable type checking. nread = f.readinto(buf[ntotal:]) or 0 # type: ignore ntotal += nread if ntotal == 0: # end of file when we can't fill any return q.put(executor.submit(compute_hash, buf[:ntotal])) finally: q.put(None) # signal end of queue to the printer
def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memoryview]: """ Read chunks of complete FASTA or FASTQ records from a file. If the format is detected to be FASTQ, all chunks except possibly the last contain an even number of records such that interleaved paired-end reads remain in sync. The yielded memoryview objects are only valid for one iteration because the internal buffer is re-used in the next iteration. Arguments: f: File with FASTA or FASTQ reads; must have been opened in binary mode buffer_size: Largest allowed chunk size Yields: memoryview representing the chunk. This becomes invalid on the next iteration. Raises: ValueError: A FASTQ record was encountered that is larger than *buffer_size*. UnknownFileFormat: The file format could not be detected (the first byte must be "@", ">" or "#") """ # This buffer is re-used in each iteration. buf = bytearray(buffer_size) # Read one byte to determine file format. # If there is a comment char, we assume FASTA! start = f.readinto(memoryview(buf)[0:1]) if start == 0: # Empty file return assert start == 1 if buf[0:1] == b'@': head = _fastq_head elif buf[0:1] == b'#' or buf[0:1] == b'>': head = _fasta_head else: raise UnknownFileFormat( f"Cannnot determine input file format: First character expected to be '>' or '@', " f"but found {repr(chr(buf[0]))}") # Layout of buf # # |-- complete records --| # +---+------------------+---------+-------+ # | | | | | # +---+------------------+---------+-------+ # ^ ^ ^ ^ ^ # 0 start end bufend len(buf) # # buf[0:start] is the 'leftover' data that could not be processed # in the previous iteration because it contained an incomplete # FASTA or FASTQ record. while True: if start == len(buf): raise OverflowError('FASTA/FASTQ record does not fit into buffer') bufend = f.readinto(memoryview(buf)[start:]) + start # type: ignore if start == bufend: # End of file break end = head(buf, bufend) assert end <= bufend if end > 0: yield memoryview(buf)[0:end] start = bufend - end assert start >= 0 buf[0:start] = buf[end:bufend] if start > 0: yield memoryview(buf)[0:start]
def read_paired_chunks( f: RawIOBase, f2: RawIOBase, buffer_size: int = 4 * 1024**2, ) -> Iterator[Tuple[memoryview, memoryview]]: """ Read chunks of paired-end FASTQ reads from two files. A pair of chunks (memoryview objects) is yielded on each iteration, and both chunks are guaranteed to have the same number of sequences. That is, the paired-end reads will stay in sync. The memoryviews are only valid for one iteration because the internal buffer is re-used in the next iteration. This is similar to `read_chunks`, but for paired-end data. Unlike `read_chunks`, this only works for FASTQ input. Args: f: File with R1 reads; must have been opened in binary mode f2: File with R2 reads; must have been opened in binary mode buffer_size: Largest allowed chunk size Yields: Pairs of memoryview objects. Raises: ValueError: A FASTQ record was encountered that is larger than *buffer_size*. """ if buffer_size < 1: raise ValueError("Buffer size too small") buf1 = bytearray(buffer_size) buf2 = bytearray(buffer_size) # Read one byte to make sure we are processing FASTQ start1 = f.readinto(memoryview(buf1)[0:1]) # type: ignore start2 = f2.readinto(memoryview(buf2)[0:1]) # type: ignore if (start1 == 1 and buf1[0:1] != b'@') or (start2 == 1 and buf2[0:1] != b'@'): raise FileFormatError( "Paired-end data must be in FASTQ format when using multiple cores", line=None) while True: if start1 == len(buf1) or start2 == len(buf2): raise ValueError("FASTQ record does not fit into buffer") bufend1 = f.readinto( memoryview(buf1)[start1:]) + start1 # type: ignore bufend2 = f2.readinto( memoryview(buf2)[start2:]) + start2 # type: ignore if start1 == bufend1 and start2 == bufend2: break end1, end2 = _paired_fastq_heads(buf1, buf2, bufend1, bufend2) assert end1 <= bufend1 assert end2 <= bufend2 if end1 > 0 or end2 > 0: yield (memoryview(buf1)[0:end1], memoryview(buf2)[0:end2]) start1 = bufend1 - end1 assert start1 >= 0 buf1[0:start1] = buf1[end1:bufend1] start2 = bufend2 - end2 assert start2 >= 0 buf2[0:start2] = buf2[end2:bufend2] if start1 > 0 or start2 > 0: yield (memoryview(buf1)[0:start1], memoryview(buf2)[0:start2])