def test_limit_reader_multiple_read(self): reader = LimitReader(BytesIO(b'abcdefghjiklmnopqrstuvwxyz'), 10) string = None for x in [2, 2, 20]: string = reader.read(x) assert b'efghji' == string
def __init__(self, stream, offset, length): super(OffsetLimitReader, self).__init__(stream, length) self.offset = offset if offset > 0: self._skip_reader = LimitReader(stream, offset) else: self._skip_reader = None
def load(self, url, offset=0, length=-1): """ Load a file-like reader from the local file system """ # if starting with . or /, can only be a file path.. file_only = url.startswith(('/', '.')) # convert to filename filename = from_file_url(url) if filename != url: file_only = True url = filename afile = None try: # first, try as file afile = open(url, 'rb') except IOError: no_except_close(afile) if file_only: raise return super(LocalFileLoader, self).load(url, offset, length) if offset > 0: afile.seek(offset) if length >= 0: return LimitReader(afile, length) else: return afile
def do_upload(self, upload_key, filename, stream, user, coll, rec, offset, length): """Send PUT request to upload recording. :param str upload_key: upload Redis key :param str filename: WARC archive filename :param stream: file object :param User user: user :param str coll: collection ID :param str rec: record ID :param int offset: offset to start of stream :param int length: length of recording """ stream.seek(offset) logger.debug('do_upload(): {0} offset: {1}: len: {2}'.format( rec, offset, length)) stream = LimitReader(stream, length) headers = {'Content-Length': str(length)} upload_url = self.upload_path.format(record_host=self.record_host, user=user, coll=coll, rec=rec, upid=upload_key) r = requests.put(upload_url, headers=headers, data=stream)
def get_req_body(self): input_ = self.env['wsgi.input'] len_ = self._get_content_length() enc = self._get_header('Transfer-Encoding') if len_: data = LimitReader(input_, int(len_)) elif enc: data = input_ else: data = None return data
def do_upload(self, upload_key, filename, stream, user, coll, rec, offset, length): stream.seek(offset) logger.debug('do_upload(): {0} offset: {1}: len: {2}'.format( rec, offset, length)) stream = LimitReader(stream, length) headers = {'Content-Length': str(length)} upload_url = self.upload_path.format(record_host=self.record_host, user=user, coll=coll, rec=rec, upid=upload_key) r = requests.put(upload_url, headers=headers, data=stream)
class OffsetLimitReader(LimitReader): def __init__(self, stream, offset, length): super(OffsetLimitReader, self).__init__(stream, length) self.offset = offset if offset > 0: self._skip_reader = LimitReader(stream, offset) else: self._skip_reader = None def _skip(self): while self._skip_reader: buff = self._skip_reader.read() if not buff: self._skip_reader = None def read(self, length=None): self._skip() return super(OffsetLimitReader, self).read(length) def readline(self, length=None): self._skip() return super(OffsetLimitReader, self).readline(length)
def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self._detect_type_load_headers( stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': sub_len = 0 else: sub_len = rec_headers.total_len the_format = 'warc' is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) # don't parse the http record at all if no_record_parse: http_headers = None #StatusAndHeaders('', []) # if empty record (error or otherwise) set status to 204 elif length == 0: #if is_err: # msg = '204 Possible Error' #else: # msg = '204 No Content' http_headers = StatusAndHeaders('', []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and uri.startswith(self.HTTP_SCHEMES)): http_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)): http_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] if length is not None and length >= 0: content_type_header.append(('Content-Length', str(length))) http_headers = StatusAndHeaders('200 OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, content_type, length)
def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False, ensure_http_headers=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self._detect_type_load_headers( stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = self._ensure_target_uri_format(rec_headers) length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': sub_len = 0 else: sub_len = rec_headers.total_len the_format = 'warc' is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) http_headers = None # load http headers if parsing if not no_record_parse: http_headers = self.load_http_headers(rec_type, uri, stream, length) # generate validate http headers (eg. for replay) if not http_headers and ensure_http_headers: http_headers = self.default_http_headers(length, content_type) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, content_type, length)
def test_limit_reader_1(self): assert b'abcdefghji' == LimitReader( BytesIO(b'abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
def test_limit_reader_close(self): reader = LimitReader(BytesIO(b'abcdefg'), 3) with closing(reader): assert b'abc' == reader.read(10) assert reader.tell() == 3
def test_limit_reader_invalid_wrap(self): b = BytesIO(b'some data') assert LimitReader.wrap_stream(b, 'abc') == b
def test_limit_reader_zero(self): assert b'' == LimitReader(BytesIO(b'a'), 0).readline(0)
def test_limit_reader_3(self): reader = LimitReader(BytesIO(b'abcdefghjiklmnopqrstuvwxyz'), 8) new_reader = LimitReader.wrap_stream(reader, 4) assert reader == new_reader assert b'abcd' == new_reader.readline(26)
def test_limit_reader_2(self): assert b'abcdefgh' == LimitReader( BytesIO(b'abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)