def _process_redirect(self): '''Update the Redirect Tracker.''' _logger.debug('Handling redirect.') if self._redirect_tracker.exceeded(): raise ProtocolError('Too many redirects.') try: url = self._redirect_tracker.next_location() if not url: raise ProtocolError('Redirect location missing.') if self._redirect_tracker.is_repeat(): _logger.debug('Got redirect is repeat.') request = self._original_request.copy() request.url = url else: request = self._web_client.request_factory(url) request.prepare_for_send() except ValueError as error: raise ProtocolError('Invalid redirect location.') from error self._next_request = request _logger.debug('Updated next redirect request to {0}.'.format(request))
def read_chunk_header(self): '''Read a single chunk's header. Returns: tuple: 2-item tuple with the size of the content in the chunk and the raw header byte string. Coroutine. ''' # _logger.debug('Reading chunk.') try: chunk_size_hex = yield from self._connection.readline() except ValueError as error: raise ProtocolError( 'Invalid chunk size: {0}'.format(error)) from error if not chunk_size_hex.endswith(b'\n'): raise NetworkError('Connection closed.') try: chunk_size = int(chunk_size_hex.split(b';', 1)[0].strip(), 16) except ValueError as error: raise ProtocolError( 'Invalid chunk size: {0}'.format(error)) from error if chunk_size < 0: raise ProtocolError('Chunk size cannot be negative.') self._chunk_size = self._bytes_left = chunk_size return chunk_size, chunk_size_hex
def _update_redirect_request(self): '''Update the Redirect Tracker.''' _logger.debug('Handling redirect.') if self._redirect_tracker.exceeded(): raise ProtocolError('Too many redirects.') url = self._redirect_tracker.next_location() if not url: raise ProtocolError('Redirect location missing.') try: request = self._rich_client.request_factory(url) except ValueError as error: raise ProtocolError('Invalid redirect location.') from error if self._redirect_tracker.is_repeat(): _logger.debug('Got redirect is repeat.') request.method = self._original_request.method request.body = self._original_request.body for name, value in self._original_request.fields.items(): if name not in request.fields: request.fields.add(name, value) self._next_request = request _logger.debug('Updated next redirect request to {0}.'.format(request))
def _decompress_data(self, data): '''Decompress the given data and return the uncompressed data.''' if self._decompressor: try: return self._decompressor.decompress(data) except zlib.error as error: raise ProtocolError( 'zlib error: {0}.'.format(error)) from error else: return data
def _flush_decompressor(self): '''Return any data left in the decompressor.''' if self._decompressor: try: return self._decompressor.flush() except zlib.error as error: raise ProtocolError( 'zlib flush error: {0}.'.format(error)) from error else: return b''
def read_chunk(self): '''Read a single chunk of the chunked transfer encoding. Returns: int: The size of the content in the chunk. ''' _logger.debug('Reading chunk.') chunk_size_hex = yield self._io_stream.read_until(b'\n') self.data_event.fire(chunk_size_hex) try: chunk_size = int(chunk_size_hex.split(b';', 1)[0].strip(), 16) except ValueError as error: raise ProtocolError(error.args[0]) from error _logger.debug('Getting chunk size={0}.'.format(chunk_size)) if not chunk_size: raise tornado.gen.Return(chunk_size) data_queue = self._io_stream.read_bytes_queue(chunk_size) while True: data = yield data_queue.get() if data is None: break self.data_event.fire(data) self.content_event.fire(data) newline_data = yield self._io_stream.read_until(b'\n') self.data_event.fire(newline_data) if len(newline_data) > 2: # Should be either CRLF or LF # This could our problem or the server's problem raise ProtocolError('Error reading newline after chunk.') raise tornado.gen.Return(chunk_size)
def parse(self, data): assert self.name is None assert not self.argument match = re.match(br'(\w+) ?([^\r\n]*)', data) if not match: raise ProtocolError('Failed to parse command.') self.name = match.group(1).decode('utf-8', errors='surrogateescape') self.argument = match.group(2).decode('utf-8', errors='surrogateescape')
def read_listing_content(self, file, duration_timeout=None): '''Read file listings. Returns: .ftp.request.ListingResponse: A Response populated the file listings Be sure to call :meth:`fetch_file_listing` first. Coroutine. ''' yield From( self.read_content(file=file, rewind=False, duration_timeout=duration_timeout)) try: if self._response.body.tell() == 0: listings = () elif self._listing_type == 'mlsd': self._response.body.seek(0) machine_listings = wpull.ftp.util.parse_machine_listing( self._response.body.read().decode( 'utf-8', errors='surrogateescape'), convert=True, strict=False) listings = list( wpull.ftp.util.machine_listings_to_file_entries( machine_listings)) else: self._response.body.seek(0) file = io.TextIOWrapper(self._response.body, encoding='utf-8', errors='surrogateescape') listing_parser = ListingParser(file=file) heuristics_result = listing_parser.run_heuristics() _logger.debug('Listing detected as %s', heuristics_result) listings = listing_parser.parse() # We don't want the file to be closed when exiting this function file.detach() except (ListingError, ValueError) as error: raise ProtocolError(*error.args) from error self._response.files = listings self._response.body.seek(0) raise Return(self._response)
def read_response(self, response=None): '''Read the response's HTTP status line and header fields. Coroutine. ''' _logger.debug('Reading header.') if response is None: response = Response() header_lines = [] bytes_read = 0 while True: try: data = yield From(self._connection.readline()) except ValueError as error: raise ProtocolError( 'Invalid header: {0}'.format(error)) from error self._data_observer.notify('response', data) if not data.endswith(b'\n'): raise NetworkError('Connection closed.') elif data in (b'\r\n', b'\n'): break header_lines.append(data) assert data.endswith(b'\n') bytes_read += len(data) if bytes_read > 32768: raise ProtocolError('Header too big.') if not header_lines: raise ProtocolError('No header received.') response.parse(b''.join(header_lines)) raise Return(response)
def read_chunk_body(self): '''Read a fragment of a single chunk. Call :meth:`read_chunk_header` first. Returns: tuple: 2-item tuple with the content data and raw data. First item is empty bytes string when chunk is fully read. Coroutine. ''' # chunk_size = self._chunk_size bytes_left = self._bytes_left # _logger.debug(__('Getting chunk size={0}, remain={1}.', # chunk_size, bytes_left)) if bytes_left > 0: size = min(bytes_left, self._read_size) data = yield from self._connection.read(size) self._bytes_left -= len(data) return (data, data) elif bytes_left < 0: raise ProtocolError('Chunked-transfer overrun.') elif bytes_left: raise NetworkError('Connection closed.') newline_data = yield from self._connection.readline() if len(newline_data) > 2: # Should be either CRLF or LF # This could our problem or the server's problem raise ProtocolError('Error reading newline after chunk.') self._chunk_size = self._bytes_left = None return (b'', newline_data)
def response_callback(request): request.prepare_for_send() self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response(302, 'See else') response.request = request response.fields['Location'] = '/robots.txt' nonlocal_dict['counter'] += 1 if nonlocal_dict['counter'] > 20: raise ProtocolError('Mock redirect loop error.') return response
def parse_status_line(cls, string): '''Parse the status line bytes. Returns: tuple: An tuple representing the version, code, and reason. ''' match = re.match(br'(HTTP/1\.[01])[ \t]+([0-9]{1,3})[ \t]*([^\r\n]*)', string) if match: groups = match.groups() if len(groups) == 3: return wpull.string.to_str( (groups[0], int(groups[1]), groups[2]), encoding='latin-1', ) raise ProtocolError("Error parsing status line '{0}'".format(string))
def parse(self, data): for line in data.splitlines(False): match = re.match(br'(\d{3}|^)([ -]?)(.*)', line) if not match: raise ProtocolError('Failed to parse reply.') if match.group(1) and match.group(2) == b' ': assert self.code is None self.code = int(match.group(1)) if self.text is None: self.text = match.group(3).decode('utf-8', errors='surrogateescape') else: self.text += '\r\n{0}'.format(match.group(3).decode( 'utf-8', errors='surrogateescape'))
def parse_status_line(cls, data): '''Parse the status line bytes. Returns: tuple: An tuple representing the version, code, and reason. ''' match = re.match(br'(HTTP/\d+\.\d+)[ \t]+([0-9]{1,3})[ \t]*([^\r\n]*)', data) if match: groups = match.groups() if len(groups) == 3: return wpull.string.to_str( (groups[0], int(groups[1]), groups[2]), encoding='latin-1', ) raise ProtocolError( 'Error parsing status line {line}".'.format(line=ascii(data)))
def parse_status_line(cls, string): '''Parse the status line bytes. Returns: tuple: An tuple representing the method, resource path, and version. ''' match = re.match(br'([a-zA-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/1\.[01])', string) if match: groups = match.groups() if len(groups) == 3: return wpull.string.to_str( (groups[0], groups[1], groups[2]), encoding='latin-1', ) raise ProtocolError('Error parsing status line ‘{0}’'.format(string))
def parse_status_line(self, data): '''Parse the status line bytes. Returns: tuple: An tuple representing the method, URI, and version. ''' match = re.match(br'([a-zA-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/\d+\.\d+)', data) if match: groups = match.groups() if len(groups) == 3: return wpull.string.to_str( (groups[0], groups[1], groups[2]), encoding=self.encoding, ) raise ProtocolError('Error parsing status line.')
def passive_mode(self): '''Enable passive mode. Returns: tuple: The address (IP address, port) of the passive port. Coroutine. ''' yield From(self._control_stream.write_command(Command('PASV'))) reply = yield From(self._control_stream.read_reply()) self.raise_if_not_match('Passive mode', ReplyCodes.entering_passive_mode, reply) try: raise Return(wpull.ftp.util.parse_address(reply.text)) except ValueError as error: raise ProtocolError(str(error)) from error
def _process_request(self, request, response_factory): '''Fulfill a single request. Returns: Response ''' yield self._connect() request.address = self._resolved_address self._events.pre_request(request) if sys.version_info < (3, 3): error_class = (socket.error, StreamClosedError, ssl.SSLError) else: error_class = (ConnectionError, StreamClosedError, ssl.SSLError) if not self._params.keep_alive and 'Connection' not in request.fields: request.fields['Connection'] = 'close' try: yield self._send_request_header(request) yield self._send_request_body(request) self._events.request.fire(request) response = yield self._read_response_header(response_factory) # TODO: handle 100 Continue yield self._read_response_body(request, response) except error_class as error: raise NetworkError('Network error: {0}'.format(error)) from error except BufferFullError as error: raise ProtocolError(*error.args) from error self._events.response.fire(response) if self.should_close(request.version, response.fields.get('Connection')): _logger.debug('HTTP connection close.') self.close() else: self._io_stream.monitor_for_close() raise tornado.gen.Return(response)
def _read_request_header(self): request = Request() for dummy in range(100): line = yield From(self._reader.readline()) _logger.debug(__('Got line {0}', line)) if line[-1:] != b'\n': return if not line.strip(): break request.parse(line) else: raise ProtocolError('Request has too many headers.') raise Return(request)
def handle_error(self, item_session: ItemSession, error: BaseException) -> Actions: '''Process an error. Returns: A value from :class:`.hook.Actions`. ''' if not self._ssl_verification and \ isinstance(error, SSLVerificationError): # Change it into a different error since the user doesn't care # about verifying certificates self._statistics.increment_error(ProtocolError()) else: self._statistics.increment_error(error) self._waiter.increment() action = self.consult_error_hook(item_session, error) if action == Actions.RETRY: item_session.set_status(Status.error) elif action == Actions.FINISH: item_session.set_status(Status.done) elif action == Actions.STOP: raise HookStop('Script requested immediate stop.') elif self._ssl_verification and isinstance(error, SSLVerificationError): raise elif isinstance(error, ConnectionRefused) and \ not self.retry_connrefused: item_session.set_status(Status.skipped) elif isinstance(error, DNSNotFound) and \ not self.retry_dns_error: item_session.set_status(Status.skipped) else: item_session.set_status(Status.error) return action
def _stream_closed_callback(self): _logger.debug('Stream closed. ' 'active={0} connected={1} ' \ 'closed={2} reading={3} writing={3}'.format( self._active, self.connected, self._io_stream.closed(), self._io_stream.reading(), self._io_stream.writing()) ) if not self._active: # We are likely in a context that's already dead _logger.debug('Ignoring stream closed error={0}.'\ .format(self._io_stream.error)) return if self._io_stream.error: _logger.debug('Throwing error {0}.'.format(self._io_stream.error)) raise self._io_stream.error if self._io_stream.buffer_full: _logger.debug('Buffer full.') raise ProtocolError('Buffer full.')
def download_listing(self, file: Optional[IO], duration_timeout: Optional[float]=None) -> \ ListingResponse: '''Read file listings. Args: file: A file object or asyncio stream. duration_timeout: Maximum time in seconds of which the entire file must be read. Returns: A Response populated the file listings Be sure to call :meth:`start_file_listing` first. Coroutine. ''' if self._session_state != SessionState.directory_request_sent: raise RuntimeError('File request not sent') self._session_state = SessionState.file_request_sent yield from self.download(file=file, rewind=False, duration_timeout=duration_timeout) try: if self._response.body.tell() == 0: listings = () elif self._listing_type == 'mlsd': self._response.body.seek(0) machine_listings = wpull.protocol.ftp.util.parse_machine_listing( self._response.body.read().decode( 'utf-8', errors='surrogateescape'), convert=True, strict=False) listings = list( wpull.protocol.ftp.util.machine_listings_to_file_entries( machine_listings)) else: self._response.body.seek(0) file = io.TextIOWrapper(self._response.body, encoding='utf-8', errors='surrogateescape') listing_parser = ListingParser(file=file) listings = list(listing_parser.parse_input()) _logger.debug('Listing detected as %s', listing_parser.type) # We don't want the file to be closed when exiting this function file.detach() except (ListingError, ValueError) as error: raise ProtocolError(*error.args) from error self._response.files = listings self._response.body.seek(0) self._session_state = SessionState.response_received return self._response