def request(self, **kwargs): """ Perform network request. You can specify grab settings in ``**kwargs``. Any keyword argument will be passed to ``self.config``. Returns: ``Document`` objects. """ self.prepare_request(**kwargs) refresh_count = 0 while True: self.log_request() try: self.transport.request() except error.GrabError as ex: self.exception = ex self.reset_temporary_options() if self.config['log_dir']: self.save_failed_dump() raise else: with self.transport.wrap_transport_error(): doc = self.process_request_result() if self.config['follow_location']: if doc.code in (301, 302, 303, 307, 308): if doc.headers.get('Location'): refresh_count += 1 if refresh_count > self.config['redirect_limit']: raise error.GrabTooManyRedirectsError() else: url = doc.headers.get('Location') self.prepare_request( url=self.make_url_absolute(url), referer=None) continue if self.config['follow_refresh']: refresh_url = self.doc.get_meta_refresh_url() if refresh_url is not None: refresh_count += 1 if refresh_count > self.config['redirect_limit']: raise error.GrabTooManyRedirectsError() else: self.prepare_request( url=self.make_url_absolute(refresh_url), referer=None) continue return doc
def request(self): try: self.curl.perform() except pycurl.error as ex: # CURLE_WRITE_ERROR (23) # An error occurred when writing received data to a local file, or # an error was returned to libcurl from a write callback. # This exception should be ignored if _callback_interrupted flag # is enabled (this happens when nohead or nobody options enabled) # # Also this error is raised when curl receives KeyboardInterrupt # while it is processing some callback function # (WRITEFUNCTION, HEADERFUNCTIO, etc) if 23 == ex.args[0]: if getattr(self.curl, '_callback_interrupted', None) is True: self.curl._callback_interrupted = False else: raise error.GrabNetworkError(ex.args[0], ex.args[1]) else: if ex.args[0] == 28: raise error.GrabTimeoutError(ex.args[0], ex.args[1]) elif ex.args[0] == 7: raise error.GrabConnectionError(ex.args[0], ex.args[1]) elif ex.args[0] == 67: raise error.GrabAuthError(ex.args[0], ex.args[1]) elif ex.args[0] == 47: raise error.GrabTooManyRedirectsError( ex.args[0], ex.args[1]) else: raise error.GrabNetworkError(ex.args[0], ex.args[1])
def request(self): stderr_proxy = StderrProxy() try: with stderr_proxy.record(): self.curl.perform() except pycurl.error as ex: # CURLE_WRITE_ERROR (23) # An error occurred when writing received data to a local file, or # an error was returned to libcurl from a write callback. # This exception should be ignored if grab_callback_interrupted flag # is enabled (this happens when nohead or nobody options enabled) # # Also this error is raised when curl receives KeyboardInterrupt # while it is processing some callback function # (WRITEFUNCTION, HEADERFUNCTIO, etc) # If you think WTF then see details here: # https://github.com/pycurl/pycurl/issues/413 if self.has_pycurl_hidden_sigint(stderr_proxy.get_output()): raise KeyboardInterrupt if 23 == ex.args[0]: if getattr(self.curl, 'grab_callback_interrupted', None) is True: # This is expected error caused by # interruptted execution of body_processor callback # FIXME: is it set automatically? self.curl.grab_callback_interrupted = False else: raise error.GrabNetworkError(ex.args[0], ex.args[1]) else: if ex.args[0] == 28: raise error.GrabTimeoutError(ex.args[0], ex.args[1]) elif ex.args[0] == 7: raise error.GrabConnectionError(ex.args[0], ex.args[1]) elif ex.args[0] == 67: raise error.GrabAuthError(ex.args[0], ex.args[1]) elif ex.args[0] == 47: raise error.GrabTooManyRedirectsError(ex.args[0], ex.args[1]) elif ex.args[0] == 6: raise error.GrabCouldNotResolveHostError(ex.args[0], ex.args[1]) else: raise error.GrabNetworkError(ex.args[0], ex.args[1]) except Exception as ex: # pylint: disable=broad-except if self.has_pycurl_hidden_sigint(stderr_proxy.get_output()): raise KeyboardInterrupt six.reraise(error.GrabInternalError, error.GrabInternalError(ex), sys.exc_info()[2]) else: if self.has_pycurl_hidden_sigint(stderr_proxy.get_output()): raise KeyboardInterrupt
def build_grab_exception(ex, curl): """ Build Grab exception from the pycurl exception Args: ex - the original pycurl exception curl - the Curl instance raised the exception """ # CURLE_WRITE_ERROR (23) # An error occurred when writing received data to a local file, or # an error was returned to libcurl from a write callback. # This exception should be ignored if grab_callback_interrupted # flag # is enabled (this happens when nohead or nobody options # enabled) # # Also this error is raised when curl receives KeyboardInterrupt # while it is processing some callback function # (WRITEFUNCTION, HEADERFUNCTIO, etc) # If you think WTF then see details here: # https://github.com/pycurl/pycurl/issues/413 if ex.args[0] == 23: if getattr(curl, 'grab_callback_interrupted', None) is True: # If the execution of body_process callback is # interrupted (body_maxsize, nobody and other options) # then the pycurl raised exception with code 23 # We should ignore it return None else: return error.GrabNetworkError(ex.args[1], ex) else: if ex.args[0] == 28: return error.GrabTimeoutError(ex.args[1], ex) elif ex.args[0] == 7: return error.GrabConnectionError(ex.args[1], ex) elif ex.args[0] == 67: return error.GrabAuthError(ex.args[1], ex) elif ex.args[0] == 47: return error.GrabTooManyRedirectsError(ex.args[1], ex) elif ex.args[0] == 6: return error.GrabCouldNotResolveHostError(ex.args[1], ex) elif ex.args[0] == 3: return error.GrabInvalidUrl(ex.args[1], ex) else: return error.GrabNetworkError(ex.args[1], ex)
def process_request_result(self, prepare_response_func=None): """ Process result of real request performed via transport extension. """ now = datetime.now() # TODO: move into separate method if self.config['debug_post']: post = self.config['post'] or self.config['multipart_post'] if isinstance(post, dict): post = list(post.items()) if post: if isinstance(post, basestring): post = post[:self.config['debug_post_limit']] + '...' else: items = normalize_http_values(post, charset='utf-8') new_items = [] for key, value in items: if len(value) > self.config['debug_post_limit']: value = value[:self. config['debug_post_limit']] + '...' else: value = value new_items.append((key, value)) post = '\n'.join('%-25s: %s' % x for x in new_items) if post: logger_network.debug('[%02d] POST request:\n%s\n' % (self.request_counter, post)) # It's important to delete old POST data after request is performed. # If POST data is not cleared then next request will try to use them # again! old_refresh_count = self.config['refresh_redirect_count'] self.reset_temporary_options() if prepare_response_func: self.doc = prepare_response_func(self.transport, self) else: self.doc = self.transport.prepare_response(self) # Workaround if self.doc.grab is None: self.doc.grab = weakref.proxy(self) if self.config['reuse_cookies']: self.cookies.update(self.doc.cookies) self.doc.timestamp = now self.config['charset'] = self.doc.charset if self.config['log_file']: with open(self.config['log_file'], 'wb') as out: out.write(self.doc.body) if self.config['cookiefile']: self.cookies.save_to_file(self.config['cookiefile']) if self.config['reuse_referer']: self.config['referer'] = self.doc.url self.copy_request_data() # Should be called after `copy_request_data` self.save_dumps() # TODO: check max redirect count if self.config['follow_refresh']: url = find_refresh_url(self.doc.unicode_body()) print('URL', url) if url is not None: inc_count = old_refresh_count + 1 if inc_count > self.config['redirect_limit']: raise error.GrabTooManyRedirectsError() else: print(inc_count) return self.request(url=url, refresh_redirect_count=inc_count) return None
def request(self, **kwargs): """ Perform network request. You can specify grab settings in ``**kwargs``. Any keyword argument will be passed to ``self.config``. Returns: ``Document`` objects. """ self.prepare_request(**kwargs) refresh_count = 0 retries = self.config["retries"] retry_timeout = self.config["retry_timeout"] while True: self.log_request() try: while True: try: self.transport.request() break except error.GrabError as ex: retries = retries - 1 if retries <= 0: raise ex else: logger.debug( 'Request has failed with %s retrying in %s seconds' % (ex, retry_timeout)) time.sleep(retry_timeout) except error.GrabError as ex: self.exception = ex self.reset_temporary_options() if self.config['log_dir']: self.save_failed_dump() raise else: doc = self.process_request_result() if self.config['follow_location']: if doc.code in (301, 302, 303, 307, 308): if doc.headers.get('Location'): refresh_count += 1 if refresh_count > self.config['redirect_limit']: raise error.GrabTooManyRedirectsError() else: url = doc.headers.get('Location') self.prepare_request( url=self.make_url_absolute(url), referer=None) continue if self.config['follow_refresh']: refresh_url = self.doc.get_meta_refresh_url() if refresh_url is not None: refresh_count += 1 if refresh_count > self.config['redirect_limit']: raise error.GrabTooManyRedirectsError() else: self.prepare_request( url=self.make_url_absolute(refresh_url), referer=None) continue return doc