Пример #1
0
    def request(self, **kwargs):
        """
        Perform network request.

        You can specify grab settings in ``**kwargs``.
        Any keyword argument will be passed to ``self.config``.

        Returns: ``Document`` objects.
        """

        self.prepare_request(**kwargs)
        refresh_count = 0

        while True:
            self.log_request()

            try:
                self.transport.request()
            except error.GrabError as ex:
                self.exception = ex
                self.reset_temporary_options()
                if self.config['log_dir']:
                    self.save_failed_dump()
                raise
            else:
                with self.transport.wrap_transport_error():
                    doc = self.process_request_result()

                if self.config['follow_location']:
                    if doc.code in (301, 302, 303, 307, 308):
                        if doc.headers.get('Location'):
                            refresh_count += 1
                            if refresh_count > self.config['redirect_limit']:
                                raise error.GrabTooManyRedirectsError()
                            else:
                                url = doc.headers.get('Location')
                                self.prepare_request(
                                    url=self.make_url_absolute(url),
                                    referer=None)
                                continue

                if self.config['follow_refresh']:
                    refresh_url = self.doc.get_meta_refresh_url()
                    if refresh_url is not None:
                        refresh_count += 1
                        if refresh_count > self.config['redirect_limit']:
                            raise error.GrabTooManyRedirectsError()
                        else:
                            self.prepare_request(
                                url=self.make_url_absolute(refresh_url),
                                referer=None)
                            continue
                return doc
Пример #2
0
    def request(self):

        try:
            self.curl.perform()
        except pycurl.error as ex:
            # CURLE_WRITE_ERROR (23)
            # An error occurred when writing received data to a local file, or
            # an error was returned to libcurl from a write callback.
            # This exception should be ignored if _callback_interrupted flag
            # is enabled (this happens when nohead or nobody options enabled)
            #
            # Also this error is raised when curl receives KeyboardInterrupt
            # while it is processing some callback function
            # (WRITEFUNCTION, HEADERFUNCTIO, etc)
            if 23 == ex.args[0]:
                if getattr(self.curl, '_callback_interrupted', None) is True:
                    self.curl._callback_interrupted = False
                else:
                    raise error.GrabNetworkError(ex.args[0], ex.args[1])
            else:
                if ex.args[0] == 28:
                    raise error.GrabTimeoutError(ex.args[0], ex.args[1])
                elif ex.args[0] == 7:
                    raise error.GrabConnectionError(ex.args[0], ex.args[1])
                elif ex.args[0] == 67:
                    raise error.GrabAuthError(ex.args[0], ex.args[1])
                elif ex.args[0] == 47:
                    raise error.GrabTooManyRedirectsError(
                        ex.args[0], ex.args[1])
                else:
                    raise error.GrabNetworkError(ex.args[0], ex.args[1])
Пример #3
0
    def request(self):

        stderr_proxy = StderrProxy()
        try:
            with stderr_proxy.record():
                self.curl.perform()
        except pycurl.error as ex:
            # CURLE_WRITE_ERROR (23)
            # An error occurred when writing received data to a local file, or
            # an error was returned to libcurl from a write callback.
            # This exception should be ignored if grab_callback_interrupted flag
            # is enabled (this happens when nohead or nobody options enabled)
            #
            # Also this error is raised when curl receives KeyboardInterrupt
            # while it is processing some callback function
            # (WRITEFUNCTION, HEADERFUNCTIO, etc)
            # If you think WTF then see details here:
            # https://github.com/pycurl/pycurl/issues/413
            if self.has_pycurl_hidden_sigint(stderr_proxy.get_output()):
                raise KeyboardInterrupt
            if 23 == ex.args[0]:
                if getattr(self.curl, 'grab_callback_interrupted', None) is True:
                    # This is expected error caused by
                    # interruptted execution of body_processor callback
                    # FIXME: is it set automatically?
                    self.curl.grab_callback_interrupted = False
                else:
                    raise error.GrabNetworkError(ex.args[0], ex.args[1])
            else:
                if ex.args[0] == 28:
                    raise error.GrabTimeoutError(ex.args[0], ex.args[1])
                elif ex.args[0] == 7:
                    raise error.GrabConnectionError(ex.args[0], ex.args[1])
                elif ex.args[0] == 67:
                    raise error.GrabAuthError(ex.args[0], ex.args[1])
                elif ex.args[0] == 47:
                    raise error.GrabTooManyRedirectsError(ex.args[0],
                                                          ex.args[1])
                elif ex.args[0] == 6:
                    raise error.GrabCouldNotResolveHostError(ex.args[0],
                                                             ex.args[1])
                else:
                    raise error.GrabNetworkError(ex.args[0], ex.args[1])
        except Exception as ex: # pylint: disable=broad-except
            if self.has_pycurl_hidden_sigint(stderr_proxy.get_output()):
                raise KeyboardInterrupt
            six.reraise(error.GrabInternalError, error.GrabInternalError(ex),
                        sys.exc_info()[2])
        else:
            if self.has_pycurl_hidden_sigint(stderr_proxy.get_output()):
                raise KeyboardInterrupt
Пример #4
0
def build_grab_exception(ex, curl):
    """
    Build Grab exception from the pycurl exception

    Args:
        ex - the original pycurl exception
        curl - the Curl instance raised the exception
    """
    # CURLE_WRITE_ERROR (23)
    # An error occurred when writing received data to a local file, or
    # an error was returned to libcurl from a write callback.
    # This exception should be ignored if grab_callback_interrupted
    # flag # is enabled (this happens when nohead or nobody options
    # enabled)
    #
    # Also this error is raised when curl receives KeyboardInterrupt
    # while it is processing some callback function
    # (WRITEFUNCTION, HEADERFUNCTIO, etc)
    # If you think WTF then see details here:
    # https://github.com/pycurl/pycurl/issues/413
    if ex.args[0] == 23:
        if getattr(curl, 'grab_callback_interrupted', None) is True:
            # If the execution of body_process callback is
            # interrupted (body_maxsize, nobody and other options)
            # then the pycurl raised exception with code 23
            # We should ignore it
            return None
        else:
            return error.GrabNetworkError(ex.args[1], ex)
    else:
        if ex.args[0] == 28:
            return error.GrabTimeoutError(ex.args[1], ex)
        elif ex.args[0] == 7:
            return error.GrabConnectionError(ex.args[1], ex)
        elif ex.args[0] == 67:
            return error.GrabAuthError(ex.args[1], ex)
        elif ex.args[0] == 47:
            return error.GrabTooManyRedirectsError(ex.args[1], ex)
        elif ex.args[0] == 6:
            return error.GrabCouldNotResolveHostError(ex.args[1], ex)
        elif ex.args[0] == 3:
            return error.GrabInvalidUrl(ex.args[1], ex)
        else:
            return error.GrabNetworkError(ex.args[1], ex)
Пример #5
0
    def process_request_result(self, prepare_response_func=None):
        """
        Process result of real request performed via transport extension.
        """

        now = datetime.now()
        # TODO: move into separate method
        if self.config['debug_post']:
            post = self.config['post'] or self.config['multipart_post']
            if isinstance(post, dict):
                post = list(post.items())
            if post:
                if isinstance(post, basestring):
                    post = post[:self.config['debug_post_limit']] + '...'
                else:
                    items = normalize_http_values(post, charset='utf-8')
                    new_items = []
                    for key, value in items:
                        if len(value) > self.config['debug_post_limit']:
                            value = value[:self.
                                          config['debug_post_limit']] + '...'
                        else:
                            value = value
                        new_items.append((key, value))
                    post = '\n'.join('%-25s: %s' % x for x in new_items)
            if post:
                logger_network.debug('[%02d] POST request:\n%s\n' %
                                     (self.request_counter, post))

        # It's important to delete old POST data after request is performed.
        # If POST data is not cleared then next request will try to use them
        # again!
        old_refresh_count = self.config['refresh_redirect_count']
        self.reset_temporary_options()

        if prepare_response_func:
            self.doc = prepare_response_func(self.transport, self)
        else:
            self.doc = self.transport.prepare_response(self)

        # Workaround
        if self.doc.grab is None:
            self.doc.grab = weakref.proxy(self)

        if self.config['reuse_cookies']:
            self.cookies.update(self.doc.cookies)

        self.doc.timestamp = now

        self.config['charset'] = self.doc.charset

        if self.config['log_file']:
            with open(self.config['log_file'], 'wb') as out:
                out.write(self.doc.body)

        if self.config['cookiefile']:
            self.cookies.save_to_file(self.config['cookiefile'])

        if self.config['reuse_referer']:
            self.config['referer'] = self.doc.url

        self.copy_request_data()

        # Should be called after `copy_request_data`
        self.save_dumps()

        # TODO: check max redirect count
        if self.config['follow_refresh']:
            url = find_refresh_url(self.doc.unicode_body())
            print('URL', url)
            if url is not None:
                inc_count = old_refresh_count + 1
                if inc_count > self.config['redirect_limit']:
                    raise error.GrabTooManyRedirectsError()
                else:
                    print(inc_count)
                    return self.request(url=url,
                                        refresh_redirect_count=inc_count)

        return None
Пример #6
0
    def request(self, **kwargs):
        """
        Perform network request.

        You can specify grab settings in ``**kwargs``.
        Any keyword argument will be passed to ``self.config``.

        Returns: ``Document`` objects.
        """

        self.prepare_request(**kwargs)
        refresh_count = 0
        retries = self.config["retries"]
        retry_timeout = self.config["retry_timeout"]

        while True:
            self.log_request()
            try:
                while True:
                    try:
                        self.transport.request()
                        break
                    except error.GrabError as ex:
                        retries = retries - 1
                        if retries <= 0:
                            raise ex
                        else:
                            logger.debug(
                                'Request has failed with %s retrying in %s seconds'
                                % (ex, retry_timeout))
                            time.sleep(retry_timeout)
            except error.GrabError as ex:
                self.exception = ex
                self.reset_temporary_options()
                if self.config['log_dir']:
                    self.save_failed_dump()
                raise
            else:
                doc = self.process_request_result()

                if self.config['follow_location']:
                    if doc.code in (301, 302, 303, 307, 308):
                        if doc.headers.get('Location'):
                            refresh_count += 1
                            if refresh_count > self.config['redirect_limit']:
                                raise error.GrabTooManyRedirectsError()
                            else:
                                url = doc.headers.get('Location')
                                self.prepare_request(
                                    url=self.make_url_absolute(url),
                                    referer=None)
                                continue

                if self.config['follow_refresh']:
                    refresh_url = self.doc.get_meta_refresh_url()
                    if refresh_url is not None:
                        refresh_count += 1
                        if refresh_count > self.config['redirect_limit']:
                            raise error.GrabTooManyRedirectsError()
                        else:
                            self.prepare_request(
                                url=self.make_url_absolute(refresh_url),
                                referer=None)
                            continue
                return doc