예제 #1
0
파일: base.py 프로젝트: lorien/grab
    def process_request_result(self, prepare_response_func=None):
        """
        Process result of real request performed via transport extension.
        """

        now = datetime.utcnow()
        # TODO: move into separate method
        if self.config['debug_post']:
            post = self.config['post'] or self.config['multipart_post']
            if isinstance(post, dict):
                post = list(post.items())
            if post:
                if isinstance(post, six.string_types):
                    post = make_str(post[:self.config['debug_post_limit']],
                                    errors='ignore') + b'...'
                else:
                    items = normalize_http_values(
                        post, charset=self.config['charset'])
                    new_items = []
                    for key, value in items:
                        if len(value) > self.config['debug_post_limit']:
                            value = value[
                                :self.config['debug_post_limit']] + b'...'
                        else:
                            value = value
                        new_items.append((key, value))
                    post = '\n'.join('%-25s: %s' % x for x in new_items)
            if post:
                logger_network.debug('[%02d] POST request:\n%s\n',
                                     self.request_counter, post)

        # It's important to delete old POST data after request is performed.
        # If POST data is not cleared then next request will try to use them
        # again!
        self.reset_temporary_options()

        if prepare_response_func:
            self.doc = prepare_response_func(self.transport, self)
        else:
            self.doc = self.transport.prepare_response(self)

        self.doc.process_grab(self)

        if self.config['reuse_cookies']:
            self.cookies.update(self.doc.cookies)

        self.doc.timestamp = now

        self.config['charset'] = self.doc.charset

        if self.config['log_file']:
            with open(self.config['log_file'], 'wb') as out:
                out.write(self.doc.body)

        if self.config['cookiefile']:
            self.cookies.save_to_file(self.config['cookiefile'])

        if self.config['reuse_referer']:
            self.config['referer'] = self.doc.url

        self.copy_request_data()

        # Should be called after `copy_request_data`
        if self.config['log_dir']:
            self.save_dumps()

        return self.doc
예제 #2
0
파일: urllib3.py 프로젝트: ad-m/grab
    def process_config(self, grab):
        req = Request(data=None)

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(
                u'%s: %s' % (six.text_type(ex), grab.config['url']))
        req.url = request_url

        method = grab.detect_request_method()
        req.method = make_str(method)

        req.body_maxsize = grab.config['body_maxsize']
        if grab.config['nobody']:
            req.body_maxsize = 0

        req.timeout = grab.config['timeout']
        req.connect_timeout = grab.config['connect_timeout']

        extra_headers = {}

        # Body processing
        if grab.config['body_inmemory']:
            pass
        else:
            if not grab.config['body_storage_dir']:
                raise GrabMisuseError(
                    'Option body_storage_dir is not defined')
            file_, path_ = self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            req._response_file = file_
            req._response_path = path_

        if grab.config['multipart_post'] is not None:
            post_data = grab.config['multipart_post']
            if isinstance(post_data, six.binary_type):
                pass
            elif isinstance(post_data, six.text_type):
                raise GrabMisuseError('Option multipart_post data'
                                      ' does not accept unicode.')
            else:
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                #if six.PY3:
                post_items = decode_pairs(post_items,
                                          grab.config['charset'])
                post_items = process_upload_items(post_items)
                post_data, content_type = encode_multipart_formdata(post_items)
                extra_headers['Content-Type'] = content_type
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data
        elif grab.config['post'] is not None:
            post_data = normalize_post_data(grab.config['post'],
                                            grab.config['charset'])
            # py3 hack
            # if six.PY3:
            #    post_data = smart_unicode(post_data,
            #                              grab.config['charset'])
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data

        if method in ('POST', 'PUT'):
            if (grab.config['post'] is None and
                grab.config['multipart_post'] is None):
                    raise GrabMisuseError('Neither `post` or `multipart_post`'
                                          ' options was specified for the %s'
                                          ' request' % method)
        # Proxy
        if grab.config['proxy']:
            req.proxy = grab.config['proxy']

        if grab.config['proxy_userpwd']:
            req.proxy_userpwd = grab.config['proxy_userpwd']

        if grab.config['proxy_type']:
            req.proxy_type = grab.config['proxy_type']
        else:
            req.proxy_type = 'http'

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        extra_headers['User-Agent'] = grab.config['user_agent'] 


        # Headers
        headers = extra_headers
        headers.update(grab.config['common_headers'])

        if grab.config['headers']:
            headers.update(grab.config['headers'])
        req.headers = headers

        # Cookies
        self.process_cookie_options(grab, req)


        self._request = req
예제 #3
0
파일: curl.py 프로젝트: Gwill/grab
    def process_config(self, grab):
        """
        Setup curl instance with values from ``self.config``.
        """

        # Copy some config for future usage
        self.config_nobody = grab.config["nobody"]
        self.config_body_maxsize = grab.config["body_maxsize"]

        try:
            request_url = normalize_url(grab.config["url"])
        except Exception as ex:
            raise error.GrabInvalidUrl(u"%s: %s" % (six.text_type(ex), grab.config["url"]))

        # py3 hack
        if not six.PY3:
            request_url = make_str(request_url)

        self.curl.setopt(pycurl.URL, request_url)

        # Actually, FOLLOWLOCATION should always be 0
        # because redirect logic takes place in Grab.request method
        # BUT in Grab.Spider this method is not invoked
        # So, in Grab.Spider we still rely on Grab internal ability
        # to follow 30X Locations
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config["follow_location"] else 0)
        self.curl.setopt(pycurl.MAXREDIRS, grab.config["redirect_limit"])
        self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config["connect_timeout"])
        self.curl.setopt(pycurl.TIMEOUT, grab.config["timeout"])
        self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
        # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0)
        if not grab.config["connection_reuse"]:
            self.curl.setopt(pycurl.FRESH_CONNECT, 1)
            self.curl.setopt(pycurl.FORBID_REUSE, 1)

        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor)

        if grab.config["body_inmemory"]:
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)
        else:
            if not grab.config["body_storage_dir"]:
                raise error.GrabMisuseError("Option body_storage_dir is not defined")
            self.setup_body_file(
                grab.config["body_storage_dir"],
                grab.config["body_storage_filename"],
                create_dir=grab.config["body_storage_create_dir"],
            )
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)

        if grab.config["verbose_logging"]:
            self.verbose_logging = True

        # User-Agent
        if grab.config["user_agent"] is None:
            if grab.config["user_agent_file"] is not None:
                with open(grab.config["user_agent_file"]) as inf:
                    lines = inf.read().splitlines()
                grab.config["user_agent"] = random.choice(lines)
            else:
                grab.config["user_agent"] = generate_user_agent()

        # If value is None then set empty string
        # None is not acceptable because in such case
        # pycurl will set its default user agent "PycURL/x.xx.x"
        if not grab.config["user_agent"]:
            grab.config["user_agent"] = ""

        self.curl.setopt(pycurl.USERAGENT, grab.config["user_agent"])

        if grab.config["debug"]:
            self.curl.setopt(pycurl.VERBOSE, 1)
            self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor)

        # Ignore SSL errors
        self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)

        # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error
        # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)

        if grab.request_method in ("POST", "PUT"):
            if grab.config["post"] is None and grab.config["multipart_post"] is None:
                raise GrabMisuseError(
                    "Neither `post` or `multipart_post`"
                    " options was specified for the %s"
                    " request" % grab.request_method
                )

        if grab.request_method == "POST":
            self.curl.setopt(pycurl.POST, 1)
            if grab.config["multipart_post"]:
                if isinstance(grab.config["multipart_post"], six.string_types):
                    raise error.GrabMisuseError("multipart_post option could not be a string")
                post_items = normalize_http_values(
                    grab.config["multipart_post"],
                    charset=grab.config["charset"],
                    ignore_classes=(UploadFile, UploadContent),
                )
                # py3 hack
                if six.PY3:
                    post_items = decode_pairs(post_items, grab.config["charset"])
                # import pdb; pdb.set_trace()
                self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items))
            elif grab.config["post"]:
                post_data = normalize_post_data(grab.config["post"], grab.config["charset"])
                # py3 hack
                # if six.PY3:
                #    post_data = smart_unicode(post_data,
                #                              grab.config['charset'])
                self.curl.setopt(pycurl.POSTFIELDS, post_data)
            else:
                self.curl.setopt(pycurl.POSTFIELDS, "")
        elif grab.request_method == "PUT":
            data = grab.config["post"]
            if isinstance(data, six.text_type):
                # py3 hack
                # if six.PY3:
                #    data = data.encode('utf-8')
                # else:
                raise error.GrabMisuseError("Value of post option could be only " "byte string if PUT method is used")
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, "PUT")
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == "PATCH":
            data = grab.config["post"]
            if isinstance(data, six.text_type):
                raise error.GrabMisuseError("Value of post option could be only byte " "string if PATCH method is used")
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, "PATCH")
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == "DELETE":
            self.curl.setopt(pycurl.CUSTOMREQUEST, "DELETE")
        elif grab.request_method == "HEAD":
            self.curl.setopt(pycurl.NOBODY, 1)
        elif grab.request_method == "UPLOAD":
            self.curl.setopt(pycurl.UPLOAD, 1)
        elif grab.request_method == "GET":
            self.curl.setopt(pycurl.HTTPGET, 1)
        elif grab.request_method == "OPTIONS":
            data = grab.config["post"]
            if data is not None:
                if isinstance(data, six.text_type):
                    raise error.GrabMisuseError(
                        "Value of post option could be only byte " "string if PATCH method is used"
                    )
                self.curl.setopt(pycurl.UPLOAD, 1)
                self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
                self.curl.setopt(pycurl.INFILESIZE, len(data))
            self.curl.setopt(pycurl.CUSTOMREQUEST, "OPTIONS")
        else:
            raise error.GrabMisuseError("Invalid method: %s" % grab.request_method)

        headers = grab.config["common_headers"]
        if grab.config["headers"]:
            headers.update(grab.config["headers"])
        # This is required to avoid some problems
        headers.update({"Expect": ""})
        header_tuples = [str("%s: %s" % x) for x in headers.items()]
        self.curl.setopt(pycurl.HTTPHEADER, header_tuples)

        self.process_cookie_options(grab, request_url)

        if grab.config["referer"]:
            self.curl.setopt(pycurl.REFERER, str(grab.config["referer"]))

        if grab.config["proxy"]:
            self.curl.setopt(pycurl.PROXY, str(grab.config["proxy"]))
        else:
            self.curl.setopt(pycurl.PROXY, "")

        if grab.config["proxy_userpwd"]:
            self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config["proxy_userpwd"]))

        if grab.config["proxy_type"]:
            key = "PROXYTYPE_%s" % grab.config["proxy_type"].upper()
            self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key))

        if grab.config["encoding"]:
            if "gzip" in grab.config["encoding"] and "zlib" not in pycurl.version:
                raise error.GrabMisuseError(
                    "You can not use gzip encoding because " "pycurl was built without zlib support"
                )
            self.curl.setopt(pycurl.ENCODING, grab.config["encoding"])

        if grab.config["userpwd"]:
            self.curl.setopt(pycurl.USERPWD, str(grab.config["userpwd"]))

        if grab.config.get("interface") is not None:
            self.curl.setopt(pycurl.INTERFACE, grab.config["interface"])

        if grab.config.get("reject_file_size") is not None:
            self.curl.setopt(pycurl.MAXFILESIZE, grab.config["reject_file_size"])
예제 #4
0
파일: curl.py 프로젝트: abael/grab
    def process_config(self, grab):
        """
        Setup curl instance with values from ``self.config``.
        """

        # Copy some config for future usage
        self.config_nobody = grab.config['nobody']
        self.config_body_maxsize = grab.config['body_maxsize']

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(
                u'%s: %s' % (six.text_type(ex), grab.config['url']))

        # py3 hack
        if not six.PY3:
            request_url = smart_str(request_url)

        self.curl.setopt(pycurl.URL, request_url)

        self.curl.setopt(pycurl.FOLLOWLOCATION,
                         1 if grab.config['follow_location'] else 0)
        self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit'])
        self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout'])
        self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout'])
        self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
        # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0)
        if not grab.config['connection_reuse']:
            self.curl.setopt(pycurl.FRESH_CONNECT, 1)
            self.curl.setopt(pycurl.FORBID_REUSE, 1)

        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.head_processor)

        if grab.config['body_inmemory']:
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)
        else:
            if not grab.config['body_storage_dir']:
                raise error.GrabMisuseError(
                    'Option body_storage_dir is not defined')
            self.setup_body_file(grab.config['body_storage_dir'],
                                 grab.config['body_storage_filename'])
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)

        if grab.config['verbose_logging']:
            self.verbose_logging = True

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = random_user_agent()

        # If value is None then set empty string
        # None is not acceptable because in such case
        # pycurl will set its default user agent "PycURL/x.xx.x"
        if not grab.config['user_agent']:
            grab.config['user_agent'] = ''

        self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent'])

        if grab.config['debug']:
            self.curl.setopt(pycurl.VERBOSE, 1)
            self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor)

        # Ignore SSL errors
        self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)

        # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error
        # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)

        if grab.request_method == 'POST':
            self.curl.setopt(pycurl.POST, 1)
            if grab.config['multipart_post']:
                if isinstance(grab.config['multipart_post'], six.string_types):
                    raise error.GrabMisuseError(
                        'multipart_post option could not be a string')
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                # py3 hack
                if six.PY3:
                    post_items = decode_pairs(post_items,
                                              grab.config['charset'])
                # import pdb; pdb.set_trace()
                self.curl.setopt(pycurl.HTTPPOST,
                                 process_upload_items(post_items))
            elif grab.config['post']:
                post_data = normalize_post_data(grab.config['post'],
                                                grab.config['charset'])
                # py3 hack
                # if six.PY3:
                #    post_data = smart_unicode(post_data,
                #                              grab.config['charset'])
                self.curl.setopt(pycurl.POSTFIELDS, post_data)
            else:
                self.curl.setopt(pycurl.POSTFIELDS, '')
        elif grab.request_method == 'PUT':
            data = grab.config['post']
            if isinstance(data, six.text_type):
                # py3 hack
                # if six.PY3:
                #    data = data.encode('utf-8')
                # else:
                raise error.GrabMisuseError(
                    'Value of post option could be only '
                    'byte string if PUT method is used')
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == 'PATCH':
            data = grab.config['post']
            if isinstance(data, six.text_type):
                raise error.GrabMisuseError(
                    'Value of post option could be only byte '
                    'string if PATCH method is used')
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH')
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == 'DELETE':
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE')
        elif grab.request_method == 'HEAD':
            self.curl.setopt(pycurl.NOBODY, 1)
        elif grab.request_method == 'UPLOAD':
            self.curl.setopt(pycurl.UPLOAD, 1)
        elif grab.request_method == 'GET':
            self.curl.setopt(pycurl.HTTPGET, 1)
        else:
            raise error.GrabMisuseError('Invalid method: %s' %
                                        grab.request_method)

        headers = grab.config['common_headers']
        if grab.config['headers']:
            headers.update(grab.config['headers'])
        header_tuples = [str('%s: %s' % x) for x
                         in headers.items()]
        self.curl.setopt(pycurl.HTTPHEADER, header_tuples)

        self.process_cookie_options(grab, request_url)

        if grab.config['referer']:
            self.curl.setopt(pycurl.REFERER, str(grab.config['referer']))

        if grab.config['proxy']:
            self.curl.setopt(pycurl.PROXY, str(grab.config['proxy']))
        else:
            self.curl.setopt(pycurl.PROXY, '')

        if grab.config['proxy_userpwd']:
            self.curl.setopt(pycurl.PROXYUSERPWD,
                             str(grab.config['proxy_userpwd']))

        if grab.config['proxy_type']:
            key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper()
            self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key))

        if grab.config['encoding']:
            if ('gzip' in grab.config['encoding'] and
                    'zlib' not in pycurl.version):
                raise error.GrabMisuseError(
                    'You can not use gzip encoding because '
                    'pycurl was built without zlib support')
            self.curl.setopt(pycurl.ENCODING, grab.config['encoding'])

        if grab.config['userpwd']:
            self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd']))

        if grab.config.get('interface') is not None:
            self.curl.setopt(pycurl.INTERFACE, grab.config['interface'])

        if grab.config.get('reject_file_size') is not None:
            self.curl.setopt(pycurl.MAXFILESIZE,
                             grab.config['reject_file_size'])
예제 #5
0
파일: base.py 프로젝트: abael/grab
    def process_request_result(self, prepare_response_func=None):
        """
        Process result of real request performed via transport extension.
        """

        now = datetime.utcnow()
        # TODO: move into separate method
        if self.config['debug_post']:
            post = self.config['post'] or self.config['multipart_post']
            if isinstance(post, dict):
                post = list(post.items())
            if post:
                if isinstance(post, six.string_types):
                    post = post[:self.config['debug_post_limit']] + '...'
                else:
                    items = normalize_http_values(post, charset='utf-8')
                    new_items = []
                    for key, value in items:
                        if len(value) > self.config['debug_post_limit']:
                            value = value[
                                :self.config['debug_post_limit']] + '...'
                        else:
                            value = value
                        new_items.append((key, value))
                    post = '\n'.join('%-25s: %s' % x for x in new_items)
            if post:
                logger_network.debug('[%02d] POST request:\n%s\n'
                                     % (self.request_counter, post))

        # It's important to delete old POST data after request is performed.
        # If POST data is not cleared then next request will try to use them
        # again!
        old_refresh_count = self.config['refresh_redirect_count']
        self.reset_temporary_options()

        if prepare_response_func:
            self.doc = prepare_response_func(self.transport, self)
        else:
            self.doc = self.transport.prepare_response(self)

        # Workaround
        if self.doc.grab is None:
            self.doc.grab = weakref.proxy(self)

        if self.config['reuse_cookies']:
            self.cookies.update(self.doc.cookies)

        self.doc.timestamp = now

        self.config['charset'] = self.doc.charset

        if self.config['log_file']:
            with open(self.config['log_file'], 'wb') as out:
                out.write(self.doc.body)

        if self.config['cookiefile']:
            self.cookies.save_to_file(self.config['cookiefile'])

        if self.config['reuse_referer']:
            self.config['referer'] = self.doc.url

        self.copy_request_data()

        # Should be called after `copy_request_data`
        self.save_dumps()

        # TODO: check max redirect count
        if self.config['follow_refresh']:
            url = find_refresh_url(self.doc.unicode_body())
            if url is not None:
                inc_count = old_refresh_count + 1
                if inc_count > self.config['redirect_limit']:
                    raise error.GrabTooManyRedirectsError()
                else:
                    return self.request(url=url,
                                        refresh_redirect_count=inc_count)

        return None
예제 #6
0
파일: urllib3.py 프로젝트: mombakkes/grab
    def process_config(self, grab):
        req = Request(data=None)

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(
                u'%s: %s' % (six.text_type(ex), grab.config['url']))
        req.url = request_url

        method = grab.detect_request_method()
        req.method = make_str(method)

        req.body_maxsize = grab.config['body_maxsize']
        if grab.config['nobody']:
            req.body_maxsize = 0

        req.timeout = grab.config['timeout']
        req.connect_timeout = grab.config['connect_timeout']

        extra_headers = {}

        # Body processing
        if grab.config['body_inmemory']:
            pass
        else:
            if not grab.config['body_storage_dir']:
                raise GrabMisuseError(
                    'Option body_storage_dir is not defined')
            file_, path_ = self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            req._response_file = file_
            req._response_path = path_

        if grab.config['multipart_post'] is not None:
            post_data = grab.config['multipart_post']
            if isinstance(post_data, six.binary_type):
                pass
            elif isinstance(post_data, six.text_type):
                raise GrabMisuseError('Option multipart_post data'
                                      ' does not accept unicode.')
            else:
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                #if six.PY3:
                post_items = decode_pairs(post_items,
                                          grab.config['charset'])
                post_items = process_upload_items(post_items)
                post_data, content_type = encode_multipart_formdata(post_items)
                extra_headers['Content-Type'] = content_type
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data
        elif grab.config['post'] is not None:
            post_data = normalize_post_data(grab.config['post'],
                                            grab.config['charset'])
            # py3 hack
            # if six.PY3:
            #    post_data = smart_unicode(post_data,
            #                              grab.config['charset'])
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data

        if method in ('POST', 'PUT'):
            if (grab.config['post'] is None and
                grab.config['multipart_post'] is None):
                    raise GrabMisuseError('Neither `post` or `multipart_post`'
                                          ' options was specified for the %s'
                                          ' request' % method)
        # Proxy
        if grab.config['proxy']:
            req.proxy = grab.config['proxy']

        if grab.config['proxy_userpwd']:
            req.proxy_userpwd = grab.config['proxy_userpwd']

        if grab.config['proxy_type']:
            req.proxy_type = grab.config['proxy_type']
        else:
            req.proxy_type = 'http'

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        extra_headers['User-Agent'] = grab.config['user_agent']


        # Headers
        headers = extra_headers
        headers.update(grab.config['common_headers'])

        if grab.config['headers']:
            headers.update(grab.config['headers'])
        req.headers = headers

        # Cookies
        self.process_cookie_options(grab, req)


        self._request = req
예제 #7
0
    def process_config(self, grab):
        """
        Setup curl instance with values from ``self.config``.
        """

        # Copy some config for future usage
        self.config_nobody = grab.config['nobody']
        self.config_body_maxsize = grab.config['body_maxsize']

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(u'%s: %s' %
                                       (six.text_type(ex), grab.config['url']))

        # py3 hack
        if not six.PY3:
            request_url = make_str(request_url)

        self.curl.setopt(pycurl.URL, request_url)

        # 30* redirects are handled by Grab
        self.curl.setopt(pycurl.FOLLOWLOCATION, 0)
        self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit'])
        self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout'])
        self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout'])
        #self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
        # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0)
        if not grab.config['connection_reuse']:
            self.curl.setopt(pycurl.FRESH_CONNECT, 1)
            self.curl.setopt(pycurl.FORBID_REUSE, 1)

        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor)

        if grab.config['body_inmemory']:
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)
        else:
            if not grab.config['body_storage_dir']:
                raise error.GrabMisuseError(
                    'Option body_storage_dir is not defined')
            self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)

        if grab.config['verbose_logging']:
            self.verbose_logging = True

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        # If value is None then set empty string
        # None is not acceptable because in such case
        # pycurl will set its default user agent "PycURL/x.xx.x"
        if not grab.config['user_agent']:
            grab.config['user_agent'] = ''

        self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent'])

        if grab.config['debug']:
            self.curl.setopt(pycurl.VERBOSE, 1)
            self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor)

        # Ignore SSL errors
        self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)

        # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error
        # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)

        if grab.request_method in ('POST', 'PUT'):
            if (grab.config['post'] is None
                    and grab.config['multipart_post'] is None):
                raise GrabMisuseError('Neither `post` or `multipart_post`'
                                      ' options was specified for the %s'
                                      ' request' % grab.request_method)

        if grab.request_method == 'POST':
            self.curl.setopt(pycurl.POST, 1)
            if grab.config['multipart_post']:
                if isinstance(grab.config['multipart_post'], six.string_types):
                    raise error.GrabMisuseError(
                        'multipart_post option could not be a string')
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                # py3 hack
                #if six.PY3:
                #    post_items = decode_pairs(post_items,
                #                              grab.config['charset'])
                self.curl.setopt(pycurl.HTTPPOST,
                                 process_upload_items(post_items))
            elif grab.config['post']:
                post_data = normalize_post_data(grab.config['post'],
                                                grab.config['charset'])
                # py3 hack
                # if six.PY3:
                #    post_data = smart_unicode(post_data,
                #                              grab.config['charset'])
                self.curl.setopt(pycurl.POSTFIELDS, post_data)
            else:
                self.curl.setopt(pycurl.POSTFIELDS, '')
        elif grab.request_method == 'PUT':
            data = grab.config['post']
            if isinstance(data, six.text_type):
                # py3 hack
                # if six.PY3:
                #    data = data.encode('utf-8')
                # else:
                raise error.GrabMisuseError(
                    'Value of post option could be only '
                    'byte string if PUT method is used')
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'PUT')
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == 'PATCH':
            data = grab.config['post']
            if isinstance(data, six.text_type):
                raise error.GrabMisuseError(
                    'Value of post option could be only byte '
                    'string if PATCH method is used')
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH')
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == 'DELETE':
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE')
        elif grab.request_method == 'HEAD':
            self.curl.setopt(pycurl.NOBODY, 1)
        elif grab.request_method == 'UPLOAD':
            self.curl.setopt(pycurl.UPLOAD, 1)
        elif grab.request_method == 'GET':
            self.curl.setopt(pycurl.HTTPGET, 1)
        elif grab.request_method == 'OPTIONS':
            data = grab.config['post']
            if data is not None:
                if isinstance(data, six.text_type):
                    raise error.GrabMisuseError(
                        'Value of post option could be only byte '
                        'string if PATCH method is used')
                self.curl.setopt(pycurl.UPLOAD, 1)
                self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
                self.curl.setopt(pycurl.INFILESIZE, len(data))
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'OPTIONS')
        else:
            raise error.GrabMisuseError('Invalid method: %s' %
                                        grab.request_method)

        headers = grab.config['common_headers']
        if grab.config['headers']:
            headers.update(grab.config['headers'])
        # This is required to avoid some problems
        headers.update({'Expect': ''})
        header_tuples = [str('%s: %s' % x) for x in headers.items()]
        self.curl.setopt(pycurl.HTTPHEADER, header_tuples)

        self.process_cookie_options(grab, request_url)

        if grab.config['referer']:
            self.curl.setopt(pycurl.REFERER, str(grab.config['referer']))

        if grab.config['proxy']:
            self.curl.setopt(pycurl.PROXY, str(grab.config['proxy']))
        else:
            self.curl.setopt(pycurl.PROXY, '')

        if grab.config['proxy_userpwd']:
            self.curl.setopt(pycurl.PROXYUSERPWD,
                             str(grab.config['proxy_userpwd']))

        if grab.config['proxy_type']:
            key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper()
            self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key))

        if grab.config['encoding']:
            if ('gzip' in grab.config['encoding']
                    and 'zlib' not in pycurl.version):
                raise error.GrabMisuseError(
                    'You can not use gzip encoding because '
                    'pycurl was built without zlib support')
            self.curl.setopt(pycurl.ENCODING, grab.config['encoding'])

        if grab.config['userpwd']:
            self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd']))

        if grab.config.get('interface') is not None:
            self.curl.setopt(pycurl.INTERFACE, grab.config['interface'])

        if grab.config.get('reject_file_size') is not None:
            self.curl.setopt(pycurl.MAXFILESIZE,
                             grab.config['reject_file_size'])
예제 #8
0
파일: base.py 프로젝트: zhenxingdev/grab
    def process_request_result(self, prepare_response_func=None):
        """
        Process result of real request performed via transport extension.
        """

        now = datetime.utcnow()
        # TODO: move into separate method
        if self.config['debug_post']:
            post = self.config['post'] or self.config['multipart_post']
            if isinstance(post, dict):
                post = list(post.items())
            if post:
                if isinstance(post, six.string_types):
                    post = post[:self.config['debug_post_limit']] + '...'
                else:
                    items = normalize_http_values(post, charset='utf-8')
                    new_items = []
                    for key, value in items:
                        if len(value) > self.config['debug_post_limit']:
                            value = value[
                                :self.config['debug_post_limit']] + '...'
                        else:
                            value = value
                        new_items.append((key, value))
                    post = '\n'.join('%-25s: %s' % x for x in new_items)
            if post:
                logger_network.debug('[%02d] POST request:\n%s\n'
                                     % (self.request_counter, post))

        # It's important to delete old POST data after request is performed.
        # If POST data is not cleared then next request will try to use them
        # again!
        self.reset_temporary_options()

        if prepare_response_func:
            self.doc = prepare_response_func(self.transport, self)
        else:
            self.doc = self.transport.prepare_response(self)

        # Workaround
        if self.doc.grab is None:
            self.doc.grab = weakref.proxy(self)

        if self.config['reuse_cookies']:
            self.cookies.update(self.doc.cookies)

        self.doc.timestamp = now

        self.config['charset'] = self.doc.charset

        if self.config['log_file']:
            with open(self.config['log_file'], 'wb') as out:
                out.write(self.doc.body)

        if self.config['cookiefile']:
            self.cookies.save_to_file(self.config['cookiefile'])

        if self.config['reuse_referer']:
            self.config['referer'] = self.doc.url

        self.copy_request_data()

        # Should be called after `copy_request_data`
        if self.config['log_dir']:
            self.save_dumps()

        return self.doc
예제 #9
0
    def process_config(self, grab):
        req = Request(data=None)

        try:
            request_url = normalize_url(grab.config["url"])
        except Exception as ex:
            raise error.GrabInvalidUrl(u"%s: %s" % (six.text_type(ex), grab.config["url"]))
        req.url = request_url

        method = grab.detect_request_method()
        req.method = make_str(method)

        req.body_maxsize = grab.config["body_maxsize"]
        if grab.config["nobody"]:
            req.body_maxsize = 0

        req.timeout = grab.config["timeout"]
        req.connect_timeout = grab.config["connect_timeout"]

        extra_headers = {}

        # Body processing
        if grab.config["body_inmemory"]:
            pass
        else:
            if not grab.config["body_storage_dir"]:
                raise GrabMisuseError("Option body_storage_dir is not defined")
            file_, path_ = self.setup_body_file(
                grab.config["body_storage_dir"],
                grab.config["body_storage_filename"],
                create_dir=grab.config["body_storage_create_dir"],
            )
            req._response_file = file_
            req._response_path = path_

        if grab.config["multipart_post"] is not None:
            post_data = grab.config["multipart_post"]
            if isinstance(post_data, six.binary_type):
                pass
            elif isinstance(post_data, six.text_type):
                raise GrabMisuseError("Option multipart_post data" " does not accept unicode.")
            else:
                post_items = normalize_http_values(
                    grab.config["multipart_post"],
                    charset=grab.config["charset"],
                    ignore_classes=(UploadFile, UploadContent),
                )
                # if six.PY3:
                post_items = decode_pairs(post_items, grab.config["charset"])
                post_items = process_upload_items(post_items)
                post_data, content_type = encode_multipart_formdata(post_items)
                extra_headers["Content-Type"] = content_type
            extra_headers["Content-Length"] = len(post_data)
            req.data = post_data
        elif grab.config["post"] is not None:
            post_data = normalize_post_data(grab.config["post"], grab.config["charset"])
            # py3 hack
            # if six.PY3:
            #    post_data = smart_unicode(post_data,
            #                              grab.config['charset'])
            extra_headers["Content-Length"] = len(post_data)
            req.data = post_data

        if method in ("POST", "PUT"):
            if grab.config["post"] is None and grab.config["multipart_post"] is None:
                raise GrabMisuseError(
                    "Neither `post` or `multipart_post`" " options was specified for the %s" " request" % method
                )
        # Proxy
        if grab.config["proxy"]:
            req.proxy = grab.config["proxy"]

        if grab.config["proxy_userpwd"]:
            req.proxy_userpwd = grab.config["proxy_userpwd"]

        if grab.config["proxy_type"]:
            req.proxy_type = grab.config["proxy_type"]
        else:
            req.proxy_type = "http"

        # User-Agent
        if grab.config["user_agent"] is None:
            if grab.config["user_agent_file"] is not None:
                with open(grab.config["user_agent_file"]) as inf:
                    lines = inf.read().splitlines()
                grab.config["user_agent"] = random.choice(lines)
            else:
                grab.config["user_agent"] = generate_user_agent()

        extra_headers["User-Agent"] = grab.config["user_agent"]

        # Headers
        headers = extra_headers
        headers.update(grab.config["common_headers"])

        if grab.config["headers"]:
            headers.update(grab.config["headers"])
        req.headers = headers

        # Cookies
        self.process_cookie_options(grab, req)

        self._request = req