Пример #1
0
    def text_search(self, anchor, byte=False):
        """
        Search the substring in response body.

        :param anchor: string to search
        :param byte: if False then `anchor` should be the
            unicode string, and search will be performed in
            `response.unicode_body()` else `anchor` should be the byte-string
            and search will be performed in `response.body`

        If substring is found return True else False.
        """

        if isinstance(anchor, six.text_type):
            if byte:
                raise GrabMisuseError('The anchor should be bytes string in '
                                      'byte mode')
            else:
                return anchor in self.unicode_body()

        if not isinstance(anchor, six.text_type):
            if byte:
                # if six.PY3:
                # return anchor in self.body_as_bytes()
                return anchor in self.body
            else:
                raise GrabMisuseError('The anchor should be byte string in '
                                      'non-byte mode')
Пример #2
0
    def xpath_number(self, path, default=NULL, filter=None, ignore_spaces=False,
                     smart=False, make_int=True):

        if filter is not None:
            raise GrabMisuseError('Argument `filter` is not supported anymore')
        return self.doc.select(path).number(default=default, smart=smart,
                                            ignore_spaces=ignore_spaces, make_int=make_int)
Пример #3
0
def create_cookie(name, value, domain, httponly=None, **kwargs):
    """Creates `cookielib.Cookie` instance"""

    if domain == 'localhost':
        domain = ''
    config = dict(
        name=name,
        value=value,
        version=0,
        port=None,
        domain=domain,
        path='/',
        secure=False,
        expires=None,
        discard=True,
        comment=None,
        comment_url=None,
        rfc2109=False,
        rest={'HttpOnly': httponly},
    )

    for key in kwargs:
        if key not in config:
            raise GrabMisuseError('Function `create_cookie` does not accept '
                                  '`%s` argument' % key)

    config.update(**kwargs)
    config['rest']['HttpOnly'] = httponly

    config['port_specified'] = bool(config['port'])
    config['domain_specified'] = bool(config['domain'])
    config['domain_initial_dot'] = (config['domain'] or '').startswith('.')
    config['path_specified'] = bool(config['path'])

    return Cookie(**config)
Пример #4
0
    def __init__(self, source, source_type, proxy_type='http', **kwargs):
        """
        Create `ProxyList` object and load proxies from the specified source.

        You should specify type of source in second argument to let ProxyList
        instance know how to handle proxy source.

        :param source: source of the project (file name, string or some object)
        :param source_type: type of proxy source
        :param proxy_type: default type of proxy (if proxy source does not
        provide this information)
        :param **kwargs: any additional arguments goes to specific proxy load
        method
        """

        self.init_kwargs = deepcopy(kwargs)

        try:
            source_class = SOURCE_LIST[source_type]
        except AttributeError:
            raise GrabMisuseError('Unknown proxy source type: %s' %
                                  source_type)
        self.source = source_class(source, proxy_type=proxy_type, **kwargs)
        self.source.load()
        self.filter_config = {}
        self.geoip_resolver = None
Пример #5
0
 def _build_selector(cls, tree, selector_type):
     if selector_type == 'xpath':
         return XpathSelector(tree)
     elif selector_type == 'json':
         return JsonSelector(tree)
     else:
         raise GrabMisuseError('Unknown selector type: %s' % selector_type)
Пример #6
0
    def choose_form(self, number=None, xpath=None, name=None, **kwargs):
        """
        Set the default form.

        :param number: number of form (starting from zero)
        :param id: value of "id" attribute
        :param name: value of "name" attribute
        :param xpath: XPath query
        :raises: :class:`DataNotFound` if form not found
        :raises: :class:`GrabMisuseError`
            if method is called without parameters

        Selected form will be available via `form` attribute of `Grab`
        instance. All form methods will work with default form.

        Examples::

            # Select second form
            g.choose_form(1)

            # Select by id
            g.choose_form(id="register")

            # Select by name
            g.choose_form(name="signup")

            # Select by xpath
            g.choose_form(xpath='//form[contains(@action, "/submit")]')
        """

        id_ = kwargs.pop('id', None)
        if id_ is not None:
            try:
                self._lxml_form = self.select('//form[@id="%s"]' % id_).node()
            except IndexError:
                raise DataNotFound("There is no form with id: %s" % id_)
        elif name is not None:
            try:
                self._lxml_form = self.select(
                    '//form[@name="%s"]' % name).node()
            except IndexError:
                raise DataNotFound('There is no form with name: %s' % name)
        elif number is not None:
            try:
                self._lxml_form = self.tree.forms[number]
            except IndexError:
                raise DataNotFound('There is no form with number: %s' % number)
        elif xpath is not None:
            try:
                self._lxml_form = self.select(xpath).node()
            except IndexError:
                raise DataNotFound(
                    'Could not find form with xpath: %s' % xpath)
        else:
            raise GrabMisuseError('choose_form methods requires one of '
                                  '[number, id, name, xpath] arguments')
Пример #7
0
def remove_bom(text):
    """
    Remove BOM-sequence from the start of byte string.
    """
    if isinstance(text, unicode):
        raise GrabMisuseError('remove_bom function accepts only byte strings')
    if text.startswith(BOM_TOKEN):
        return text[3:]
    else:
        return text
Пример #8
0
 def _write_body(self, body):
     if isinstance(body, six.text_type):
         raise GrabMisuseError('Document.body could be only byte string.')
     elif self.body_path:
         with open(self.body_path, 'wb') as out:
             out.write(body)
         self._bytes_body = None
     else:
         self._bytes_body = body
     self._unicode_body = None
Пример #9
0
 def update(self, cookies):
     if isinstance(cookies, CookieJar):
         for cookie in cookies:
             self.cookiejar.set_cookie(cookie)
     elif isinstance(cookies, CookieManager):
         for cookie in cookies.cookiejar:
             self.cookiejar.set_cookie(cookie)
     else:
         raise GrabMisuseError('Unknown type of cookies argument: %s' %
                               type(cookies))
Пример #10
0
 def xpath_text(self,
                path,
                default=NULL,
                filter=None,
                smart=False,
                normalize_space=True):
     if filter is not None:
         raise GrabMisuseError('Argument `filter` is not supported anymore')
     return self.doc.select(path).text(default=default,
                                       smart=smart,
                                       normalize_space=normalize_space)
Пример #11
0
def func_field(*args, **kwargs):
    if not kwargs and len(args) == 1 \
            and isinstance(args[0], collections.Callable):
        raise GrabMisuseError('It seems that you forgot to "call" the '
                              'func_field decorator. Use "@func_field()" '
                              'instead "func_field".')

    def wrapper(func):
        kwargs['pass_item'] = True
        return FuncField(func=func, *args, **kwargs)

    return wrapper
Пример #12
0
    def request(self):
        req = self._request

        if req.proxy:
            if req.proxy_userpwd:
                headers = make_headers(proxy_basic_auth=req.proxy_userpwd)
            else:
                headers = None
            proxy_url = '%s://%s' % (req.proxy_type, req.proxy)
            try:
                pool = ProxyManager(proxy_url, proxy_headers=headers)
            except ProxySchemeUnknown:
                raise GrabMisuseError('Urllib3 transport does '
                                      'not support %s proxies' %
                                      req.proxy_type)
        else:
            pool = self.pool
        try:
            retry = Retry(redirect=False, connect=False, read=False)
            # The read timeout is not total response time timeout
            # It is the timeout on read of next data chunk from the server
            # Total response timeout is handled by Grab
            timeout = Timeout(connect=req.connect_timeout, read=req.timeout)
            #req_headers = dict((make_unicode(x), make_unicode(y))
            #                   for (x, y) in req.headers.items())
            if six.PY3:
                req_url = make_unicode(req.url)
                req_method = make_unicode(req.method)
            else:
                req_url = make_str(req.url)
                req_method = req.method
            req.op_started = time.time()
            res = pool.urlopen(req_method,
                               req_url,
                               body=req.data,
                               timeout=timeout,
                               retries=retry,
                               headers=req.headers,
                               preload_content=False)
        except exceptions.ReadTimeoutError as ex:
            raise error.GrabTimeoutError('Read timeout')
        except exceptions.ConnectTimeoutError as ex:
            raise error.GrabConnectionError('Could not create connection')
        except exceptions.ProtocolError as ex:
            raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1])

        # WTF?
        self.request_head = b''
        self.request_body = b''
        self.request_log = b''

        self._response = res
Пример #13
0
    def load(self):
        """
        Load proxies from given list.

        Each proxy server could be in two forms:
        * simple: "server:port"
        * complex: "server:port:user:pwd"
        """

        if not isinstance(self.source, list):
            raise GrabMisuseError("Given proxy list isn't a list type")
        self.server_list = self.get_server_list(self.source)
        self.server_list_iterator = itertools.cycle(self.server_list)
Пример #14
0
    def load(self):
        """
        Load proxies from given string. String can be multiline.

        Each proxy server could be in two forms:
        * simple: "server:port"
        * complex: "server:port:user:pwd"
        """

        if not isinstance(self.source, (str, unicode)):
            raise GrabMisuseError("Given proxy list isn't a string or unicode type")
        self.server_list = self.get_server_list(self.source)
        self.server_list_iterator = itertools.cycle(self.server_list)
Пример #15
0
    def extract_document_data(cls, grab):
        """
        Extract document data from grab object in format that is
        suitable to pass to `cls` Item constructor.
        """

        sel_type = cls._get_selector_type()
        if sel_type == 'xpath':
            return grab.tree
        elif sel_type == 'json':
            return grab.response.json
        else:
            raise GrabMisuseError('Unknown selector type: %s' % sel_type)
Пример #16
0
    def find_link(self, href_pattern, make_absolute=True):
        """
        Find link in response body which href value matches ``href_pattern``.

        Returns found url or None.
        """

        if make_absolute:
            self.tree.make_links_absolute(self.response.url)

        if isinstance(href_pattern, unicode):
            raise GrabMisuseError('find_link method accepts only '\
                                  'byte-string argument')
        for elem, attr, link, pos in self.tree.iterlinks():
            if elem.tag == 'a' and href_pattern in link:
                return link
        return None
Пример #17
0
    def find_link(self, href_pattern, make_absolute=True):
        """
        Find link in response body which href value matches ``href_pattern``.

        Returns found url or None.
        """

        if make_absolute:
            self.tree.make_links_absolute(self.doc.url)

        if isinstance(href_pattern, six.text_type):
            raise GrabMisuseError('Method `find_link` accepts only '
                                  'byte-string argument')
        href_pattern = make_unicode(href_pattern)
        for elem, _, link, _ in self.tree.iterlinks():
            if elem.tag == 'a' and href_pattern in link:
                return link
        return None
Пример #18
0
 def xpath_list(self, path, filter=None):
     if filter is not None:
         raise GrabMisuseError('Argument `filter` is not supported anymore')
     return self.doc.select(path).node_list()
Пример #19
0
 def xpath_one(self, path, default=NULL, filter=None):
     if filter is not None:
         raise GrabMisuseError('Argument `filter` is not supported anymore')
     return self.doc.select(path).node(default=default)
Пример #20
0
 def select(self, xpath=None):
     raise GrabMisuseError('TextSelector does not allow select method')
Пример #21
0
 def attr(self, key, default=NULL):
     raise GrabMisuseError('TextSelector does not allow attr method')
Пример #22
0
Файл: curl.py Проект: lyicy/grab
    def process_config(self, grab):
        """
        Setup curl instance with values from ``self.config``.
        """

        # Copy some config for future usage
        self.config_nobody = grab.config['nobody']
        self.config_body_maxsize = grab.config['body_maxsize']

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(u'%s: %s' %
                                       (six.text_type(ex), grab.config['url']))

        # py3 hack
        if not six.PY3:
            request_url = make_str(request_url)

        self.curl.setopt(pycurl.URL, request_url)

        # 30* redirects are handled by Grab
        self.curl.setopt(pycurl.FOLLOWLOCATION, 0)
        self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit'])
        self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout'])
        self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout'])
        #self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
        # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0)
        if not grab.config['connection_reuse']:
            self.curl.setopt(pycurl.FRESH_CONNECT, 1)
            self.curl.setopt(pycurl.FORBID_REUSE, 1)

        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor)

        if grab.config['body_inmemory']:
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)
        else:
            if not grab.config['body_storage_dir']:
                raise error.GrabMisuseError(
                    'Option body_storage_dir is not defined')
            self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)

        if grab.config['verbose_logging']:
            self.verbose_logging = True

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        # If value is None then set empty string
        # None is not acceptable because in such case
        # pycurl will set its default user agent "PycURL/x.xx.x"
        if not grab.config['user_agent']:
            grab.config['user_agent'] = ''

        self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent'])

        if grab.config['debug']:
            self.curl.setopt(pycurl.VERBOSE, 1)
            self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor)

        # Ignore SSL errors
        self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)

        # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error
        # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)

        if grab.request_method in ('POST', 'PUT'):
            if (grab.config['post'] is None
                    and grab.config['multipart_post'] is None):
                raise GrabMisuseError('Neither `post` or `multipart_post`'
                                      ' options was specified for the %s'
                                      ' request' % grab.request_method)

        if grab.request_method == 'POST':
            self.curl.setopt(pycurl.POST, 1)
            if grab.config['multipart_post']:
                if isinstance(grab.config['multipart_post'], six.string_types):
                    raise error.GrabMisuseError(
                        'multipart_post option could not be a string')
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                # py3 hack
                #if six.PY3:
                #    post_items = decode_pairs(post_items,
                #                              grab.config['charset'])
                self.curl.setopt(pycurl.HTTPPOST,
                                 process_upload_items(post_items))
            elif grab.config['post']:
                post_data = normalize_post_data(grab.config['post'],
                                                grab.config['charset'])
                # py3 hack
                # if six.PY3:
                #    post_data = smart_unicode(post_data,
                #                              grab.config['charset'])
                self.curl.setopt(pycurl.POSTFIELDS, post_data)
            else:
                self.curl.setopt(pycurl.POSTFIELDS, '')
        elif grab.request_method == 'PUT':
            data = grab.config['post']
            if isinstance(data, six.text_type):
                # py3 hack
                # if six.PY3:
                #    data = data.encode('utf-8')
                # else:
                raise error.GrabMisuseError(
                    'Value of post option could be only '
                    'byte string if PUT method is used')
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'PUT')
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == 'PATCH':
            data = grab.config['post']
            if isinstance(data, six.text_type):
                raise error.GrabMisuseError(
                    'Value of post option could be only byte '
                    'string if PATCH method is used')
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH')
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == 'DELETE':
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE')
        elif grab.request_method == 'HEAD':
            self.curl.setopt(pycurl.NOBODY, 1)
        elif grab.request_method == 'UPLOAD':
            self.curl.setopt(pycurl.UPLOAD, 1)
        elif grab.request_method == 'GET':
            self.curl.setopt(pycurl.HTTPGET, 1)
        elif grab.request_method == 'OPTIONS':
            data = grab.config['post']
            if data is not None:
                if isinstance(data, six.text_type):
                    raise error.GrabMisuseError(
                        'Value of post option could be only byte '
                        'string if PATCH method is used')
                self.curl.setopt(pycurl.UPLOAD, 1)
                self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
                self.curl.setopt(pycurl.INFILESIZE, len(data))
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'OPTIONS')
        else:
            raise error.GrabMisuseError('Invalid method: %s' %
                                        grab.request_method)

        headers = grab.config['common_headers']
        if grab.config['headers']:
            headers.update(grab.config['headers'])
        # This is required to avoid some problems
        headers.update({'Expect': ''})
        header_tuples = [str('%s: %s' % x) for x in headers.items()]
        self.curl.setopt(pycurl.HTTPHEADER, header_tuples)

        self.process_cookie_options(grab, request_url)

        if grab.config['referer']:
            self.curl.setopt(pycurl.REFERER, str(grab.config['referer']))

        if grab.config['proxy']:
            self.curl.setopt(pycurl.PROXY, str(grab.config['proxy']))
        else:
            self.curl.setopt(pycurl.PROXY, '')

        if grab.config['proxy_userpwd']:
            self.curl.setopt(pycurl.PROXYUSERPWD,
                             str(grab.config['proxy_userpwd']))

        if grab.config['proxy_type']:
            key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper()
            self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key))

        if grab.config['encoding']:
            if ('gzip' in grab.config['encoding']
                    and 'zlib' not in pycurl.version):
                raise error.GrabMisuseError(
                    'You can not use gzip encoding because '
                    'pycurl was built without zlib support')
            self.curl.setopt(pycurl.ENCODING, grab.config['encoding'])

        if grab.config['userpwd']:
            self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd']))

        if grab.config.get('interface') is not None:
            self.curl.setopt(pycurl.INTERFACE, grab.config['interface'])

        if grab.config.get('reject_file_size') is not None:
            self.curl.setopt(pycurl.MAXFILESIZE,
                             grab.config['reject_file_size'])
Пример #23
0
    def process_config(self, grab):
        req = Request(data=None)

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(
                u'%s: %s' %
                (six.text_type(ex),
                 make_unicode(grab.config['url'], errors='ignore')))
        req.url = request_url

        method = grab.detect_request_method()
        req.method = make_str(method)

        req.config_body_maxsize = grab.config['body_maxsize']
        req.config_nobody = grab.config['nobody']

        req.timeout = grab.config['timeout']
        req.connect_timeout = grab.config['connect_timeout']

        extra_headers = {}

        # Body processing
        if grab.config['body_inmemory']:
            pass
        else:
            if not grab.config['body_storage_dir']:
                raise GrabMisuseError('Option body_storage_dir is not defined')
            file_, path_ = self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            req.response_file = file_
            req.response_path = path_

        if grab.config['multipart_post'] is not None:
            post_data = grab.config['multipart_post']
            if isinstance(post_data, six.binary_type):
                pass
            elif isinstance(post_data, six.text_type):
                raise GrabMisuseError('Option multipart_post data'
                                      ' does not accept unicode.')
            else:
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                post_items = decode_pairs(post_items, grab.config['charset'])
                post_items = process_upload_items(post_items)
                post_data, content_type = encode_multipart_formdata(post_items)
                extra_headers['Content-Type'] = content_type
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data
        elif grab.config['post'] is not None:
            post_data = normalize_post_data(grab.config['post'],
                                            grab.config['charset'])
            # py3 hack
            # if six.PY3:
            #    post_data = smart_unicode(post_data,
            #                              grab.config['charset'])
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data

        if method in ('POST', 'PUT'):
            if (grab.config['post'] is None
                    and grab.config['multipart_post'] is None):
                raise GrabMisuseError('Neither `post` or `multipart_post`'
                                      ' options was specified for the %s'
                                      ' request' % method)
        # Proxy
        if grab.config['proxy']:
            req.proxy = grab.config['proxy']

        if grab.config['proxy_userpwd']:
            req.proxy_userpwd = grab.config['proxy_userpwd']

        if grab.config['proxy_type']:
            req.proxy_type = grab.config['proxy_type']
        else:
            req.proxy_type = 'http'

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        extra_headers['User-Agent'] = grab.config['user_agent']

        # Headers
        headers = extra_headers
        headers.update(grab.config['common_headers'])

        if grab.config['headers']:
            headers.update(grab.config['headers'])
        req.headers = headers

        # Cookies
        self.process_cookie_options(grab, req)

        self._request = req