示例#1
0
def build_search_url(query, page=1, per_page=None, lang='en', filter=True, **kwargs):
    """
    Build google search url with specified query and pagination options.

    :param per_page: 10, 20, 30, 50, 100
    kwargs:
        tbs=qdr:h
        tbs=qdr:d
        tbs=qdr:w
        tbs=qdr:m
        tbs=qdr:y
    """

    if per_page is None:
        per_page = 10
    if isinstance(query, unicode):
        query = query.encode('utf-8')
    start = per_page * (page - 1)
    url = 'http://google.com./search?hl=%s&q=%s&start=%s' % (
        lang, urllib.quote(query), start)
    if per_page != 10:
        url += '&num=%d' % per_page
    if not filter:
        url += '&filter=0'
    if kwargs:
        url += '&' + urlencode(kwargs)
    return url
示例#2
0
def build_search_url(query,
                     page=None,
                     per_page=None,
                     lang=None,
                     filter=None,
                     **kwargs):
    """
    Build google search url with specified query and pagination options.

    :param per_page: 10, 20, 30, 50, 100
    kwargs:
        tbs=qdr:h
        tbs=qdr:d
        tbs=qdr:w
        tbs=qdr:m
        tbs=qdr:y
    """

    if per_page is None:
        per_page = 10
    if page is None:
        page = 1
    if lang is None:
        lang = 'en'
    if filter is None:
        filter = True
    start = per_page * (page - 1)

    if not 'hl' in kwargs:
        kwargs['hl'] = lang
    if not 'num' in kwargs:
        kwargs['num'] = per_page
    if not 'start' in kwargs:
        kwargs['start'] = start
    if not 'filter' in kwargs:
        if not filter:
            kwargs['filter'] = '0'

    url = 'http://google.com/search?q=%s' % quote(smart_str(query))
    if kwargs:
        url += '&' + urlencode(kwargs)
    return url
示例#3
0
文件: google.py 项目: Scaurus/grab
def build_search_url(query, page=None, per_page=None, lang=None,
                     filter=None, **kwargs):
    """
    Build google search url with specified query and pagination options.

    :param per_page: 10, 20, 30, 50, 100
    kwargs:
        tbs=qdr:h
        tbs=qdr:d
        tbs=qdr:w
        tbs=qdr:m
        tbs=qdr:y
    """

    if per_page is None:
        per_page = 10
    if page is None:
        page = 1
    if lang is None:
        lang = 'en'
    if filter is None:
        filter = True
    start = per_page * (page - 1)

    if not 'hl' in kwargs:
        kwargs['hl'] = lang
    if not 'num' in kwargs:
        kwargs['num'] = per_page
    if not 'start' in kwargs:
        kwargs['start'] = start
    if not 'filter' in kwargs:
        if not filter:
            kwargs['filter'] = '0'


    url = 'http://google.com/search?q=%s' % quote(smart_str(query))
    if kwargs:
        url += '&' + urlencode(kwargs)
    return url
示例#4
0
def build_search_url(query, page=None, per_page=None, lang=None, filter=None, **kwargs):
    """
    Build google search url with specified query and pagination options.

    :param per_page: 10, 20, 30, 50, 100
    kwargs:
        tbs=qdr:h
        tbs=qdr:d
        tbs=qdr:w
        tbs=qdr:m
        tbs=qdr:y
    """

    if per_page is None:
        per_page = 10
    if page is None:
        page = 1
    if lang is None:
        lang = "en"
    if filter is None:
        filter = True
    start = per_page * (page - 1)

    if not "hl" in kwargs:
        kwargs["hl"] = lang
    if not "num" in kwargs:
        kwargs["num"] = per_page
    if not "start" in kwargs:
        kwargs["start"] = start
    if not "filter" in kwargs:
        if not filter:
            kwargs["filter"] = "0"

    url = "http://google.com/search?q=%s" % quote(smart_str(query))
    if kwargs:
        url += "&" + urlencode(kwargs)
    return url
示例#5
0
    def process_config(self, grab):
        """
        Setup curl instance with values from ``grab.config``.
        """
        
        # Accumulate all request options into `self.requests_config`
        self.requests_config = {
            'headers': {},
            'payload': None,
            'cookies': None,
            'proxy': None,
        }

        if isinstance(grab.config['url'], unicode):
            grab.config['url'] = grab.config['url'].encode('utf-8')

        self.requests_config['url'] = grab.config['url']

        #self.curl.setopt(pycurl.URL, url)
        #self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
        #self.curl.setopt(pycurl.MAXREDIRS, 5)
        #self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout'])
        #self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout'])
        #self.curl.setopt(pycurl.NOSIGNAL, 1)
        #self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)
        #self.curl.setopt(pycurl.HEADERFUNCTION, self.head_processor)

        # User-Agent
        # TODO: move to base class
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                lines = open(grab.config['user_agent_file']).read().splitlines()
                grab.config['user_agent'] = random.choice(lines)

        # If value is None then set empty string
        # None is not acceptable because in such case
        # pycurl will set its default user agent "PycURL/x.xx.x"
        # For consistency we send empty User-Agent in case of None value
        # in all other transports too
        if not grab.config['user_agent']:
            grab.config['user_agent'] = ''
        self.requests_config['headers']['User-Agent'] = grab.config['user_agent']

        #if grab.config['debug']:
            #self.curl.setopt(pycurl.VERBOSE, 1)
            #self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor)

        ## Ignore SSL errors
        #self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        #self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)

        self.requests_config['method'] = grab.request_method.lower()

        if grab.request_method == 'POST' or grab.request_method == 'PUT':
            if grab.config['multipart_post']:
                raise NotImplementedError
                #if isinstance(grab.config['multipart_post'], basestring):
                    #raise GrabMisuseError('multipart_post option could not be a string')
                #post_items = normalize_http_values(grab.config['multipart_post'],
                                                    #charset=grab.config['charset'])
                #self.curl.setopt(pycurl.HTTPPOST, post_items) 
            elif grab.config['post']:
                if isinstance(grab.config['post'], basestring):
                    # bytes-string should be posted as-is
                    # unicode should be converted into byte-string
                    if isinstance(grab.config['post'], unicode):
                        post_data = normalize_unicode(grab.config['post'])
                    else:
                        post_data = grab.config['post']
                else:
                    # dict, tuple, list should be serialized into byte-string
                    post_data = urlencode(grab.config['post'])
                self.requests_config['payload'] = post_data
                #self.curl.setopt(pycurl.POSTFIELDS, post_data)
        #elif grab.request_method == 'PUT':
            #self.curl.setopt(pycurl.PUT, 1)
            #self.curl.setopt(pycurl.READFUNCTION, StringIO(grab.config['post']).read) 
        elif grab.request_method == 'DELETE':
            pass
            #self.curl.setopt(pycurl.CUSTOMREQUEST, 'delete')
        elif grab.request_method == 'HEAD':
            pass
            #self.curl.setopt(pycurl.NOBODY, 1)
        else:
            pass
            #self.curl.setopt(pycurl.HTTPGET, 1)

        
        headers = grab.config['common_headers']
        if grab.config['headers']:
            headers.update(grab.config['headers'])
        #header_tuples = [str('%s: %s' % x) for x\
                         #in headers.iteritems()]
        #self.curl.setopt(pycurl.HTTPHEADER, header_tuples)
        self.requests_config['headers'].update(headers)

        # `cookiefile` option shoul be processed before `cookies` option
        # because `load_cookies` updates `cookies` option
        if grab.config['cookiefile']:
            grab.load_cookies(grab.config['cookiefile'])

        if grab.config['cookies']:
            items = normalize_http_values(grab.config['cookies'])
            self.requests_config['cookies'] = dict(items)

        #if not grab.config['reuse_cookies'] and not grab.config['cookies']:
            #self.curl.setopt(pycurl.COOKIELIST, 'ALL')

        #if grab.config['referer']:
            #self.curl.setopt(pycurl.REFERER, str(grab.config['referer']))

        #if grab.config['proxy']:
            #self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) 
        #else:
            #self.curl.setopt(pycurl.PROXY, '')

        #if grab.config['proxy_userpwd']:
            #self.curl.setopt(pycurl.PROXYUSERPWD, grab.config['proxy_userpwd'])

        if grab.config['proxy']:
            self.requests_config['proxy'] = grab.config['proxy']

        if grab.config['proxy_userpwd']:
            raise GrabMisuseError('requests transport does not support proxy authentication')

        if grab.config['proxy_type']:
            if grab.config['proxy_type'] != 'http':
                raise GrabMisuseError('requests transport supports only proxies of http type')