def build_search_url(query, page=1, per_page=None, lang='en', filter=True, **kwargs): """ Build google search url with specified query and pagination options. :param per_page: 10, 20, 30, 50, 100 kwargs: tbs=qdr:h tbs=qdr:d tbs=qdr:w tbs=qdr:m tbs=qdr:y """ if per_page is None: per_page = 10 if isinstance(query, unicode): query = query.encode('utf-8') start = per_page * (page - 1) url = 'http://google.com./search?hl=%s&q=%s&start=%s' % ( lang, urllib.quote(query), start) if per_page != 10: url += '&num=%d' % per_page if not filter: url += '&filter=0' if kwargs: url += '&' + urlencode(kwargs) return url
def build_search_url(query, page=None, per_page=None, lang=None, filter=None, **kwargs): """ Build google search url with specified query and pagination options. :param per_page: 10, 20, 30, 50, 100 kwargs: tbs=qdr:h tbs=qdr:d tbs=qdr:w tbs=qdr:m tbs=qdr:y """ if per_page is None: per_page = 10 if page is None: page = 1 if lang is None: lang = 'en' if filter is None: filter = True start = per_page * (page - 1) if not 'hl' in kwargs: kwargs['hl'] = lang if not 'num' in kwargs: kwargs['num'] = per_page if not 'start' in kwargs: kwargs['start'] = start if not 'filter' in kwargs: if not filter: kwargs['filter'] = '0' url = 'http://google.com/search?q=%s' % quote(smart_str(query)) if kwargs: url += '&' + urlencode(kwargs) return url
def build_search_url(query, page=None, per_page=None, lang=None, filter=None, **kwargs): """ Build google search url with specified query and pagination options. :param per_page: 10, 20, 30, 50, 100 kwargs: tbs=qdr:h tbs=qdr:d tbs=qdr:w tbs=qdr:m tbs=qdr:y """ if per_page is None: per_page = 10 if page is None: page = 1 if lang is None: lang = "en" if filter is None: filter = True start = per_page * (page - 1) if not "hl" in kwargs: kwargs["hl"] = lang if not "num" in kwargs: kwargs["num"] = per_page if not "start" in kwargs: kwargs["start"] = start if not "filter" in kwargs: if not filter: kwargs["filter"] = "0" url = "http://google.com/search?q=%s" % quote(smart_str(query)) if kwargs: url += "&" + urlencode(kwargs) return url
def process_config(self, grab): """ Setup curl instance with values from ``grab.config``. """ # Accumulate all request options into `self.requests_config` self.requests_config = { 'headers': {}, 'payload': None, 'cookies': None, 'proxy': None, } if isinstance(grab.config['url'], unicode): grab.config['url'] = grab.config['url'].encode('utf-8') self.requests_config['url'] = grab.config['url'] #self.curl.setopt(pycurl.URL, url) #self.curl.setopt(pycurl.FOLLOWLOCATION, 1) #self.curl.setopt(pycurl.MAXREDIRS, 5) #self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) #self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) #self.curl.setopt(pycurl.NOSIGNAL, 1) #self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) #self.curl.setopt(pycurl.HEADERFUNCTION, self.head_processor) # User-Agent # TODO: move to base class if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: lines = open(grab.config['user_agent_file']).read().splitlines() grab.config['user_agent'] = random.choice(lines) # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" # For consistency we send empty User-Agent in case of None value # in all other transports too if not grab.config['user_agent']: grab.config['user_agent'] = '' self.requests_config['headers']['User-Agent'] = grab.config['user_agent'] #if grab.config['debug']: #self.curl.setopt(pycurl.VERBOSE, 1) #self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) ## Ignore SSL errors #self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) #self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) self.requests_config['method'] = grab.request_method.lower() if grab.request_method == 'POST' or grab.request_method == 'PUT': if grab.config['multipart_post']: raise NotImplementedError #if isinstance(grab.config['multipart_post'], basestring): #raise GrabMisuseError('multipart_post option could not be a string') #post_items = normalize_http_values(grab.config['multipart_post'], #charset=grab.config['charset']) #self.curl.setopt(pycurl.HTTPPOST, post_items) elif grab.config['post']: if isinstance(grab.config['post'], basestring): # bytes-string should be posted as-is # unicode should be converted into byte-string if isinstance(grab.config['post'], unicode): post_data = normalize_unicode(grab.config['post']) else: post_data = grab.config['post'] else: # dict, tuple, list should be serialized into byte-string post_data = urlencode(grab.config['post']) self.requests_config['payload'] = post_data #self.curl.setopt(pycurl.POSTFIELDS, post_data) #elif grab.request_method == 'PUT': #self.curl.setopt(pycurl.PUT, 1) #self.curl.setopt(pycurl.READFUNCTION, StringIO(grab.config['post']).read) elif grab.request_method == 'DELETE': pass #self.curl.setopt(pycurl.CUSTOMREQUEST, 'delete') elif grab.request_method == 'HEAD': pass #self.curl.setopt(pycurl.NOBODY, 1) else: pass #self.curl.setopt(pycurl.HTTPGET, 1) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) #header_tuples = [str('%s: %s' % x) for x\ #in headers.iteritems()] #self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.requests_config['headers'].update(headers) # `cookiefile` option shoul be processed before `cookies` option # because `load_cookies` updates `cookies` option if grab.config['cookiefile']: grab.load_cookies(grab.config['cookiefile']) if grab.config['cookies']: items = normalize_http_values(grab.config['cookies']) self.requests_config['cookies'] = dict(items) #if not grab.config['reuse_cookies'] and not grab.config['cookies']: #self.curl.setopt(pycurl.COOKIELIST, 'ALL') #if grab.config['referer']: #self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) #if grab.config['proxy']: #self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) #else: #self.curl.setopt(pycurl.PROXY, '') #if grab.config['proxy_userpwd']: #self.curl.setopt(pycurl.PROXYUSERPWD, grab.config['proxy_userpwd']) if grab.config['proxy']: self.requests_config['proxy'] = grab.config['proxy'] if grab.config['proxy_userpwd']: raise GrabMisuseError('requests transport does not support proxy authentication') if grab.config['proxy_type']: if grab.config['proxy_type'] != 'http': raise GrabMisuseError('requests transport supports only proxies of http type')