def ping(name, url, grab, thread_number=10): """ Do XMLRPC ping of given site. """ name = smart_str(name) url = smart_str(url) def worker(rpc_url): post = PING_XML % { 'url': html.escape(url), 'name': html.escape(name), } ok = False try: grab.go(rpc_url, post=post) except Exception as ex: logging.error(unicode(ex)) else: if not '<boolean>0' in grab.response.body: logging.error('%s : FAIL' % rpc_url) logging.error(grab.response.body[:1000]) else: ok = True return rpc_url, ok results = [] for rpc_url, ok in make_work(worker, SERVER_LIST, thread_number): results.append((rpc_url, ok)) return results
def build_query_url(query, lang='ru'): args = { 'hostname': GOOGLE_DOMAINS_ITER.next(), 'query': urllib.quote_plus(smart_str(query)), 'lang': lang, #'hl': lang, } return BASE_SUGGEST_URL % args
def sanitize_html(html, encoding='utf-8', return_unicode=False): html = smart_str(html, encoding=encoding) if RE_TAG_START.search(html): html = render_html(parse_html(html)) if return_unicode: return html.decode('utf-8') else: return html
def main(lookFor, jobTitle, company, tag): employerHeaderPageId = 1 questionTextPageId = 0 g = Grab() g.go(p(lookFor, jobTitle, company, tag, employerHeaderPageId)) employerHeader = g.xpath('//h1').text_content() f = open('Glassdoor.com ' + employerHeader + '.txt', 'w') f.write(smart_str(employerHeader) + ':\n') while True: g = Grab() questionTextPageId += 1 g.go(p(lookFor, jobTitle, company, tag, questionTextPageId)) if int(g.xpath('//li[@class="currPage"]').text) <= (questionTextPageId - 1): print 'Finished at page: ' + g.xpath('//li[@class="currPage"]').text + '!' break for questionText in g.xpath_list('//p[@class="questionText"]'): f.write(smart_str(questionText.text_content().strip()) + '\n') print 'Page # ' + g.xpath('//li[@class="currPage"]').text + ' parsed!'
def ping(name, url, grab, thread_number=10): """ Do XMLRPC ping of given site. """ name = smart_str(name) url = smart_str(url) def worker(rpc_url): post = PING_XML % { 'url': html.escape(url), 'name': html.escape(name), } ok = False try: grab.go(rpc_url, post=post) except Exception, ex: logging.error(unicode(ex)) else:
def main(lookFor, jobTitle, company, tag): employerHeaderPageId = 1 questionTextPageId = 0 g = Grab() g.go(p(lookFor, jobTitle, company, tag, employerHeaderPageId)) employerHeader = g.xpath('//h1').text_content() f = open('Glassdoor.com ' + employerHeader + '.txt', 'w') f.write(smart_str(employerHeader) + ':\n') while True: g = Grab() questionTextPageId += 1 g.go(p(lookFor, jobTitle, company, tag, questionTextPageId)) if int(g.xpath('//li[@class="currPage"]').text) <= ( questionTextPageId - 1): print 'Finished at page: ' + g.xpath( '//li[@class="currPage"]').text + '!' break for questionText in g.xpath_list('//p[@class="questionText"]'): f.write(smart_str(questionText.text_content().strip()) + '\n') print 'Page # ' + g.xpath('//li[@class="currPage"]').text + ' parsed!'
def save_list(self, list_name, path): """ Save items from list to the file. """ with open(path, 'w') as out: lines = [] for item in self.items.get(list_name, []): if isinstance(item, basestring): lines.append(smart_str(item)) else: lines.append(json.dumps(item)) out.write('\n'.join(lines))
def save_list(self, list_name, path): """ Save items from list to the file. """ with open(path, "w") as out: lines = [] for item in self.items.get(list_name, []): if isinstance(item, basestring): lines.append(smart_str(item)) else: lines.append(json.dumps(item)) out.write("\n".join(lines))
def main(tag): pageId = 0 f = open(tag + '.txt', 'w') f.write(tag + ':\n' ) while True: g = Grab() g.setup(timeout=60, connect_timeout=60) pageId += 1 g.go(p(tag, pageId)) v1 = g.xpath_text('//title') v2 = unicode("Хабрахабр — страница не найдена (404)", 'utf-8') if v1 == v2: print 'Finished at page: ' + str(pageId) + '!' break for questionText in g.xpath_list('//a[@class="post_title"]'): f.write(smart_str(questionText.text_content().strip()) + '\n') print 'Page # ' + str(pageId) + ' parsed!'
def build_search_url(query, page=1, per_page=None, lang='en', filter=True, region=213, **kwargs): """ Build yandex search url with specified query and pagination options. :param per_page: 10, 20, 30, 50, 100 213 region is Moscow """ query = smart_str(query) url = 'http://yandex.ru/yandsearch?text=%s&lr=%s' % ( quote(query), region) if kwargs: url += '&' + urlencode(kwargs) url += '&p=%d' % (page - 1) return url
def main(tag): pageId = 0 f = open(tag + '.txt', 'w') f.write(tag + ':\n') while True: g = Grab() g.setup(timeout=60, connect_timeout=60) pageId += 1 g.go(p(tag, pageId)) v1 = g.xpath_text('//title') v2 = unicode("Хабрахабр — страница не найдена (404)", 'utf-8') if v1 == v2: print 'Finished at page: ' + str(pageId) + '!' break for questionText in g.xpath_list('//a[@class="post_title"]'): f.write(smart_str(questionText.text_content().strip()) + '\n') print 'Page # ' + str(pageId) + ' parsed!'
def build_search_url(query, page=None, per_page=None, lang=None, filter=None, **kwargs): """ Build google search url with specified query and pagination options. :param per_page: 10, 20, 30, 50, 100 kwargs: tbs=qdr:h tbs=qdr:d tbs=qdr:w tbs=qdr:m tbs=qdr:y """ if per_page is None: per_page = 10 if page is None: page = 1 if lang is None: lang = 'en' if filter is None: filter = True start = per_page * (page - 1) if not 'hl' in kwargs: kwargs['hl'] = lang if not 'num' in kwargs: kwargs['num'] = per_page if not 'start' in kwargs: kwargs['start'] = start if not 'filter' in kwargs: if not filter: kwargs['filter'] = '0' url = 'http://google.com/search?q=%s' % quote(smart_str(query)) if kwargs: url += '&' + urlencode(kwargs) return url
def build_search_url(query, page=1, per_page=None, lang='en', filter=True, region=213, **kwargs): """ Build yandex search url with specified query and pagination options. :param per_page: 10, 20, 30, 50, 100 213 region is Moscow """ query = smart_str(query) url = 'http://yandex.ru/yandsearch?text=%s&lr=%s' % (quote(query), region) if kwargs: url += '&' + urlencode(kwargs) url += '&p=%d' % (page - 1) return url
def build_search_url(query, page=None, per_page=None, lang=None, filter=None, **kwargs): """ Build google search url with specified query and pagination options. :param per_page: 10, 20, 30, 50, 100 kwargs: tbs=qdr:h tbs=qdr:d tbs=qdr:w tbs=qdr:m tbs=qdr:y """ if per_page is None: per_page = 10 if page is None: page = 1 if lang is None: lang = "en" if filter is None: filter = True start = per_page * (page - 1) if not "hl" in kwargs: kwargs["hl"] = lang if not "num" in kwargs: kwargs["num"] = per_page if not "start" in kwargs: kwargs["start"] = start if not "filter" in kwargs: if not filter: kwargs["filter"] = "0" url = "http://google.com/search?q=%s" % quote(smart_str(query)) if kwargs: url += "&" + urlencode(kwargs) return url
def get_url(url): g = Grab() g.go('http://clck.ru/--?url=%s' % quote(smart_str(url))) return g.response.body
def build_query_url(query): args = { 'query': urllib.quote_plus(smart_str(query)), #'hl': lang, } return BASE_SUGGEST_URL % args
def quote(data): return urllib.quote_plus(smart_str(data))
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config['nobody'] self.config_body_maxsize = grab.config['body_maxsize'] try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl(u'%s: %s' % (unicode(ex), grab.config['url'])) # py3 hack if not PY3K: request_url = smart_str(request_url) self.curl.setopt(pycurl.URL, request_url) self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config['follow_location'] else 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit']) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) #self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config['connection_reuse']: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.head_processor) if grab.config['body_inmemory']: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config['body_storage_dir']: raise error.GrabMisuseError('Option body_storage_dir is not defined') self.setup_body_file(grab.config['body_storage_dir'], grab.config['body_storage_filename']) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config['verbose_logging']: self.verbose_logging = True # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = random_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config['user_agent']: grab.config['user_agent'] = '' self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent']) if grab.config['debug']: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error #self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method == 'POST': self.curl.setopt(pycurl.POST, 1) if grab.config['multipart_post']: if isinstance(grab.config['multipart_post'], basestring): raise error.GrabMisuseError('multipart_post option could not be a string') post_items = normalize_http_values(grab.config['multipart_post'], charset=grab.config['charset']) # py3 hack if PY3K: post_items = decode_pairs(post_items, grab.config['charset']) #import pdb; pdb.set_trace() self.curl.setopt(pycurl.HTTPPOST, post_items) elif grab.config['post']: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack #if PY3K: # post_data = smart_unicode(post_data, grab.config['charset']) self.curl.setopt(pycurl.COPYPOSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, '') elif grab.request_method == 'PUT': data = grab.config['post'] if isinstance(data, unicode) or (not PY3K and not isinstance(data, basestring)): # py3 hack #if PY3K: # data = data.encode('utf-8') #else: raise error.GrabMisuseError('Value of post option could be only '\ 'byte string if PUT method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'PATCH': data = grab.config['post'] if isinstance(data, unicode) or not isinstance(data, basestring): # py3 hack if PY3K: data = data.encode('utf-8') else: raise error.GrabMisuseError('Value of post option could be only byte '\ 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'DELETE': self.curl.setopt(pycurl.CUSTOMREQUEST, 'delete') elif grab.request_method == 'HEAD': self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == 'UPLOAD': self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) else: raise error.GrabMisuseError('Invalid method: %s' % grab.request_method) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) header_tuples = [str('%s: %s' % x) for x\ in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config['referer']: self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) if grab.config['proxy']: self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) else: self.curl.setopt(pycurl.PROXY, '') if grab.config['proxy_userpwd']: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config['proxy_userpwd'])) if grab.config['proxy_type']: ptype = getattr(pycurl, 'PROXYTYPE_%s' % grab.config['proxy_type'].upper()) self.curl.setopt(pycurl.PROXYTYPE, ptype) if grab.config['encoding']: if 'gzip' in grab.config['encoding'] and not 'zlib' in pycurl.version: raise error.GrabMisuseError('You can not use gzip encoding because '\ 'pycurl was built without zlib support') self.curl.setopt(pycurl.ENCODING, grab.config['encoding']) if grab.config['userpwd']: self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd'])) if grab.config.get('interface') is not None: self.curl.setopt(pycurl.INTERFACE, grab.config['interface']) if grab.config.get('reject_file_size') is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config['reject_file_size'])
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config['nobody'] self.config_body_maxsize = grab.config['body_maxsize'] try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl(u'%s: %s' % (unicode(ex), grab.config['url'])) # py3 hack if not PY3K: request_url = smart_str(request_url) self.curl.setopt(pycurl.URL, request_url) self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config['follow_location'] else 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit']) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) #self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config['connection_reuse']: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.head_processor) if grab.config['body_inmemory']: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config['body_storage_dir']: raise error.GrabMisuseError( 'Option body_storage_dir is not defined') self.setup_body_file(grab.config['body_storage_dir'], grab.config['body_storage_filename']) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config['verbose_logging']: self.verbose_logging = True # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = random_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config['user_agent']: grab.config['user_agent'] = '' self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent']) if grab.config['debug']: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error #self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method == 'POST': self.curl.setopt(pycurl.POST, 1) if grab.config['multipart_post']: if isinstance(grab.config['multipart_post'], basestring): raise error.GrabMisuseError( 'multipart_post option could not be a string') post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset']) # py3 hack if PY3K: post_items = decode_pairs(post_items, grab.config['charset']) #import pdb; pdb.set_trace() self.curl.setopt(pycurl.HTTPPOST, post_items) elif grab.config['post']: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack #if PY3K: # post_data = smart_unicode(post_data, grab.config['charset']) self.curl.setopt(pycurl.COPYPOSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, '') elif grab.request_method == 'PUT': data = grab.config['post'] if isinstance(data, unicode) or (not PY3K and not isinstance(data, basestring)): # py3 hack #if PY3K: # data = data.encode('utf-8') #else: raise error.GrabMisuseError('Value of post option could be only '\ 'byte string if PUT method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'PATCH': data = grab.config['post'] if isinstance(data, unicode) or not isinstance(data, basestring): # py3 hack if PY3K: data = data.encode('utf-8') else: raise error.GrabMisuseError('Value of post option could be only byte '\ 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'DELETE': self.curl.setopt(pycurl.CUSTOMREQUEST, 'delete') elif grab.request_method == 'HEAD': self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == 'UPLOAD': self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) else: raise error.GrabMisuseError('Invalid method: %s' % grab.request_method) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) header_tuples = [str('%s: %s' % x) for x\ in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config['referer']: self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) if grab.config['proxy']: self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) else: self.curl.setopt(pycurl.PROXY, '') if grab.config['proxy_userpwd']: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config['proxy_userpwd'])) if grab.config['proxy_type']: ptype = getattr(pycurl, 'PROXYTYPE_%s' % grab.config['proxy_type'].upper()) self.curl.setopt(pycurl.PROXYTYPE, ptype) if grab.config['encoding']: if 'gzip' in grab.config[ 'encoding'] and not 'zlib' in pycurl.version: raise error.GrabMisuseError('You can not use gzip encoding because '\ 'pycurl was built without zlib support') self.curl.setopt(pycurl.ENCODING, grab.config['encoding']) if grab.config['userpwd']: self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd'])) if grab.config.get('interface') is not None: self.curl.setopt(pycurl.INTERFACE, grab.config['interface']) if grab.config.get('reject_file_size') is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config['reject_file_size'])