def __init__(self, name): self.db = Connection(config.db_host, config.db_db, config.db_user, config.db_password) self.logger = fn.init_file_logger(name + '.log') self.urlpool = UrlPool(name) self.hub_hosts = None self.load_hubs()
def run(self): while self.alive: time.sleep(1) walk_url = TaskQueue.getInstance().get() if not walk_url: self.emptycount = self.emptycount - 1 if self.emptycount == 0: return continue self.emptycount = self.initempty _level = walk_url[1] _url = walk_url[0] if _level > self.level: continue if UrlPool.getInstance().exist(_url): continue UrlPool.getInstance().put_url(_url) try: html = self.down_load_html(_url, coding="gb2312") except Exception, e: print e continue proxydata = self.proxy_parser(html) _level = _level + 1 if len(proxydata): ProxyData.getInstance().put(proxydata) _level = 0 link_list = self.get_link(html, _level, _url) for link in link_list: if not UrlPool.getInstance().exist(link[0]): TaskQueue.getInstance().put(link)
def run(self): while self.alive: time.sleep(1) walk_url = TaskQueue.getInstance().get() if not walk_url: self.emptycount = self.emptycount - 1 if self.emptycount == 0 : return continue self.emptycount = self.initempty _level = walk_url[1] _url = walk_url[0] if _level > self.level: continue if UrlPool.getInstance().exist(_url): continue UrlPool.getInstance().put_url(_url) try: html = self.down_load_html(_url,coding="gb2312") except Exception,e: print e continue proxydata = self.proxy_parser(html) _level = _level + 1 if len(proxydata): ProxyData.getInstance().put(proxydata) _level = 0 link_list = self.get_link(html,_level,_url) for link in link_list: if not UrlPool.getInstance().exist(link[0]): TaskQueue.getInstance().put(link)
class NewsCrawlerAsync: def __init__(self, name): self._workers = 0 self._workers_max = 5 self.logger = fn.init_file_logger(name + '.log') self.urlpool = UrlPool(name) self.loop = asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.db = motor.motor_asyncio.AsyncIOMotorClient(config.MONGO_URI)["haodaifu"] async def load_hubs(self,): data = self.db.doctor_para.find({"status": {"$ne": True}}) urls = [] async for d in data: urls.append("{}&{}".format(d['faculty'], d["page"])) self.urlpool.addmany(urls) async def process(self, url): para = {"faculty": url.split("&")[0], "page": url.split("&")[1]} status, html = await fn.fetch(self.session, para) self.urlpool.set_status(url, status) if status != 200: return await self.db.doctors.insert_one(html) await self.db.doctor_para.update_one(para, {"$set": {"status": True}}) self._workers -= 1 async def loop_crawl(self): await self.load_hubs() last_rating_time = time.time() counter = 0 while True: tasks = self.urlpool.db.pop_from_redis(self._workers_max) if not tasks: print('no url to crawl, sleep 10S') await asyncio.sleep(10) continue for url in tasks: self._workers += 1 counter += 1 print('crawl:', url, self._workers, counter) asyncio.ensure_future(self.process(url)) gap = time.time() - last_rating_time if gap > 5: rate = counter / gap print('\tloop_crawl() rate:%s, counter: %s, workers: %s' % (round(rate, 2), counter, self._workers)) last_rating_time = time.time() counter = 0 if self._workers >= self._workers_max: print('====== got workers_max, sleep 3 sec to next worker =====') await asyncio.sleep(1) def run(self): try: self.loop.run_until_complete(self.loop_crawl()) except KeyboardInterrupt: print('stopped by yourself!') del self.urlpool pass
def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, max_in_mem=100000, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, max_in_mem=max_in_mem, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler') self.failed_urls = {} # it is for hight priority url to download, # start() will get url from this queque firstly self.urlqueue = gevent.queue.Queue()
def __init__(self, name): self._workers = 0 self._workers_max = 5 self.logger = fn.init_file_logger(name + '.log') self.urlpool = UrlPool(name) self.loop = asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.db = motor.motor_asyncio.AsyncIOMotorClient(config.MONGO_URI)["haodaifu"]
def __init__(self, name): self._workers = 0 self._workers_max = 30 self.logger = fn.init_file_logger(name + '.log') self.urlpool = UrlPool(name) self.loop = asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.db = sanicdb.SanicDB(config.db_host, config.db_db, config.db_user, config.db_password, loop=self.loop)
def __init__(self): self._workers = 0 self._workers_max = 5 self.urlpool = UrlPool() self.loop = asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.db = motor.motor_asyncio.AsyncIOMotorClient( config.MONGO_URI)["cninfo"]
def __init__(self , keyword = []): self.keyword.extend(keyword) if not isinstance(keyword, list): raise TypeError("KEY_EORD_MUST_BE_LIST") for _kw in keyword: searchword = self.keyword.pop() url = self.baidu_search(searchword) if not UrlPool.getInstance().exist(url): TaskQueue.getInstance().put((url,-10))
def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler')
def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, worker_conf_file='', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler')
class XCrawler(object): '''index key-value: {url: state} , state: 'task': the url is pending as a task 'done': the url has been download seccessfully ''' def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, worker_conf_file='', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler') def _worker(self, url): ''' do a task ''' try: self.logger.info('start a worker: [%s]' % self._workers) proxy, status_code, html, url_real = self.downloader(url) if not proxy and status_code == self._http_exception_code: status_code, html = self.special_downloader(url) if status_code == 200: new_urls = self.processor(url_real, html) self.urlpool.set_url_done(url) self.urlpool.set_url_done(url_real) if new_urls: self.urlpool.addmany(new_urls, self.is_good_link) else: self.logger.error('%sfailed download: %s, [%s]%s' % ( RED, url, status_code, NOR, )) if proxy: self.urlpool.set_url_404(url) self.urlpool.add(url) elif (status_code == self._http_exception_code or status_code >= 400): #don't try more if no proxy self.urlpool.set_url_bad(url) else: self.urlpool.add(url) except: traceback.print_exc() self._workers -= 1 def dynamic_max_working(self,): try: ns = open(self.worker_conf_file).read() ns = int(ns) self.max_working = ns except: pass msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % ( BRO, self.max_working, GRE, self._workers, NOR) print msg def start(self): self.init_urlpool() spawn(self.main_parallel_task_loop) self.dynamic_max_working() while 1: print '%sworkers left: %s%s' % ( GRE, self._workers, NOR ) self.dynamic_max_working() #if self._workers >= self.max_working: # gevent.sleep(2) # continue for i in xrange(self.max_working): url = self.urlpool.pop() if not url: break spawn(self._worker, url) self._workers += 1 #print 'start worker: ', self._workers if self._workers >= self.max_working: break # wait for workers to start gevent.sleep(3) def main_parallel_task_loop(self,): '''define the task to do in a main-parallel loop''' return def is_ip_blocked(self, url, html): ''' find ip blocked info in redirected url or html ''' return False def is_good_link(self, url): ''' filter url which you don't want to download re-implement if needs ''' return True def init_urlpool(self, urls=None): ''' init url pool with urls re-implement your own if need ''' pass def special_downloader(self, url, timeout=20): ''' define supplementary to self.downloader() e.g. use special proxy to try in Exception in self.downloader() ''' return (self._http_exception_code, '') def downloader(self, url, timeout=20): ''' download url to get html re-implement your own if need ''' if not self.headers: headers = { 'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)', } else: headers = self.headers proxy, to_sleep = self.proxypool.get(url) if to_sleep > 10: print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep)) status_code = self._http_exception_code html = '' url_real = url try: self.logger.debug('%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR)) if to_sleep: gevent.sleep(to_sleep) if proxy: timeout = 25 r = requests.get(url, headers=headers, timeout=timeout, proxies=proxy) else: r = requests.get(url, headers=headers, timeout=timeout) html = r.content url_real = r.url.encode('utf8') ## get the redirected url status_code = r.status_code if self.is_ip_blocked(r.url, html): html = '' status_code = 400 self.proxypool._pool.remove(proxy) print '%sremove proxy: %s, pool size: %s%s' % ( BRO, str(proxy), len(self.proxypool._pool), NOR) except: traceback.print_exc() html = '' #if status_code == 200: # self.proxypool.record_proxy_state(proxy, self.proxypool.SUCCESS) #else: #print status_code, url, len(html) # self.proxypool.record_proxy_state(proxy, self.proxypool.FAILED) return (proxy, status_code, html, url_real) def processor(self, url, html): ''' process the html from downloader e.g. extract URL, title, content and other info save the info extracted from html to DB ''' new_urls = [] return new_urls
class NewsCrawlerSync: def __init__(self, name): self.db = Connection(config.db_host, config.db_db, config.db_user, config.db_password) self.logger = fn.init_file_logger(name + '.log') self.urlpool = UrlPool(name) self.hub_hosts = None self.load_hubs() def load_hubs(self, ): sql = 'select url from crawler_hub' data = self.db.query(sql) self.hub_hosts = set() hubs = [] for d in data: host = urlparse.urlparse(d['url']).netloc self.hub_hosts.add(host) hubs.append(d['url']) self.urlpool.set_hubs(hubs, 300) def save_to_db(self, url, html): urlhash = farmhash.hash64(url) sql = 'select url from crawler_html where urlhash=%s' d = self.db.get(sql, urlhash) if d: if d['url'] != url: msg = 'farmhash collision: %s <=> %s' % (url, d['url']) self.logger.error(msg) return True if isinstance(html, str): html = html.encode('utf8') html_lzma = lzma.compress(html) sql = ('insert into crawler_html(urlhash, url, html_lzma) ' 'values(%s, %s, %s)') good = False try: self.db.execute(sql, urlhash, url, html_lzma) good = True except Exception as e: if e.args[0] == 1062: # Duplicate entry good = True pass else: traceback.print_exc() raise e return good def filter_good(self, urls): goodlinks = [] for url in urls: host = urlparse.urlparse(url).netloc if host in self.hub_hosts: goodlinks.append(url) return goodlinks def process(self, url, ishub): status, html, redirected_url = fn.downloader(url) self.urlpool.set_status(url, status) if redirected_url != url: self.urlpool.set_status(redirected_url, status) # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接,按需提取 if status != 200: return if ishub: newlinks = fn.extract_links_re(redirected_url, html) goodlinks = self.filter_good(newlinks) print("%s/%s, goodlinks/newlinks" % (len(goodlinks), len(newlinks))) self.urlpool.addmany(goodlinks) else: self.save_to_db(redirected_url, html) def run(self, ): while 1: urls = self.urlpool.pop(5) for url, ishub in urls.items(): self.process(url, ishub)
#!/usr/bin/env python3 # encoding: utf8 # author: veelion # file: bee_server.py from sanic import Sanic from sanic import response from urlpool import UrlPool urlpool = UrlPool(__file__) # 初始化urlpool,根据你的需要进行修改 hub_urls = [] urlpool.set_hubs(hub_urls, 300) urlpool.add('https://news.sina.com.cn/') # init app = Sanic(__name__) @app.listener('after_server_stop') async def cache_urlpool(app, loop): global urlpool print('caching urlpool after_server_stop') del urlpool print('bye!') @app.route('/task') async def task_get(request):
#!/usr/bin/env python3 # encoding: utf8 # author: veelion # file: bee_server.py from sanic import Sanic from sanic import response from urlpool import UrlPool urlpool = UrlPool(__file__) # 初始化urlpool,根据你的需要进行修改 urlpool.add('https://news.sina.com.cn/') # init app = Sanic(__name__) @app.listener('after_server_stop') async def cache_urlpool(app, loop): global urlpool print('caching urlpool after_server_stop') del urlpool print('bye!') @app.route('/task') async def task_get(request): count = request.args.get('count', 10) try:
class NewsCrawlerAsync: def __init__(self, name): self._workers = 0 self._workers_max = 30 self.logger = fn.init_file_logger(name + '.log') self.urlpool = UrlPool(name) self.loop = asyncio.get_event_loop() self.session = aiohttp.ClientSession(loop=self.loop) self.db = sanicdb.SanicDB(config.db_host, config.db_db, config.db_user, config.db_password, loop=self.loop) async def load_hubs(self, ): sql = 'select url from crawler_hub' data = await self.db.query(sql) self.hub_hosts = set() hubs = [] for d in data: host = urlparse.urlparse(d['url']).netloc self.hub_hosts.add(host) hubs.append(d['url']) self.urlpool.set_hubs(hubs, 300) async def save_to_db(self, url, html): urlhash = farmhash.hash64(url) sql = 'select url from crawler_html where urlhash=%s' d = await self.db.get(sql, urlhash) if d: if d['url'] != url: msg = 'farmhash collision: %s <=> %s' % (url, d['url']) self.logger.error(msg) return True if isinstance(html, str): html = html.encode('utf8') html_lzma = lzma.compress(html) sql = ('insert into crawler_html(urlhash, url, html_lzma) ' 'values(%s, %s, %s)') good = False try: await self.db.execute(sql, urlhash, url, html_lzma) good = True except Exception as e: if e.args[0] == 1062: # Duplicate entry good = True pass else: traceback.print_exc() raise e return good def filter_good(self, urls): goodlinks = [] for url in urls: host = urlparse.urlparse(url).netloc if host in self.hub_hosts: goodlinks.append(url) return goodlinks async def process(self, url, ishub): status, html, redirected_url = await fn.fetch(self.session, url) self.urlpool.set_status(url, status) if redirected_url != url: self.urlpool.set_status(redirected_url, status) # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接,按需提取 if status != 200: return if ishub: newlinks = fn.extract_links_re(redirected_url, html) goodlinks = self.filter_good(newlinks) print("%s/%s, goodlinks/newlinks" % (len(goodlinks), len(newlinks))) self.urlpool.addmany(goodlinks) else: await self.save_to_db(redirected_url, html) self._workers -= 1 async def loop_crawl(self, ): await self.load_hubs() last_rating_time = time.time() counter = 0 while 1: tasks = self.urlpool.pop(self._workers_max) if not tasks: print('no url to crawl, sleep') await asyncio.sleep(3) continue for url, ishub in tasks.items(): self._workers += 1 counter += 1 print('crawl:', url) asyncio.ensure_future(self.process(url, ishub)) gap = time.time() - last_rating_time if gap > 5: rate = counter / gap print('\tloop_crawl() rate:%s, counter: %s, workers: %s' % (round(rate, 2), counter, self._workers)) last_rating_time = time.time() counter = 0 if self._workers > self._workers_max: print( '====== got workers_max, sleep 3 sec to next worker =====') await asyncio.sleep(3) def run(self): try: self.loop.run_until_complete(self.loop_crawl()) except KeyboardInterrupt: print('stopped by yourself!') del self.urlpool pass
class XCrawler(object): '''index key-value: {url: state} , state: 'task': the url is pending as a task 'done': the url has been download seccessfully ''' def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler') def _worker(self, url): ''' do a task ''' try: self.logger.info('start a worker: [%s]' % self._workers) proxy, status_code, html, url_real = self.downloader(url) if not proxy and status_code == self._http_exception_code: status_code, html = self.special_downloader(url) if status_code == 200: new_urls = self.processor(url_real, html) self.urlpool.set_url_done(url) self.urlpool.set_url_done(url_real) if new_urls: self.urlpool.addmany(new_urls, self.is_good_link) else: self.logger.error('%sfailed download: %s, [%s]%s' % ( RED, url, status_code, NOR, )) if proxy: self.urlpool.set_url_404(url) self.urlpool.add(url) elif (status_code == self._http_exception_code or status_code >= 400): #don't try more if no proxy self.urlpool.set_url_bad(url) else: self.urlpool.add(url) except: traceback.print_exc() self._workers -= 1 def dynamic_max_working(self,): try: ns = open(self.worker_conf_file).read() ns = int(ns) self.max_working = ns except: import os cmd = 'echo %s > %s' % (self.max_working, self.worker_conf_file) print '!!!!!! ', cmd os.system(cmd) pass msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % ( BRO, self.max_working, GRE, self._workers, NOR) print msg def start(self): self.init_urlpool() spawn(self.main_parallel_task_loop) self.dynamic_max_working() while 1: print '%sworkers left: %s%s' % ( GRE, self._workers, NOR ) self.dynamic_max_working() #if self._workers >= self.max_working: # gevent.sleep(2) # continue for i in xrange(self.max_working): if self._workers >= self.max_working: break url = self.urlpool.pop() if not url: break spawn(self._worker, url) self._workers += 1 #print 'start worker: ', self._workers # wait for workers to start gevent.sleep(3) def main_parallel_task_loop(self,): '''define the task to do in a main-parallel loop''' return def is_ip_blocked(self, url, html): ''' find ip blocked info in redirected url or html ''' return False def is_good_link(self, url): ''' filter url which you don't want to download re-implement if needs ''' return True def init_urlpool(self, urls=None): ''' init url pool with urls re-implement your own if need ''' pass def special_downloader(self, url, timeout=20): ''' define supplementary to self.downloader() e.g. use special proxy to try in Exception in self.downloader() ''' return (self._http_exception_code, '') def downloader(self, url, timeout=20): ''' download url to get html re-implement your own if need ''' if not self.headers: headers = { 'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)', } else: headers = self.headers proxy, to_sleep = self.proxypool.get(url) if to_sleep > 10: print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep)) status_code = self._http_exception_code html = '' url_real = url try: self.logger.debug('%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR)) if to_sleep: gevent.sleep(to_sleep) if proxy: timeout = 25 r = requests.get(url, headers=headers, timeout=timeout, proxies=proxy) else: r = requests.get(url, headers=headers, timeout=timeout) html = r.content url_real = r.url.encode('utf8') ## get the redirected url status_code = r.status_code if self.is_ip_blocked(r.url, html): html = '' status_code = 400 self.proxypool._pool.remove(proxy) print '%sremove proxy: %s, pool size: %s%s' % ( BRO, str(proxy), len(self.proxypool._pool), NOR) except: traceback.print_exc() html = '' #if status_code == 200: # self.proxypool.record_proxy_state(proxy, self.proxypool.SUCCESS) #else: #print status_code, url, len(html) # self.proxypool.record_proxy_state(proxy, self.proxypool.FAILED) return (proxy, status_code, html, url_real) def processor(self, url, html): ''' process the html from downloader e.g. extract URL, title, content and other info save the info extracted from html to DB ''' new_urls = [] return new_urls
class XCrawler(object): '''index key-value: {url: state} , state: 'task': the url is pending as a task 'done': the url has been download seccessfully ''' def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, max_in_mem=100000, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, max_in_mem=max_in_mem, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler') self.failed_urls = {} # it is for hight priority url to download, # start() will get url from this queque firstly self.urlqueue = gevent.queue.Queue() def _worker(self, url): ''' do a task ''' try: self.logger.info('start a worker: [%s]' % self._workers) proxy, status_code, html, url_real = self.downloader(url) if not proxy and status_code == self._http_exception_code: status_code, html = self.special_downloader(url) if status_code == 200: new_urls = self.processor(url_real, html) self.urlpool.set_url_done(url) self.urlpool.set_url_done(url_real) if new_urls: self.urlpool.addmany(new_urls) else: self.logger.info('%sfailed download: %s, [%s]%s' % ( RED, url, status_code, NOR, )) if proxy: self.urlpool.set_url_404(url) self.urlpool.add(url) elif (status_code == self._http_exception_code or status_code >= 400): # don't try more if no proxy self.urlpool.set_url_bad(url) else: t = self.failed_urls.get(url, 0) if t == 0: self.failed_urls[url] = 1 self.urlpool.add(url) if t < 3: self.failed_urls[url] += 1 self.urlpool.add(url) if t >= 3: self.urlpool.set_url_bad(url) except: traceback.print_exc() self._workers -= 1 def dynamic_max_working(self,): changed = False try: ns = open(self.worker_conf_file).read() ns = int(ns) if ns != self.max_working: changed = True self.max_working = ns else: changed = False except: import os cmd = 'echo %s > %s' % (self.max_working, self.worker_conf_file) print '!!!!!! ', cmd os.system(cmd) pass if changed: msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % ( BRO, self.max_working, GRE, self._workers, NOR) print msg def start(self): self.init_urlpool() spawn(self.main_parallel_task_loop) self.dynamic_max_working() self.last_special_crawl = 0 while 1: print '%sworkers left: %s%s' % ( GRE, self._workers, NOR ) self.dynamic_max_working() for i in xrange(self.max_working): if self._workers >= self.max_working: gevent.sleep(10) break try: url = self.urlqueue.get_nowait() except: url = self.urlpool.pop() gap = self.special_crawl_gap(url) skip_special = False if gap > 0: to_sleep = gap - (time.time() - self.last_special_crawl) if to_sleep > 0: print '\tskip special:' time.sleep(1) self.urlpool.add(url, always=True) skip_special = True else: self.last_special_crawl = time.time() if skip_special: continue if not url: break spawn(self._worker, url) self._workers += 1 # wait for workers to start gevent.sleep(3) def main_parallel_task_loop(self,): '''define the task to do in a main-parallel loop''' return def special_crawl_gap(self, url): ''' re-define if sleep some time for this url ''' return 0 def is_ip_blocked(self, url, html): ''' find ip blocked info in redirected url or html ''' return False def is_good_link(self, url): ''' filter url which you don't want to download re-implement if needs ''' return True def init_urlpool(self, urls=None): ''' init url pool with urls re-implement your own if need ''' pass def special_downloader(self, url, timeout=20): ''' define supplementary to self.downloader() e.g. use special proxy to try in Exception in self.downloader() ''' return (self._http_exception_code, '') def downloader(self, url, timeout=20): ''' download url to get html re-implement your own if need ''' if not self.headers: ua = ('Mozilla/5.0 (compatible; MSIE 9.0; ' 'Windows NT 6.1; Win64; x64; Trident/5.0)') headers = { 'User-Agent': ua, } else: headers = self.headers proxy, to_sleep = self.proxypool.get(url) if to_sleep > 10: print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep)) status_code = self._http_exception_code html = '' url_real = url try: msg = '%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR) self.logger.debug(msg) if to_sleep: gevent.sleep(to_sleep) if proxy: timeout = 25 r = requests.get(url, headers=headers, timeout=timeout, proxies=proxy) else: r = requests.get(url, headers=headers, timeout=timeout) html = r.content url_real = r.url.encode('utf8') # get the redirected url status_code = r.status_code if self.is_ip_blocked(r.url, html): html = '' status_code = 400 self.proxypool._pool.remove(proxy) print '%sremove proxy: %s, pool size: %s%s' % ( BRO, str(proxy), len(self.proxypool._pool), NOR) except: # traceback.print_exc() html = '' return (proxy, status_code, html, url_real) def processor(self, url, html): ''' process the html from downloader e.g. extract URL, title, content and other info save the info extracted from html to DB ''' new_urls = [] return new_urls