def install(self, options): self.event = Event() sm = SpiderManager() self.spider_class = sm.create(options.spider) spider_qk = "spider:q:%s" % self.spider_class.name schedule_qk = "schedule:q:%s" % self.spider_class.name self.log = Logger().getlog(level=logging.getLevelName(options.level)) self.spider_queue = Queue(spider_qk, self.redis) self.schedule_queue = Queue(schedule_qk, self.redis) self.pipeline_queue = queue.Queue() self.fp = Dupefilter.from_crawler(self) self.fp.open(self.spider_class.name) self.forever = options.forever self.running = False if options.urls: self.addition_urls = options.urls.split(',') else: self.addition_urls = [] self.schedule = Schedule(self)
class Crawler(object): def __init__(self): self.concurrent_num = settings.CONCURRENT_NUM self.spider_pool = Pool(self.concurrent_num) self.queue_key = '' self.session = requests.Session() self.session.headers = {"Accept:text/html": "application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip,deflate,sdch", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4", "Cache-Control": "max-age=0", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"} self.redis = redis.Redis(host=settings.REDIS_HOST, port=settings.REDIS_PORT, db=settings.REDIS_DB) def install(self, options): self.event = Event() sm = SpiderManager() self.spider_class = sm.create(options.spider) spider_qk = "spider:q:%s" % self.spider_class.name schedule_qk = "schedule:q:%s" % self.spider_class.name self.log = Logger().getlog(level=logging.getLevelName(options.level)) self.spider_queue = Queue(spider_qk, self.redis) self.schedule_queue = Queue(schedule_qk, self.redis) self.pipeline_queue = queue.Queue() self.fp = Dupefilter.from_crawler(self) self.fp.open(self.spider_class.name) self.forever = options.forever self.running = False if options.urls: self.addition_urls = options.urls.split(',') else: self.addition_urls = [] self.schedule = Schedule(self) def do_pipeline(self, spider): pipelines = [class_import(ppclass).from_crawler(self) for ppclass in spider.pipelines] while True: try: obj = self.pipeline_queue.get() except queue.Empty: gevent.sleep(1) else: for ppl in pipelines: obj = ppl.process(obj) def run(self): if len(self.schedule_queue) == 0 and len(self.spider_queue) == 0: for url in set(self.spider_class.start_urls + self.addition_urls): req = Request(url) self.schedule_queue.put(req) else: self.log.info("Continue run crawler, schedule queue %d" % len(self.schedule_queue)) for i in xrange(self.concurrent_num): spd = self.spider_class.from_crawler(self) self.spider_pool.start(spd) gevent.spawn(self.schedule.start).join() self.log.info("Crawl finish.") def _even(self): return self.forever or ( self.spider_queue.qlen() > 0 or self.schedule_queue.qlen() > 0) \ or self.running def stop(self): self.event.set() self.fp.close()