Пример #1
0
    def install(self, options):
        self.event = Event()
        sm = SpiderManager()
        self.spider_class = sm.create(options.spider)
        spider_qk = "spider:q:%s" % self.spider_class.name
        schedule_qk = "schedule:q:%s" % self.spider_class.name

        self.log = Logger().getlog(level=logging.getLevelName(options.level))
        self.spider_queue = Queue(spider_qk, self.redis)
        self.schedule_queue = Queue(schedule_qk, self.redis)
        self.pipeline_queue = queue.Queue()

        self.fp = Dupefilter.from_crawler(self)
        self.fp.open(self.spider_class.name)

        self.forever = options.forever
        self.running = False
        if options.urls:
            self.addition_urls = options.urls.split(',')
        else:
            self.addition_urls = []
        self.schedule = Schedule(self)
Пример #2
0
class Crawler(object):
    def __init__(self):
        self.concurrent_num = settings.CONCURRENT_NUM
        self.spider_pool = Pool(self.concurrent_num)
        self.queue_key = ''

        self.session = requests.Session()
        self.session.headers = {"Accept:text/html": "application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                                "Accept-Encoding": "gzip,deflate,sdch",
                                "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4",
                                "Cache-Control": "max-age=0",
                                "Connection": "keep-alive",
                                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"}

        self.redis = redis.Redis(host=settings.REDIS_HOST,
                                 port=settings.REDIS_PORT,
                                 db=settings.REDIS_DB)

    def install(self, options):
        self.event = Event()
        sm = SpiderManager()
        self.spider_class = sm.create(options.spider)
        spider_qk = "spider:q:%s" % self.spider_class.name
        schedule_qk = "schedule:q:%s" % self.spider_class.name

        self.log = Logger().getlog(level=logging.getLevelName(options.level))
        self.spider_queue = Queue(spider_qk, self.redis)
        self.schedule_queue = Queue(schedule_qk, self.redis)
        self.pipeline_queue = queue.Queue()

        self.fp = Dupefilter.from_crawler(self)
        self.fp.open(self.spider_class.name)

        self.forever = options.forever
        self.running = False
        if options.urls:
            self.addition_urls = options.urls.split(',')
        else:
            self.addition_urls = []
        self.schedule = Schedule(self)

    def do_pipeline(self, spider):
        pipelines = [class_import(ppclass).from_crawler(self)
                     for ppclass in spider.pipelines]
        while True:
            try:
                obj = self.pipeline_queue.get()
            except queue.Empty:
                gevent.sleep(1)
            else:
                for ppl in pipelines:
                    obj = ppl.process(obj)

    def run(self):
        if len(self.schedule_queue) == 0 and len(self.spider_queue) == 0:
            for url in set(self.spider_class.start_urls + self.addition_urls):
                req = Request(url)
                self.schedule_queue.put(req)
        else:
            self.log.info("Continue run crawler, schedule queue %d"
                          % len(self.schedule_queue))

        for i in xrange(self.concurrent_num):
            spd = self.spider_class.from_crawler(self)
            self.spider_pool.start(spd)

        gevent.spawn(self.schedule.start).join()
        self.log.info("Crawl finish.")

    def _even(self):
        return self.forever or (
            self.spider_queue.qlen() > 0 or self.schedule_queue.qlen() > 0) \
            or self.running

    def stop(self):
        self.event.set()
        self.fp.close()