Exemplo n.º 1
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        connection_url = settings.get("RABBITMQ_CONNECTION_PARAMETERS")
        queue_class = load_object(settings.get("SCHEDULER_QUEUE_CLASS"))
        dupefilter_cls = load_object(settings["DUPEFILTER_CLASS"])
        dupefilter = create_instance(dupefilter_cls, settings, crawler)
        pqclass = load_object(settings["SCHEDULER_PRIORITY_QUEUE"])
        if pqclass is PriorityQueue:
            warnings.warn(
                "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
                " is no longer supported because of API changes; "
                "please use 'scrapy.pqueues.ScrapyPriorityQueue'",
                ScrapyDeprecationWarning,
            )
            from scrapy.pqueues import ScrapyPriorityQueue

            pqclass = ScrapyPriorityQueue

        dqclass = load_object(settings["SCHEDULER_DISK_QUEUE"])
        mqclass = load_object(settings["SCHEDULER_MEMORY_QUEUE"])
        logunser = settings.getbool("SCHEDULER_DEBUG")
        return cls(
            dupefilter,
            connection_url,
            jobdir=job_dir(settings),
            logunser=logunser,
            stats=crawler.stats,
            pqclass=pqclass,
            dqclass=dqclass,
            mqclass=mqclass,
            crawler=crawler,
            queue_class=queue_class,
        )
Exemplo n.º 2
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = create_instance(dupefilter_cls, settings, crawler)
        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
        if pqclass is PriorityQueue:
            warnings.warn(
                "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
                " is no longer supported because of API changes; "
                "please use 'scrapy.pqueues.ScrapyPriorityQueue'",
                ScrapyDeprecationWarning)
            from scrapy.pqueues import ScrapyPriorityQueue
            pqclass = ScrapyPriorityQueue

        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('SCHEDULER_DEBUG')
        return cls(dupefilter,
                   jobdir=job_dir(settings),
                   logunser=logunser,
                   stats=crawler.stats,
                   pqclass=pqclass,
                   dqclass=dqclass,
                   mqclass=mqclass,
                   crawler=crawler)
Exemplo n.º 3
0
    def from_crawler(cls, crawler, **spider_kwargs):
        settings = crawler.settings
        kwargs = {
            'filter_storage_path': settings.get('FILTER_STORAGE_PATH', ''),
            'item_storage_path': settings.get('ITEM_STORAGE_PATH', ''),
        }
        kwargs.update(spider_kwargs)
        spider_kwargs = kwargs
        spider = super(EndpointSpider,
                       cls).from_crawler(crawler, **spider_kwargs)
        spider.stats = crawler.stats

        jobdir = job_dir(settings)
        generated = False
        if jobdir:
            queuecls = load_object(settings['SCHEDULER_DISK_QUEUE'])
            queuedir = os.path.join(jobdir, 'startrequests.queue')
            if os.path.exists(queuedir):
                generated = True
            spider.requestqueue = queuecls(os.path.join(queuedir, '0'))
        else:
            queuecls = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
            spider.requestqueue = queuecls()
        if not generated:
            for x in spider.generate_start_requests():
                spider.enqueue_start_request(x)

        crawler.signals.connect(spider.spider_closed,
                                signal=signals.spider_closed)
        return spider
Exemplo n.º 4
0
 def from_crawler(cls, crawler):
     obj = cls(job_dir(crawler.settings))
     crawler.signals.connect(obj.spider_closed,
                             signal=signals.spider_closed)
     crawler.signals.connect(obj.spider_opened,
                             signal=signals.spider_opened)
     return obj
 def from_crawler(cls, crawler):
     if not crawler.spider.islinkgenerator:
         settings = crawler.settings
         persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
         queue_key = "%s:requests" % crawler.spider.name
         queue_cls = queue.SpiderQueue
         idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE',
                                          IDLE_BEFORE_CLOSE)
         server = connection.from_settings(settings, crawler.spider.name)
         stats = crawler.stats
         return cls(server, persist, queue_key, queue_cls,
                    idle_before_close, stats)
     else:
         settings = crawler.settings
         dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
         dupefilter = dupefilter_cls.from_settings(settings)
         pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
         dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
         mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
         logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
                                     settings.getbool('SCHEDULER_DEBUG'))
         core_scheduler = load_object('scrapy.core.scheduler.Scheduler')
         return core_scheduler(dupefilter,
                               jobdir=job_dir(settings),
                               logunser=logunser,
                               stats=crawler.stats,
                               pqclass=pqclass,
                               dqclass=dqclass,
                               mqclass=mqclass)
Exemplo n.º 6
0
 def from_crawler(cls, crawler):
     """
     类方法,按照配置文件settings中的配置项,生成调度器实例。
     :param crawler: 爬虫类
     :return: 实例化调度器(调用__init__方法)
     """
     settings = crawler.settings
     # 链接去重器 scrapy.dupefilters.RFPDupeFilter
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     # 优先队列类 queuelib.PriorityQueue
     pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
     # 磁盘队列类 scrapy.squeues.PickleLifoDiskQueue
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     # 内存队列类 scrapy.squeues.LifoMemoryQueue
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     # 日志
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
                                 settings.getbool('SCHEDULER_DEBUG'))
     return cls(dupefilter,
                jobdir=job_dir(settings),
                logunser=logunser,
                stats=crawler.stats,
                pqclass=pqclass,
                dqclass=dqclass,
                mqclass=mqclass)
Exemplo n.º 7
0
 def from_settings(cls, settings):
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
     return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser)
Exemplo n.º 8
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     rqclass = load_object(settings['SCHEDULER_RABBIT_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
     return cls(dupefilter, job_dir(settings), rqclass, logunser, crawler.stats)
Exemplo n.º 9
0
    def from_crawler(cls, crawler, **spider_kwargs):
        settings = crawler.settings
        kwargs = {
            'filter_storage_path': settings.get('FILTER_STORAGE_PATH', ''),
            'item_storage_path': settings.get('ITEM_STORAGE_PATH', ''),
        }
        kwargs.update(spider_kwargs)
        spider_kwargs = kwargs
        spider = super(EndpointSpider, cls).from_crawler(crawler, **spider_kwargs)
        spider.stats = crawler.stats
        
        jobdir = job_dir(settings)
        generated = False
        if jobdir:
            queuecls = load_object(settings['SCHEDULER_DISK_QUEUE'])
            queuedir = os.path.join(jobdir, 'startrequests.queue')
            if os.path.exists(queuedir):
                generated = True
            spider.requestqueue = queuecls(os.path.join(queuedir, '0'))
        else:
            queuecls = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
            spider.requestqueue = queuecls()
        if not generated:
            for x in spider.generate_start_requests():
                spider.enqueue_start_request(x)

        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
        return spider
Exemplo n.º 10
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        # 从配置中获取指纹过滤器类, 见scrapy/dupefilters.py
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = create_instance(dupefilter_cls, settings,
                                     crawler)  # 实例化指纹过滤器类, 最终复制为self.df
        # 任务队列, 见scrapy/squeues.py, 其中磁盘队列: 执行后会保存队列任务到磁盘; 内存队列: 重启消失
        # 如果用户配置了JOBDIR, 则会同时影响过滤器, 磁盘队列的设置.
        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
        if pqclass is PriorityQueue:
            warnings.warn(
                "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
                " is no longer supported because of API changes; "
                "please use 'scrapy.pqueues.ScrapyPriorityQueue'",
                ScrapyDeprecationWarning)
            from scrapy.pqueues import ScrapyPriorityQueue
            pqclass = ScrapyPriorityQueue

        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        # 日志序列化开关
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
                                    settings.getbool('SCHEDULER_DEBUG'))
        return cls(dupefilter,
                   jobdir=job_dir(settings),
                   logunser=logunser,
                   stats=crawler.stats,
                   pqclass=pqclass,
                   dqclass=dqclass,
                   mqclass=mqclass,
                   crawler=crawler)  # 实例化scheduler
Exemplo n.º 11
0
 def from_settings(cls: Type[RFPDupeFilterTV],
                   settings: BaseSettings,
                   *,
                   fingerprinter=None) -> RFPDupeFilterTV:
     debug = settings.getbool('DUPEFILTER_DEBUG')
     try:
         return cls(job_dir(settings), debug, fingerprinter=fingerprinter)
     except TypeError:
         warn(
             "RFPDupeFilter subclasses must either modify their '__init__' "
             "method to support a 'fingerprinter' parameter or reimplement "
             "the 'from_settings' class method.",
             ScrapyDeprecationWarning,
         )
         result = cls(job_dir(settings), debug)
         result.fingerprinter = fingerprinter
         return result
Exemplo n.º 12
0
 def from_settings(cls, settings):
     """
     获得爬虫中的两个参数:JOBDIR缓存磁盘目录,和DUPEFILTER_DEBUG是否开启debug模式
     :param settings: 爬虫配置
     :return: 调用__init__方法,获得实例
     """
     debug = settings.getbool('DUPEFILTER_DEBUG')
     return cls(job_dir(settings), debug)
Exemplo n.º 13
0
    def from_crawler(cls, crawler):
        jobdir = job_dir(crawler.settings)
        if not jobdir:
            raise NotConfigured

        obj = cls(jobdir)
        crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
        return obj
Exemplo n.º 14
0
    def from_crawler(cls, crawler):
        jobdir = job_dir(crawler.settings)
        if not jobdir:
            raise NotConfigured

        obj = cls(jobdir)
        crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
        return obj
Exemplo n.º 15
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     run_as_daemon = settings.get('DAEMON')
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
     return cls(crawler, dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats, run_as_daemon)
Exemplo n.º 16
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = create_instance(dupefilter_cls, settings, crawler)
     pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
     return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
                stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
Exemplo n.º 17
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
     return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
                stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
Exemplo n.º 18
0
 def from_settings(cls, settings):
     check_settings(settings)
     debug = settings.getbool('DUPEFILTER_DEBUG')
     config = settings.getdict('REQUEST_DUPEFILTER_CONFIG', {})
     mongo_uri = settings.get('MONGO_URI')
     mongo_db = settings.get('MONGO_DATABASE')
     return cls(mongo_uri,
                mongo_db,
                config,
                path=job_dir(settings),
                debug=debug)
Exemplo n.º 19
0
    def from_settings(cls, global_settings, global_stats):
        settings = global_settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
        total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
        domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')

        return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, global_stats, total_concurrency, domain_concurrency, ip_concurrency)
Exemplo n.º 20
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     if hasattr(dupefilter_cls, 'from_crawler'):
         dupefilter = dupefilter_cls.from_crawler(crawler)
     elif hasattr(dupefilter_cls, 'from_settings'):
         dupefilter = dupefilter_cls.from_settings(crawler.settings)
     else:
         dupefilter = dupefilter_cls()
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
     return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
                stats=crawler.stats)
Exemplo n.º 21
0
 def from_crawler(cls, crawler):  # 这里是真正的实例化入口
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])  # 'scrapy.dupefilters.RFPDupeFilter' 过滤的咯
     dupefilter = create_instance(dupefilter_cls, settings,
                                  crawler)  # objcls.from_crawler(crawler, *args, **kwargs)执行from-settings实例化
     pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])  # 'queuelib.PriorityQueue'
     dqclass = load_object(
         settings['SCHEDULER_DISK_QUEUE'])  # 'scrapy.squeues.PickleLifoDiskQueue',先进先出,使用pickle模块序列化
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])  # 'scrapy.squeues.LifoMemoryQueue'
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool(
         'SCHEDULER_DEBUG'))  # ('LOG_UNSERIALIZABLE_REQUESTS', 'use SCHEDULER_DEBUG instead')
     return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
                stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)  # 初始化时队列为空,并没有往里面推数据
Exemplo n.º 22
0
 def from_settings(cls, settings):
     from elasticsearch import Elasticsearch
     check_settings(settings)
     debug = settings.getbool('DUPEFILTER_DEBUG')
     config = settings.getdict('REQUEST_DUPEFILTER_CONFIG', {})
     obj = cls(path=job_dir(settings), debug=debug)
     obj.settings = settings
     es_servers = obj.settings['ELASTICSEARCH_SERVERS']
     es_servers = es_servers if isinstance(es_servers,
                                           list) else [es_servers]
     obj.items = get_item_dict(config.get('items'), settings)
     obj.es = Elasticsearch(hosts=es_servers,
                            timeout=obj.settings.get(
                                'ELASTICSEARCH_TIMEOUT', 60))
     return obj
Exemplo n.º 23
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     pqcls = load_object(settings["SCHEDULER_PRIORITY_QUEUE"])
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
     return cls(dupefilter,
                job_dir(settings),
                dqclass,
                mqclass,
                logunser,
                crawler.stats,
                pqcls)
Exemplo n.º 24
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)
        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))

        rabbitmq_queue_name = settings.get('RABBITMQ_INPUT_QUEUE_NAME')
        rabbitmq_url = settings.get('RABBITMQ_URL')

        return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
                   stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass,
                   rabbitmq_queue_name=rabbitmq_queue_name, rabbitmq_url=rabbitmq_url)
Exemplo n.º 25
0
 def from_crawler(cls: Type[SchedulerTV], crawler) -> SchedulerTV:
     """
     Factory method, initializes the scheduler with arguments taken from the crawl settings
     """
     dupefilter_cls = load_object(crawler.settings['DUPEFILTER_CLASS'])
     return cls(
         dupefilter=create_instance(dupefilter_cls, crawler.settings, crawler),
         jobdir=job_dir(crawler.settings),
         dqclass=load_object(crawler.settings['SCHEDULER_DISK_QUEUE']),
         mqclass=load_object(crawler.settings['SCHEDULER_MEMORY_QUEUE']),
         logunser=crawler.settings.getbool('SCHEDULER_DEBUG'),
         stats=crawler.stats,
         pqclass=load_object(crawler.settings['SCHEDULER_PRIORITY_QUEUE']),
         crawler=crawler,
     )
Exemplo n.º 26
0
    def from_crawler(cls, crawler):
        settings = crawler.settings

        # DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)

        # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])

        # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])

        # 是否在LOG里记录不可序列化的request
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
        return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats)
Exemplo n.º 27
0
 def from_crawler(cls, crawler):  #crawler 来实例化这个东西
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = create_instance(dupefilter_cls, settings, crawler)
     pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     logunser = settings.getbool('SCHEDULER_DEBUG')
     return cls(dupefilter,
                jobdir=job_dir(settings),
                logunser=logunser,
                stats=crawler.stats,
                pqclass=pqclass,
                dqclass=dqclass,
                mqclass=mqclass,
                crawler=crawler)
Exemplo n.º 28
0
    def from_settings(cls, settings):
        debug = settings.getbool('DUPEFILTER_DEBUG')
        m_length = settings['MAX_LENGTH']
        error_rate = settings['ERROR_RATE']
        mongo_host = settings['DB_HOST']
        mongo_port = settings['DB_PORT']

        try:
            dbname = settings['DB_NAME']
            dbcollections = settings['DB_COLLECTIONS_NAME']
        except Exception:
            dbname = None
            dbcollections = None

        return cls(m_length, error_rate, mongo_host, mongo_port,\
                   dbname, dbcollections, job_dir(settings), debug)
Exemplo n.º 29
0
    def from_crawler(cls, crawler):
        """ init from crawler """

        jobdir = job_dir(crawler.settings)

        if not jobdir:
            raise NotConfigured

        state_file = crawler.settings.get("STATE_TAG_FILE") or ".state"
        pid_file = crawler.settings.get("PID_TAG_FILE") or ".pid"

        obj = cls(jobdir, state_file, pid_file)

        crawler.signals.connect(obj._spider_opened, signals.spider_opened)
        crawler.signals.connect(obj._spider_closed, signals.spider_closed)

        return obj
Exemplo n.º 30
0
 def from_crawler(cls, crawler):
     print "initialize scheduler from crawler    <-- wangyf"
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     db_host = settings['DB_HOST']
     db_port = settings['DB_PORT']
     try:
         db_name = settings['DB_NAME']
         db_collections = settings['DB_COLLECTIONS_NAME']
     except Exception:
         db_name = None
         db_collections = None
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
                                 settings.getbool('SCHEDULER_DEBUG'))
     return cls(dupefilter, db_host, db_port, db_name, db_collections, \
                jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats)
    def from_crawler(cls, crawler):
        settings = crawler.settings

        #DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)

        #SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])

        #SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])

        #LOG_UNSERIALIZABLE_REQUESTS:False
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')

        return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser,
                   crawler.stats)
Exemplo n.º 32
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     host = settings.get('REDIS_HOST', 'localhost')
     port = settings.get('REDIS_PORT', 6379)
     server = redis.Redis(host, port)
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     rqclass = load_object(settings['SCHEDULER_REDIS_QUEUE'])
     next_urls_queue_key = settings.get('NEXT_URLS_QUEUE_KEY',
                                        '%(spider)s:next_urls')
     crawled_urls_queue_key = settings.get('CRAWLED_URLS_QUEUE_KEY',
                                           '%(spider)s:crawled_urls')
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
     return cls(server, rqclass, next_urls_queue_key, crawled_urls_queue_key,
                dupefilter, job_dir(settings), dqclass, mqclass, logunser,
                crawler.stats)
Exemplo n.º 33
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = create_instance(dupefilter_cls, settings, crawler)
        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
        if pqclass is PriorityQueue:
            warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
                          " is no longer supported because of API changes; "
                          "please use 'scrapy.pqueues.ScrapyPriorityQueue'",
                          ScrapyDeprecationWarning)
            from scrapy.pqueues import ScrapyPriorityQueue
            pqclass = ScrapyPriorityQueue

        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
                                    settings.getbool('SCHEDULER_DEBUG'))
        return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
                   stats=crawler.stats, pqclass=pqclass, dqclass=dqclass,
                   mqclass=mqclass, crawler=crawler)
Exemplo n.º 34
0
    def from_crawler(cls, crawler):
        """
        调度器的初始化主要做了2件事:
        实例化请求指纹过滤器:用来过滤重复请求,可自己重写替换之;
        定义各种不同类型的任务队列:优先级任务队列、基于磁盘的任务队列、基于内存的任务队列;
        """
        settings = crawler.settings
        dupefilter_cls = load_object(
            settings['DUPEFILTER_CLASS'])  # 从配置文件中获取指纹过滤器类
        dupefilter = create_instance(dupefilter_cls, settings,
                                     crawler)  # 实例化指纹过滤器
        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']
                              )  # 基于优先级的任务队列类(priority queue)
        if pqclass is PriorityQueue:  # 子类is父类???
            warnings.warn(
                "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
                " is no longer supported because of API changes; "
                "please use 'scrapy.pqueues.ScrapyPriorityQueue'",
                ScrapyDeprecationWarning)
            from scrapy.pqueues import ScrapyPriorityQueue
            pqclass = ScrapyPriorityQueue

        dqclass = load_object(
            settings['SCHEDULER_DISK_QUEUE'])  # 基于磁盘的任务队列类(disk queue)
        mqclass = load_object(
            settings['SCHEDULER_MEMORY_QUEUE'])  # 基于内存的任务队列类(memory queue)
        logunser = settings.getbool(
            'LOG_UNSERIALIZABLE_REQUESTS',
            settings.getbool('SCHEDULER_DEBUG'))  # 请求日志序列化开关
        return cls(dupefilter,
                   jobdir=job_dir(settings),
                   logunser=logunser,
                   stats=crawler.stats,
                   pqclass=pqclass,
                   dqclass=dqclass,
                   mqclass=mqclass,
                   crawler=crawler)
Exemplo n.º 35
0
    def from_crawler(cls, crawler):
        ## 根据一个爬虫对象实例化一个调度器类

        ## 配置文件
        settings = crawler.settings
        ## 从配置文件中获取指纹过滤器类
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        ##根据配置和爬虫对象创建一个指纹过滤器(用来过滤重复请求)
        dupefilter = create_instance(dupefilter_cls, settings, crawler)
        ## 从配置文件中依次获取基于优先级、基于磁盘任务、基于内存的任务队列类
        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        ## 日志是否序列化
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
                                    settings.getbool('SCHEDULER_DEBUG'))
        ## 返回一个调度器实例
        return cls(dupefilter,
                   jobdir=job_dir(settings),
                   logunser=logunser,
                   stats=crawler.stats,
                   pqclass=pqclass,
                   dqclass=dqclass,
                   mqclass=mqclass)
Exemplo n.º 36
0
 def from_crawler(cls, crawler):
     debug = crawler.settings.getbool('DUPEFILTER_DEBUG')
     df = cls(job_dir(crawler.settings), debug)
     df.method = 'from_crawler'
     return df
Exemplo n.º 37
0
 def from_settings(cls, settings):
     return cls(job_dir(settings))
Exemplo n.º 38
0
 def from_settings(cls, settings):
     debug = settings.getbool('DUPEFILTER_DEBUG')
     return cls(job_dir(settings), debug)
Exemplo n.º 39
0
 def from_settings(cls, settings):
     debug = settings.getbool('DUPEFILTER_DEBUG')
     return cls(job_dir(settings), debug)
Exemplo n.º 40
0
 def from_settings(cls, settings):
     debug = settings.getbool('DUPEFILTER_DEBUG')
     use_anchors = settings.getbool('DUPEFILTER_USE_ANCHORS')
     return cls(job_dir(settings), debug, use_anchors)
Exemplo n.º 41
0
 def from_crawler(cls, crawler):
     obj = cls(job_dir(crawler.settings))
     crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
     crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
     return obj
Exemplo n.º 42
0
 def from_settings(cls, settings):
     debug = settings.getbool('DUPEFILTER_DEBUG')
     df = cls(job_dir(settings), debug)
     df.method = 'from_settings'
     return df
Exemplo n.º 43
0
 def from_settings(cls, settings):
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     return cls(dupefilter, job_dir(settings), dqclass)
Exemplo n.º 44
0
 def from_settings(cls, settings):
     debug = settings.getbool('DUPEFILTER_DEBUG')
     df = cls(job_dir(settings), debug)
     df.method = 'from_settings'
     return df
Exemplo n.º 45
0
 def from_settings(cls, settings):
     return cls(job_dir(settings))
Exemplo n.º 46
0
 def from_settings(cls, settings):
     verbose_log = settings.getbool('DUPEFILTER_DEBUG')
     return cls(job_dir(settings), verbose_log)