Пример #1
0
    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)  # 统一

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        # 复位?
        self.settings.freeze()  # settings不可改变了
        self.crawling = False
        self.spider = None
        self.engine = None
Пример #2
0
class EventedStatsCollector(StatsCollector):
    """
    Stats Collector which allows to subscribe to value changes.
    Update notifications are throttled: interval between updates is no shorter
    than ``accumulate_time``.

    It is assumed that stat keys are never deleted.
    """
    accumulate_time = 0.1  # value is in seconds

    def __init__(self, crawler):
        super(EventedStatsCollector, self).__init__(crawler)
        self.signals = SignalManager(self)
        self._changes = {}
        self._task = PeriodicCallback(self.emit_changes,
                                      self.accumulate_time * 1000)
        self._task.start()

        # FIXME: this is ugly
        self.crawler = crawler  # used by ArachnadoCrawlerProcess

    def emit_changes(self):
        if self._changes:
            changes, self._changes = self._changes, {}
            self.signals.send_catch_log(stats_changed, changes=changes)

    def open_spider(self, spider):
        super(EventedStatsCollector, self).open_spider(spider)
        self._task.start()

    def close_spider(self, spider, reason):
        super(EventedStatsCollector, self).close_spider(spider, reason)
        self._task.stop()
Пример #3
0
    def __init__(self,**kwargs):
        
        if not 'config' in kwargs:
            err =  'failed to find seed file (config=*.conf)'
            print err
#         if not 'keywords' in kwargs:
#             err =  'failed to find seed file (keywords=*.dat)'
#             print err
        config = kwargs['config']
#         self.keywords = kwargs['keywords']
        self.load_conf(config)
        if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline')
            settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline')
        else:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline')
        
        log_filename = self.conf_name.replace('.conf','')+'.log'
        settings.set('LOG_FILE', log_filename, priority='cmdline')
        #redis key
        self.meta_next_url = meta_redis_key()
        #初始化redis
        self.init_redis()
        self.redis_keyword = get_redis_key(self.conf_name)
        #注册signal
        sig = SignalManager(dispatcher.Any)
        sig.connect(self.idle,signal=signals.spider_idle)
        self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        #保存该次获取的url,用于判断该次抓取是否和上次重复{keyword:md5(url)}
        self.urlmd5 = dict()
        self.log_writer = open('log.dat','a+') 
        self.date_from_url_re = re.compile("[-_/][a-zA-Z]*[-_]?(?P<year>(20)?([0-1][0-9]))([-_/])?(?P<m>(10|11|12|(0?[1-9])){1})([-_/])?(?P<day>(10|20|30|31|([0-2]?[1-9])){1})([-_/])")
Пример #4
0
    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
Пример #5
0
    def __init__(self, spidercls, settings):
        if isinstance(settings, dict):
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        self.signals.connect(lambda: logging.root.removeHandler(handler),
                             signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.spidercls.update_settings(self.settings)
        self.settings.freeze()

        self.crawling = False
        self.spider = None
        self.engine = None
Пример #6
0
class EventedStatsCollector(StatsCollector):
    """
    Stats Collector which allows to subscribe to value changes.
    Update notifications are throttled: interval between updates is no shorter
    than ``accumulate_time``.

    It is assumed that stat keys are never deleted.
    """
    accumulate_time = 0.1  # value is in seconds

    def __init__(self, crawler):
        super(EventedStatsCollector, self).__init__(crawler)
        self.signals = SignalManager(self)
        self._changes = {}
        self._task = PeriodicCallback(self.emit_changes, self.accumulate_time*1000)
        self._task.start()

        # FIXME: this is ugly
        self.crawler = crawler  # used by ArachnadoCrawlerProcess

    def emit_changes(self):
        if self._changes:
            changes, self._changes = self._changes, {}
            self.signals.send_catch_log(stats_changed, changes=changes)

    def open_spider(self, spider):
        super(EventedStatsCollector, self).open_spider(spider)
        self._task.start()

    def close_spider(self, spider, reason):
        super(EventedStatsCollector, self).close_spider(spider, reason)
        self._task.stop()
Пример #7
0
    def __init__(self, spidercls, settings=None, init_reactor: bool = False):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)

        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings:\n%(settings)s",
                    {'settings': pprint.pformat(d)})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)

        self.request_fingerprinter = create_instance(
            load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']),
            settings=self.settings,
            crawler=self,
        )

        reactor_class = self.settings.get("TWISTED_REACTOR")
        if init_reactor:
            # this needs to be done after the spider settings are merged,
            # but before something imports twisted.internet.reactor
            if reactor_class:
                install_reactor(reactor_class,
                                self.settings["ASYNCIO_EVENT_LOOP"])
            else:
                from twisted.internet import reactor  # noqa: F401
            log_reactor_info()
        if reactor_class:
            verify_installed_reactor(reactor_class)

        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
Пример #8
0
    def __init__(self, crawler):
        super(EventedStatsCollector, self).__init__(crawler)
        self.signals = SignalManager(self)
        self._changes = {}
        self._task = PeriodicCallback(self.emit_changes,
                                      self.accumulate_time * 1000)
        self._task.start()

        # FIXME: this is ugly
        self.crawler = crawler  # used by ArachnadoCrawlerProcess
Пример #9
0
    def __init__(self, spidercls, settings=None):
        ## crawler 对象必须用 scrapy.spiders.Spider 的子类和一个 scrapy.settings.Settings
        ## 对象来实例化

        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        ## 自定义爬虫类
        self.spidercls = spidercls
        ## crawler 的配置管理器,用来为插件和中间件提供访问该 crawler 的 Scrapy 配置的入口
        self.settings = settings.copy()
        ## 根据自定义爬虫类中的可能定义的 custom_settigns 属性更新配置
        ## 优先级为 spider
        self.spidercls.update_settings(self.settings)

        ## 这里得到的只是被覆盖过的配置项,并将其转换为字典
        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        ## crawler 的信号管理器,被插件和中间件用来将它们自身集成到 Scrapy 功能中
        self.signals = SignalManager(self)
        ## crawler 的 stats 收集器,用来从插件和中间件中记录它们的行为和访问其他插件收集到的数据
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        ## 用于对爬虫运行过程中产生的日志的级别数量,进行统计
        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        ## 为 engine_stopped 信号注册 __remove_handler 处理器
        ## 当产生引擎停止信号时,将会由 __remove_handler 处理器进行处理
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        ## 初始化日志格式化器实例
        self.logformatter = lf_cls.from_crawler(self)
        ## 用来追踪可用插件的插件管理器
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        ## 标志爬虫运行状态
        self.crawling = False
        ## 当前正在爬取的 spider
        self.spider = None
        ## 执行引擎,用来协调调度器、下载器、spiders 之间的爬取逻辑
        self.engine = None
Пример #10
0
    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed,
                             CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(MyselfCrawlerProcess, self).__init__(settings or {})
        logging.getLogger('scrapy.spidermiddlewares.depth').setLevel(
            logging.INFO)
Пример #11
0
class ProcessStatsMonitor(object):
    """ A class which emits process stats periodically """

    signal_updated = object()

    def __init__(self, interval=1.0):
        self.signals = SignalManager(self)
        self.process = psutil.Process(os.getpid())
        self.interval = interval
        self._task = PeriodicCallback(self._emit, self.interval*1000)
        self._recent = {}

    def start(self):
        # yappi.start()
        self._task.start()

    def stop(self):
        self._task.stop()
        # stats = yappi.get_func_stats()
        # stats.sort('tsub', 'desc')
        # with open("func-stats.txt", 'wt') as f:
        #     stats.print_all(f, columns={
        #         0: ("name", 80),
        #         1: ("ncall", 10),
        #         2: ("tsub", 8),
        #         3: ("ttot", 8),
        #         4: ("tavg",8)
        #     })
        #
        # pstats = yappi.convert2pstats(stats)
        # pstats.dump_stats("func-stats.prof")

    def get_recent(self):
        return self._recent

    def _emit(self):
        cpu_times = self.process.cpu_times()
        ram_usage = self.process.memory_info()
        stats = {
            'ram_percent': self.process.memory_percent(),
            'ram_rss': ram_usage.rss,
            'ram_vms': ram_usage.vms,
            'cpu_percent': self.process.cpu_percent(),
            'cpu_time_user': cpu_times.user,
            'cpu_time_system': cpu_times.system,
            'num_fds': self.process.num_fds(),
            'context_switches': self.process.num_ctx_switches(),
            'num_threads': self.process.num_threads(),
            'server_time': int(time.time()*1000),
        }
        self._recent = stats
        self.signals.send_catch_log(self.signal_updated, stats=stats)
Пример #12
0
class ProcessStatsMonitor(object):
    """ A class which emits process stats periodically """

    signal_updated = object()

    def __init__(self, interval=1.0):
        self.signals = SignalManager(self)
        self.process = psutil.Process(os.getpid())
        self.interval = interval
        self._task = PeriodicCallback(self._emit, self.interval * 1000)
        self._recent = {}

    def start(self):
        # yappi.start()
        self._task.start()

    def stop(self):
        self._task.stop()
        # stats = yappi.get_func_stats()
        # stats.sort('tsub', 'desc')
        # with open("func-stats.txt", 'wt') as f:
        #     stats.print_all(f, columns={
        #         0: ("name", 80),
        #         1: ("ncall", 10),
        #         2: ("tsub", 8),
        #         3: ("ttot", 8),
        #         4: ("tavg",8)
        #     })
        #
        # pstats = yappi.convert2pstats(stats)
        # pstats.dump_stats("func-stats.prof")

    def get_recent(self):
        return self._recent

    def _emit(self):
        cpu_times = self.process.cpu_times()
        ram_usage = self.process.memory_info()
        stats = {
            'ram_percent': self.process.memory_percent(),
            'ram_rss': ram_usage.rss,
            'ram_vms': ram_usage.vms,
            'cpu_percent': self.process.cpu_percent(),
            'cpu_time_user': cpu_times.user,
            'cpu_time_system': cpu_times.system,
            'num_fds': self.process.num_fds(),
            'context_switches': self.process.num_ctx_switches(),
            'num_threads': self.process.num_threads(),
            'server_time': int(time.time() * 1000),
        }
        self._recent = stats
        self.signals.send_catch_log(self.signal_updated, stats=stats)
Пример #13
0
class ProcessStatsMonitor(object):
    """查看进程状态, 每秒发布一次"""
    signal_updated = object()

    def __init__(self, interval=1.0):
        self.signals = SignalManager(self)
        self.process = psutil.Process(os.getpid())
        self.interval = interval
        self._task = PeriodicCallback(self._emit, self.interval * 1000)
        self._recent = {}

    def start(self):
        """启动进程"""
        self._task.start()

    def stop(self):
        """停止进程"""
        self._task.stop()

    def get_recent(self):
        """当前进程信息"""
        return self._recent

    def _emit(self):
        """进程属性"""
        cpu_times = self.process.cpu_times()
        ram_usage = self.process.memory_info()
        stats = {
            # 内存使用率
            'ram_percent': self.process.memory_percent(),
            # 内存rss
            'ram_rss': ram_usage.rss,
            # 内存vms
            'ram_vms': ram_usage.vms,
            # cpu百分比
            'cpu_percent': self.process.cpu_percent(),
            # 用户cpu时间
            'cpu_time_user': cpu_times.user,
            # 系统cpu时间
            'cpu_time_system': cpu_times.system,
            # 上下文
            'context_switches': self.process.num_ctx_switches(),
            # 线程数
            'num_threads': self.process.num_threads(),
            # 运行时间
            'server_time': int(time.time() * 1000)
        }
        # 当前系统状态
        self._recent = stats
        self.signals.send_catch_log(self.signal_updated, stats=stats)
Пример #14
0
    def __init__(self, spidercls, settings=None):
        if isinstance(spidercls, Spider):
            # spidercls参数必须是类,而不是对象
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        # 加载配置文件
        self.settings = settings.copy()
        # 这里调用了更新设置
        self.spidercls.update_settings(self.settings)

        # 初始化信号管理类
        self.signals = SignalManager(self)
        # 初始化日志收集类
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings:\n%(settings)s",
                    {'settings': pprint.pformat(d)})

        # 这个是关于根日志的
        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        # 加lambda为了防止垃圾回收机制?
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        # 加载日志格式化的东西
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        # 好像是个扩展
        self.extensions = ExtensionManager.from_crawler(self)

        # settings初始化完成,禁止修改了
        self.settings.freeze()
        # 未开始抓取
        self.crawling = False
        # 这里做准备操作,后面crawl才进行赋值
        self.spider = None
        self.engine = None
Пример #15
0
    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed,
                             CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(ArachnadoCrawlerProcess, self).__init__(settings or {})

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(
            logging.INFO)
Пример #16
0
    def __init__(self):
        from twisted.internet import reactor  # Imported here.inside

        self.reactor = reactor

        engine = get_engine()
        create_schema(engine)

        self.read_pool = ThreadPool(
            minthreads=1, maxthreads=16, name="ReadPool")

        self.write_pool = ThreadPool(
            minthreads=1, maxthreads=1, name="WritePool")

        self.read_pool.start()
        self.write_pool.start()

        self.signals = SignalManager(dispatcher.Any).connect(
            self.stop_threadpools, spider_closed)

        self.counters = defaultdict(lambda: Counter())

        self.cache = defaultdict(
            lambda: dict())

        self.write_queue = Queue()
        self.writelock = False  # Write queue mutex
Пример #17
0
    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
Пример #18
0
 def __init__(self,
              url=None,
              out=None,
              base=None,
              only=None,
              *args,
              **kwargs):
     super(PlayGrabberSpider, self).__init__(*args, **kwargs)
     SignalManager(dispatcher.Any).connect(self.closed_handler,
                                           signal=signals.spider_closed)
     if out == None and base == None:
         raise Exception(
             'Must provide argument "-a out=..." or "-a base=..."')
     if out != None and base != None:
         raise Exception(
             'Cannot provide both argument "-a out=..." and "-a base=..."')
     if url == None and base == None:
         raise Exception(
             'Must provide argument "-a url=..." or "-a base=..."')
     if url:
         self.start_urls = [url]
     else:
         self.start_urls = self.find_shows_in_base(base)
     if only:
         self.dont_crawl = True
     else:
         self.dont_crawl = False
     self.output_dir = out
     self.output_base_dir = base
Пример #19
0
    def __init__(self, spidercls, settings):
        if isinstance(settings, dict):
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        self.signals.connect(lambda: logging.root.removeHandler(handler),
                             signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.spidercls.update_settings(self.settings)
        self.settings.freeze()

        self.crawling = False
        self.spider = None
        self.engine = None
Пример #20
0
    def __init__(self, spider):
        from twisted.internet import reactor  # Imported here.inside

        self.spider = spider
        ''' Used for logging for now '''

        self.reactor = reactor
        ''' Used for thred pools '''

        engine = get_engine()
        create_schema(engine)

        self.thread_pool = ThreadPool(
            minthreads=1, maxthreads=13, name="ReadPool")

        # There should be only one pool in the write_pool
        # Never increase maxtreads value
        self.write_pool = ProfiledThreadPool(
            minthreads=1, maxthreads=1, name="WritePool")

        self.thread_pool.start()
        self.write_pool.start()

        self.signals = SignalManager(dispatcher.Any).connect(
            self.stop_threadpools, spider_closed)

        self.reporter = Reporter()
        ''' Reporer is used for statistics collection '''
        self.counters = self.reporter.counters

        self.cache = defaultdict(
            lambda: dict())

        self.write_queue = Queue()
        self.writelock = False  # Write queue mutex
Пример #21
0
    def __init__(self, settings):
        # self.options = settings.get('SELENIUM_OPTIONS', {})

        max_run = settings.get('SELENIUM_MAXRUN', 10)
        self.sem = defer.DeferredSemaphore(max_run)
        self.queue = queue.LifoQueue(max_run)

        SignalManager(dispatcher.Any).connect(self._close, signal=signals.spider_closed)
 def __init__(self, settings):
     self.options = settings.get('PHANTOMJS_OPTIONS', {})  # 默认空
     max_run = settings.get('PHANTOMJS_MAXRUN',
                            10)  # PhantomJS 可以同时运行最大的个数, 默认10
     self.sem = defer.DeferredSemaphore(max_run)
     self.queue = Queue.LifoQueue(maxsize=max_run)  # LifoQueue 后进先出队列
     SignalManager(dispatcher.Any).connect(receiver=self._close,
                                           signal=signals.spider_closed)
Пример #23
0
 def __init__(self):
     self.browser = webdriver.Chrome(
         executable_path='C:\\Users\\23607\\Desktop\\py\\chromedriver.exe')
     super(LagouSpider, self).__init__()
     # dispatcher.connect(self.spider_closed(),signals.spider_closed)
     # crawler.signals.connect(self.spider_closed, signals.spider_closed)
     SignalManager(dispatcher.Any).connect(self.spider_closed,
                                           signal=signals.spider_closed)
Пример #24
0
 def __init__(self, settings):
     self.options = settings.get('PHANTOMJS_OPTIONS', {})
     max_run = settings.get('PHANTOMJS_MAXRUN', 10)
     self.sem = defer.DeferredSemaphore(max_run)
     self.queue = Queue.LifoQueue(max_run)
     self.create_phantomjs_count = 0
     self._fallback_handler = load_object(FALLBACK_HANDLER)(settings)
     SignalManager(dispatcher.Any).connect(self._close,
                                           signal=signals.spider_closed)
Пример #25
0
    def __init__(self, crawler):
        super(EventedStatsCollector, self).__init__(crawler)
        self.signals = SignalManager(self)
        self._changes = {}
        self._task = PeriodicCallback(self.emit_changes, self.accumulate_time*1000)
        self._task.start()

        # FIXME: this is ugly
        self.crawler = crawler  # used by ArachnadoCrawlerProcess
Пример #26
0
    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:  # 此处应该直接就是一个False
            settings = Settings(settings)

        self.spidercls = spidercls  # 获取了爬虫对象,,但是还是没有实例化  # 所以流程就是获取爬虫对象,在实例化之前,执行了一次update_settings,就是针对custom setting的一次操作嘛
        self.settings = settings.copy()
        self.spidercls.update_settings(
            self.settings
        )  # 就是在这里执行了对custom_setting的设置嘛,可以很强,把爬虫里面的custom_setting更新到setting里面?好吧没事共同维护的一个setting对象,确实是直接更新到了setting里面
        # 所以在实例化之前,他只是更新了settings而已,你写在__init__其实一点用都得的,根本就没有执行到哪一步
        # todo 总结。我以前是想把custiom_settings卸载init里面,但是并没有起作用,因为此时的爬虫也就是 self.spidercls 根本就还没有进行初始化,而是先执行的update_settings
        d = dict(overridden_settings(self.settings))  # 找出相同的,取overridden里面的值
        logger.info(
            "Overridden settings: %(settings)r",
            {'settings': d
             })  # 这就是打印那句log的地方,打印出Overridden属性。现遍历默认属性,找出已有属性中对应的值,看那些有修改

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(
            self
        )  # STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector' -- 统计机制

        handler = LogCounterHandler(
            self, level=self.settings.get('LOG_LEVEL'))  # LOG_LEVEL = 'DEBUG'
        logging.root.addHandler(handler)
        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(
            self.settings['LOG_FORMATTER']
        )  # LOG_FORMATTER = 'scrapy.logformatter.LogFormatter'
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(
            self)  # extensions是干嘛用的,杵这干啥呢,闹呢。卧槽,貌似不需要使用这些扩展件呀

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
Пример #27
0
    def __init__(self,**kwargs):
        
        if not 'config' in kwargs:
            err =  'failed to find seed file (config=*.conf)'
            print err
        if 'startdate' in kwargs:
            self.startdate = kwargs['startdate']
        else:
            self.startdate = (datetime.datetime.now()-datetime.timedelta(days=2)).strftime('%Y-%m-%d %H:%M:%S')
        if 'enddate' in kwargs:
            self.enddate = kwargs['enddate']
        else:
            self.enddate = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#         if not 'keywords' in kwargs:
#             err =  'failed to find seed file (keywords=*.dat)'
#             print err
        config = kwargs['config']
        self.load_conf(config)
        if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline')
            settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline')
        else:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline')
        
        log_filename = self.conf_name.replace('.conf','')+'.log'
        settings.set('LOG_FILE', log_filename, priority='cmdline')
        #初始化redis
        self.init_redis()
        self.redis_keyword = get_redis_key()
        #注册signal
        sig = SignalManager(dispatcher.Any)
        sig.connect(self.idle,signal=signals.spider_idle)
        sig.connect(self.close,signal=signals.spider_closed)
        self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        self.conn_local = mysql.connect('meta',host='localhost')
        self.conn_local_cursor = self.conn_local.cursor()
#        self.conn_local_cursor.execute('set global autocommit=1')
	try:
            self.meta_ip = get_meta_ip(network_card='enp7s0')
	except:
	    self.meta_ip = get_meta_ip(network_card='eth0')
        #初始化meta库的state
        self.init_state()
Пример #28
0
    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)

        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)

        self.scheduled = {}
Пример #29
0
 def __init__(self, settings):
     self.configured = False
     self.settings = settings
     self.signals = SignalManager(self)
     self.stats = load_object(settings['STATS_CLASS'])(self)
     self._start_requests = lambda: ()
     self._spider = None
     # TODO: move SpiderManager to CrawlerProcess
     spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
     self.spiders = spman_cls.from_crawler(self)
Пример #30
0
    def __init__(self, *args, **kwargs):
        super(GeneralUrls, self).__init__(*args, **kwargs)
        SignalManager(dispatcher.Any).connect(self.spiderClosed,
                                              signals.spider_closed)

        self.splitOptions.append(os.linesep)

        for c in printable:
            if c.isalpha() == False and c != '\'':
                self.splitOptions.append(c)
Пример #31
0
    def __init__(self, settings):
        self.settings = settings
        self.options = settings.get('PHANTOMJS_OPTIONS', {})\

        max_run = settings.get('PHANTOMJS_MAXRUN', 5)
        self.sem = defer.DeferredSemaphore(
            max_run)  # as a means of limiting parallelism
        self.queue = queue.LifoQueue(
            max_run)  # last in first out, the content is driver not request
        SignalManager(dispatcher.Any).connect(self._close,
                                              signal=signals.spider_closed)
Пример #32
0
 def __init__(self, mongo_uri, cache=False):
     self.mongo_uri = mongo_uri
     _, _, _, _, self.col = motor_from_uri(mongo_uri)
     self.signal_manager = SignalManager()
     # Used for unsubscribe
     # disconnect() requires reference to original callback
     self._callbacks = {}
     self.fetching = False
     self.signals = {
         'created': object(),
         'updated': object(),
         'deleted': object(),
     }
     # XXX: cache is used in arachnado.cron and arachnado.site_checker.
     # Is it needed?
     self.cache_flag = cache
     if cache:
         self.cache = defaultdict(dict)
     else:
         self.cache = None
 def __init__(self, domain=None):
     self.con = mdb.connect('localhost', 'root', 'admin', 'huiben')
     self.cur = self.con.cursor()
     self.start_urls = [
         "http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A658409051%2Cp_n_fulfilled_by_amazon%3A326314071"
     ]
     filename = str(uuid.uuid1())
     print filename
     self.savefile = codecs.open(filename, 'w', 'utf-8')
     SignalManager(dispatcher.Any).connect(
         self.closed_handler, signal=scrapy.signals.spider_closed)
Пример #34
0
    def __init__(self, settings):
        self.options = settings.get('SELENIUM_OPTIONS', {})
        self.domain_concurrency = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        self.ip_concurrency = settings.getint('CONCURRENT_REQUESTS_PER_IP')

        max_run = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
        logging.info("Download workers: %s", max_run)
        self.sem = defer.DeferredSemaphore(max_run)
        self.queue = queue.LifoQueue(max_run)

        SignalManager(dispatcher.Any).connect(self._close, signal=signals.spider_closed)
Пример #35
0
    def __init__(self, spidercls, settings):
        self.spidercls = spidercls
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.crawling = False
        self.spider = None
        self.engine = None
Пример #36
0
    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(ArachnadoCrawlerProcess, self).__init__(settings or {})

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO)
Пример #37
0
    def __init__(self, spidercls, settings=None):
        if isinstance(spidercls, Spider):
            raise ValueError(
                "The spidercls argument must be a class, not an object")

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings["STATS_CLASS"])(self)

        handler = LogCounterHandler(self, level=self.settings.get("LOG_LEVEL"))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings:\n%(settings)s",
                    {"settings": pprint.pformat(d)})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings["LOG_FORMATTER"])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
Пример #38
0
    def __init__(self, spidercls, settings=None):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
Пример #39
0
class Crawler(object):

    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None

    @property
    def spiders(self):
        if not hasattr(self, '_spiders'):
            warnings.warn("Crawler.spiders is deprecated, use "
                          "CrawlerRunner.spider_loader or instantiate "
                          "scrapy.spiderloader.SpiderLoader with your "
                          "settings.",
                          category=ScrapyDeprecationWarning, stacklevel=2)
            self._spiders = _get_spider_loader(self.settings.frozencopy())
        return self._spiders

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        assert not self.crawling, "Crawling already taking place"
        self.crawling = True

        try:
            self.spider = self._create_spider(*args, **kwargs)
            self.engine = self._create_engine()
            start_requests = iter(self.spider.start_requests())
            yield self.engine.open_spider(self.spider, start_requests)
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            self.crawling = False
            raise

    def _create_spider(self, *args, **kwargs):
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Пример #40
0
 def __init__(self):
     super(StoreToMongoDB, self).__init__()
     manager = SignalManager()
     manager.connect(self.initialize, scrapy.signals.spider_opened)
     manager.connect(self.finalize, scrapy.signals.spider_idle)
Пример #41
0
 def __init__(self):
     self.conn = None
     sig=SignalManager(sender=dispatcher.Any)
     sig.connect(self.initialize,signals.engine_started)
     sig.connect(self.finalize, signals.engine_stopped)
Пример #42
0
 def __init__(self, interval=1.0):
     self.signals = SignalManager(self)
     self.process = psutil.Process(os.getpid())
     self.interval = interval
     self._task = PeriodicCallback(self._emit, self.interval*1000)
     self._recent = {}
Пример #43
0
class ArachnadoCrawlerProcess(CrawlerProcess):
    """
    CrawlerProcess which sets up a global signals manager,
    assigns unique ids to each spider job, workarounds some Scrapy
    issues and provides extra stats.
    """

    crawl_ids = itertools.count(start=1)

    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(ArachnadoCrawlerProcess, self).__init__(settings or {})

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO)

    def crawl(self, crawler_or_spidercls, *args, **kwargs):
        kwargs["crawl_id"] = next(self.crawl_ids)

        crawler = crawler_or_spidercls
        if not isinstance(crawler_or_spidercls, Crawler):
            crawler = self._create_crawler(crawler_or_spidercls)

        # aggregate all crawler signals
        for name in SCRAPY_SIGNAL_NAMES:
            crawler.signals.connect(self._resend_signal, getattr(signals, name))

        # aggregate signals from crawler EventedStatsCollectors
        if hasattr(crawler.stats, "signals"):
            crawler.stats.signals.connect(self._resend_signal, stats.stats_changed)

        d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs)
        return d

    def _create_crawler(self, spidercls):
        if isinstance(spidercls, six.string_types):
            spidercls = self.spider_loader.load(spidercls)
        return ArachnadoCrawler(spidercls, self.settings)

    def stop_job(self, crawl_id):
        """ Stop a single crawl job """
        self.get_crawler(crawl_id).stop()

    def pause_job(self, crawl_id):
        """ Pause a crawling job """
        self._paused_jobs.add(crawl_id)
        self.get_crawler(crawl_id).engine.pause()

    def resume_job(self, crawl_id):
        """ Resume a crawling job """
        self._paused_jobs.remove(crawl_id)
        self.get_crawler(crawl_id).engine.unpause()

    def get_crawler(self, crawl_id):
        for crawler in self.crawlers:
            if getattr(crawler.spider, "crawl_id") == crawl_id:
                return crawler
        raise KeyError("Job is not known: %s" % crawl_id)

    def _resend_signal(self, **kwargs):
        # FIXME: this is a mess. Signal handling should be unified somehow:
        # there shouldn't be two separate code paths
        # for CrawlerProcessSignals and STAT_SIGNALS.
        signal = kwargs["signal"]
        if signal in STAT_SIGNALS:
            signal = STAT_SIGNALS[signal]
            kwargs["crawler"] = kwargs.pop("sender").crawler
        else:
            signal = CrawlerProcessSignals.signal(signal)
            kwargs["crawler"] = kwargs.pop("sender")

        kwargs["signal"] = signal
        if signal.supports_defer:
            return self.signals.send_catch_log_deferred(**kwargs)
        else:
            return self.signals.send_catch_log(**kwargs)

    def stop(self):
        """ Terminate the process (exit from application). """
        self.procmon.stop()
        return super(ArachnadoCrawlerProcess, self).stop()

    def on_spider_closed(self, spider, reason):
        # spiders are closed not that often, insert(0,...) should be fine
        self._finished_jobs.insert(
            0,
            {
                "id": spider.crawl_id,
                "job_id": getattr(spider, "motor_job_id"),
                "seed": spider.domain,
                "status": reason,
                "stats": spider.crawler.stats.get_stats(spider),
                "downloads": self._downloader_stats(spider.crawler),
            },
        )

    # FIXME: methods below are ugly for two reasons:
    # 1. they assume spiders have certain attributes;
    # 2. they try to get crawling status based on auxilary information.

    def get_jobs(self):
        """ Return a list of active jobs """
        crawlers = [crawler for crawler in self.crawlers if crawler.spider is not None]
        return [
            {
                "id": crawler.spider.crawl_id,
                "job_id": getattr(crawler.spider, "motor_job_id"),
                "seed": crawler.spider.domain,
                "status": self._get_crawler_status(crawler),
                "stats": crawler.spider.crawler.stats.get_stats(crawler.spider),
                "downloads": self._downloader_stats(crawler)
                # 'engine_info': dict(get_engine_status(crawler.engine))
            }
            for crawler in crawlers
        ]

    @classmethod
    def _downloader_stats(cls, crawler):
        downloader = crawler.engine.downloader
        return {
            "active": [cls._request_info(req) for req in downloader.active],
            "slots": sorted(
                [cls._slot_info(key, slot) for key, slot in downloader.slots.items()], key=operator.itemgetter("key")
            ),
        }

    @classmethod
    def _request_info(cls, request):
        return {"url": request.url, "method": request.method}

    @classmethod
    def _slot_info(cls, key, slot):
        return {
            "key": key,
            "concurrency": slot.concurrency,
            "delay": slot.delay,
            "lastseen": slot.lastseen,
            "len(queue)": len(slot.queue),
            "transferring": [cls._request_info(req) for req in slot.transferring],
            "active": [cls._request_info(req) for req in slot.active],
        }

    def _get_crawler_status(self, crawler):
        if crawler.spider is None:
            return "unknown"
        if not crawler.crawling:
            return "stopping"
        if int(crawler.spider.crawl_id) in self._paused_jobs:
            return "suspended"
        return "crawling"

    @property
    def jobs(self):
        """ Current crawl state """
        # filter out active jobs which are in fact finished
        finished_ids = {job["id"] for job in self._finished_jobs}
        active_jobs = [job for job in self.get_jobs() if job["id"] not in finished_ids]

        return active_jobs + self._finished_jobs
Пример #44
0
class Crawler(object):

    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None

    @property
    def spiders(self):
        if not hasattr(self, '_spiders'):
            warnings.warn("Crawler.spiders is deprecated, use "
                          "CrawlerRunner.spider_loader or instantiate "
                          "scrapy.spiderloader.SpiderLoader with your "
                          "settings.",
                          category=ScrapyDeprecationWarning, stacklevel=2)
            self._spiders = _get_spider_loader(self.settings.frozencopy())
        return self._spiders

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        assert not self.crawling, "Crawling already taking place"
        self.crawling = True

        try:
            self.spider = self._create_spider(*args, **kwargs)
            self.engine = self._create_engine()
            start_requests = iter(self.spider.start_requests())
            yield self.engine.open_spider(self.spider, start_requests)
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            # In Python 2 reraising an exception after yield discards
            # the original traceback (see https://bugs.python.org/issue7563),
            # so sys.exc_info() workaround is used.
            # This workaround also works in Python 3, but it is not needed,
            # and it is slower, so in Python 3 we use native `raise`.
            if six.PY2:
                exc_info = sys.exc_info()

            self.crawling = False
            if self.engine is not None:
                yield self.engine.close()

            if six.PY2:
                six.reraise(*exc_info)
            raise

    def _create_spider(self, *args, **kwargs):
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)