class EventedStatsCollector(StatsCollector): """ Stats Collector which allows to subscribe to value changes. Update notifications are throttled: interval between updates is no shorter than ``accumulate_time``. It is assumed that stat keys are never deleted. """ accumulate_time = 0.1 # value is in seconds def __init__(self, crawler): super(EventedStatsCollector, self).__init__(crawler) self.signals = SignalManager(self) self._changes = {} self._task = PeriodicCallback(self.emit_changes, self.accumulate_time * 1000) self._task.start() # FIXME: this is ugly self.crawler = crawler # used by ArachnadoCrawlerProcess def emit_changes(self): if self._changes: changes, self._changes = self._changes, {} self.signals.send_catch_log(stats_changed, changes=changes) def open_spider(self, spider): super(EventedStatsCollector, self).open_spider(spider) self._task.start() def close_spider(self, spider, reason): super(EventedStatsCollector, self).close_spider(spider, reason) self._task.stop()
class EventedStatsCollector(StatsCollector): """ Stats Collector which allows to subscribe to value changes. Update notifications are throttled: interval between updates is no shorter than ``accumulate_time``. It is assumed that stat keys are never deleted. """ accumulate_time = 0.1 # value is in seconds def __init__(self, crawler): super(EventedStatsCollector, self).__init__(crawler) self.signals = SignalManager(self) self._changes = {} self._task = PeriodicCallback(self.emit_changes, self.accumulate_time*1000) self._task.start() # FIXME: this is ugly self.crawler = crawler # used by ArachnadoCrawlerProcess def emit_changes(self): if self._changes: changes, self._changes = self._changes, {} self.signals.send_catch_log(stats_changed, changes=changes) def open_spider(self, spider): super(EventedStatsCollector, self).open_spider(spider) self._task.start() def close_spider(self, spider, reason): super(EventedStatsCollector, self).close_spider(spider, reason) self._task.stop()
class ProcessStatsMonitor(object): """ A class which emits process stats periodically """ signal_updated = object() def __init__(self, interval=1.0): self.signals = SignalManager(self) self.process = psutil.Process(os.getpid()) self.interval = interval self._task = PeriodicCallback(self._emit, self.interval * 1000) self._recent = {} def start(self): # yappi.start() self._task.start() def stop(self): self._task.stop() # stats = yappi.get_func_stats() # stats.sort('tsub', 'desc') # with open("func-stats.txt", 'wt') as f: # stats.print_all(f, columns={ # 0: ("name", 80), # 1: ("ncall", 10), # 2: ("tsub", 8), # 3: ("ttot", 8), # 4: ("tavg",8) # }) # # pstats = yappi.convert2pstats(stats) # pstats.dump_stats("func-stats.prof") def get_recent(self): return self._recent def _emit(self): cpu_times = self.process.cpu_times() ram_usage = self.process.memory_info() stats = { 'ram_percent': self.process.memory_percent(), 'ram_rss': ram_usage.rss, 'ram_vms': ram_usage.vms, 'cpu_percent': self.process.cpu_percent(), 'cpu_time_user': cpu_times.user, 'cpu_time_system': cpu_times.system, 'num_fds': self.process.num_fds(), 'context_switches': self.process.num_ctx_switches(), 'num_threads': self.process.num_threads(), 'server_time': int(time.time() * 1000), } self._recent = stats self.signals.send_catch_log(self.signal_updated, stats=stats)
class ProcessStatsMonitor(object): """ A class which emits process stats periodically """ signal_updated = object() def __init__(self, interval=1.0): self.signals = SignalManager(self) self.process = psutil.Process(os.getpid()) self.interval = interval self._task = PeriodicCallback(self._emit, self.interval*1000) self._recent = {} def start(self): # yappi.start() self._task.start() def stop(self): self._task.stop() # stats = yappi.get_func_stats() # stats.sort('tsub', 'desc') # with open("func-stats.txt", 'wt') as f: # stats.print_all(f, columns={ # 0: ("name", 80), # 1: ("ncall", 10), # 2: ("tsub", 8), # 3: ("ttot", 8), # 4: ("tavg",8) # }) # # pstats = yappi.convert2pstats(stats) # pstats.dump_stats("func-stats.prof") def get_recent(self): return self._recent def _emit(self): cpu_times = self.process.cpu_times() ram_usage = self.process.memory_info() stats = { 'ram_percent': self.process.memory_percent(), 'ram_rss': ram_usage.rss, 'ram_vms': ram_usage.vms, 'cpu_percent': self.process.cpu_percent(), 'cpu_time_user': cpu_times.user, 'cpu_time_system': cpu_times.system, 'num_fds': self.process.num_fds(), 'context_switches': self.process.num_ctx_switches(), 'num_threads': self.process.num_threads(), 'server_time': int(time.time()*1000), } self._recent = stats self.signals.send_catch_log(self.signal_updated, stats=stats)
class ProcessStatsMonitor(object): """查看进程状态, 每秒发布一次""" signal_updated = object() def __init__(self, interval=1.0): self.signals = SignalManager(self) self.process = psutil.Process(os.getpid()) self.interval = interval self._task = PeriodicCallback(self._emit, self.interval * 1000) self._recent = {} def start(self): """启动进程""" self._task.start() def stop(self): """停止进程""" self._task.stop() def get_recent(self): """当前进程信息""" return self._recent def _emit(self): """进程属性""" cpu_times = self.process.cpu_times() ram_usage = self.process.memory_info() stats = { # 内存使用率 'ram_percent': self.process.memory_percent(), # 内存rss 'ram_rss': ram_usage.rss, # 内存vms 'ram_vms': ram_usage.vms, # cpu百分比 'cpu_percent': self.process.cpu_percent(), # 用户cpu时间 'cpu_time_user': cpu_times.user, # 系统cpu时间 'cpu_time_system': cpu_times.system, # 上下文 'context_switches': self.process.num_ctx_switches(), # 线程数 'num_threads': self.process.num_threads(), # 运行时间 'server_time': int(time.time() * 1000) } # 当前系统状态 self._recent = stats self.signals.send_catch_log(self.signal_updated, stats=stats)
class MongoStorage(object): """ Utility class for working with MongoDB data. It supports CRUD operations and allows to subscribe to created/updated/deleted events. """ def __init__(self, mongo_uri, cache=False): self.mongo_uri = mongo_uri _, _, _, _, self.col = motor_from_uri(mongo_uri) self.signal_manager = SignalManager() # Used for unsubscribe # disconnect() requires reference to original callback self._callbacks = {} self.fetching = False self.signals = { 'created': object(), 'updated': object(), 'deleted': object(), } # XXX: cache is used in arachnado.cron and arachnado.site_checker. # Is it needed? self.cache_flag = cache if cache: self.cache = defaultdict(dict) else: self.cache = None def subscribe(self, events=None, callback=None): if events is None: events = self.available_events if not isinstance(events, list): events = [events] for event_name in events: if event_name not in self.signals: raise ValueError('Invalid event name: {}'.format(event_name)) self.signal_manager.connect(callback, self.signals[event_name], weak=False) self._callbacks[event_name] = callback def unsubscribe(self, events=None): if events is None: events = self.available_events if not isinstance(events, list): events = [events] for event_name in events: try: self.signal_manager.disconnect( self._callbacks[event_name], self.signals[event_name], weak=False ) self._callbacks.pop(event_name, None) except KeyError: # FIXME: when can it happen? pass @property def available_events(self): return list(self.signals.keys()) @coroutine def fetch(self, query=None): if self.fetching: return self.fetching = True docs = [] cursor = self.col.find(query) while (yield cursor.fetch_next): doc = cursor.next_object() docs.append(doc) #if self.cache is not None: # self.cache[str(doc['_id'])] = doc # if str(doc['_id']) not in self.cache: # self.signal_manager.send_catch_log( # self.signals['created'], data=doc # ) self.fetching = False raise Return(docs) @coroutine def create(self, doc): doc = replace_dots(doc) result = yield self.col.insert(doc) if self.cache is not None: self.cache[str(doc['_id'])] = doc self.signal_manager.send_catch_log(self.signals['created'], data=doc) raise Return(result) @coroutine def ensure_index(self, key_or_list): result = yield self.col.ensure_index(key_or_list) raise Return(result) @coroutine def update(self, doc): doc = replace_dots(doc) doc_copy = deepcopy(doc) doc_copy.pop('_id') result = yield self.col.update({ '_id': ObjectId(doc['_id']) }, { '$set': doc_copy }) if self.cache is not None: self.cache[str(doc['_id'])].update(doc) self.signal_manager.send_catch_log(self.signals['updated'], data=doc) raise Return(result) @coroutine def delete(self, doc): result = yield self.col.remove({'_id': ObjectId(doc['_id'])}) if self.cache is not None: self.cache.pop(str(doc['_id']), None) self.signal_manager.send_catch_log(self.signals['deleted'], data=doc) raise Return(result)
class ArachnadoCrawlerProcess(CrawlerProcess): """ CrawlerProcess which sets up a global signals manager, assigns unique ids to each spider job, workarounds some Scrapy issues and provides extra stats. """ crawl_ids = itertools.count(start=1) def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(ArachnadoCrawlerProcess, self).__init__(settings or {}) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel( logging.INFO) def crawl(self, crawler_or_spidercls, *args, **kwargs): kwargs['crawl_id'] = next(self.crawl_ids) crawler = crawler_or_spidercls if not isinstance(crawler_or_spidercls, Crawler): crawler = self._create_crawler(crawler_or_spidercls) # aggregate all crawler signals for name in SCRAPY_SIGNAL_NAMES: crawler.signals.connect(self._resend_signal, getattr(signals, name)) # aggregate signals from crawler EventedStatsCollectors if hasattr(crawler.stats, "signals"): crawler.stats.signals.connect(self._resend_signal, stats.stats_changed) d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs) return d def _create_crawler(self, spidercls): if isinstance(spidercls, six.string_types): spidercls = self.spider_loader.load(spidercls) return ArachnadoCrawler(spidercls, self.settings) def stop_job(self, crawl_id): """ Stop a single crawl job """ self.get_crawler(crawl_id).stop() def pause_job(self, crawl_id): """ Pause a crawling job """ self._paused_jobs.add(crawl_id) self.get_crawler(crawl_id).engine.pause() def resume_job(self, crawl_id): """ Resume a crawling job """ self._paused_jobs.remove(crawl_id) self.get_crawler(crawl_id).engine.unpause() def get_crawler(self, crawl_id): for crawler in self.crawlers: if getattr(crawler.spider, "crawl_id") == crawl_id: return crawler raise KeyError("Job is not known: %s" % crawl_id) def _resend_signal(self, **kwargs): # FIXME: this is a mess. Signal handling should be unified somehow: # there shouldn't be two separate code paths # for CrawlerProcessSignals and STAT_SIGNALS. signal = kwargs['signal'] if signal in STAT_SIGNALS: signal = STAT_SIGNALS[signal] kwargs['crawler'] = kwargs.pop('sender').crawler else: signal = CrawlerProcessSignals.signal(signal) kwargs['crawler'] = kwargs.pop('sender') kwargs['signal'] = signal if signal.supports_defer: return self.signals.send_catch_log_deferred(**kwargs) else: return self.signals.send_catch_log(**kwargs) def stop(self): """ Terminate the process (exit from application). """ self.procmon.stop() return super(ArachnadoCrawlerProcess, self).stop() def on_spider_closed(self, spider, reason): # spiders are closed not that often, insert(0,...) should be fine self._finished_jobs.insert( 0, { 'id': spider.crawl_id, 'job_id': getattr(spider, 'motor_job_id'), 'seed': spider.domain, 'status': reason, 'stats': spider.crawler.stats.get_stats(spider), 'downloads': self._downloader_stats(spider.crawler) }) # FIXME: methods below are ugly for two reasons: # 1. they assume spiders have certain attributes; # 2. they try to get crawling status based on auxilary information. def get_jobs(self): """ Return a list of active jobs """ crawlers = [ crawler for crawler in self.crawlers if crawler.spider is not None ] return [ { 'id': crawler.spider.crawl_id, 'job_id': getattr(crawler.spider, 'motor_job_id'), 'seed': crawler.spider.domain, 'status': self._get_crawler_status(crawler), 'stats': crawler.spider.crawler.stats.get_stats(crawler.spider), 'downloads': self._downloader_stats(crawler) # 'engine_info': dict(get_engine_status(crawler.engine)) } for crawler in crawlers ] @classmethod def _downloader_stats(cls, crawler): downloader = crawler.engine.downloader return { 'active': [cls._request_info(req) for req in downloader.active], 'slots': sorted([ cls._slot_info(key, slot) for key, slot in downloader.slots.items() ], key=operator.itemgetter('key')) } @classmethod def _request_info(cls, request): return {'url': request.url, 'method': request.method} @classmethod def _slot_info(cls, key, slot): return { 'key': key, 'concurrency': slot.concurrency, 'delay': slot.delay, 'lastseen': slot.lastseen, 'len(queue)': len(slot.queue), 'transferring': [cls._request_info(req) for req in slot.transferring], 'active': [cls._request_info(req) for req in slot.active], } def _get_crawler_status(self, crawler): if crawler.spider is None: return "unknown" if not crawler.crawling: return "stopping" if int(crawler.spider.crawl_id) in self._paused_jobs: return "suspended" return "crawling" @property def jobs(self): """ Current crawl state """ # filter out active jobs which are in fact finished finished_ids = {job['id'] for job in self._finished_jobs} active_jobs = [ job for job in self.get_jobs() if job['id'] not in finished_ids ] return active_jobs + self._finished_jobs
class MyselfCrawlerProcess(CrawlerProcess): """signals管理, spider管理""" crawl_ids = itertools.count(start=1) def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(MyselfCrawlerProcess, self).__init__(settings or {}) logging.getLogger('scrapy.spidermiddlewares.depth').setLevel( logging.INFO) def crawl(self, crawler_or_spidercls, *args, **kwargs): kwargs['crawl_id'] = next(self.crawl_ids) crawler = crawler_or_spidercls if not isinstance(crawler_or_spidercls, Crawler): crawler = self._create_crawler(crawler_or_spidercls) # 爬虫信号状态 for name in SCRAPY_SIGNAL_NAMES: crawler.signals.connect(self._resend_signal, getattr(signals, name)) if hasattr(crawler.stats, "signals"): crawler.stats.signals.connect(self._resend_signal, stats.stats_changed) d = super(MyselfCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs) return d def _create_crawler(self, spidercls): """新建crawler""" if isinstance(spidercls, six.string_types): spidercls = self.spider_loader.load(spidercls) return MyselfCrawlerProcess(spidercls, self.settings) def stop_job(self, crawl_id): """crawl job停止信号""" self.get_crawler(crawl_id).stop() def pause_job(self, crawl_id): """crawl job暂停""" self._paused_jobs.add(crawl_id) self.get_crawler(crawl_id).engine.pause() def resume_job(self, crawl_id): """crawl job恢复""" self._paused_jobs.remove(crawl_id) self.get_crawler(crawl_id).engine.unpause() def get_crawler(self, crawl_id): """获取crawl""" for crawler in self.crawlers: if getattr(crawler.spider, "crawl_id") == crawl_id: return crawler raise KeyError("Job is not known: %s" % crawl_id) def _resend_signal(self, **kwargs): # FIXME: signal and crawl are mess. signal = kwargs['signal'] if signal in STAT_SIGNALS: signal = STAT_SIGNALS[signal] kwargs['crawler'] = kwargs.pop('sender').crawler else: signal = CrawlerProcessSignals.signal(signal) kwargs['crawler'] = kwargs.pop('sender') kwargs['signal'] = signal if signal.supports_defer: return self.signals.send_catch_log_deferred(**kwargs) else: return self.signals.send_catch_log(**kwargs) def stop(self): """停止crawl process""" self.procmon.stop() return super(MyselfCrawlerProcess, self).stop() def on_spider_closed(self, spider, reason): """spider关闭时写入""" self._finished_jobs.insert( 0, { 'id': spider.crawl_id, 'job_id': getattr(spider, 'motor_job_id'), 'seed': spider.domain, 'status': reason, 'stats': spider.crawler.stats.get_stats(spider), 'downloads': self._downloader_stats(spider.crawler) }) def get_jobs(self): """获取运行中的job""" crawlers = [ crawler for crawler in self.crawlers if crawler.spider is not None ] return [{ 'id': crawler.spider.crawl_id, 'job_id': getattr(crawler.spider, 'motor_job_id'), 'seed': crawler.spider.domain, 'status': self._get_crawler_status(crawler), 'stats': crawler.spider.crawler.stats.get_stats(crawler.spider), 'downloads': self._downloader_stats(crawler) } for crawler in crawlers] @classmethod def _downloader_stats(cls, crawler): """下载器状态""" downloader = crawler.engine.downloader return { 'active': [cls._request_info(req) for req in downloader.active], 'slots': sorted([ cls._slot_info(key, slot) for key, slot in downloader.slots.items() ], key=operator.itemgetter('key')) } @classmethod def _request_info(cls, request): """request消息""" return {'url': request.url, 'method': request.method} @classmethod def _slot_info(cls, key, slot): """slot消息""" return { 'key': key, 'concurrency': slot.concurrency, 'delay': slot.delay, 'lastseen': slot.lastseen, 'len(queue)': len(slot.queue), 'transferring': [cls._request_info(req) for req in slot.transferring], 'active': [cls._request_info(req) for req in slot.active] } def _get_crawler_status(self, crawler): """crawler运行状态""" if crawler.spider is None: return "unknown" if not crawler.crawling: return "stopping" if int(crawler.spider.crawl_id) in self._paused_jobs: return "suspended" return "crawling" @property def jobs(self): """完成和未完成crawl状态""" finished_ids = {job['id'] for job in self._finished_jobs} active_jobs = [ job for job in self.get_jobs() if job['id'] not in finished_ids ] return active_jobs + self._finished_jobs
class ArachnadoCrawlerProcess(CrawlerProcess): """ CrawlerProcess which sets up a global signals manager, assigns unique ids to each spider job, workarounds some Scrapy issues and provides extra stats. """ crawl_ids = itertools.count(start=1) def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(ArachnadoCrawlerProcess, self).__init__(settings or {}) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO) def crawl(self, crawler_or_spidercls, *args, **kwargs): kwargs["crawl_id"] = next(self.crawl_ids) crawler = crawler_or_spidercls if not isinstance(crawler_or_spidercls, Crawler): crawler = self._create_crawler(crawler_or_spidercls) # aggregate all crawler signals for name in SCRAPY_SIGNAL_NAMES: crawler.signals.connect(self._resend_signal, getattr(signals, name)) # aggregate signals from crawler EventedStatsCollectors if hasattr(crawler.stats, "signals"): crawler.stats.signals.connect(self._resend_signal, stats.stats_changed) d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs) return d def _create_crawler(self, spidercls): if isinstance(spidercls, six.string_types): spidercls = self.spider_loader.load(spidercls) return ArachnadoCrawler(spidercls, self.settings) def stop_job(self, crawl_id): """ Stop a single crawl job """ self.get_crawler(crawl_id).stop() def pause_job(self, crawl_id): """ Pause a crawling job """ self._paused_jobs.add(crawl_id) self.get_crawler(crawl_id).engine.pause() def resume_job(self, crawl_id): """ Resume a crawling job """ self._paused_jobs.remove(crawl_id) self.get_crawler(crawl_id).engine.unpause() def get_crawler(self, crawl_id): for crawler in self.crawlers: if getattr(crawler.spider, "crawl_id") == crawl_id: return crawler raise KeyError("Job is not known: %s" % crawl_id) def _resend_signal(self, **kwargs): # FIXME: this is a mess. Signal handling should be unified somehow: # there shouldn't be two separate code paths # for CrawlerProcessSignals and STAT_SIGNALS. signal = kwargs["signal"] if signal in STAT_SIGNALS: signal = STAT_SIGNALS[signal] kwargs["crawler"] = kwargs.pop("sender").crawler else: signal = CrawlerProcessSignals.signal(signal) kwargs["crawler"] = kwargs.pop("sender") kwargs["signal"] = signal if signal.supports_defer: return self.signals.send_catch_log_deferred(**kwargs) else: return self.signals.send_catch_log(**kwargs) def stop(self): """ Terminate the process (exit from application). """ self.procmon.stop() return super(ArachnadoCrawlerProcess, self).stop() def on_spider_closed(self, spider, reason): # spiders are closed not that often, insert(0,...) should be fine self._finished_jobs.insert( 0, { "id": spider.crawl_id, "job_id": getattr(spider, "motor_job_id"), "seed": spider.domain, "status": reason, "stats": spider.crawler.stats.get_stats(spider), "downloads": self._downloader_stats(spider.crawler), }, ) # FIXME: methods below are ugly for two reasons: # 1. they assume spiders have certain attributes; # 2. they try to get crawling status based on auxilary information. def get_jobs(self): """ Return a list of active jobs """ crawlers = [crawler for crawler in self.crawlers if crawler.spider is not None] return [ { "id": crawler.spider.crawl_id, "job_id": getattr(crawler.spider, "motor_job_id"), "seed": crawler.spider.domain, "status": self._get_crawler_status(crawler), "stats": crawler.spider.crawler.stats.get_stats(crawler.spider), "downloads": self._downloader_stats(crawler) # 'engine_info': dict(get_engine_status(crawler.engine)) } for crawler in crawlers ] @classmethod def _downloader_stats(cls, crawler): downloader = crawler.engine.downloader return { "active": [cls._request_info(req) for req in downloader.active], "slots": sorted( [cls._slot_info(key, slot) for key, slot in downloader.slots.items()], key=operator.itemgetter("key") ), } @classmethod def _request_info(cls, request): return {"url": request.url, "method": request.method} @classmethod def _slot_info(cls, key, slot): return { "key": key, "concurrency": slot.concurrency, "delay": slot.delay, "lastseen": slot.lastseen, "len(queue)": len(slot.queue), "transferring": [cls._request_info(req) for req in slot.transferring], "active": [cls._request_info(req) for req in slot.active], } def _get_crawler_status(self, crawler): if crawler.spider is None: return "unknown" if not crawler.crawling: return "stopping" if int(crawler.spider.crawl_id) in self._paused_jobs: return "suspended" return "crawling" @property def jobs(self): """ Current crawl state """ # filter out active jobs which are in fact finished finished_ids = {job["id"] for job in self._finished_jobs} active_jobs = [job for job in self.get_jobs() if job["id"] not in finished_ids] return active_jobs + self._finished_jobs