def stats_spider_opened(self, spider): stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider) stats.set_value('envinfo/host', stats.get_value('envinfo/host'), spider=spider) stats.inc_value('spider_count/opened')
def stats_spider_closing(self, spider, reason): stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider) stats.set_value('finish_status', 'OK' if reason == 'finished' else reason, spider=spider) stats.inc_value('spider_count/%s' % reason, spider=spider)
def process_item(self, spider, item): sampled = stats.get_value("items_sampled", 0, spider=spider) if sampled < items_per_spider: self.items[item.guid] = item sampled += 1 stats.set_value("items_sampled", sampled, spider=spider) log.msg("Sampled %s" % item, spider=spider, level=log.INFO) if close_spider and sampled == items_per_spider: scrapyengine.close_spider(spider) return item
def _check_limit(self): if self.get_virtual_size() > self.limit: stats.set_value('memusage/limit_reached', 1) mem = self.limit/1024/1024 log.msg("Memory usage exceeded %dM. Shutting down Scrapy..." % mem, level=log.ERROR) if self.notify_mails: subj = "%s terminated: memory usage exceeded %dM at %s" % \ (self.crawler.settings['BOT_NAME'], mem, socket.gethostname()) self._send_report(self.notify_mails, subj) stats.set_value('memusage/limit_notified', 1) self.crawler.stop()
def process_response(self, request, response, spider): """ A response is leaving the Downloader. It was either retreived from the web or from another middleware. Decide if we would like to store it in the history. """ if self.store_if(spider, request, response): self.storage.store_response(spider, request, response) stats.set_value("history/cached", True, spider=spider) return response
def _check_warning(self): if self.warned: # warn only once return if self.get_virtual_size() > self.warning: stats.set_value("memusage/warning_reached", 1) mem = self.warning / 1024 / 1024 log.msg("Memory usage reached %dM" % mem, level=log.WARNING) if self.notify_mails: subj = "%s warning: memory usage reached %dM at %s" % (settings["BOT_NAME"], mem, socket.gethostname()) self._send_report(self.notify_mails, subj) stats.set_value("memusage/warning_notified", 1) self.warned = True
def _check_limit(self): if self.get_virtual_size() > self.limit: stats.set_value('memusage/limit_reached', 1) mem = self.limit / 1024 / 1024 log.msg("Memory usage exceeded %dM. Shutting down Scrapy..." % mem, level=log.ERROR) if self.notify_mails: subj = "%s terminated: memory usage exceeded %dM at %s" % \ (settings['BOT_NAME'], mem, socket.gethostname()) self._send_report(self.notify_mails, subj) stats.set_value('memusage/limit_notified', 1) crawler.stop()
def _filter(request): if isinstance(request, Request): depth = response.request.meta['depth'] + 1 request.meta['depth'] = depth if self.maxdepth and depth > self.maxdepth: log.msg("Ignoring link (depth > %d): %s " % (self.maxdepth, request.url), \ level=log.DEBUG, spider=spider) return False elif self.stats: stats.inc_value('request_depth_count/%s' % depth, spider=spider) if depth > stats.get_value('request_depth_max', 0, spider=spider): stats.set_value('request_depth_max', depth, spider=spider) return True
def _check_warning(self): if self.warned: # warn only once return if self.get_virtual_size() > self.warning: stats.set_value('memusage/warning_reached', 1) mem = self.warning/1024/1024 log.msg("Memory usage reached %dM" % mem, level=log.WARNING) if self.notify_mails: subj = "%s warning: memory usage reached %dM at %s" % \ (self.crawler.settings['BOT_NAME'], mem, socket.gethostname()) self._send_report(self.notify_mails, subj) stats.set_value('memusage/warning_notified', 1) self.warned = True
def engine_started(self): stats.set_value('memusage/startup', self.get_virtual_size()) self.tasks = [] tsk = task.LoopingCall(self.update) self.tasks.append(tsk) tsk.start(60.0, now=True) if self.limit: tsk = task.LoopingCall(self._check_limit) self.tasks.append(tsk) tsk.start(60.0, now=True) if self.warning: tsk = task.LoopingCall(self._check_warning) self.tasks.append(tsk) tsk.start(60.0, now=True)
def engine_started(self): stats.set_value('memusage/startup', self.get_virtual_size()) self.tasks = [] tsk = task.LoopingCall(self.update) self.tasks.append(tsk) tsk.start(60.0, now=True) if self.limit: tsk = task.LoopingCall(self._check_limit) self.tasks.append(tsk) tsk.start(60.0, now=True) if self.warning: tsk = task.LoopingCall(self._check_warning) self.tasks.append(tsk) tsk.start(60.0, now=True)
def __init__(self): stats.set_value("envinfo/user", getpass.getuser()) stats.set_value("envinfo/host", socket.gethostname()) stats.set_value("envinfo/logfile", settings["LOG_FILE"]) stats.set_value("envinfo/pid", os.getpid()) dispatcher.connect(self.stats_spider_opened, signal=stats_spider_opened) dispatcher.connect(self.stats_spider_closing, signal=stats_spider_closing) dispatcher.connect(self.item_scraped, signal=signals.item_scraped) dispatcher.connect(self.item_passed, signal=signals.item_passed) dispatcher.connect(self.item_dropped, signal=signals.item_dropped)
def __init__(self): stats.set_value('envinfo/user', getpass.getuser()) stats.set_value('envinfo/host', socket.gethostname()) stats.set_value('envinfo/logfile', settings['LOG_FILE']) stats.set_value('envinfo/pid', os.getpid()) dispatcher.connect(self.stats_spider_opened, signal=signals.stats_spider_opened) dispatcher.connect(self.stats_spider_closing, signal=signals.stats_spider_closing) dispatcher.connect(self.item_scraped, signal=signals.item_scraped) dispatcher.connect(self.item_passed, signal=signals.item_passed) dispatcher.connect(self.item_dropped, signal=signals.item_dropped)
def engine_stopped(self): if self.libxml2: self.libxml2.cleanupParser() stats.set_value('memdebug/libxml2_leaked_bytes', self.libxml2.debugMemory(1)) gc.collect() stats.set_value('memdebug/gc_garbage_count', len(gc.garbage)) if self.trackrefs: for cls, wdict in live_refs.iteritems(): if not wdict: continue stats.set_value('memdebug/live_refs/%s' % cls.__name__, len(wdict))
def new_callback(*args, **kwargs): tbefore = time() mbefore = self._memusage() r = function(*args, **kwargs) mafter = self._memusage() ct = time() - tbefore tcc = stats.get_value('profiling/total_callback_time', 0, spider=spider) sct = stats.get_value('profiling/slowest_callback_time', 0, spider=spider) stats.set_value('profiling/total_callback_time', tcc+ct, spider=spider) if ct > sct: stats.set_value('profiling/slowest_callback_time', ct, spider=spider) stats.set_value('profiling/slowest_callback_name', function.__name__, \ spider=spider) stats.set_value('profiling/slowest_callback_url', args[0].url, \ spider=spider) if self._memusage: stats.inc_value('profiling/total_mem_allocated_in_callbacks', \ count=mafter-mbefore, spider=spider) return r
def engine_stopped(self): if self.libxml2: self.libxml2.cleanupParser() stats.set_value('memdebug/libxml2_leaked_bytes', self.libxml2.debugMemory(1)) gc.collect() stats.set_value('memdebug/gc_garbage_count', len(gc.garbage)) if settings.getbool('TRACK_REFS'): for cls, wdict in live_refs.iteritems(): if not wdict: continue stats.set_value('memdebug/live_refs/%s' % cls.__name__, len(wdict))
def stats_spider_opened(self, spider): stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
def stats_spider_closing(self, spider, reason): stats.set_value("finish_time", datetime.datetime.utcnow(), spider=spider) stats.set_value("finish_status", "OK" if reason == "finished" else reason, spider=spider) stats.inc_value("spider_count/%s" % reason, spider=spider)
def stats_spider_opened(self, spider): stats.set_value("start_time", datetime.datetime.utcnow(), spider=spider) stats.set_value("envinfo/host", stats.get_value("envinfo/host"), spider=spider) stats.inc_value("spider_count/opened")
def stats_spider_opened(self, spider): stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider) stats.set_value('envinfo/host', stats.get_value('envinfo/host'), spider=spider) stats.inc_value('spider_count/opened')
from scrapy.stats import stats from scrapy.shell import inspect_response as inspect # python stdlib modules from random import random from urllib import urlencode # local modules from crange import Crange # michbar modules import michbar.settings from michbar.items import MichbarItem stats.set_value('prefixes', {}) class MichBarSpider(BaseSpider): name = 'MichBar' base_url = 'http://www.michbar.org/memberdirectory' index_url = base_url + '/results.cfm' detail_url = base_url + '/detail.cfm' allowed_domains = ( 'michbar.org', ) def start_requests(self): prefix_stats = stats.get_value('prefixes') alpha = Crange()
def stats_spider_closing(self, spider, reason): stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider) stats.set_value('finish_reason', reason, spider=spider)
def stats_spider_closing(self, spider, reason): stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider) stats.set_value('finish_status', 'OK' if reason == 'finished' else reason, spider=spider) stats.inc_value('spider_count/%s' % reason, spider=spider)
def __init__(self): self.maxdepth = settings.getint('DEPTH_LIMIT') self.stats = settings.getbool('DEPTH_STATS') if self.stats and self.maxdepth: stats.set_value('envinfo/request_depth_limit', self.maxdepth)
def stats_spider_closing(self, spider, reason): stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider) stats.set_value('finish_reason', reason, spider=spider)
def stats_spider_opened(self, spider): stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)