def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) # 统一 self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) # 复位? self.settings.freeze() # settings不可改变了 self.crawling = False self.spider = None self.engine = None
class EventedStatsCollector(StatsCollector): """ Stats Collector which allows to subscribe to value changes. Update notifications are throttled: interval between updates is no shorter than ``accumulate_time``. It is assumed that stat keys are never deleted. """ accumulate_time = 0.1 # value is in seconds def __init__(self, crawler): super(EventedStatsCollector, self).__init__(crawler) self.signals = SignalManager(self) self._changes = {} self._task = PeriodicCallback(self.emit_changes, self.accumulate_time * 1000) self._task.start() # FIXME: this is ugly self.crawler = crawler # used by ArachnadoCrawlerProcess def emit_changes(self): if self._changes: changes, self._changes = self._changes, {} self.signals.send_catch_log(stats_changed, changes=changes) def open_spider(self, spider): super(EventedStatsCollector, self).open_spider(spider) self._task.start() def close_spider(self, spider, reason): super(EventedStatsCollector, self).close_spider(spider, reason) self._task.stop()
def __init__(self,**kwargs): if not 'config' in kwargs: err = 'failed to find seed file (config=*.conf)' print err # if not 'keywords' in kwargs: # err = 'failed to find seed file (keywords=*.dat)' # print err config = kwargs['config'] # self.keywords = kwargs['keywords'] self.load_conf(config) if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag: settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline') settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline') else: settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline') log_filename = self.conf_name.replace('.conf','')+'.log' settings.set('LOG_FILE', log_filename, priority='cmdline') #redis key self.meta_next_url = meta_redis_key() #初始化redis self.init_redis() self.redis_keyword = get_redis_key(self.conf_name) #注册signal sig = SignalManager(dispatcher.Any) sig.connect(self.idle,signal=signals.spider_idle) self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') #保存该次获取的url,用于判断该次抓取是否和上次重复{keyword:md5(url)} self.urlmd5 = dict() self.log_writer = open('log.dat','a+') self.date_from_url_re = re.compile("[-_/][a-zA-Z]*[-_]?(?P<year>(20)?([0-1][0-9]))([-_/])?(?P<m>(10|11|12|(0?[1-9])){1})([-_/])?(?P<day>(10|20|30|31|([0-2]?[1-9])){1})([-_/])")
def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) logging.root.addHandler(handler) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def __init__(self, spidercls, settings): if isinstance(settings, dict): settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) logging.root.addHandler(handler) self.signals.connect(lambda: logging.root.removeHandler(handler), signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.spidercls.update_settings(self.settings) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
class EventedStatsCollector(StatsCollector): """ Stats Collector which allows to subscribe to value changes. Update notifications are throttled: interval between updates is no shorter than ``accumulate_time``. It is assumed that stat keys are never deleted. """ accumulate_time = 0.1 # value is in seconds def __init__(self, crawler): super(EventedStatsCollector, self).__init__(crawler) self.signals = SignalManager(self) self._changes = {} self._task = PeriodicCallback(self.emit_changes, self.accumulate_time*1000) self._task.start() # FIXME: this is ugly self.crawler = crawler # used by ArachnadoCrawlerProcess def emit_changes(self): if self._changes: changes, self._changes = self._changes, {} self.signals.send_catch_log(stats_changed, changes=changes) def open_spider(self, spider): super(EventedStatsCollector, self).open_spider(spider) self._task.start() def close_spider(self, spider, reason): super(EventedStatsCollector, self).close_spider(spider, reason) self._task.stop()
def __init__(self, spidercls, settings=None, init_reactor: bool = False): if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings:\n%(settings)s", {'settings': pprint.pformat(d)}) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.request_fingerprinter = create_instance( load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']), settings=self.settings, crawler=self, ) reactor_class = self.settings.get("TWISTED_REACTOR") if init_reactor: # this needs to be done after the spider settings are merged, # but before something imports twisted.internet.reactor if reactor_class: install_reactor(reactor_class, self.settings["ASYNCIO_EVENT_LOOP"]) else: from twisted.internet import reactor # noqa: F401 log_reactor_info() if reactor_class: verify_installed_reactor(reactor_class) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def __init__(self, crawler): super(EventedStatsCollector, self).__init__(crawler) self.signals = SignalManager(self) self._changes = {} self._task = PeriodicCallback(self.emit_changes, self.accumulate_time * 1000) self._task.start() # FIXME: this is ugly self.crawler = crawler # used by ArachnadoCrawlerProcess
def __init__(self, spidercls, settings=None): ## crawler 对象必须用 scrapy.spiders.Spider 的子类和一个 scrapy.settings.Settings ## 对象来实例化 if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) ## 自定义爬虫类 self.spidercls = spidercls ## crawler 的配置管理器,用来为插件和中间件提供访问该 crawler 的 Scrapy 配置的入口 self.settings = settings.copy() ## 根据自定义爬虫类中的可能定义的 custom_settigns 属性更新配置 ## 优先级为 spider self.spidercls.update_settings(self.settings) ## 这里得到的只是被覆盖过的配置项,并将其转换为字典 d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) ## crawler 的信号管理器,被插件和中间件用来将它们自身集成到 Scrapy 功能中 self.signals = SignalManager(self) ## crawler 的 stats 收集器,用来从插件和中间件中记录它们的行为和访问其他插件收集到的数据 self.stats = load_object(self.settings['STATS_CLASS'])(self) ## 用于对爬虫运行过程中产生的日志的级别数量,进行统计 handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) ## 为 engine_stopped 信号注册 __remove_handler 处理器 ## 当产生引擎停止信号时,将会由 __remove_handler 处理器进行处理 self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) ## 初始化日志格式化器实例 self.logformatter = lf_cls.from_crawler(self) ## 用来追踪可用插件的插件管理器 self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() ## 标志爬虫运行状态 self.crawling = False ## 当前正在爬取的 spider self.spider = None ## 执行引擎,用来协调调度器、下载器、spiders 之间的爬取逻辑 self.engine = None
def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(MyselfCrawlerProcess, self).__init__(settings or {}) logging.getLogger('scrapy.spidermiddlewares.depth').setLevel( logging.INFO)
class ProcessStatsMonitor(object): """ A class which emits process stats periodically """ signal_updated = object() def __init__(self, interval=1.0): self.signals = SignalManager(self) self.process = psutil.Process(os.getpid()) self.interval = interval self._task = PeriodicCallback(self._emit, self.interval*1000) self._recent = {} def start(self): # yappi.start() self._task.start() def stop(self): self._task.stop() # stats = yappi.get_func_stats() # stats.sort('tsub', 'desc') # with open("func-stats.txt", 'wt') as f: # stats.print_all(f, columns={ # 0: ("name", 80), # 1: ("ncall", 10), # 2: ("tsub", 8), # 3: ("ttot", 8), # 4: ("tavg",8) # }) # # pstats = yappi.convert2pstats(stats) # pstats.dump_stats("func-stats.prof") def get_recent(self): return self._recent def _emit(self): cpu_times = self.process.cpu_times() ram_usage = self.process.memory_info() stats = { 'ram_percent': self.process.memory_percent(), 'ram_rss': ram_usage.rss, 'ram_vms': ram_usage.vms, 'cpu_percent': self.process.cpu_percent(), 'cpu_time_user': cpu_times.user, 'cpu_time_system': cpu_times.system, 'num_fds': self.process.num_fds(), 'context_switches': self.process.num_ctx_switches(), 'num_threads': self.process.num_threads(), 'server_time': int(time.time()*1000), } self._recent = stats self.signals.send_catch_log(self.signal_updated, stats=stats)
class ProcessStatsMonitor(object): """ A class which emits process stats periodically """ signal_updated = object() def __init__(self, interval=1.0): self.signals = SignalManager(self) self.process = psutil.Process(os.getpid()) self.interval = interval self._task = PeriodicCallback(self._emit, self.interval * 1000) self._recent = {} def start(self): # yappi.start() self._task.start() def stop(self): self._task.stop() # stats = yappi.get_func_stats() # stats.sort('tsub', 'desc') # with open("func-stats.txt", 'wt') as f: # stats.print_all(f, columns={ # 0: ("name", 80), # 1: ("ncall", 10), # 2: ("tsub", 8), # 3: ("ttot", 8), # 4: ("tavg",8) # }) # # pstats = yappi.convert2pstats(stats) # pstats.dump_stats("func-stats.prof") def get_recent(self): return self._recent def _emit(self): cpu_times = self.process.cpu_times() ram_usage = self.process.memory_info() stats = { 'ram_percent': self.process.memory_percent(), 'ram_rss': ram_usage.rss, 'ram_vms': ram_usage.vms, 'cpu_percent': self.process.cpu_percent(), 'cpu_time_user': cpu_times.user, 'cpu_time_system': cpu_times.system, 'num_fds': self.process.num_fds(), 'context_switches': self.process.num_ctx_switches(), 'num_threads': self.process.num_threads(), 'server_time': int(time.time() * 1000), } self._recent = stats self.signals.send_catch_log(self.signal_updated, stats=stats)
class ProcessStatsMonitor(object): """查看进程状态, 每秒发布一次""" signal_updated = object() def __init__(self, interval=1.0): self.signals = SignalManager(self) self.process = psutil.Process(os.getpid()) self.interval = interval self._task = PeriodicCallback(self._emit, self.interval * 1000) self._recent = {} def start(self): """启动进程""" self._task.start() def stop(self): """停止进程""" self._task.stop() def get_recent(self): """当前进程信息""" return self._recent def _emit(self): """进程属性""" cpu_times = self.process.cpu_times() ram_usage = self.process.memory_info() stats = { # 内存使用率 'ram_percent': self.process.memory_percent(), # 内存rss 'ram_rss': ram_usage.rss, # 内存vms 'ram_vms': ram_usage.vms, # cpu百分比 'cpu_percent': self.process.cpu_percent(), # 用户cpu时间 'cpu_time_user': cpu_times.user, # 系统cpu时间 'cpu_time_system': cpu_times.system, # 上下文 'context_switches': self.process.num_ctx_switches(), # 线程数 'num_threads': self.process.num_threads(), # 运行时间 'server_time': int(time.time() * 1000) } # 当前系统状态 self._recent = stats self.signals.send_catch_log(self.signal_updated, stats=stats)
def __init__(self, spidercls, settings=None): if isinstance(spidercls, Spider): # spidercls参数必须是类,而不是对象 raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls # 加载配置文件 self.settings = settings.copy() # 这里调用了更新设置 self.spidercls.update_settings(self.settings) # 初始化信号管理类 self.signals = SignalManager(self) # 初始化日志收集类 self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings:\n%(settings)s", {'settings': pprint.pformat(d)}) # 这个是关于根日志的 if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope # 加lambda为了防止垃圾回收机制? self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) # 加载日志格式化的东西 lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) # 好像是个扩展 self.extensions = ExtensionManager.from_crawler(self) # settings初始化完成,禁止修改了 self.settings.freeze() # 未开始抓取 self.crawling = False # 这里做准备操作,后面crawl才进行赋值 self.spider = None self.engine = None
def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(ArachnadoCrawlerProcess, self).__init__(settings or {}) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel( logging.INFO)
def __init__(self): from twisted.internet import reactor # Imported here.inside self.reactor = reactor engine = get_engine() create_schema(engine) self.read_pool = ThreadPool( minthreads=1, maxthreads=16, name="ReadPool") self.write_pool = ThreadPool( minthreads=1, maxthreads=1, name="WritePool") self.read_pool.start() self.write_pool.start() self.signals = SignalManager(dispatcher.Any).connect( self.stop_threadpools, spider_closed) self.counters = defaultdict(lambda: Counter()) self.cache = defaultdict( lambda: dict()) self.write_queue = Queue() self.writelock = False # Write queue mutex
def __init__(self, url=None, out=None, base=None, only=None, *args, **kwargs): super(PlayGrabberSpider, self).__init__(*args, **kwargs) SignalManager(dispatcher.Any).connect(self.closed_handler, signal=signals.spider_closed) if out == None and base == None: raise Exception( 'Must provide argument "-a out=..." or "-a base=..."') if out != None and base != None: raise Exception( 'Cannot provide both argument "-a out=..." and "-a base=..."') if url == None and base == None: raise Exception( 'Must provide argument "-a url=..." or "-a base=..."') if url: self.start_urls = [url] else: self.start_urls = self.find_shows_in_base(base) if only: self.dont_crawl = True else: self.dont_crawl = False self.output_dir = out self.output_base_dir = base
def __init__(self, spider): from twisted.internet import reactor # Imported here.inside self.spider = spider ''' Used for logging for now ''' self.reactor = reactor ''' Used for thred pools ''' engine = get_engine() create_schema(engine) self.thread_pool = ThreadPool( minthreads=1, maxthreads=13, name="ReadPool") # There should be only one pool in the write_pool # Never increase maxtreads value self.write_pool = ProfiledThreadPool( minthreads=1, maxthreads=1, name="WritePool") self.thread_pool.start() self.write_pool.start() self.signals = SignalManager(dispatcher.Any).connect( self.stop_threadpools, spider_closed) self.reporter = Reporter() ''' Reporer is used for statistics collection ''' self.counters = self.reporter.counters self.cache = defaultdict( lambda: dict()) self.write_queue = Queue() self.writelock = False # Write queue mutex
def __init__(self, settings): # self.options = settings.get('SELENIUM_OPTIONS', {}) max_run = settings.get('SELENIUM_MAXRUN', 10) self.sem = defer.DeferredSemaphore(max_run) self.queue = queue.LifoQueue(max_run) SignalManager(dispatcher.Any).connect(self._close, signal=signals.spider_closed)
def __init__(self, settings): self.options = settings.get('PHANTOMJS_OPTIONS', {}) # 默认空 max_run = settings.get('PHANTOMJS_MAXRUN', 10) # PhantomJS 可以同时运行最大的个数, 默认10 self.sem = defer.DeferredSemaphore(max_run) self.queue = Queue.LifoQueue(maxsize=max_run) # LifoQueue 后进先出队列 SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
def __init__(self): self.browser = webdriver.Chrome( executable_path='C:\\Users\\23607\\Desktop\\py\\chromedriver.exe') super(LagouSpider, self).__init__() # dispatcher.connect(self.spider_closed(),signals.spider_closed) # crawler.signals.connect(self.spider_closed, signals.spider_closed) SignalManager(dispatcher.Any).connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self, settings): self.options = settings.get('PHANTOMJS_OPTIONS', {}) max_run = settings.get('PHANTOMJS_MAXRUN', 10) self.sem = defer.DeferredSemaphore(max_run) self.queue = Queue.LifoQueue(max_run) self.create_phantomjs_count = 0 self._fallback_handler = load_object(FALLBACK_HANDLER)(settings) SignalManager(dispatcher.Any).connect(self._close, signal=signals.spider_closed)
def __init__(self, crawler): super(EventedStatsCollector, self).__init__(crawler) self.signals = SignalManager(self) self._changes = {} self._task = PeriodicCallback(self.emit_changes, self.accumulate_time*1000) self._task.start() # FIXME: this is ugly self.crawler = crawler # used by ArachnadoCrawlerProcess
def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: # 此处应该直接就是一个False settings = Settings(settings) self.spidercls = spidercls # 获取了爬虫对象,,但是还是没有实例化 # 所以流程就是获取爬虫对象,在实例化之前,执行了一次update_settings,就是针对custom setting的一次操作嘛 self.settings = settings.copy() self.spidercls.update_settings( self.settings ) # 就是在这里执行了对custom_setting的设置嘛,可以很强,把爬虫里面的custom_setting更新到setting里面?好吧没事共同维护的一个setting对象,确实是直接更新到了setting里面 # 所以在实例化之前,他只是更新了settings而已,你写在__init__其实一点用都得的,根本就没有执行到哪一步 # todo 总结。我以前是想把custiom_settings卸载init里面,但是并没有起作用,因为此时的爬虫也就是 self.spidercls 根本就还没有进行初始化,而是先执行的update_settings d = dict(overridden_settings(self.settings)) # 找出相同的,取overridden里面的值 logger.info( "Overridden settings: %(settings)r", {'settings': d }) # 这就是打印那句log的地方,打印出Overridden属性。现遍历默认属性,找出已有属性中对应的值,看那些有修改 self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])( self ) # STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector' -- 统计机制 handler = LogCounterHandler( self, level=self.settings.get('LOG_LEVEL')) # LOG_LEVEL = 'DEBUG' logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object( self.settings['LOG_FORMATTER'] ) # LOG_FORMATTER = 'scrapy.logformatter.LogFormatter' self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler( self) # extensions是干嘛用的,杵这干啥呢,闹呢。卧槽,貌似不需要使用这些扩展件呀 self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def __init__(self,**kwargs): if not 'config' in kwargs: err = 'failed to find seed file (config=*.conf)' print err if 'startdate' in kwargs: self.startdate = kwargs['startdate'] else: self.startdate = (datetime.datetime.now()-datetime.timedelta(days=2)).strftime('%Y-%m-%d %H:%M:%S') if 'enddate' in kwargs: self.enddate = kwargs['enddate'] else: self.enddate = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # if not 'keywords' in kwargs: # err = 'failed to find seed file (keywords=*.dat)' # print err config = kwargs['config'] self.load_conf(config) if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag: settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline') settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline') else: settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline') log_filename = self.conf_name.replace('.conf','')+'.log' settings.set('LOG_FILE', log_filename, priority='cmdline') #初始化redis self.init_redis() self.redis_keyword = get_redis_key() #注册signal sig = SignalManager(dispatcher.Any) sig.connect(self.idle,signal=signals.spider_idle) sig.connect(self.close,signal=signals.spider_closed) self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') self.conn_local = mysql.connect('meta',host='localhost') self.conn_local_cursor = self.conn_local.cursor() # self.conn_local_cursor.execute('set global autocommit=1') try: self.meta_ip = get_meta_ip(network_card='enp7s0') except: self.meta_ip = get_meta_ip(network_card='eth0') #初始化meta库的state self.init_state()
def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self.scheduled = {}
def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) self._start_requests = lambda: () self._spider = None # TODO: move SpiderManager to CrawlerProcess spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self)
def __init__(self, *args, **kwargs): super(GeneralUrls, self).__init__(*args, **kwargs) SignalManager(dispatcher.Any).connect(self.spiderClosed, signals.spider_closed) self.splitOptions.append(os.linesep) for c in printable: if c.isalpha() == False and c != '\'': self.splitOptions.append(c)
def __init__(self, settings): self.settings = settings self.options = settings.get('PHANTOMJS_OPTIONS', {})\ max_run = settings.get('PHANTOMJS_MAXRUN', 5) self.sem = defer.DeferredSemaphore( max_run) # as a means of limiting parallelism self.queue = queue.LifoQueue( max_run) # last in first out, the content is driver not request SignalManager(dispatcher.Any).connect(self._close, signal=signals.spider_closed)
def __init__(self, mongo_uri, cache=False): self.mongo_uri = mongo_uri _, _, _, _, self.col = motor_from_uri(mongo_uri) self.signal_manager = SignalManager() # Used for unsubscribe # disconnect() requires reference to original callback self._callbacks = {} self.fetching = False self.signals = { 'created': object(), 'updated': object(), 'deleted': object(), } # XXX: cache is used in arachnado.cron and arachnado.site_checker. # Is it needed? self.cache_flag = cache if cache: self.cache = defaultdict(dict) else: self.cache = None
def __init__(self, domain=None): self.con = mdb.connect('localhost', 'root', 'admin', 'huiben') self.cur = self.con.cursor() self.start_urls = [ "http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A658409051%2Cp_n_fulfilled_by_amazon%3A326314071" ] filename = str(uuid.uuid1()) print filename self.savefile = codecs.open(filename, 'w', 'utf-8') SignalManager(dispatcher.Any).connect( self.closed_handler, signal=scrapy.signals.spider_closed)
def __init__(self, settings): self.options = settings.get('SELENIUM_OPTIONS', {}) self.domain_concurrency = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self.ip_concurrency = settings.getint('CONCURRENT_REQUESTS_PER_IP') max_run = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency logging.info("Download workers: %s", max_run) self.sem = defer.DeferredSemaphore(max_run) self.queue = queue.LifoQueue(max_run) SignalManager(dispatcher.Any).connect(self._close, signal=signals.spider_closed)
def __init__(self, spidercls, settings): self.spidercls = spidercls self.settings = settings self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.crawling = False self.spider = None self.engine = None
def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(ArachnadoCrawlerProcess, self).__init__(settings or {}) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO)
def __init__(self, spidercls, settings=None): if isinstance(spidercls, Spider): raise ValueError( "The spidercls argument must be a class, not an object") if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings["STATS_CLASS"])(self) handler = LogCounterHandler(self, level=self.settings.get("LOG_LEVEL")) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings:\n%(settings)s", {"settings": pprint.pformat(d)}) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings["LOG_FORMATTER"]) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def __init__(self, spidercls, settings=None): if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
class Crawler(object): def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) logging.root.addHandler(handler) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None @property def spiders(self): if not hasattr(self, '_spiders'): warnings.warn("Crawler.spiders is deprecated, use " "CrawlerRunner.spider_loader or instantiate " "scrapy.spiderloader.SpiderLoader with your " "settings.", category=ScrapyDeprecationWarning, stacklevel=2) self._spiders = _get_spider_loader(self.settings.frozencopy()) return self._spiders @defer.inlineCallbacks def crawl(self, *args, **kwargs): assert not self.crawling, "Crawling already taking place" self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) self.engine = self._create_engine() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: self.crawling = False raise def _create_spider(self, *args, **kwargs): return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
def __init__(self): super(StoreToMongoDB, self).__init__() manager = SignalManager() manager.connect(self.initialize, scrapy.signals.spider_opened) manager.connect(self.finalize, scrapy.signals.spider_idle)
def __init__(self): self.conn = None sig=SignalManager(sender=dispatcher.Any) sig.connect(self.initialize,signals.engine_started) sig.connect(self.finalize, signals.engine_stopped)
def __init__(self, interval=1.0): self.signals = SignalManager(self) self.process = psutil.Process(os.getpid()) self.interval = interval self._task = PeriodicCallback(self._emit, self.interval*1000) self._recent = {}
class ArachnadoCrawlerProcess(CrawlerProcess): """ CrawlerProcess which sets up a global signals manager, assigns unique ids to each spider job, workarounds some Scrapy issues and provides extra stats. """ crawl_ids = itertools.count(start=1) def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(ArachnadoCrawlerProcess, self).__init__(settings or {}) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO) def crawl(self, crawler_or_spidercls, *args, **kwargs): kwargs["crawl_id"] = next(self.crawl_ids) crawler = crawler_or_spidercls if not isinstance(crawler_or_spidercls, Crawler): crawler = self._create_crawler(crawler_or_spidercls) # aggregate all crawler signals for name in SCRAPY_SIGNAL_NAMES: crawler.signals.connect(self._resend_signal, getattr(signals, name)) # aggregate signals from crawler EventedStatsCollectors if hasattr(crawler.stats, "signals"): crawler.stats.signals.connect(self._resend_signal, stats.stats_changed) d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs) return d def _create_crawler(self, spidercls): if isinstance(spidercls, six.string_types): spidercls = self.spider_loader.load(spidercls) return ArachnadoCrawler(spidercls, self.settings) def stop_job(self, crawl_id): """ Stop a single crawl job """ self.get_crawler(crawl_id).stop() def pause_job(self, crawl_id): """ Pause a crawling job """ self._paused_jobs.add(crawl_id) self.get_crawler(crawl_id).engine.pause() def resume_job(self, crawl_id): """ Resume a crawling job """ self._paused_jobs.remove(crawl_id) self.get_crawler(crawl_id).engine.unpause() def get_crawler(self, crawl_id): for crawler in self.crawlers: if getattr(crawler.spider, "crawl_id") == crawl_id: return crawler raise KeyError("Job is not known: %s" % crawl_id) def _resend_signal(self, **kwargs): # FIXME: this is a mess. Signal handling should be unified somehow: # there shouldn't be two separate code paths # for CrawlerProcessSignals and STAT_SIGNALS. signal = kwargs["signal"] if signal in STAT_SIGNALS: signal = STAT_SIGNALS[signal] kwargs["crawler"] = kwargs.pop("sender").crawler else: signal = CrawlerProcessSignals.signal(signal) kwargs["crawler"] = kwargs.pop("sender") kwargs["signal"] = signal if signal.supports_defer: return self.signals.send_catch_log_deferred(**kwargs) else: return self.signals.send_catch_log(**kwargs) def stop(self): """ Terminate the process (exit from application). """ self.procmon.stop() return super(ArachnadoCrawlerProcess, self).stop() def on_spider_closed(self, spider, reason): # spiders are closed not that often, insert(0,...) should be fine self._finished_jobs.insert( 0, { "id": spider.crawl_id, "job_id": getattr(spider, "motor_job_id"), "seed": spider.domain, "status": reason, "stats": spider.crawler.stats.get_stats(spider), "downloads": self._downloader_stats(spider.crawler), }, ) # FIXME: methods below are ugly for two reasons: # 1. they assume spiders have certain attributes; # 2. they try to get crawling status based on auxilary information. def get_jobs(self): """ Return a list of active jobs """ crawlers = [crawler for crawler in self.crawlers if crawler.spider is not None] return [ { "id": crawler.spider.crawl_id, "job_id": getattr(crawler.spider, "motor_job_id"), "seed": crawler.spider.domain, "status": self._get_crawler_status(crawler), "stats": crawler.spider.crawler.stats.get_stats(crawler.spider), "downloads": self._downloader_stats(crawler) # 'engine_info': dict(get_engine_status(crawler.engine)) } for crawler in crawlers ] @classmethod def _downloader_stats(cls, crawler): downloader = crawler.engine.downloader return { "active": [cls._request_info(req) for req in downloader.active], "slots": sorted( [cls._slot_info(key, slot) for key, slot in downloader.slots.items()], key=operator.itemgetter("key") ), } @classmethod def _request_info(cls, request): return {"url": request.url, "method": request.method} @classmethod def _slot_info(cls, key, slot): return { "key": key, "concurrency": slot.concurrency, "delay": slot.delay, "lastseen": slot.lastseen, "len(queue)": len(slot.queue), "transferring": [cls._request_info(req) for req in slot.transferring], "active": [cls._request_info(req) for req in slot.active], } def _get_crawler_status(self, crawler): if crawler.spider is None: return "unknown" if not crawler.crawling: return "stopping" if int(crawler.spider.crawl_id) in self._paused_jobs: return "suspended" return "crawling" @property def jobs(self): """ Current crawl state """ # filter out active jobs which are in fact finished finished_ids = {job["id"] for job in self._finished_jobs} active_jobs = [job for job in self.get_jobs() if job["id"] not in finished_ids] return active_jobs + self._finished_jobs
class Crawler(object): def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None @property def spiders(self): if not hasattr(self, '_spiders'): warnings.warn("Crawler.spiders is deprecated, use " "CrawlerRunner.spider_loader or instantiate " "scrapy.spiderloader.SpiderLoader with your " "settings.", category=ScrapyDeprecationWarning, stacklevel=2) self._spiders = _get_spider_loader(self.settings.frozencopy()) return self._spiders @defer.inlineCallbacks def crawl(self, *args, **kwargs): assert not self.crawling, "Crawling already taking place" self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) self.engine = self._create_engine() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: # In Python 2 reraising an exception after yield discards # the original traceback (see https://bugs.python.org/issue7563), # so sys.exc_info() workaround is used. # This workaround also works in Python 3, but it is not needed, # and it is slower, so in Python 3 we use native `raise`. if six.PY2: exc_info = sys.exc_info() self.crawling = False if self.engine is not None: yield self.engine.close() if six.PY2: six.reraise(*exc_info) raise def _create_spider(self, *args, **kwargs): return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)