def __init__(self, thread_count=None): """ 基于内存队列的爬虫,不支持分布式 :param thread_count: 线程数 """ super(AirSpider, self).__init__() for key, value in self.__class__.__custom_setting__.items(): setattr(setting, key, value) log.reload() self._thread_count = (setting.SPIDER_THREAD_COUNT if not thread_count else thread_count) self._memory_db = MemoryDB() self._parser_controls = []
def __init__( self, redis_key=None, thread_count=None, begin_callback=None, end_callback=None, delete_keys=(), auto_stop_when_spider_done=None, auto_start_requests=None, send_run_time=True, batch_interval=0, wait_lock=True, task_table=None, ): """ @summary: 调度器 --------- @param redis_key: 爬虫request及item存放reis中的文件夹 @param thread_count: 线程数,默认为配置文件中的线程数 @param begin_callback: 爬虫开始回调函数 @param end_callback: 爬虫结束回调函数 @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则 @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束 @param auto_start_requests: 爬虫是否自动添加任务 @param send_run_time: 发送运行时间 @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动 @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True @param task_table: 任务表, 批次爬虫传递 --------- @result: """ super(Scheduler, self).__init__() for key, value in self.__class__.__custom_setting__.items(): setattr(setting, key, value) log.reload() self._redis_key = redis_key or setting.REDIS_KEY if not self._redis_key: raise Exception( """ redis_key 为redis中存放request与item的目录。不能为空, 可在setting中配置,如 REDIS_KEY = 'test' 或spider初始化时传参, 如 TestSpider(redis_key='test') """ ) self._request_buffer = RequestBuffer(redis_key) self._item_buffer = ItemBuffer(redis_key, task_table) self._collector = Collector(redis_key) self._parsers = [] self._parser_controls = [] self._parser_control_obj = PaserControl self._auto_stop_when_spider_done = ( auto_stop_when_spider_done if auto_stop_when_spider_done is not None else setting.AUTO_STOP_WHEN_SPIDER_DONE ) self._auto_start_requests = ( auto_start_requests if auto_start_requests is not None else setting.SPIDER_AUTO_START_REQUESTS ) self._send_run_time = send_run_time self._batch_interval = batch_interval self._begin_callback = ( begin_callback if begin_callback else lambda: log.info("\n********** feapder begin **********") ) self._end_callback = ( end_callback if end_callback else lambda: log.info("\n********** feapder end **********") ) self._thread_count = ( setting.SPIDER_THREAD_COUNT if not thread_count else thread_count ) self._spider_name = redis_key self._project_name = redis_key.split(":")[0] self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key) self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key) self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key) self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format( redis_key=redis_key ) self._is_notify_end = False # 是否已经通知结束 self._last_task_count = 0 # 最近一次任务数量 self._redisdb = RedisDB() self._project_total_state_table = "{}_total_state".format(self._project_name) self._is_exist_project_total_state_table = False # Request 缓存设置 Request.cached_redis_key = redis_key Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME delete_keys = delete_keys or setting.DELETE_KEYS if delete_keys: self.delete_tables(delete_keys) self._last_check_task_status_time = 0 self.wait_lock = wait_lock