def init(self): def get_url(list): ''' 描述: 将爬虫获取到的request列表中的url提取出来,并且格式化与去重复 :param list: :return: ''' tempList = [] for item in list: url = item.url if url and url[-1] == '/': url = url[:-1] tempList.append(url) return set(tempList) self.detectTM.setMaxThreads(10) # 设置可以同时进行任务的个数 from sinbot import sinbot_start # 引入sinbot_start方法 from settings.settings import settings as st # 引入sinbot_settings方法 st.set('DEPTH_LIMIT', settings.getint('DEPTH_LIMIT')) # 设置检测层数, 此处设置为2表示3层,从0开始计数 reqList = sinbot_start(self.url) # 开始爬取结果 self.urlList = get_url(reqList) # 将爬取到的url结果保存到列表中 logger.info('Detect modules complete initialization...')
def _initPool(self): if not self._initialized: self._maxThreads = settings.getint('THREAD_MAX') or 5 self._queueSize = settings.getint('QUEUE_SIZE') or 200 self._threadPool = ThreadPool(self._queueSize, self._maxThreads) self._initialized = True
def _initPool(self): if not self._initialized: self._maxThreads = settings.getint('THREAD_MAX') or 5 self._queueSize = settings.getint('QUEUE_SIZE') or 200 self._threadPool = ThreadPool(self._queueSize, self._maxThreads) self._initialized = True