def __init__(self, name="Simpyder", gen_url=None, parse=None, save=None, config=SimpyderConfig()): # 配置Session,复用TCP连接 self.session = requests.session() self.session.mount('http://', HTTPAdapter(max_retries=3)) self.session.mount('https://', HTTPAdapter(max_retries=3)) # 载入配置 self.config = config # 载入主线程日志记录 self.logger = _get_logger("{} - 主线程".format(name), self.config.LOG_LEVEL) # 构造函数组装 self.assemble(gen_url, parse, save) self.QUEUE_LEN = self.config.PARSE_THREAD_NUMER * 2 self.url_queue = queue.Queue(self.QUEUE_LEN) self.item_queue = queue.Queue(self.QUEUE_LEN) self.except_queue = queue.Queue(1) self.queueLock = threading.Lock() self.threads = [] self.name = name self._saving = False
def run(self): self.__apply_config() print(""" ======================================================= _____ _ __ / ___/(_)___ ___ ____ __ ______/ /__ _____ \__ \/ / __ `__ \/ __ \/ / / / __ / _ \/ ___/ ___/ / / / / / / / /_/ / /_/ / /_/ / __/ / /____/_/_/ /_/ /_/ .___/\__, /\__,_/\___/_/ /_/ /____/ version: {} ======================================================= """.format(__VERSION__)) self.logger.critical("Simpyder ver.{}".format(__VERSION__)) self.logger.critical("启动爬虫任务") meta = {'link_count': 0, 'item_count': 0} start_time = datetime.datetime.now() meta['start_time'] = start_time self.meta = meta info_thread = threading.Thread(target=self.__get_info, name="状态打印线程") info_thread.setDaemon(True) info_thread.start() save_thread = threading.Thread(target=self.__run_save, name="保存项目线程") save_thread.setDaemon(True) save_thread.start() for i in range(self.PARSE_THREAD_NUMER): self.threads.append( self.ParseThread('{} - 子线程 - No.{}'.format(self.name, i), self.url_queue, self.queueLock, self.get_response, self.parse, self.save, self.except_queue, self.item_queue, meta)) for each_thread in self.threads: each_thread.setDaemon(True) each_thread.start() url_gener = self.gen_url() for each_url in url_gener: # self.queueLock.acquire() if (self.url_queue.full()): # self.queueLock.release() sleep(0.1) else: self.url_queue.put(each_url) # self.queueLock.release() while self.url_queue.empty() == False: if self.except_queue.empty() == False: except_info = self.except_queue.get() self.logger = _get_logger(self.NAME) self.logger.error(except_info) # for each_thread in self.threads: # each_thread.join() break pass sleep(1) self.logger.critical("爬取完毕") self.logger.critical("合计爬取项目数:{}".format(meta["item_count"])) self.logger.critical("合计爬取链接数:{}".format(meta["link_count"]))
def __run_save(self): logger = _get_logger("{} - 子线程 - SAVE".format(self.name), 'INFO') while True: if not self.item_queue.empty(): self.save(self.item_queue.get()) self.meta['item_count'] += 1 else: sleep(0.1)
def __init__(self, name, url_queue, queueLock, get_response, parse, save, except_queue, item_queue, meta): threading.Thread.__init__(self, target=self.run) self.name = name self.url_queue = url_queue self.queueLock = queueLock self.get_response = get_response self.parse = parse self.save = save self.item_queue = item_queue self.except_queue = except_queue self.logger = _get_logger(self.name) self.meta = meta
def run(self): self.logger = _get_logger("{}".format(self.name), self.log_level) print("""\033[0;32m _____ _ Author: Jannchie __ / ___/(_)___ ___ ____ __ ______/ /__ _____ \__ \/ / __ `__ \/ __ \/ / / / __ / _ \/ ___/ ___/ / / / / / / / /_/ / /_/ / /_/ / __/ / /____/_/_/ /_/ /_/ .___/\__, /\__,_/\___/_/ /_/ /____/ version: {}\033[0m """.format(__VERSION__)) self.logger.critical("user_agent: %s" % self.user_agent) self.logger.critical("concurrency: %s" % self.concurrency) self.logger.critical("interval: %s" % self.interval) self.proxy_gener = self.gen_proxy() self.loop = asyncio.get_event_loop() self.loop.run_until_complete(self._run()) self.loop.close()
def __init__(self, gen_url=None, parse=None, save=None, config=SimpyderConfig(), name="Simpyder"): self.logger = _get_logger("{} - 主线程".format(name)) self.assemble(gen_url, parse, save) self.config = config self.QUEUE_LEN = 1000 self.url_queue = queue.Queue(self.QUEUE_LEN) self.item_queue = queue.Queue(self.QUEUE_LEN) self.except_queue = queue.Queue(1) self.queueLock = threading.Lock() self.threads = [] self.name = name
def __run_save(self): logger = _get_logger("{} - 子线程 - SAVE".format(self.name), self.config.LOG_LEVEL) while True: if not self.item_queue.empty(): try: item = self.item_queue.get() self._saving = True if item == None or item == False: continue item = self.save(item) except Exception as e: self.logger.exception(e) logger.debug(item) self.meta['item_count'] += 1 else: self._saving = False sleep(1)
def __get_info(self): log = _get_logger("{} - 子线程 - INFO".format(self.name), 'INFO') history = [] interval = 5 while True: c_time = datetime.datetime.now() history.append( (c_time, self.meta['link_count'], self.meta['item_count'])) if len(history) > 60 / interval: history = history[-12:] if (c_time - self.meta['start_time'] ).total_seconds() % interval < 1 and len(history) > 1: delta_link = (history[-1][1] - history[0][1]) * 60 / \ (history[-1][0] - history[0][0]).total_seconds() delta_item = (history[-1][2] - history[0][2]) * 60 / \ (history[-1][0] - history[0][0]).total_seconds() log.info("正在爬取第 {} 个链接({}/min),共产生 {} 个对象({}/min)".format( self.meta['link_count'], int(delta_link), self.meta['item_count'], int(delta_item))) sleep(1)
def __get_info(self): log = _get_logger("{} - 子线程 - INFO".format(self.name), self.config.LOG_LEVEL) history = [] interval = 5 while True: c_time = datetime.datetime.now() history.append( (c_time, self.meta['link_count'], self.meta['item_count'])) if len(history) > 60: history = history[-60:] if (c_time - self.meta['start_time'] ).total_seconds() % interval < 1 and len(history) > 1: delta_link = (history[-interval + 1][1] - history[0][1]) * 60 / \ ((history[-interval + 1][0] - history[0][0]).total_seconds() + 1) delta_item = (history[-interval + 1][2] - history[0][2]) * 60 / \ ((history[-interval + 1][0] - history[0][0]).total_seconds() + 1) if (self.config.DOWNLOAD_INTERVAL == 0): load = 100 else: load = int( (history[-1][1] - history[0][1]) * 60 / (history[-1][0] - history[0][0]).total_seconds() / (60 / (self.config.DOWNLOAD_INTERVAL / self.config.PARSE_THREAD_NUMER)) * 100) result = { 'computer_name': socket.gethostname(), 'spider_name': self.start_time, 'start_time': self.start_time, 'update_time': datetime.datetime.now(), 'load': load, 'delta_link': delta_link, 'delta_item': delta_item }, log.info( "正在爬取第 {} 个链接({}/min, 负载{}%),共产生 {} 个对象({}/min)".format( self.meta['link_count'], int(delta_link), load, self.meta['item_count'], int(delta_item))) sleep(1)
def run(self): self.start_time = datetime.datetime.now() self._finish = False print(""" _____ _ Author: Jannchie __ / ___/(_)___ ___ ____ __ ______/ /__ _____ \__ \/ / __ `__ \/ __ \/ / / / __ / _ \/ ___/ ___/ / / / / / / / /_/ / /_/ / /_/ / __/ / /____/_/_/ /_/ /_/ .___/\__, /\__,_/\___/_/ /_/ /____/ version: {} """.format(__VERSION__)) self.__apply_config() self.logger.critical("Simpyder ver.{}".format(__VERSION__)) self.logger.critical("启动爬虫任务") meta = { 'link_count': 0, 'item_count': 0, 'thread_number': self.config.PARSE_THREAD_NUMER, 'download_interval': self.config.DOWNLOAD_INTERVAL } meta['start_time'] = self.start_time self.meta = meta info_thread = threading.Thread(target=self.__get_info, name="状态打印线程") info_thread.setDaemon(True) info_thread.start() save_thread = threading.Thread(target=self.__run_save, name="保存项目线程") save_thread.setDaemon(True) save_thread.start() for i in range(self.PARSE_THREAD_NUMER): self.threads.append( self.ParseThread('{} - 子线程 - No.{}'.format(self.name, i), self.url_queue, self.queueLock, self.get_response, self.parse, self.save, self.except_queue, self.item_queue, meta, self.config)) for each_thread in self.threads: each_thread.setDaemon(True) each_thread.start() url_gener = self.gen_url() for each_url in url_gener: self.queueLock.acquire() while (self.url_queue.full()): if self.queueLock.locked(): self.logger.debug("队列满: {}".format(each_url)) self.queueLock.release() sleep(0.1) self.logger.debug("加入待爬: {}".format(each_url)) if self.queueLock.locked(): self.queueLock.release() self.queueLock.acquire() self.url_queue.put(each_url) self.queueLock.release() self.logger.info("全部请求完毕,等待解析进程") while self.url_queue.empty() == False or self.item_queue.empty( ) == False or self._saving == True: if self.except_queue.empty() == False: except_info = self.except_queue.get() self.logger = _get_logger(self.name, self.config.LOG_LEVEL) self.logger.error(except_info) # for each_thread in self.threads: # each_thread.join() break pass self.logger.critical("全部解析完毕,等待保存进程") self._finish = True save_thread.join() self.logger.critical("合计爬取项目数:{}".format(meta["item_count"])) self.logger.critical("合计爬取链接数:{}".format(meta["link_count"]))