def fix_mode(self): """ 修复模式,检查temp文件夹下内容 与 小说目录页 下载未完成章节并合成小说 :return: """ start_time = time.time() self.logger.info("------------------Fix Mode------------------") # 获取所有详细内容链接 detail_urls = self._parse_catalog() redownload_urls = CommonTool.get_not_downloaded_chapters(detail_urls) self.logger.debug("redownload: " + str(redownload_urls)) self.logger.info("Get novel chapters: " + str(len(redownload_urls))) self.all_chapter_num = len(redownload_urls) # 使用threadpool 控制多线程数量 requests = threadpool.makeRequests(self._get_detail, redownload_urls) [self.pool.putRequest(req) for req in requests] # 等待所有章节抓取完成 self.pool.wait() self.logger.info("Checking download completeness...") if CommonTool.check_completion(detail_urls): # 合并全文 self.logger.info("All chapters are downloaded successfully. Start merging ...") CommonTool.merge_all_chapters(self.output_name) self.logger.info("Merged. Enjoy reading!") else: self.logger.error("Some chapters download failed.") self.logger.error("Try: python novel_download.py -url URL -t THREAD_LIMIT --fix=true") self.logger.info("Total cost %.2fs" % (time.time() - start_time))
def _get_detail(self, detail_url): """ 获取详细章节详细内容 并写入 temp文件夹下 暂存 :param detail_url: 章节链接 :return: None """ time.sleep(0.5) try: # this will raise FetchFailedException content = CommonTool.fetch_page(detail_url) # this will raise EmptyContentException result = self._check_parse_detail(content) # 小说章节末尾链接作为临时储存文件名 filename = detail_url.split('/')[-1] # 暂存章节至文件 CommonTool.save_chapter(filename, result) if self.lock.acquire(): self.progress_cnt += 1 self.lock.release() self._print_progress() except FetchFailedException as e: self.failed_set.add(detail_url) self.logger.debug("Fetch failed: " + detail_url + ". " + str(e)) except EmptyContentException: self.failed_set.add(detail_url) self.logger.debug("Empty content: " + detail_url)
def _parse_catalog(self): """ 请求self.url,获取小说目录页面内容 :return: 所有详细页面的链接 """ result = CommonTool.fetch_page(self.catalog_url) doc = pq.PyQuery(result) # 内存去重 detail_urls = set() # 模式1 https://www.kanunu8.com/book3/8257/ for a in doc('table:nth-child(2) > tbody > tr > td > a').items(): detail_url = urllib.request.urljoin(self.catalog_url, a.attr.href) if detail_url in detail_urls: # 去重 continue if self.HOST not in detail_url: # 不是该站点链接 continue detail_urls.add(detail_url) # 模式2 https://www.kanunu8.com/book2/10946/index.html for a in doc('div.col-left > div > dl > dd > a').items(): detail_url = urllib.request.urljoin(self.catalog_url, a.attr.href) if detail_url in detail_urls: # 去重 continue if self.HOST not in detail_url: # 不是该站点链接 continue detail_urls.add(detail_url) return detail_urls
def start(self): """ 解析目录页 尝试爬取所有章节 暂存至temp文件夹 校验下载完整后,合成小说文件 若下载不完整,则退出。 使用 --fix=true 参数进入修复模式 :return: """ start_time = time.time() # 先清除临时文件 CommonTool.clean_temp() # 获取所有详细内容链接 detail_urls = self._parse_catalog() self.logger.info("Get novel chapters: " + str(len(detail_urls))) self.all_chapter_num = len(detail_urls) # 使用threadpool 控制多线程数量 requests = threadpool.makeRequests(self._get_detail, detail_urls) [self.pool.putRequest(req) for req in requests] # 等待所有章节抓取完成 self.pool.wait() retry_max = 3 retry_cnt = 0 # 进行3次重试, 若无法下载完整,使用 --fix 模式 while (self.progress_cnt < self.all_chapter_num) and (retry_cnt < retry_max): retry_cnt += 1 self.logger.info("Retry failed set. Len: " + str(len(self.failed_set))) retry, self.failed_set = self.failed_set, set() requests = threadpool.makeRequests(self._get_detail, retry) [self.pool.putRequest(req) for req in requests] # 等待所有章节抓取完成 self.pool.wait() self.logger.info("Checking download completeness...") if CommonTool.check_completion(detail_urls): # 合并全文 self.logger.info("All chapters are downloaded successfully. Start merging ...") CommonTool.merge_all_chapters(self.output_name) self.logger.info("Merged. Enjoy reading!") else: self.logger.error("Some chapters download failed.") self.logger.error("Try python novel_download.py -url URL --fix") self.logger.info("Total cost %.2fs" % (time.time() - start_time))
def _parse_detail(content): """ 解析页面详细内容,提取并返回 标题+正文 :param content: 小说内容页面 :return: 标题+正文 """ doc = pq.PyQuery(content) title = doc( '#wrapper > div.content_read > div > div.bookname > h1').text() title = CommonTool.fix_title(title) content = doc('#content').text() return title, content
def _parse_detail(content): """ 解析页面详细内容,提取并返回 标题+正文 :param content: 小说内容页面 :return: 标题+正文 """ doc = pq.PyQuery(content) title = doc('#directs > div.bookInfo > h1 > strong').text().replace( "正文", "").strip() title = CommonTool.fix_title(title) content = doc('#content').text() content = content.replace('style6();', '').replace('style5();', '') return title, content
def _parse_catalog(self): """ 请求self.url,获取小说目录页面内容 :return: 所有详细页面的链接 """ result = CommonTool.fetch_page(self.catalog_url) doc = pq.PyQuery(result) # 内存去重 detail_urls = set() for a in doc('#list > dl > dd > a').items(): detail_url = a.attr.href if detail_url in detail_urls: # 去重 continue detail_url = urllib.request.urljoin(self.HOST, detail_url) detail_urls.add(detail_url) return detail_urls
def _parse_catalog(self): """ 请求self.url,获取小说目录页面内容 :return: 所有详细页面的链接 """ result = CommonTool.fetch_page(self.catalog_url) doc = pq.PyQuery(result) # 内存去重 detail_urls = set() for a in doc( '#chapter > div.chapterSo > div.chapterNum > ul > div.clearfix.dirconone li > a' ).items(): detail_url = a.attr.href if detail_url in detail_urls: # 去重 continue if self.HOST not in detail_url: # 不是该站点链接 continue detail_urls.add(detail_url) return detail_urls