def checkBooks(libName, libUrl): Helper.print("Check " + libName + " chapter") # 把lib里面的书按照 name=index 存到字典里 Novel_Book = {} Lib = open(NOVEL_LIB_PATH + libName + '.txt', 'r+', encoding='utf-8') for line in Lib.readlines(): line = re.sub('\n', '', line) values = line.split('=') Novel_Book[values[0]] = values[1] Lib.close() # 判断库里有没有存这本书的索引 bookSelect = 0 while bookSelect < len(SearchNovels): bookName = SearchNovels[bookSelect] Helper.print("Update {} chapters".format(bookName)) libIndex = int(Novel_Book.get(str(bookName), "0")) if libIndex != 0: checkChapters(libName, libUrl, libIndex, bookName) else: Helper.print("Can't find {} in {} lib".format(bookName, libName)) bookSelect = bookSelect + 1 time.sleep(Helper.randomFloat()) checkNextUrl()
def getChapterHtml(libName, libUrl, libIndex, chapterIdx): baseUrl = re.search("www(.*?)/", libUrl).group() baseUrl = re.sub("/", "", baseUrl) limitData = URL_LIMIT[libName] if limitData['count'] == 2: url = libUrl.format(math.floor(libIndex / 1000), libIndex) if limitData['count'] == 1: url = libUrl.format(libIndex) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'Connection': 'keep-alive', 'Host': baseUrl, # 'Referer': url, 'Upgrade-Insecure-Requests': '1', 'User-Agent': User_Agent, 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', } questTimes = 0 while questTimes < 3: try: html = SESSION.get(url + chapterIdx, headers=headers, params={}, verify=False, timeout=3) questTimes = 5 except Exception as e: Helper.printError(string="request {} again".format(libIndex)) questTimes = questTimes + 1 time.sleep(Helper.randomFloat()) if questTimes < 5: return return Helper.decodeHtml(html)
def checkLib(libName, libUrl): Helper.print("check " + libName + " lib") Novel_Lib = {} Lib = open(NOVEL_LIB_PATH + libName + '.txt', 'r+', encoding='utf-8') for line in Lib.readlines(): line = re.sub('\n', '', line) values = line.split('=') Novel_Lib[values[1]] = values[0] curIndex = len(Novel_Lib) + 1 ErrorCount = 0 while curIndex <= Lib_Max_Count: try: html = getBookChapterHtml(curIndex, libName, libUrl) novelName = Html.getBookName(html) if novelName: if Novel_Lib.get(str(curIndex), "") != novelName: ErrorCount = 0 Novel_Lib[str(curIndex)] = novelName Lib.write(novelName + "=" + str(curIndex) + "\n") Helper.print("{} add {} {}".format(libName, curIndex, novelName)) else: curIndex = curIndex + 1 else: Helper.printError(string="request {} error".format(curIndex)) ErrorCount = ErrorCount + 1 if ErrorCount >= Repeat_Max_Count: curIndex = curIndex + 1 ErrorCount = 0 except Exception as e: Helper.printError() time.sleep(Helper.randomFloat()) Lib.close() checkNextUrl()
def getBookChapterHtml(libIndex, libName, libUrl): baseUrl = re.search("www(.*?)/", libUrl).group() baseUrl = re.sub("/", "", baseUrl) limitData = URL_LIMIT[libName] if limitData['count'] == 2: url = libUrl.format(math.floor(libIndex / 1000), libIndex) if limitData['count'] == 1: url = libUrl.format(libIndex) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': baseUrl, 'Referer': 'https://' + baseUrl, 'Upgrade-Insecure-Requests': '1', 'User-Agent': User_Agent, } questTimes = 0 while questTimes < 2: try: html = SESSION.get(url, headers=headers, params={}, verify=False, timeout=3) return Helper.decodeHtml(html) except Exception as e: questTimes = questTimes + 1 time.sleep(Helper.randomFloat()) return ""