def get_story_urls(urls): downloadnum = config.DOWNLOADNUM db = MySQL() stroy_urls = {} downloadstoryno = [] for url in urls: requests.adapters.DEFAULT_RETRIES = 5 s = requests.session() s.keep_alive = False flag = True while flag: try: user_agent = user_Agent() res = requests.get(url, headers=user_agent) flag = False # res.request.headers 获取设置的user_agent except Exception as e: logging.error(e) continue url_reg = re.compile(r'<a href="/txt/(\d+).html">') allUrl = url_reg.findall(res.text) if downloadnum == False: downloadnum = len(allUrl) for i in allUrl[0:downloadnum]: story_url = "http://m.xsqishu.com/txt/" + i + ".html" stroy_urls[i] = story_url downloadstoryno.append(i) for num, compileurl in stroy_urls.items(): requests.adapters.DEFAULT_RETRIES = 5 s = requests.session() s.keep_alive = False flag = True while flag: try: user_agent = user_Agent() res = requests.get(compileurl, headers=user_agent) res.encoding = "gbk" flag = False # res.request.headers 获取设置的user_agent except Exception as e: logging.error(e) continue reg = re.compile(r'<a href="/book/(.+).html" class="bdbtn greenBtn">') url = reg.findall(res.text) story_title_reg = re.compile(r'<h1 class="title">(.+)</h1>') title = story_title_reg.findall(res.text)[0] download_url = "http://m.xsqishu.com/book/" + url[0] + ".html" if db.isExistStory(num): msg = "小说---" + title + "---已入库" logging.info(msg) else: db.inertStoryUrl(num, title, download_url) return downloadstoryno
def get_story_urlsnew(url): db = MySQL() stroy_urls = {} download_urls = {} requests.adapters.DEFAULT_RETRIES = 5 s = requests.session() s.keep_alive = False flag = True while flag: try: user_agent = user_Agent() res = requests.get(url, headers=user_agent) flag = False # res.request.headers 获取设置的user_agent except Exception as e: logging.error(e) continue url_reg = re.compile(r'<a href="/txt/(\d+).html">') allUrl = url_reg.findall(res.text) for i in allUrl: story_url = "http://m.xsqishu.com/txt/" + i + ".html" stroy_urls[i] = story_url logging.info(stroy_urls) for num, compileurl in stroy_urls.items(): requests.adapters.DEFAULT_RETRIES = 5 s = requests.session() s.keep_alive = False flag = True while flag: try: user_agent = user_Agent() res = requests.get(compileurl, headers=user_agent) res.encoding = "gbk" flag = False # res.request.headers 获取设置的user_agent except Exception as e: logging.error(e) continue reg = re.compile(r'<a href="/book/(.+).html" class="bdbtn greenBtn">') url = reg.findall(res.text) logging.info("-----------") logging.info(url) story_title_reg = re.compile(r'<h1 class="title">(.+)</h1>') title = story_title_reg.findall(res.text)[0] download_url = "http://m.xsqishu.com/book/" + url[0] + ".html" download_urls[num] = download_url if db.isExistStory(num): msg = "小说" + title + "已入库" logging.info(msg) else: db.inertStoryUrl(num, title, download_url) # logging.info(download_url) return download_urls # urls=db.getStoryIndex(10) # starttime=time.time() # for url in urls: # get_story_urlsnew(url) # endtime=time.time() # print('Cost {} seconds'.format(endtime-starttime)) # threads=[] # starttime=time.time() # for i in urls: # t=Thread(target=get_story_urlsnew,args=[i]) # t.start() # threads.append(t) # for i in threads: # t.join() # endtime=time.time() # print('Cost {} seconds'.format(endtime-starttime))