def download(self, name_list, url_list): for i in range(0, len(name_list)): count = 0 time.sleep(1.5) base_url = "http://www.sbkk88.com" file_name = name_list[i] url = url_list[i] options = (r'(<p>.*</p>)') options2 = (r'(.*</p>)<p>') html = requests.get( base_url + url, headers=sth.get_agent_pc()).content.decode("gbk") temp = (re.findall(options, html)) print(file_name + ":" + base_url + url) if temp == []: #网站里的正文有两种格式,乳沟第一种匹配失败就用第二种去匹配 temp = (re.findall(options2, html)) for i in range(0, len(temp)): lines = temp[i] lines = lines.replace('<u>一</u>', '') lines = lines.replace('<p>', '') lines = lines.replace('</p>', '\n') file_name = file_name.replace('\r\n', '') with open('./' + file_name, 'a') as f: f.write(lines)
def get_books_list(book_url): # 返回列表,为当前页下包含小说的li标签 html = requests.get(book_url, headers=sth.get_agent_pc()).content.decode('gbk') options = (r'<li>.*class="ablum".*</li>') each_book_list = re.findall(options, html) name_list1 = [] url_list = [] #print (each_book_list) for url in each_book_list: TEMP_chapter = Fenxi(url) # 初始化TEMP_chapter对象,然后通过调用类里面的get_name_url方法来获取每本书的名字和链接,然后分别追加到各自列表 name_list1.append(TEMP_chapter.get_name_url()[0]) url_list.append(TEMP_chapter.get_name_url()[1]) return (name_list1, url_list)
def download(self, name, url): print("开始下载%s" % name) base_url = "http://www.sbkk88.com" file_name = name if os.path.exists("./" + file_name): print(file_name + "已下载,跳过") else: url = url options = (r'(<p>.*</p>)') options2 = (r'(.*</p>)<p>') options3 = (r'(.+)<br>') options4 = (r'(<br>)') html = requests.get( base_url + url, headers=sth.get_agent_pc()).content.decode("gbk") temp = (re.findall(options, html)) #print (file_name+":"+base_url+url) # option #<p>“渔家女孩告诉我,他们中下阶层的百姓有个更妙的比喻:国王吃席,首相”</p> # options2 # 诸般买卖无商旅,各样生涯不见人。殿上君王归内院,阶前文武转衙门。</p><p> # options3 # 一局输赢料不真,香销茶尽尚逡巡。<br> # options4 #<br> if temp == []: # 特别恶心人的一点,网站里面的正文有两种格式甚多种,目前找到两种,但是爬取时又有没有抓到的页面,此部分待完善。 # 而且网站还在网页中间插入了<u>一</u>这种东西,不知道还有没有插入其他的东西,但是真的好恶心。。。 temp = (re.findall(options2, html)) for i in range(0, len(temp)): # 循环遍历每一行,去除特殊字符并且用</p>来换行。 lines = temp[i] lines = lines.replace('<u>一</u>', '') lines = lines.replace('<p>', '') lines = lines.replace('</p>', '\n') file_name = file_name.replace('\r\n', '') with open('./' + file_name, 'a') as f: f.write(lines)
def get_page_url(self): #print (self.__url) name_list = [] url_list = [] #print (self.__url) html = requests.get(self.__url, headers=sth.get_agent_pc()).content.decode("gbk") self.each_name_R = (r' <li> <a href=(.*).*</a> </li>') each_url_name = re.findall(self.each_name_R, html) each_url_name = each_url_name[0].replace('<li> <a href=', '\r\n') each_url_name = each_url_name.replace( ' class="articleTitle" target="_blank">', '') each_url_name = each_url_name.replace('</a> </li>', '') each_url_names = each_url_name.split('"') #print (len(each_url_name)) for i in range(1, len(each_url_names)): if (i % 2) == 0: name_list.append(each_url_names[i]) else: url_list.append(each_url_names[i]) self.download(name_list, url_list)
def get_page_url(self): # 传入单本书的页面url name_list = [] url_list = [] html = requests.get(self.__url, headers=sth.get_agent_pc()).content.decode("gbk") self.each_name_R = (r' <li> <a href=(.*).*</a> </li>') each_url_name = re.findall(self.each_name_R, html) each_url_name = each_url_name[0].replace('<li> <a href=', '\r\n') each_url_name = each_url_name.replace( ' class="articleTitle" target="_blank">', '') each_url_name = each_url_name.replace('</a> </li>', '') each_url_names = each_url_name.split('"') # 一大串replace是为了转换地址,从下方的URL1转化为URL2,最终通过给定的分隔符"来切割字段,形成一个列表。 # URL1:<li> <a href="/mingzhu/gudaicn/shuihuchuan/41951.html" class="articleTitle" target="_blank">读后感——江湖好汉为何爱吃酱牛肉、不喜红烧肉之小考' # URL2:"/mingzhu/gudaicn/shuihuchuan/41951.html"读后感——江湖好汉为何爱吃酱牛肉、不喜红烧肉之小考 for i in range(1, len(each_url_names)): # 循环列表,列表中的偶数位字段则为名字,奇数位字段的为url。取出之后放入两个列表。 if (i % 2) == 0: name_list.append(each_url_names[i]) else: url_list.append(each_url_names[i]) Multi = Pool(4) for i in range(0, len(name_list)): file_name = re.sub(r"[/\\:*?\"<>|]+", '', name_list[i]) Multi.apply(self.download, args=( file_name, url_list[i], )) # 因为网站那边肯定有反爬机制,因此设定爬取五个页面就暂停1.5秒再继续,同时headers里的user-agent是随机抽取的,因此隐蔽性略微提高。 # 调用download,提取网页上的问题并以给定的文件名保存到本地 Multi.close() Multi.join()
def get_book_list(book_url): #返回列表,为当前页下包含小说的li标签 html = requests.get(book_url, headers=sth.get_agent_pc()).content.decode('gbk') options = (r'<li>.*class="ablum".*</li>') return re.findall(options, html)