def switchPage(url, doubanUrl, userAgents, agent_index): """判断下一页是否还有该类小说,若有,返回下一页的url""" agentNumber = len(userAgents) html = getHtml.getHtml(url, agent_index[0]) k = 0 while True: if k == agentNumber: break if html == "": agent_index = switchUserAgent.switchUserAgent( userAgents, agent_index) html = getHtml.getHtml(url, agent_index[0]) k = k + 1 print(str(666) + "...没用的代理") else: break soup = BeautifulSoup(html, "lxml") #先粗略获取下一页的链接,逐步精确 newPage = re.findall(r'thispage(.*?)\<\/a\>', str(soup), re.S) if newPage: newPage = re.findall(r'href="(.*?)">', newPage[0], re.S) if not newPage or not re.findall(r'start(.*)T', newPage[0]): return "" #若存在下一页的链接,获取下一页的信息,判断下一页是否存在书 else: newPage = doubanUrl + newPage[0] html = getHtml.getHtml(newPage, agent_index[0]) k = 0 #判断当前代理是否IP是否有用,没用更换下一个IP while True: if k == agentNumber: break if html == "": agent_index = switchUserAgent.switchUserAgent( userAgents, agent_index) html = getHtml.getHtml(newPage, agent_index[0]) k = k + 1 print(str(666) + "...没用的代理") else: break soup = BeautifulSoup(html, "lxml") hasBook = re.findall(r'thispage(.*)\<\/a\>', str(soup), re.S) if hasBook: return newPage else: return ""
def getHotBooks(url, userAgents, agent_index): """获取热门书籍""" #获取网页中的信息 html = getHtml.getHtml(url, agent_index[0]) #判断IP代理是否有用 agentNumber = len(userAgents) k = 0 while True: if k == agentNumber: break if html == "": agent_index = switchUserAgent.switchUserAgent( userAgents, agent_index) html = getHtml.getHtml(url, agent_index[0]) k = k + 1 print(str(666) + "...没用的代理") else: break soup = BeautifulSoup(html, "lxml") # 提取网页中书的链接和书名 contents = soup.find_all( "ul", attrs={"class": "list-col list-col2 list-summary s"})[0] contents = contents.find_all("li", attrs={"class": ""}) link_titles = [] number = 0 for content in contents: link_title = [] content = re.findall(r'class\=\"\"\shref(.*?)a\>', str(content)) link = re.findall(r'\=\"(.*?)\"\s', content[0])[0] title = re.findall(r'\"\>(.*?)\<', content[0])[0] link_title.append(link) link_title.append(title) link_titles.append(link_title) number = number + 1 if number == 8: break # 由书的链接获取更详细的信息 number = 0 hotBookMessages = [] for link_title in link_titles: hotBook = getBookMessage(link_title[0], link_title[1], userAgents, agent_index) hotBookMessages.append(hotBook) return hotBookMessages
def getNewBooks(url, userAgents, agent_index): """获取最新书籍""" #获取网页信息 html = getHtml.getHtml(url, agent_index[0]) #判断IP代理是否有用 agentNumber = len(userAgents) k = 0 while True: if k == agentNumber: break if html == "": agent_index = switchUserAgent.switchUserAgent( userAgents, agent_index) html = getHtml.getHtml(url, agent_index[0]) k = k + 1 print(str(666) + "...没用的代理") else: break soup = BeautifulSoup(html, "lxml") #提取网页中书的链接和书名 contents = soup.find_all( "ul", attrs={"class": "list-col list-col5 list-express slide-item"})[0] contents = contents.find_all("li", attrs={"class": ""}) link_titles = [] number = 0 for content in contents: link_title = [] content = re.findall(r'href(.*?)\>', str(content)) link = re.findall(r'\=\"(.*?)\"', content[0]) title = re.findall(r'title\=\"(.*?)\"', content[0]) link_title.append(link[0]) link_title.append(title[0]) link_titles.append(link_title) number = number + 1 if number == 10: break #由书的链接获取更详细的信息 newBookMessages = [] for link_title in link_titles: newBook = getBookMessage(link_title[0], link_title[1], userAgents, agent_index) newBookMessages.append(newBook) return newBookMessages
def getTags(url, userAgents, agent_index): """得到书的六大类的标签,同时将六大类的标签和多个小分类的标签对应起来""" agentNumber = len(userAgents) html = getHtml.getHtml(url, agent_index[0]) k = 0 while True: if k == agentNumber: break if html == "": agent_index = switchUserAgent.switchUserAgent( userAgents, agent_index) html = getHtml.getHtml(url, agent_index[0]) k = k + 1 print(str(666) + "...没用的代理") else: break soup = BeautifulSoup(html, "lxml") #粗略地将信息根据六大类分为六个小块 content = re.findall(r'style\=\"padding\-top\:10px(.*?)\<\/tbody\>', str(soup), re.S) tags = {} #在各个小块中得到大类标签,同时和多个小分类标签对应起来 for c in content: #得到大类标签 tag = re.findall(r'\"\>(.*?)\s\·', c, re.S)[0] #按小分类标签将信息分块 labels_0 = re.findall(r'href(.*?)a\>\<b\>', c, re.S) #将大类标签和小分类标签对应起来 labels = [] for lab in labels_0: label = re.findall(r'\"\>(.*?)\<\/', lab, re.S) if label: labels.append(label[0]) tags[tag] = labels return tags
userAgents = getUserAgents.getUserAgent() agent_index = [] agent_index.append(userAgents[200]) agent_index.append(200) agentNumber = len(userAgents) #初始链接 url = "https://book.douban.com/tag/" html = getHtml.getHtml(url, agent_index[0]) #判断当前的代理IP是否可用,在不可用的情况下更换下一个代理IP k = 0 while True: if k == agentNumber: break if html == "": agent_index = switchUserAgent.switchUserAgent(userAgents, agent_index) html = getHtml.getHtml(url, agent_index[0]) k = k + 1 print(str(666) + "...没用的代理") else: break tags = getTagUrls.getTags(url, userAgents, agent_index) urls = getTagUrls.getTagUrls(html, url, tags) url = "https://book.douban.com" #获取每一个链接得到的信息 #bookMessages=[] pageNumber = 1 j = 0
def getBookMessage(bookLink, bookName, tag, userAgents, agent_index): """从由链接获取到的详情网页中获取书的详细信息""" #保存书籍的相关信息 bookMessage = {} #记录信息是否缺失,缺失则丢弃 flag = 1 #将传递过来的书名和链接保存 bookMessage["书名"] = bookName bookMessage["豆瓣链接"] = bookLink bookMessage["类别"] = tag html = getHtml.getHtml(bookLink, agent_index[0]) agentNumber = len(userAgents) k = 0 while True: if k == agentNumber: break if html == "": agent_index = switchUserAgent.switchUserAgent( userAgents, agent_index) html = getHtml.getHtml(bookLink, agent_index[0]) k = k + 1 print(str(666) + "...没用的代理") else: break soup = BeautifulSoup(html, 'lxml') #获取书的作者 author = soup.find_all("div", attrs={"class": "", "id": "info"}) if author: author = re.findall(r'href\=\"(.*?)a\>', str(author[0]), re.S) if author: author = re.findall(r'\>(.*?)\<', author[0], re.S) if author: author_0 = author[0].split("\n") author = "" for a in author_0: author = author + a.strip() bookMessage["作者"] = author else: bookMessage["作者"] = "" flag = 0 else: bookMessage["作者"] = "" flag = 0 else: bookMessage["作者"] = "" flag = 0 #获取书的封面链接 pictureLink = soup.find_all('div', attrs={"class": "", "id": "mainpic"}) if pictureLink: pictureLink = re.findall(r'href\=\"(.*?)\"', str(pictureLink[0]), re.S) if pictureLink: bookMessage["图片链接"] = pictureLink[0].strip() else: bookMessage["图片链接"] = '' flag = 0 else: bookMessage["图片链接"] = '' flag = 0 #获取书的作者简介 introContext = soup.find_all("div", attrs={"class": "indent "}) if introContext: introContext = re.findall(r'\<p\>(.*?)\<\/div\>', str(introContext[0]), re.S) if introContext: context = "" contexts = introContext[0].split(r"</p> <p>") for c in contexts: context = context + c authorIntro = re.findall(r'(.*?)\<\/p\>', context)[0] authorIntro = re.sub(r'\s*', "", authorIntro) bookMessage["作者简介"] = authorIntro.strip() else: bookMessage["作者简介"] = '' flag = 0 else: bookMessage["作者简介"] = '' flag = 0 #获取书的内容简介 introContext = soup.find_all("div", attrs={"class": "related_info"}) if introContext: introContext = re.findall(r'\<p\>(.*?)\<\/div\>', str(introContext[0]), re.S) if introContext: context = "" contexts = introContext[0].split(r"</p> <p>") for c in contexts: context = context + c contextIntro = re.findall(r'(.*?)\<\/p\>', context)[0] contextIntro = re.sub(r'\s*', "", contextIntro) bookMessage["内容简介"] = contextIntro else: bookMessage["内容简介"] = '' flag = 0 else: bookMessage["内容简介"] = '' flag = 0 #获取书的评分 rating = soup.find_all('strong', attrs={ 'class': 'll rating_num ', 'property': 'v:average' }) if rating: rating = re.findall("\>(.*?)\<\/", str(rating[0]), re.S) if rating: bookMessage["评分"] = rating[0].strip() else: bookMessage["评分"] = '' flag = 0 else: bookMessage["评分"] = '' flag = 0 #获取书的ISBN码 isbn = re.findall(r'\<span\sclass\=\"pl\"\>ISBN\:\<\/span\>(.*?)\<br', str(soup), re.S) if isbn: bookMessage["ISBN"] = isbn[0].strip() else: bookMessage["ISBN"] = '' flag = 0 if flag == 1: return bookMessage else: return ""