Пример #1
0
def switchPage(url, doubanUrl, userAgents, agent_index):
    """判断下一页是否还有该类小说,若有,返回下一页的url"""

    agentNumber = len(userAgents)
    html = getHtml.getHtml(url, agent_index[0])
    k = 0
    while True:
        if k == agentNumber:
            break
        if html == "":
            agent_index = switchUserAgent.switchUserAgent(
                userAgents, agent_index)
            html = getHtml.getHtml(url, agent_index[0])
            k = k + 1
            print(str(666) + "...没用的代理")
        else:
            break

    soup = BeautifulSoup(html, "lxml")

    #先粗略获取下一页的链接,逐步精确
    newPage = re.findall(r'thispage(.*?)\<\/a\>', str(soup), re.S)
    if newPage:
        newPage = re.findall(r'href="(.*?)">', newPage[0], re.S)

    if not newPage or not re.findall(r'start(.*)T', newPage[0]):
        return ""

    #若存在下一页的链接,获取下一页的信息,判断下一页是否存在书
    else:
        newPage = doubanUrl + newPage[0]
        html = getHtml.getHtml(newPage, agent_index[0])
        k = 0
        #判断当前代理是否IP是否有用,没用更换下一个IP
        while True:
            if k == agentNumber:
                break
            if html == "":
                agent_index = switchUserAgent.switchUserAgent(
                    userAgents, agent_index)
                html = getHtml.getHtml(newPage, agent_index[0])
                k = k + 1
                print(str(666) + "...没用的代理")
            else:
                break

        soup = BeautifulSoup(html, "lxml")
        hasBook = re.findall(r'thispage(.*)\<\/a\>', str(soup), re.S)
        if hasBook:
            return newPage
        else:
            return ""
Пример #2
0
def getHotBooks(url, userAgents, agent_index):
    """获取热门书籍"""

    #获取网页中的信息
    html = getHtml.getHtml(url, agent_index[0])
    #判断IP代理是否有用
    agentNumber = len(userAgents)
    k = 0
    while True:
        if k == agentNumber:
            break
        if html == "":
            agent_index = switchUserAgent.switchUserAgent(
                userAgents, agent_index)
            html = getHtml.getHtml(url, agent_index[0])
            k = k + 1
            print(str(666) + "...没用的代理")
        else:
            break
    soup = BeautifulSoup(html, "lxml")

    # 提取网页中书的链接和书名
    contents = soup.find_all(
        "ul", attrs={"class": "list-col list-col2 list-summary s"})[0]
    contents = contents.find_all("li", attrs={"class": ""})
    link_titles = []
    number = 0
    for content in contents:
        link_title = []
        content = re.findall(r'class\=\"\"\shref(.*?)a\>', str(content))
        link = re.findall(r'\=\"(.*?)\"\s', content[0])[0]
        title = re.findall(r'\"\>(.*?)\<', content[0])[0]
        link_title.append(link)
        link_title.append(title)
        link_titles.append(link_title)
        number = number + 1
        if number == 8:
            break

    # 由书的链接获取更详细的信息
    number = 0
    hotBookMessages = []
    for link_title in link_titles:
        hotBook = getBookMessage(link_title[0], link_title[1], userAgents,
                                 agent_index)
        hotBookMessages.append(hotBook)

    return hotBookMessages
Пример #3
0
def getNewBooks(url, userAgents, agent_index):
    """获取最新书籍"""

    #获取网页信息
    html = getHtml.getHtml(url, agent_index[0])
    #判断IP代理是否有用
    agentNumber = len(userAgents)
    k = 0
    while True:
        if k == agentNumber:
            break
        if html == "":
            agent_index = switchUserAgent.switchUserAgent(
                userAgents, agent_index)
            html = getHtml.getHtml(url, agent_index[0])
            k = k + 1
            print(str(666) + "...没用的代理")
        else:
            break
    soup = BeautifulSoup(html, "lxml")

    #提取网页中书的链接和书名
    contents = soup.find_all(
        "ul", attrs={"class": "list-col list-col5 list-express slide-item"})[0]
    contents = contents.find_all("li", attrs={"class": ""})
    link_titles = []
    number = 0
    for content in contents:
        link_title = []
        content = re.findall(r'href(.*?)\>', str(content))
        link = re.findall(r'\=\"(.*?)\"', content[0])
        title = re.findall(r'title\=\"(.*?)\"', content[0])
        link_title.append(link[0])
        link_title.append(title[0])
        link_titles.append(link_title)
        number = number + 1
        if number == 10:
            break

    #由书的链接获取更详细的信息
    newBookMessages = []
    for link_title in link_titles:
        newBook = getBookMessage(link_title[0], link_title[1], userAgents,
                                 agent_index)
        newBookMessages.append(newBook)

    return newBookMessages
Пример #4
0
def getTags(url, userAgents, agent_index):
    """得到书的六大类的标签,同时将六大类的标签和多个小分类的标签对应起来"""
    agentNumber = len(userAgents)
    html = getHtml.getHtml(url, agent_index[0])
    k = 0
    while True:
        if k == agentNumber:
            break
        if html == "":
            agent_index = switchUserAgent.switchUserAgent(
                userAgents, agent_index)
            html = getHtml.getHtml(url, agent_index[0])
            k = k + 1
            print(str(666) + "...没用的代理")
        else:
            break

    soup = BeautifulSoup(html, "lxml")

    #粗略地将信息根据六大类分为六个小块
    content = re.findall(r'style\=\"padding\-top\:10px(.*?)\<\/tbody\>',
                         str(soup), re.S)

    tags = {}
    #在各个小块中得到大类标签,同时和多个小分类标签对应起来
    for c in content:
        #得到大类标签
        tag = re.findall(r'\"\>(.*?)\s\·', c, re.S)[0]

        #按小分类标签将信息分块
        labels_0 = re.findall(r'href(.*?)a\>\<b\>', c, re.S)

        #将大类标签和小分类标签对应起来
        labels = []
        for lab in labels_0:
            label = re.findall(r'\"\>(.*?)\<\/', lab, re.S)
            if label:
                labels.append(label[0])

        tags[tag] = labels

    return tags
Пример #5
0
userAgents = getUserAgents.getUserAgent()
agent_index = []
agent_index.append(userAgents[200])
agent_index.append(200)
agentNumber = len(userAgents)

#初始链接
url = "https://book.douban.com/tag/"
html = getHtml.getHtml(url, agent_index[0])
#判断当前的代理IP是否可用,在不可用的情况下更换下一个代理IP
k = 0
while True:
    if k == agentNumber:
        break
    if html == "":
        agent_index = switchUserAgent.switchUserAgent(userAgents, agent_index)
        html = getHtml.getHtml(url, agent_index[0])
        k = k + 1
        print(str(666) + "...没用的代理")
    else:
        break

tags = getTagUrls.getTags(url, userAgents, agent_index)
urls = getTagUrls.getTagUrls(html, url, tags)

url = "https://book.douban.com"

#获取每一个链接得到的信息
#bookMessages=[]
pageNumber = 1
j = 0
Пример #6
0
def getBookMessage(bookLink, bookName, tag, userAgents, agent_index):
    """从由链接获取到的详情网页中获取书的详细信息"""

    #保存书籍的相关信息
    bookMessage = {}
    #记录信息是否缺失,缺失则丢弃
    flag = 1

    #将传递过来的书名和链接保存
    bookMessage["书名"] = bookName
    bookMessage["豆瓣链接"] = bookLink
    bookMessage["类别"] = tag

    html = getHtml.getHtml(bookLink, agent_index[0])
    agentNumber = len(userAgents)
    k = 0
    while True:
        if k == agentNumber:
            break
        if html == "":
            agent_index = switchUserAgent.switchUserAgent(
                userAgents, agent_index)
            html = getHtml.getHtml(bookLink, agent_index[0])
            k = k + 1
            print(str(666) + "...没用的代理")
        else:
            break

    soup = BeautifulSoup(html, 'lxml')

    #获取书的作者
    author = soup.find_all("div", attrs={"class": "", "id": "info"})
    if author:
        author = re.findall(r'href\=\"(.*?)a\>', str(author[0]), re.S)
        if author:
            author = re.findall(r'\>(.*?)\<', author[0], re.S)
            if author:
                author_0 = author[0].split("\n")
                author = ""
                for a in author_0:
                    author = author + a.strip()
                bookMessage["作者"] = author
            else:
                bookMessage["作者"] = ""
                flag = 0
        else:
            bookMessage["作者"] = ""
            flag = 0
    else:
        bookMessage["作者"] = ""
        flag = 0

    #获取书的封面链接
    pictureLink = soup.find_all('div', attrs={"class": "", "id": "mainpic"})
    if pictureLink:
        pictureLink = re.findall(r'href\=\"(.*?)\"', str(pictureLink[0]), re.S)
        if pictureLink:
            bookMessage["图片链接"] = pictureLink[0].strip()
        else:
            bookMessage["图片链接"] = ''
            flag = 0
    else:
        bookMessage["图片链接"] = ''
        flag = 0

    #获取书的作者简介
    introContext = soup.find_all("div", attrs={"class": "indent "})
    if introContext:
        introContext = re.findall(r'\<p\>(.*?)\<\/div\>', str(introContext[0]),
                                  re.S)
        if introContext:
            context = ""
            contexts = introContext[0].split(r"</p> <p>")
            for c in contexts:
                context = context + c
            authorIntro = re.findall(r'(.*?)\<\/p\>', context)[0]
            authorIntro = re.sub(r'\s*', "", authorIntro)
            bookMessage["作者简介"] = authorIntro.strip()
        else:
            bookMessage["作者简介"] = ''
            flag = 0

    else:
        bookMessage["作者简介"] = ''
        flag = 0

    #获取书的内容简介
    introContext = soup.find_all("div", attrs={"class": "related_info"})
    if introContext:
        introContext = re.findall(r'\<p\>(.*?)\<\/div\>', str(introContext[0]),
                                  re.S)
        if introContext:
            context = ""
            contexts = introContext[0].split(r"</p> <p>")
            for c in contexts:
                context = context + c
            contextIntro = re.findall(r'(.*?)\<\/p\>', context)[0]
            contextIntro = re.sub(r'\s*', "", contextIntro)
            bookMessage["内容简介"] = contextIntro
        else:
            bookMessage["内容简介"] = ''
            flag = 0

    else:
        bookMessage["内容简介"] = ''
        flag = 0

    #获取书的评分
    rating = soup.find_all('strong',
                           attrs={
                               'class': 'll rating_num ',
                               'property': 'v:average'
                           })
    if rating:
        rating = re.findall("\>(.*?)\<\/", str(rating[0]), re.S)
        if rating:
            bookMessage["评分"] = rating[0].strip()
        else:
            bookMessage["评分"] = ''
            flag = 0
    else:
        bookMessage["评分"] = ''
        flag = 0

    #获取书的ISBN码
    isbn = re.findall(r'\<span\sclass\=\"pl\"\>ISBN\:\<\/span\>(.*?)\<br',
                      str(soup), re.S)
    if isbn:
        bookMessage["ISBN"] = isbn[0].strip()
    else:
        bookMessage["ISBN"] = ''
        flag = 0

    if flag == 1:
        return bookMessage
    else:
        return ""