Пример #1
0
    def fetchItemList(self):
        htmlSoup = BeautifulSoup(self.mainHtml, "html.parser")
        listDiv = htmlSoup.find(attrs={"class": "alllist"}).find(attrs={"class": "items_area"})
        itemRawList = listDiv.find_all(attrs={"class": "item"})
        for item in itemRawList:
            try:
                article = CnbetaArticle()
                titleTag = item.find(attrs={"class": "title"}).find("a")
                contentTag = item.find("span", attrs={"class": "newsinfo"}).find("p")

                article.title = "".join(titleTag.contents)
                article.url = self.targetMainUrl + titleTag["href"]
                article.cover = item.find("div", attrs={"class": "pic"}).find("a").find("img")['src']
                briefStr = contentTag.renderContents().decode("utf-8")

                article.brief = briefStr
                self.itemList.append(article)
            except Exception as e:
                # print(e)
                pass

        logutil.log("CnbetaSpider", "getItemList finished")
Пример #2
0
def on_get_news_item():
    url = request.args.get("url")
    return success(CnbetaArticle.load_from_db(url, True))
Пример #3
0
def on_get_news_list():
    return success(CnbetaArticle.load_list_from_db())
Пример #4
0
                article = CnbetaArticle()
                titleTag = item.find(attrs={"class": "title"}).find("a")
                contentTag = item.find("span", attrs={"class": "newsinfo"}).find("p")

                article.title = "".join(titleTag.contents)
                article.url = self.targetMainUrl + titleTag["href"]
                article.cover = item.find("div", attrs={"class": "pic"}).find("a").find("img")['src']
                briefStr = contentTag.renderContents().decode("utf-8")

                article.brief = briefStr
                self.itemList.append(article)
            except Exception as e:
                # print(e)
                pass

        logutil.log("CnbetaSpider", "getItemList finished")

    def fetchItemDetail(self):
        for item in self.itemList:
            html = self.scratch(item.url)
            htmlSoup = BeautifulSoup(html, "html.parser")
            content = htmlSoup.find("div", attrs={"class": "content"})
            item.content = content.renderContents().decode("utf-8")


if __name__ == '__main__':
    spider = CnbetaSpider()
    spider.startSpider()
    article_list = CnbetaArticle.load_list_from_db()
    print(article_list)