def fetchItemList(self): htmlSoup = BeautifulSoup(self.mainHtml, "html.parser") listDiv = htmlSoup.find(attrs={"class": "alllist"}).find(attrs={"class": "items_area"}) itemRawList = listDiv.find_all(attrs={"class": "item"}) for item in itemRawList: try: article = CnbetaArticle() titleTag = item.find(attrs={"class": "title"}).find("a") contentTag = item.find("span", attrs={"class": "newsinfo"}).find("p") article.title = "".join(titleTag.contents) article.url = self.targetMainUrl + titleTag["href"] article.cover = item.find("div", attrs={"class": "pic"}).find("a").find("img")['src'] briefStr = contentTag.renderContents().decode("utf-8") article.brief = briefStr self.itemList.append(article) except Exception as e: # print(e) pass logutil.log("CnbetaSpider", "getItemList finished")
def on_get_news_item(): url = request.args.get("url") return success(CnbetaArticle.load_from_db(url, True))
def on_get_news_list(): return success(CnbetaArticle.load_list_from_db())
article = CnbetaArticle() titleTag = item.find(attrs={"class": "title"}).find("a") contentTag = item.find("span", attrs={"class": "newsinfo"}).find("p") article.title = "".join(titleTag.contents) article.url = self.targetMainUrl + titleTag["href"] article.cover = item.find("div", attrs={"class": "pic"}).find("a").find("img")['src'] briefStr = contentTag.renderContents().decode("utf-8") article.brief = briefStr self.itemList.append(article) except Exception as e: # print(e) pass logutil.log("CnbetaSpider", "getItemList finished") def fetchItemDetail(self): for item in self.itemList: html = self.scratch(item.url) htmlSoup = BeautifulSoup(html, "html.parser") content = htmlSoup.find("div", attrs={"class": "content"}) item.content = content.renderContents().decode("utf-8") if __name__ == '__main__': spider = CnbetaSpider() spider.startSpider() article_list = CnbetaArticle.load_list_from_db() print(article_list)