def scratch(self, url): try: req = urllib.request.Request(url) result = urllib.request.urlopen(req).read() html = result.decode("utf-8", 'ignore') return html except Exception as e: logutil.log("CnbetaSpider", e) return None
def fetchItemList(self): htmlSoup = BeautifulSoup(self.mainHtml, "html.parser") listDiv = htmlSoup.find(attrs={"class": "alllist"}).find(attrs={"class": "items_area"}) itemRawList = listDiv.find_all(attrs={"class": "item"}) for item in itemRawList: try: article = CnbetaArticle() titleTag = item.find(attrs={"class": "title"}).find("a") contentTag = item.find("span", attrs={"class": "newsinfo"}).find("p") article.title = "".join(titleTag.contents) article.url = self.targetMainUrl + titleTag["href"] article.cover = item.find("div", attrs={"class": "pic"}).find("a").find("img")['src'] briefStr = contentTag.renderContents().decode("utf-8") article.brief = briefStr self.itemList.append(article) except Exception as e: # print(e) pass logutil.log("CnbetaSpider", "getItemList finished")