def parse_page(self, response): validHtmlDoc = parsePage.getContent(response.body) if validHtmlDoc == None: return print response.url soup = BeautifulSoup(validHtmlDoc) persons = soup.find_all('div', attrs={"class": "WB_cardwrap S_bg2 clearfix"}) for person in persons: item = ScrapyWeiboItem() item['html'] = str(person) item['keyword'] = response.meta['keyword'] item['keywordId'] = response.meta['keywordId'] item['downDate'] = response.meta['start'] yield item
def parse_weibo(self, response): keyword = response.meta['keyword'] keywordId = response.meta['keywordId'] start = datetime.strptime(response.meta['start'], "%Y-%m-%d %H:%M:%S") end = datetime.strptime(response.meta['end'], "%Y-%m-%d %H:%M:%S") #open_in_browser(response) validHtmlDoc = parsePage.getContent(response.body) if validHtmlDoc == None: self.logger.warning(' ---> [%-30s] can not find the feed list, maybe structure of weibo if change' % (keyword)) # TODO: 当出现validHtmlDoc为空的时候说明weibo结构发生了改变,或者该账号被封了。应记录下该response return soup = BeautifulSoup(validHtmlDoc) # there is no weibo about this topic in the timerange. if not parsePage.isThereResult(soup): self.logger.warning() return pageNode = soup.find('div', attrs={"node-type": "feed_list_page_morelist"}) searchPage = SearchPage.wrap(pageNode) # request for more pages and parse the first page. for i in range(len(searchPage)): url = searchPage[i] request = Request(url=url, callback=self.parse_page) request.meta['keyword'] = keyword request.meta['keywordId'] = keywordId request.meta['start'] = start request.meta['end'] = end yield request for item in self.parse_page(response): yield item