def parse_content(self, response): title = response.css('.biaoti h1 span font::text').get() if title is None: title = response.url # 防止文章标题出现非法字符 title = tools.reshape_title(title) content = response.css('.zuo_nr').get() soup = bs(content, 'lxml') soup.find(class_='biaoti').extract() content = soup.prettify() # 清除字体格式,图片 content = tools.reshape_content(content) path = tools.reshape_path(self.name) item = items.HedespiderItem() item['title'] = title item['content'] = content item['path'] = path item['userid'] = self.userid if len(self.keywords) == 0: yield item for keyword in self.keywords: if keyword in str(item): yield item break
def parse_content(self, response): title = response.css('.tit::text').get() if title is None: title = response.url # 防止文章标题出现非法字符 title = tools.reshape_title(title) content = response.css('.content').get() # 清除字体格式,图片 content = tools.reshape_content(content) path = tools.reshape_path(self.name) item = items.HedespiderItem() item['title'] = title item['content'] = content item['path'] = path item['userid'] = self.userid if len(self.keywords) == 0: yield item for keyword in self.keywords: if keyword in str(item): yield item break