def parse_item(self, response): info = response.request.info html = response.body.decode() match = self.get_addr(html) if len(match) > 0: info['videoUrl'] = match[0] else: return bs4 = BeautifulSoup(response.text, 'html.parser') info['img'] = bs4.select_one("div[id=\"poster\"]").select_one('img')['src'] missionBean = MissionBean(response.url, 3, ['fishing_new']) missionBean.html = html missionBean.title = info['title'] missionBean.info = info self.client.save(missionBean)
def parse_item(self, response): info = response.request.info html = response.text bs4 = BeautifulSoup(html, "html.parser") content = bs4.select_one('div[class=\"content\"]').prettify() info['content'] = content missionBean = MissionBean(response.url, 1001, ['qutoutiao']) missionBean.info = info missionBean.html = html missionBean.title = info['title'] # 组装正式版Bean newsBean = NewsBean() newsBean.titleInfo = info['title'] newsBean.content = info['content'] newsBean.url = response.url newsBean.newsId = info['id'] newsBean.tags = info['tag'] newsBean.etc = {'news_type': info['type']} newsBean.fromChannel = self.TYPE_DICT.get(int(info['type']), '其他') newsBean.fromSpider = '推荐流' newsBean.fromType = 8 newsBean.goodNum = int(info['like_num']) newsBean.commentNum = int(info['comment_count']) newsBean.readNum = int(info['read_count']) newsBean.mediaName = info['source_name'] newsBean.mediaId = info['source_name'] newsBean.introduction = info['introduction'] newsBean.imgUrls = info['cover'] newsBean.shareNum = info['share_count'] missionBean.info = newsBean.__dict__ # 其中publishDate和createTime由于redis的格式问题 # TODO 只能传递时间戳 newsBean.publishDate = datetime.datetime.fromtimestamp( int(info['publish_time']) / 1000).timestamp() newsBean.createTime = newsBean.createTime.timestamp() daoFilterAndSave.MongoFilterSave(missionBean)