def _parse_page_from_wx(self, **kwargs): ''' 解析wx单个article的info :param kwargs: :return: a WellRecommendArticle object ''' article_link = kwargs.get('article_link', '') article_info = kwargs.get('article_info', {}).get('data', {}) article_likes = kwargs.get('article_likes', get_random_int_number()) error_msg = '出错article_url: {0}'.format(article_link) try: nick_name = article_info.get('user', {}).get('nickname', '') assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg head_url = article_info.get('user', {}).get('images', '') assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg profile = '' # 个人简介或者个性签名(留空) share_id = article_info.get('id', '') assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg title = self.wash_sensitive_info(article_info.get('title', '')) # title默认留空 comment_content = self.wash_sensitive_info( article_info.get('desc', '')) assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg share_img_url_list = [{ # 如果是视频的话, 则里面第一章图片就是视频第一帧 'img_url': item.get('original', ''), 'height': item.get('height'), # 图片高宽 'width': item.get('width'), } for item in article_info.get('images_list', [])] assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg div_body = '' # 默认留空 gather_url = article_link # 原文章原始的创建日期 tmp_create_time = article_info.get('time', '') assert tmp_create_time != '', '获取到的create_time为空值!请检查!' create_time = string_to_datetime(tmp_create_time + ':00') site_id = 3 # 小红书 goods_url_list = [] # 该文章待抓取的商品地址 share_goods_base_info = [] # wx端tags没有返回值 tags = self._get_tags_from_wx(article_info=article_info) # 视频播放地址 tmp_video_url = article_info.get('video', '') tmp_video_url = re.compile('\?.*').sub('', tmp_video_url) video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url) likes = article_likes collects = article_info.get('fav_count', None) assert collects is not None, '获取到的collects为None!请检查!' + error_msg except Exception: sleep(self.CRAWL_ARTICLE_SLEEP_TIME) self.lg.error('遇到错误:', exc_info=True) return {} _ = WellRecommendArticle() _['nick_name'] = nick_name _['head_url'] = head_url _['profile'] = profile _['share_id'] = share_id _['title'] = title _['comment_content'] = comment_content _['share_img_url_list'] = share_img_url_list _['div_body'] = div_body _['gather_url'] = gather_url _['create_time'] = create_time _['site_id'] = site_id _['goods_url_list'] = goods_url_list _['tags'] = tags _['share_goods_base_info'] = share_goods_base_info _['video_url'] = video_url _['likes'] = likes _['collects'] = collects return _
async def _get_article(self, data, taobao_short_url): ''' 得到该文章的需求信息 :param data: :return: ''' try: nick_name = data.get('data', {}).get('models', {}).get('account', {}).get('name', '') assert nick_name != '', '获取到的nick_name为空值!' head_url = await self._get_head_url(data=data) # 推荐人的简介或者个性签名 tmp_profile = data.get('data', {}).get('models', {}).get('account', {}).get('accountDesc', '') profile = tmp_profile if tmp_profile is not None else '' title = self._wash_sensitive_info(data.get('data', {}).get('models', {}).get('content', {}).get('title', '')) # self.my_lg.info(title) assert title != '', '获取到的title为空值!请检查!' # 达人的评论,可用于荐好首页的文字信息 comment_content = self._wash_sensitive_info(data.get('data', {}).get('models', {}).get('content', {}).get('summary', '')) '''微淘抓包的接口: 图片,商品依次对应''' tmp_goods_list = data.get('data', {}).get('models', {}).get('content', {}).get('drawerList', []) assert tmp_goods_list != [], '获取到的goods_id_list为空list! 请检查! 可能该文章推荐商品为空[]!' share_img_url_list = [{'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', '')} for item in tmp_goods_list] goods_id_list = [{'goods_id': item.get('itemId', '')} for item in tmp_goods_list] # 由于微淘的图片跟商品信息一一对应,so直接存一个字段, 清除重复的推荐商品(list去重,并保持原来的顺序) share_goods_base_info = list_duplicate_remove([{ 'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', ''), 'goods_id': item.get('itemId', ''), } for item in tmp_goods_list]) # div_body div_body = self._wash_sensitive_info(await self._get_div_body(rich_text=data.get('data', {}).get('models', {}).get('content', {}).get('richText', []))) # print(div_body) # 待抓取的商品地址, 统一格式为淘宝的,如果是tmall地址, 浏览器会重定向到天猫 goods_url_list = [{'goods_url': 'https://item.taobao.com/item.htm?id=' + item.get('goods_id', '')} for item in goods_id_list] _ = (await self._get_target_url_and_content_id_and_csid(taobao_short_url)) gather_url = _[0] share_id = _[1] # 即content_id create_time = get_shanghai_time() site_id = 2 # 淘宝微淘 # tags 额外的文章地址 tags = await self._get_tags(data=data) # pprint(tags) except Exception as e: self.my_lg.error('出错短链接地址:{0}'.format(taobao_short_url)) self.my_lg.exception(e) return {} article = WellRecommendArticle() article['nick_name'] = nick_name article['head_url'] = head_url article['profile'] = profile article['share_id'] = share_id article['title'] = title article['comment_content'] = comment_content article['share_img_url_list'] = share_img_url_list article['goods_id_list'] = goods_id_list article['div_body'] = div_body article['gather_url'] = gather_url article['create_time'] = create_time article['site_id'] = site_id article['goods_url_list'] = goods_url_list article['tags'] = tags article['share_goods_base_info'] = share_goods_base_info return article