Exemplo n.º 1
0
    def _parse_page_from_wx(self, **kwargs):
        '''
        解析wx单个article的info
        :param kwargs:
        :return: a WellRecommendArticle object
        '''
        article_link = kwargs.get('article_link', '')
        article_info = kwargs.get('article_info', {}).get('data', {})
        article_likes = kwargs.get('article_likes', get_random_int_number())

        error_msg = '出错article_url: {0}'.format(article_link)
        try:
            nick_name = article_info.get('user', {}).get('nickname', '')
            assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg

            head_url = article_info.get('user', {}).get('images', '')
            assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg

            profile = ''  # 个人简介或者个性签名(留空)

            share_id = article_info.get('id', '')
            assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg

            title = self.wash_sensitive_info(article_info.get('title',
                                                              ''))  # title默认留空
            comment_content = self.wash_sensitive_info(
                article_info.get('desc', ''))
            assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg

            share_img_url_list = [{  # 如果是视频的话, 则里面第一章图片就是视频第一帧
                'img_url': item.get('original', ''),
                'height': item.get('height'),  # 图片高宽
                'width': item.get('width'),
            } for item in article_info.get('images_list', [])]
            assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg

            div_body = ''  # 默认留空
            gather_url = article_link

            # 原文章原始的创建日期
            tmp_create_time = article_info.get('time', '')
            assert tmp_create_time != '', '获取到的create_time为空值!请检查!'
            create_time = string_to_datetime(tmp_create_time + ':00')

            site_id = 3  # 小红书
            goods_url_list = []  # 该文章待抓取的商品地址
            share_goods_base_info = []

            # wx端tags没有返回值
            tags = self._get_tags_from_wx(article_info=article_info)

            # 视频播放地址
            tmp_video_url = article_info.get('video', '')
            tmp_video_url = re.compile('\?.*').sub('', tmp_video_url)
            video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url)

            likes = article_likes
            collects = article_info.get('fav_count', None)
            assert collects is not None, '获取到的collects为None!请检查!' + error_msg

        except Exception:
            sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
            self.lg.error('遇到错误:', exc_info=True)
            return {}

        _ = WellRecommendArticle()
        _['nick_name'] = nick_name
        _['head_url'] = head_url
        _['profile'] = profile
        _['share_id'] = share_id
        _['title'] = title
        _['comment_content'] = comment_content
        _['share_img_url_list'] = share_img_url_list
        _['div_body'] = div_body
        _['gather_url'] = gather_url
        _['create_time'] = create_time
        _['site_id'] = site_id
        _['goods_url_list'] = goods_url_list
        _['tags'] = tags
        _['share_goods_base_info'] = share_goods_base_info
        _['video_url'] = video_url
        _['likes'] = likes
        _['collects'] = collects

        return _
Exemplo n.º 2
0
    async def _get_article(self, data, taobao_short_url):
        '''
        得到该文章的需求信息
        :param data:
        :return:
        '''
        try:
            nick_name = data.get('data', {}).get('models', {}).get('account', {}).get('name', '')
            assert nick_name != '', '获取到的nick_name为空值!'

            head_url = await self._get_head_url(data=data)

            # 推荐人的简介或者个性签名
            tmp_profile = data.get('data', {}).get('models', {}).get('account', {}).get('accountDesc', '')
            profile = tmp_profile if tmp_profile is not None else ''

            title = self._wash_sensitive_info(data.get('data', {}).get('models', {}).get('content', {}).get('title', ''))
            # self.my_lg.info(title)
            assert title != '', '获取到的title为空值!请检查!'

            # 达人的评论,可用于荐好首页的文字信息
            comment_content = self._wash_sensitive_info(data.get('data', {}).get('models', {}).get('content', {}).get('summary', ''))

            '''微淘抓包的接口: 图片,商品依次对应'''
            tmp_goods_list = data.get('data', {}).get('models', {}).get('content', {}).get('drawerList', [])
            assert tmp_goods_list != [], '获取到的goods_id_list为空list! 请检查! 可能该文章推荐商品为空[]!'

            share_img_url_list = [{'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', '')} for item in tmp_goods_list]
            goods_id_list = [{'goods_id': item.get('itemId', '')} for item in tmp_goods_list]

            # 由于微淘的图片跟商品信息一一对应,so直接存一个字段, 清除重复的推荐商品(list去重,并保持原来的顺序)
            share_goods_base_info = list_duplicate_remove([{
                'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', ''),
                'goods_id': item.get('itemId', ''),
            } for item in tmp_goods_list])

            # div_body
            div_body = self._wash_sensitive_info(await self._get_div_body(rich_text=data.get('data', {}).get('models', {}).get('content', {}).get('richText', [])))
            # print(div_body)

            # 待抓取的商品地址, 统一格式为淘宝的,如果是tmall地址, 浏览器会重定向到天猫
            goods_url_list = [{'goods_url': 'https://item.taobao.com/item.htm?id=' + item.get('goods_id', '')} for item in goods_id_list]

            _ = (await self._get_target_url_and_content_id_and_csid(taobao_short_url))
            gather_url = _[0]
            share_id = _[1]  # 即content_id

            create_time = get_shanghai_time()

            site_id = 2  # 淘宝微淘

            # tags 额外的文章地址
            tags = await self._get_tags(data=data)
            # pprint(tags)

        except Exception as e:
            self.my_lg.error('出错短链接地址:{0}'.format(taobao_short_url))
            self.my_lg.exception(e)
            return {}

        article = WellRecommendArticle()
        article['nick_name'] = nick_name
        article['head_url'] = head_url
        article['profile'] = profile
        article['share_id'] = share_id
        article['title'] = title
        article['comment_content'] = comment_content
        article['share_img_url_list'] = share_img_url_list
        article['goods_id_list'] = goods_id_list
        article['div_body'] = div_body
        article['gather_url'] = gather_url
        article['create_time'] = create_time
        article['site_id'] = site_id
        article['goods_url_list'] = goods_url_list
        article['tags'] = tags
        article['share_goods_base_info'] = share_goods_base_info

        return article