Python WechatSogouStructuring.get_article_detail示例

编程语言: Python

命名空间/包名称: wechatsogou.structuring

方法/功能: get_article_detail

hotexamples.com的示例: 4

Python WechatSogouStructuring.get_article_detail - 已找到4个示例。这些是从开源项目中提取的最受好评的wechatsogou.structuring.WechatSogouStructuring.get_article_detail现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

get_article_by_search(5)

get_gzh_by_search(3)

get_gzh_info_and_article_by_history(3)

get_article_detail(2)

get_gzh_article_by_hot(2)

get_gzh_artilce_by_hot(2)

WechatSogouStructuring(1)

get_article_by_history_json(1)

get_article_by_search_wap(1)

get_gzh_info_by_history(1)

示例#1

显示文件

文件： test_structuring.py 项目： Chyroc/WechatSogou

    def test_get_article_detail(self):
        file_name = os.path.join(fake_data_path, 'article_detail_backgroud-image.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 29, article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html'])
        # 图片有src属性，无data-src属性
        content_html = BeautifulSoup(article_detail['content_html'], 'lxml')
        imgs = content_html.find_all("img", src=re.compile(r'http'))
        assert_equal(len(imgs), 29, imgs)
        for img in imgs:
            assert_is_none(img.attrs.get('data-src'))

        file_name = os.path.join(fake_data_path, 'article_detail_mpvoice.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 9, article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html'])
        assert_true('mpvoice' not in article_detail['content_html'], article_detail['content_html'])

        file_name = os.path.join(fake_data_path, 'article_detail_qqmusic.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 2, article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html'])
        assert_true('mpvoice' not in article_detail['content_html'], article_detail['content_html'])

        file_name = os.path.join(fake_data_path, 'article_detail_iframe.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 6, article_detail)
        assert_not_in('data-wxurl', article_detail['content_html'], article_detail['content_html'])
        assert_not_in('qqmusic', article_detail['content_html'], article_detail['content_html'])
        assert_not_in('mpvoice', article_detail['content_html'], article_detail['content_html'])

        # 图片有src属性，无data-src属性
        content_html = BeautifulSoup(article_detail['content_html'], 'lxml')
        iframes = content_html.find_all("iframe", src=re.compile(r'http'))
        assert_equal(len(iframes), 1, iframes)
        for iframe in iframes:
            assert_is_none(iframe.attrs.get('data-src'))

示例#2

显示文件

文件： api.py 项目： yunsite/WechatSogou

    def get_article_content(self,
                            url,
                            del_qqmusic=True,
                            del_mpvoice=True,
                            unlock_callback=None,
                            identify_image_callback=None,
                            hosting_callback=None,
                            raw=False):
        """获取文章原文，避免临时链接失效

        Parameters
        ----------
        url : str or unicode
            原文链接，临时链接
        raw : bool
            True: 返回原始html
            False: 返回处理后的html
        del_qqmusic: bool
            True:微信原文中有插入的qq音乐，则删除
            False:微信源文中有插入的qq音乐，则保留
        del_mpvoice: bool
            True:微信原文中有插入的语音消息，则删除
            False:微信源文中有插入的语音消息，则保留
        unlock_callback : callable
            处理 文章明细 的时候出现验证码的函数，参见 unlock_callback_example
        identify_image_callback : callable
            处理 文章明细 的时候处理验证码函数，输入验证码二进制数据，输出文字，参见 identify_image_callback_example
        hosting_callback: callable
            将微信采集的文章托管到7牛或者阿里云回调函数，输入微信图片源地址，返回托管后地址

        Returns
        -------
        content_html
            原文内容
        content_img_list
            文章中图片列表

        Raises
        ------
        WechatSogouRequestsException
        """

        resp = self.__get_by_unlock(
            url,
            unlock_platform=self.__unlock_wechat,
            unlock_callback=unlock_callback,
            identify_image_callback=identify_image_callback)

        resp.encoding = 'utf-8'
        if '链接已过期' in resp.text:
            raise WechatSogouException(
                'get_article_content 链接 [{}] 已过期'.format(url))
        if raw:
            return resp.text
        content_info = WechatSogouStructuring.get_article_detail(
            resp.text, del_qqmusic=del_qqmusic, del_voice=del_mpvoice)
        if hosting_callback:
            content_info = self.__hosting_wechat_img(content_info,
                                                     hosting_callback)
        return content_info

示例#3

显示文件

文件： api.py 项目： Chyroc/WechatSogou

    def get_article_content(self, url, del_qqmusic=True, del_mpvoice=True, unlock_callback=None,
                            identify_image_callback=None, hosting_callback=None, raw=False):
        """获取文章原文，避免临时链接失效

        Parameters
        ----------
        url : str or unicode
            原文链接，临时链接
        raw : bool
            True: 返回原始html
            False: 返回处理后的html
        del_qqmusic: bool
            True:微信原文中有插入的qq音乐，则删除
            False:微信源文中有插入的qq音乐，则保留
        del_mpvoice: bool
            True:微信原文中有插入的语音消息，则删除
            False:微信源文中有插入的语音消息，则保留
        unlock_callback : callable
            处理 文章明细 的时候出现验证码的函数，参见 unlock_callback_example
        identify_image_callback : callable
            处理 文章明细 的时候处理验证码函数，输入验证码二进制数据，输出文字，参见 identify_image_callback_example
        hosting_callback: callable
            将微信采集的文章托管到7牛或者阿里云回调函数，输入微信图片源地址，返回托管后地址

        Returns
        -------
        content_html
            原文内容
        content_img_list
            文章中图片列表

        Raises
        ------
        WechatSogouRequestsException
        """

        resp = self.__get_by_unlock(url,
                                    unlock_platform=self.__unlock_wechat,
                                    unlock_callback=unlock_callback,
                                    identify_image_callback=identify_image_callback)

        resp.encoding = 'utf-8'
        if '链接已过期' in resp.text:
            raise WechatSogouException('get_article_content 链接 [{}] 已过期'.format(url))
        if raw:
            return resp.text
        content_info = WechatSogouStructuring.get_article_detail(resp.text, del_qqmusic=del_qqmusic,
                                                                 del_voice=del_mpvoice)
        if hosting_callback:
            content_info = self.__hosting_wechat_img(content_info, hosting_callback)
        return content_info

示例#4

显示文件

文件： test_structuring.py 项目： ymf-930/Python

    def test_get_article_detail(self):
        file_name = os.path.join(fake_data_path,
                                 'article_detail_backgroud-image.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 29,
                     article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'],
                    article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'],
                    article_detail['content_html'])
        # 图片有src属性，无data-src属性
        content_html = BeautifulSoup(article_detail['content_html'], 'lxml')
        imgs = content_html.find_all("img", src=re.compile(r'http'))
        assert_equal(len(imgs), 29, imgs)
        for img in imgs:
            assert_is_none(img.attrs.get('data-src'))

        file_name = os.path.join(fake_data_path, 'article_detail_mpvoice.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 9,
                     article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'],
                    article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'],
                    article_detail['content_html'])
        assert_true('mpvoice' not in article_detail['content_html'],
                    article_detail['content_html'])

        file_name = os.path.join(fake_data_path, 'article_detail_qqmusic.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 2,
                     article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'],
                    article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'],
                    article_detail['content_html'])
        assert_true('mpvoice' not in article_detail['content_html'],
                    article_detail['content_html'])

        file_name = os.path.join(fake_data_path, 'article_detail_iframe.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 6,
                     article_detail)
        assert_not_in('data-wxurl', article_detail['content_html'],
                      article_detail['content_html'])
        assert_not_in('qqmusic', article_detail['content_html'],
                      article_detail['content_html'])
        assert_not_in('mpvoice', article_detail['content_html'],
                      article_detail['content_html'])

        # 图片有src属性，无data-src属性
        content_html = BeautifulSoup(article_detail['content_html'], 'lxml')
        iframes = content_html.find_all("iframe", src=re.compile(r'http'))
        assert_equal(len(iframes), 1, iframes)
        for iframe in iframes:
            assert_is_none(iframe.attrs.get('data-src'))