def extract(self,
             html,
             title_xpath='',
             author_xpath='',
             publish_time_xpath='',
             host='',
             noise_node_list=None,
             with_body_html=False):
     element = pre_parse(html)
     remove_noise_node(element, noise_node_list)
     content = ContentExtractor().extract(element, host, with_body_html)
     title = TitleExtractor().extract(element, title_xpath=title_xpath)
     publish_time = TimeExtractor().extractor(
         element, publish_time_xpath=publish_time_xpath)
     author = AuthorExtractor().extractor(element,
                                          author_xpath=author_xpath)
     result = {
         'title': title,
         'author': author,
         'publish_time': publish_time,
         'content': content[0][1]['text'],
         'images': content[0][1]['images']
     }
     if with_body_html or config.get('with_body_html', False):
         result['body_html'] = content[0][1]['body_html']
     return result
    def extract(self,
                html,
                title_xpath='',
                author_xpath='',
                publish_time_xpath='',
                host='',
                body_xpath='',
                noise_node_list=None,
                with_body_html=False):

        # 对 HTML 进行预处理可能会破坏 HTML 原有的结构,导致根据原始 HTML 编写的 XPath 不可用
        # 因此,如果指定了 title_xpath/author_xpath/publish_time_xpath,那么需要先提取再进行
        # 预处理
        normal_html = normalize_text(html)
        element = html2element(normal_html)
        title = TitleExtractor().extract(element, title_xpath=title_xpath)
        publish_time = TimeExtractor().extractor(element, publish_time_xpath=publish_time_xpath)
        author = AuthorExtractor().extractor(element, author_xpath=author_xpath)
        element = pre_parse(element)
        remove_noise_node(element, noise_node_list)
        content = ContentExtractor().extract(element,
                                             host=host,
                                             with_body_html=with_body_html,
                                             body_xpath=body_xpath)
        result = {'title': title,
                  'author': author,
                  'publish_time': publish_time,
                  'content': content[0][1]['text'],
                  'images': content[0][1]['images']
                  }
        if with_body_html or config.get('with_body_html', False):
            result['body_html'] = content[0][1]['body_html']
        return result
示例#3
0
class GeneralNewsExtractor:
    def __init__(self):
        self.content_extractor = ContentExtractor()
        self.title_extractor = TitleExtractor()
        self.author_extractor = AuthorExtractor()
        self.time_extractor = TimeExtractor()

    def extract(self, html, title_xpath='', noise_node_list=None):
        element = pre_parse(html)
        remove_noise_node(element, noise_node_list)
        content = self.content_extractor.extract(element)
        title = self.title_extractor.extract(element, title_xpath=title_xpath)
        publish_time = self.time_extractor.extractor(element)
        author = self.author_extractor.extractor(element)
        return {'title': title,
                'author': author,
                'publish_time': publish_time,
                'content': content[0][1]['text']}
 def extract(
     self,
     html,  # 目标网站的源代码
     title_xpath='',  # 新闻标题的 XPath,用于定向提取标题
     author_xpath='',  # 文章作者的 XPath,用于定向提取文章作者
     publish_time_xpath='',  # 文章发布时间的 XPath,用于定向提取文章发布时间
     host='',  # 图片所在的域名,例如 https://www.kingname.info, 那么,当GNE 从新闻网站提取到图片的相对连接``/images/123.png``时,会把 host 拼接上去,变成``https://www.kingname.info/images/123.png``
     noise_node_list=None,  # 移除会导致干扰的标签。(XPath 的列表,列表中的 XPath 对应的标签,会在预处理时被直接删除掉,从而避免他们影响新闻正文的提取)
     with_body_html=False
 ):  # 为 True时,返回的结果会包含字段 body_html,内容是新闻正文所在标签的 HTML 源代码,默认为False
     # 预解析html(剔除换行和无用节点)
     element = pre_parse(html)
     # 剔除用户自定义的干扰节点
     remove_noise_node(element, noise_node_list)
     # 正文抽取
     content = ContentExtractor().extract(element, host, with_body_html)
     # 标题抽取
     title = TitleExtractor().extract(element, title_xpath=title_xpath)
     # 发表时间抽取
     publish_time = TimeExtractor().extractor(
         element, publish_time_xpath=publish_time_xpath)
     # 作者抽取
     author = AuthorExtractor().extractor(element,
                                          author_xpath=author_xpath)
     # 汇总结果字典
     result = {
         'title': title,
         'author': author,
         'publish_time': publish_time,
         'content': content[0][1]['text'],
         'images': content[0][1]['images']
     }
     # 判断结果字典是否加入源码
     if with_body_html or config.get('with_body_html', False):
         result['body_html'] = content[0][1]['body_html']
     return result
示例#5
0
 def __init__(self):
     self.content_extractor = ContentExtractor()
     self.title_extractor = TitleExtractor()
     self.author_extractor = AuthorExtractor()
     self.time_extractor = TimeExtractor()