Exemplos de AuthorExtractor em Python, exemplos de gne.extractor.AuthorExtractor em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: __init__.py Projeto: yuanbao2004/GeneralNewsExtractor

 def extract(self,
             html,
             title_xpath='',
             author_xpath='',
             publish_time_xpath='',
             host='',
             noise_node_list=None,
             with_body_html=False):
     element = pre_parse(html)
     remove_noise_node(element, noise_node_list)
     content = ContentExtractor().extract(element, host, with_body_html)
     title = TitleExtractor().extract(element, title_xpath=title_xpath)
     publish_time = TimeExtractor().extractor(
         element, publish_time_xpath=publish_time_xpath)
     author = AuthorExtractor().extractor(element,
                                          author_xpath=author_xpath)
     result = {
         'title': title,
         'author': author,
         'publish_time': publish_time,
         'content': content[0][1]['text'],
         'images': content[0][1]['images']
     }
     if with_body_html or config.get('with_body_html', False):
         result['body_html'] = content[0][1]['body_html']
     return result

Exemplo n.º 2

0

Exibir arquivo

Arquivo: __init__.py Projeto: nareshpendyala/GeneralNewsExtractor-1

    def extract(self,
                html,
                title_xpath='',
                author_xpath='',
                publish_time_xpath='',
                host='',
                body_xpath='',
                noise_node_list=None,
                with_body_html=False):

        # 对 HTML 进行预处理可能会破坏 HTML 原有的结构，导致根据原始 HTML 编写的 XPath 不可用
        # 因此，如果指定了 title_xpath/author_xpath/publish_time_xpath，那么需要先提取再进行
        # 预处理
        normal_html = normalize_text(html)
        element = html2element(normal_html)
        title = TitleExtractor().extract(element, title_xpath=title_xpath)
        publish_time = TimeExtractor().extractor(element, publish_time_xpath=publish_time_xpath)
        author = AuthorExtractor().extractor(element, author_xpath=author_xpath)
        element = pre_parse(element)
        remove_noise_node(element, noise_node_list)
        content = ContentExtractor().extract(element,
                                             host=host,
                                             with_body_html=with_body_html,
                                             body_xpath=body_xpath)
        result = {'title': title,
                  'author': author,
                  'publish_time': publish_time,
                  'content': content[0][1]['text'],
                  'images': content[0][1]['images']
                  }
        if with_body_html or config.get('with_body_html', False):
            result['body_html'] = content[0][1]['body_html']
        return result

Exemplo n.º 3

0

Exibir arquivo

Arquivo: __init__.py Projeto: zshipu/GeneralNewsExtractor

class GeneralNewsExtractor:
    def __init__(self):
        self.content_extractor = ContentExtractor()
        self.title_extractor = TitleExtractor()
        self.author_extractor = AuthorExtractor()
        self.time_extractor = TimeExtractor()

    def extract(self, html, title_xpath='', noise_node_list=None):
        element = pre_parse(html)
        remove_noise_node(element, noise_node_list)
        content = self.content_extractor.extract(element)
        title = self.title_extractor.extract(element, title_xpath=title_xpath)
        publish_time = self.time_extractor.extractor(element)
        author = self.author_extractor.extractor(element)
        return {'title': title,
                'author': author,
                'publish_time': publish_time,
                'content': content[0][1]['text']}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: __init__.py Projeto: Jokertion/Deconstruction-Source-Code

 def extract(
     self,
     html,  # 目标网站的源代码
     title_xpath='',  # 新闻标题的 XPath，用于定向提取标题
     author_xpath='',  # 文章作者的 XPath，用于定向提取文章作者
     publish_time_xpath='',  # 文章发布时间的 XPath，用于定向提取文章发布时间
     host='',  # 图片所在的域名，例如 https://www.kingname.info, 那么，当GNE 从新闻网站提取到图片的相对连接``/images/123.png``时，会把 host 拼接上去，变成``https://www.kingname.info/images/123.png``
     noise_node_list=None,  # 移除会导致干扰的标签。(XPath 的列表,列表中的 XPath 对应的标签，会在预处理时被直接删除掉，从而避免他们影响新闻正文的提取)
     with_body_html=False
 ):  # 为 True时，返回的结果会包含字段 body_html，内容是新闻正文所在标签的 HTML 源代码，默认为False
     # 预解析html(剔除换行和无用节点)
     element = pre_parse(html)
     # 剔除用户自定义的干扰节点
     remove_noise_node(element, noise_node_list)
     # 正文抽取
     content = ContentExtractor().extract(element, host, with_body_html)
     # 标题抽取
     title = TitleExtractor().extract(element, title_xpath=title_xpath)
     # 发表时间抽取
     publish_time = TimeExtractor().extractor(
         element, publish_time_xpath=publish_time_xpath)
     # 作者抽取
     author = AuthorExtractor().extractor(element,
                                          author_xpath=author_xpath)
     # 汇总结果字典
     result = {
         'title': title,
         'author': author,
         'publish_time': publish_time,
         'content': content[0][1]['text'],
         'images': content[0][1]['images']
     }
     # 判断结果字典是否加入源码
     if with_body_html or config.get('with_body_html', False):
         result['body_html'] = content[0][1]['body_html']
     return result

Exemplo n.º 5

0

Exibir arquivo

Arquivo: __init__.py Projeto: zshipu/GeneralNewsExtractor

 def __init__(self):
     self.content_extractor = ContentExtractor()
     self.title_extractor = TitleExtractor()
     self.author_extractor = AuthorExtractor()
     self.time_extractor = TimeExtractor()