示例#1
0
    def __init__(self,
                 html_tree,
                 display_images=False,
                 deduplicate_captions=False,
                 display_links=False,
                 css=None):
        # setup config
        self.cfg_deduplicate_captions = deduplicate_captions
        self.css = css if css else CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME]

        # setup start and end tag call tables
        self.start_tag_handler_dict = {
            'table': self.start_table,
            'tr': self.start_tr,
            'td': self.start_td,
            'th': self.start_td,
            'ul': self.start_ul,
            'ol': self.start_ol,
            'li': self.start_li,
            'br': self.newline,
            'a': self.start_a if display_links else None,
            'img': self.start_img if display_images else None,
        }
        self.end_tag_handler_dict = {
            'table': self.end_table,
            'ul': self.end_ul,
            'ol': self.end_ol,
            'td': self.end_td,
            'th': self.end_td,
            'a': self.end_a if display_links else None,
        }

        # instance variables
        self.current_tag = [HtmlElement()]
        self.current_line = [Line()]
        self.next_line = [Line()]

        # the canvases used for displaying text
        # clean_text_line[0] refers to the root canvas; tables write into child
        # canvases that are created for every table line and merged with the
        # root canvas at the end of a table
        self.clean_text_lines = [[]]

        self.current_table = []
        self.li_counter = []
        self.li_level = 0
        self.invisible = []  # attributes that are considered invisible
        self.last_caption = None

        # used if display_links is enabled
        self.link_target = ''

        # crawl the html tree
        self.crawl_tree(html_tree)
        if self.current_line[-1]:
            self.write_line()
示例#2
0
def test_html_element_str():
    '''
    Tests the string representation of an HtmlElement.
    '''
    html_element = HtmlElement('div', '', '', Display.inline, 0, 0, 0,
                               WhiteSpace.pre)
    assert str(html_element) == ('<div prefix=, suffix=, '
                                 'display=Display.inline, margin_before=0, '
                                 'margin_after=0, padding=0, '
                                 'whitespace=WhiteSpace.pre>')
示例#3
0
def test_style_unit_parsing():
    html_element = CssParse.get_style_attribute(
        "margin-top:2.666666667em;margin-bottom: 2.666666667em",
        html_element=HtmlElement())
    assert html_element.margin_before == 3
    assert html_element.margin_after == 3
示例#4
0
class Inscriptis(object):
    '''
    The Inscriptis class translates an lxml HTML tree to the corresponding
    text representation.

    Args:
      html_tree: the lxml HTML tree to convert.
      display_images: whether to include image tiles/alt texts.
      deduplicate_captions: whether to deduplicate captions such as image
         titles (many newspaper include images and video previews with
         identifical titles).
      display_links: whether to display link targets
         (e.g. `[Python](https://www.python.org)`).
      css: an optional custom CSS definition.

    Example::

      from lxml.html import fromstring
      from inscriptis.html_engine import Inscriptis

      html_content = "<html><body><h1>Test</h1></body></html>"

      # create an HTML tree from the HTML content.
      html_tree = fromstring(html_content)

      # transform the HTML tree to text.
      parser = Inscriptis(html_tree)
      text = parser.get_text()
    '''

    UL_COUNTER = ('* ', '+ ', 'o ', '- ')
    UL_COUNTER_LEN = len(UL_COUNTER)

    DEFAULT_ELEMENT = HtmlElement()

    def __init__(self,
                 html_tree,
                 display_images=False,
                 deduplicate_captions=False,
                 display_links=False,
                 css=None):
        # setup config
        self.cfg_deduplicate_captions = deduplicate_captions
        self.css = css if css else CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME]

        # setup start and end tag call tables
        self.start_tag_handler_dict = {
            'table': self.start_table,
            'tr': self.start_tr,
            'td': self.start_td,
            'th': self.start_td,
            'ul': self.start_ul,
            'ol': self.start_ol,
            'li': self.start_li,
            'br': self.newline,
            'a': self.start_a if display_links else None,
            'img': self.start_img if display_images else None,
        }
        self.end_tag_handler_dict = {
            'table': self.end_table,
            'ul': self.end_ul,
            'ol': self.end_ol,
            'td': self.end_td,
            'th': self.end_td,
            'a': self.end_a if display_links else None,
        }

        # instance variables
        self.current_tag = [HtmlElement()]
        self.current_line = [Line()]
        self.next_line = [Line()]

        # the canvases used for displaying text
        # clean_text_line[0] refers to the root canvas; tables write into child
        # canvases that are created for every table line and merged with the
        # root canvas at the end of a table
        self.clean_text_lines = [[]]

        self.current_table = []
        self.li_counter = []
        self.li_level = 0
        self.invisible = []  # attributes that are considered invisible
        self.last_caption = None

        # used if display_links is enabled
        self.link_target = ''

        # crawl the html tree
        self.crawl_tree(html_tree)
        if self.current_line[-1]:
            self.write_line()

    def crawl_tree(self, tree):
        if isinstance(tree.tag, str):
            self.handle_starttag(tree.tag, tree.attrib)
            if tree.text:
                self.handle_data(tree.text)

            for node in tree:
                self.crawl_tree(node)

            self.handle_endtag(tree.tag)

        if tree.tail:
            self.handle_data(tree.tail)

    def get_text(self):
        '''
        Returns:
          str -- A text representation of the parsed content.
        '''
        return unescape('\n'.join(chain(*self.clean_text_lines))).rstrip()

    def write_line(self, force=False):
        '''
        Writes the current line to the buffer, provided that there is any
        data to write.

        Returns:
          bool -- True, if a line has been writer, otherwise False.
        '''
        # only break the line if there is any relevant content
        if not force and (not self.current_line[-1].content
                          or self.current_line[-1].content.isspace()):
            self.current_line[-1].margin_before = \
                max(self.current_line[-1].margin_before,
                    self.current_tag[-1].margin_before)
            return False

        line = self.current_line[-1].get_text()
        self.clean_text_lines[-1].append(line)
        self.current_line[-1] = self.next_line[-1]
        self.next_line[-1] = Line()
        return True

    def write_line_verbatim(self, text):
        '''
        Writes the current buffer without any modifications.

        Args:
          text (str): the text to write.
        '''
        self.clean_text_lines[-1].append(text)

    def handle_starttag(self, tag, attrs):
        '''
        Handels HTML start tags.

        Args:
          tag (str): the HTML start tag to process.
          attrs (dict): a dictionary of HTML attributes and their respective
             values.
        '''
        # use the css to handle tags known to it :)

        cur = self.css.get(tag, Inscriptis.DEFAULT_ELEMENT)
        if 'style' in attrs:
            cur = CssParse.get_style_attribute(attrs['style'],
                                               html_element=cur)
        self.current_tag.append(cur)
        if cur.display == Display.none or self.invisible:
            self.invisible.append(cur)
            return

        self.next_line[-1].padding = self.current_line[-1].padding \
            + cur.padding
        # flush text before display:block elements
        if cur.display == Display.block:
            if not self.write_line():
                self.current_line[-1].margin_before = max(
                    self.current_line[-1].margin_before, cur.margin_before)
                self.current_line[-1].padding = self.next_line[-1].padding
            else:
                self.current_line[-1].margin_after = max(
                    self.current_line[-1].margin_after, cur.margin_after)

        handler = self.start_tag_handler_dict.get(tag, None)
        if handler:
            handler(attrs)

    def handle_endtag(self, tag):
        '''
        Handels HTML end tags.

        Args:
          tag(str): the HTML end tag to process.
        '''
        cur = self.current_tag.pop()
        if self.invisible:
            self.invisible.pop()
            return

        self.next_line[-1].padding = self.current_line[-1].padding \
            - cur.padding
        self.current_line[-1].margin_after = max(
            self.current_line[-1].margin_after, cur.margin_after)
        # flush text after display:block elements
        if cur.display == Display.block:
            # propagate the new padding to the current line, if nothing has
            # been written
            if not self.write_line():
                self.current_line[-1].padding = self.next_line[-1].padding

        handler = self.end_tag_handler_dict.get(tag, None)
        if handler:
            handler()

    def handle_data(self, data):
        '''
        Handels text belonging to HTML tags.

        Args:
          data (str): The text to process.
        '''
        if self.invisible:
            return

        # protect pre areas
        if self.current_tag[-1].whitespace == WhiteSpace.pre:
            data = '\0' + data + '\0'

        # add prefix, if present
        data = self.current_tag[-1].prefix + data + self.current_tag[-1].suffix

        # determine whether to add this content to a table column
        # or to a standard line
        self.current_line[-1].content += data

    def start_ul(self, attrs):
        self.li_level += 1
        self.li_counter.append(Inscriptis.get_bullet(self.li_level - 1))

    def end_ul(self):
        self.li_level -= 1
        self.li_counter.pop()

    def start_img(self, attrs):
        image_text = attrs.get('alt', '') or attrs.get('title', '')
        if image_text and not (self.cfg_deduplicate_captions
                               and image_text == self.last_caption):
            self.current_line[-1].content += '[{}]'.format(image_text)
            self.last_caption = image_text

    def start_a(self, attrs):
        self.link_target = attrs.get('href', '')
        self.current_line[-1].content += '['

    def end_a(self):
        self.current_line[-1].content += ']({})'.format(self.link_target)

    def start_ol(self, attrs):
        self.li_counter.append(1)
        self.li_level += 1

    def end_ol(self):
        self.li_level -= 1
        self.li_counter.pop()

    def start_li(self, attrs):
        self.write_line()
        if self.li_level > 0:
            bullet = self.li_counter[-1]
        else:
            bullet = "* "
        if isinstance(bullet, int):
            self.li_counter[-1] += 1
            self.current_line[-1].list_bullet = "{}. ".format(bullet)
        else:
            self.current_line[-1].list_bullet = bullet

    def start_table(self, attrs):
        self.current_table.append(Table())

    def start_tr(self, attrs):
        if self.current_table:
            # check whether we need to cleanup a <td> tag that has not been
            # closed yet
            if self.current_table[-1].td_is_open:
                self.end_td()

            self.current_table[-1].add_row()

    def start_td(self, attrs):
        if self.current_table:
            # check whether we need to cleanup a <td> tag that has not been
            # closed yet
            if self.current_table[-1].td_is_open:
                self.end_td()

            # open td tag
            self.clean_text_lines.append([])
            self.current_line.append(Line())
            self.next_line.append(Line())
            self.current_table[-1].add_cell(self.clean_text_lines[-1])
            self.current_table[-1].td_is_open = True

    def end_td(self):
        if self.current_table and self.current_table[-1].td_is_open:
            self.current_table[-1].td_is_open = False
            self.write_line(force=True)
            self.clean_text_lines.pop()
            self.current_line.pop()
            self.next_line.pop()

    def end_tr(self):
        pass

    def end_table(self):
        if self.current_table and self.current_table[-1].td_is_open:
            self.end_td()
        self.write_line()
        table = self.current_table.pop()
        self.write_line_verbatim(table.get_text())

    def newline(self, attrs):
        self.write_line(force=True)

    @staticmethod
    def get_bullet(index):
        '''
        Returns:
          str -- The bullet that corresponds to the given index.
        '''
        return Inscriptis.UL_COUNTER[index % Inscriptis.UL_COUNTER_LEN]
示例#5
0
# coding: utf-8
'''
Standard CSS profiles shipped with inscriptis.

- `strict`: this profile corresponds to the defaults used by Firefox
- `relaxed`: this profile is more suited for text analytics, since it ensures
             that whitespaces are inserted between span and div elements
             preventing cases where two words stick together.
'''

from inscriptis.model.css import HtmlElement
from inscriptis.html_properties import Display, WhiteSpace

STRICT_CSS_PROFILE = {
    'body':
    HtmlElement('body', display=Display.inline, whitespace=WhiteSpace.normal),
    'head':
    HtmlElement('head', display=Display.none),
    'link':
    HtmlElement('link', display=Display.none),
    'meta':
    HtmlElement('meta', display=Display.none),
    'script':
    HtmlElement('script', display=Display.none),
    'title':
    HtmlElement('title', display=Display.none),
    'style':
    HtmlElement('style', display=Display.none),
    'p':
    HtmlElement('p', display=Display.block, margin_before=1, margin_after=1),
    'figure':
示例#6
0
# coding: utf-8
'''
Standard CSS profiles shipped with inscriptis.

- `strict`: this profile corrisponds to the defaults used by Firefox
- `relaxed`: this profile is more suited for text analytics, since it ensures
             that whitespaces are inserted between span and div elements
             preventing cases where two words stick together.
'''

from inscriptis.model.css import HtmlElement
from inscriptis.html_properties import Display

STRICT_CSS_PROFILE = {
    'head':
    HtmlElement('head', display=Display.none),
    'link':
    HtmlElement('link', display=Display.none),
    'meta':
    HtmlElement('meta', display=Display.none),
    'script':
    HtmlElement('script', display=Display.none),
    'title':
    HtmlElement('title', display=Display.none),
    'style':
    HtmlElement('style', display=Display.none),
    'p':
    HtmlElement('p', display=Display.block, margin_before=1, margin_after=1),
    'figure':
    HtmlElement('figure',
                display=Display.block,
def test_html_element_refinement():
    new = HtmlElement('span', display=Display.inline, prefix=' ', suffix=' ',
                      limit_whitespace_affixes=True)
    pre = HtmlElement('pre', display=Display.block, whitespace=WhiteSpace.pre)
    code = HtmlElement('code')

    # refinement with pre and whitespaces
    refined = pre.get_refined_html_element(new)
    assert refined.prefix == ''
    assert refined.suffix == ''

    # refinement with code and whitespaces
    refined = code.get_refined_html_element(new)
    assert refined.prefix == ' '
    assert refined.suffix == ' '

    # refinement with pre and non-whitespaces
    new.prefix = ' 1. '
    new.suffix = '<'
    refined = pre.get_refined_html_element(new)
    assert refined.prefix == ' 1. '
    assert refined.suffix == '<'

    # refinement with code and non-whitespaces
    refined = code.get_refined_html_element(new)
    assert refined.prefix == ' 1. '
    assert refined.suffix == '<'