def test_html_element_refinement():
    new = HtmlElement('span', display=Display.inline, prefix=' ', suffix=' ',
                      limit_whitespace_affixes=True)
    pre = HtmlElement('pre', display=Display.block, whitespace=WhiteSpace.pre)
    code = HtmlElement('code')

    # refinement with pre and whitespaces
    refined = pre.get_refined_html_element(copy(new))
    assert refined.prefix == ''
    assert refined.suffix == ''

    # refinement with code and whitespaces
    refined = code.get_refined_html_element(copy(new))
    assert refined.prefix == ' '
    assert refined.suffix == ' '

    # refinement with pre and non-whitespaces
    new.prefix = ' 1. '
    new.suffix = '<'
    refined = pre.get_refined_html_element(copy(new))
    assert refined.prefix == ' 1. '
    assert refined.suffix == '<'

    # refinement with code and non-whitespaces
    refined = code.get_refined_html_element(copy(new))
    assert refined.prefix == ' 1. '
    assert refined.suffix == '<'
def test_formatting():
    # standard line

    h = HtmlElement()
    assert _get_text(h) == 'firstEhre sei Gott!last'

    h.display = Display.block
    h.margin_before = 1
    h.margin_after = 2
    print(h)
    print(_get_text(h))
    assert _get_text(h) == 'first\n\nEhre sei Gott!\n\n\nlast'

    # list bullet without padding_inline
    h.list_bullet = "* "
    assert _get_text(h) == 'first\n\n* Ehre sei Gott!\n\n\nlast'

    # add a padding_inline
    h.padding_inline = 3
    assert _get_text(h) == 'first\n\n * Ehre sei Gott!\n\n\nlast'

    # and prefixes + suffixes
    h.prefix = '>>'
    h.suffix = '<<'
    assert  _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast'
def _get_text(html_element):
    """
    Returns
        the text formatted based on the current HTML element.
    """
    c = Canvas()
    html_element.canvas = c

    HtmlElement().set_canvas(c).write("first")

    c.open_tag(html_element)
    html_element.write("Ehre sei Gott!")
    c.close_tag(html_element)

    HtmlElement().set_canvas(c).write("last")
    c._flush_inline()
    return '\n'.join(c.blocks)
示例#4
0
def test_parse_horizontal_align():
    html_element = HtmlElement()
    CssParse.attr_horizontal_align('center', html_element)
    assert html_element.align == HorizontalAlignment.center

    # invalid value
    CssParse.attr_horizontal_align('unknown', html_element)
    assert html_element.align == HorizontalAlignment.center
示例#5
0
def test_parse_vertical_align():
    html_element = HtmlElement()
    CssParse.attr_vertical_align('top', html_element)
    assert html_element.valign == VerticalAlignment.top

    # invalid value
    CssParse.attr_vertical_align('unknown', html_element)
    assert html_element.valign == VerticalAlignment.top
示例#6
0
def test_html_element_str():
    '''
    Tests the string representation of an HtmlElement.
    '''
    html_element = HtmlElement('div', '', '', Display.inline, 0, 0, 0, '',
                               WhiteSpace.pre)
    assert str(html_element) == ('<div prefix=, suffix=, '
                                 'display=Display.inline, margin_before=0, '
                                 'margin_after=0, padding_inline=0, '
                                 'list_bullet=, '
                                 'whitespace=WhiteSpace.pre, '
                                 'align=HorizontalAlignment.left, '
                                 'valign=VerticalAlignment.middle, '
                                 'annotation=()>')
def test_parse():
    """
    basic rule parsing.
    """
    rules = {'table#border=1': ['table'], 'hr': ['horizontal-line']}
    tags, attrs = AnnotationModel._parse(rules)

    assert tags == {'hr': ['horizontal-line']}

    apply_annotation = attrs[0]
    assert apply_annotation.match_tag == 'table'
    assert apply_annotation.match_value == '1'
    assert apply_annotation.attr == 'border'

    e = HtmlElement(tag='table')
    apply_annotation.apply('1', e)
    assert e.annotation == ('table', )
def test_merged_attribute():
    """
    test multiple rules per attribute
    """
    rules = {'#color=white': ['white'], '#color=yellow': ['yellow']}
    css = deepcopy(CSS_PROFILES['strict'])
    annotation_model = AnnotationModel(css, rules)

    attribute_handler = Attribute()
    attribute_handler.merge_attribute_map(annotation_model.css_attr)

    e = HtmlElement()
    attribute_handler.attribute_mapping['color']('green', e)
    assert e.annotation == ()
    attribute_handler.attribute_mapping['color']('yellow', e)
    assert e.annotation == ('yellow', )
    attribute_handler.attribute_mapping['color']('white', e)
    assert e.annotation == ('yellow', 'white')
示例#9
0
#!/usr/bin/env python3
# coding: utf-8
"""Standard CSS profiles shipped with inscriptis.

- `strict`: this profile corresponds to the defaults used by Firefox
- `relaxed`: this profile is more suited for text analytics, since it ensures
             that whitespaces are inserted between span and div elements
             preventing cases where two words stick together.
"""

from inscriptis.model.html_element import HtmlElement
from inscriptis.html_properties import Display, WhiteSpace

STRICT_CSS_PROFILE = {
    'body':
    HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal),
    'head':
    HtmlElement(display=Display.none),
    'link':
    HtmlElement(display=Display.none),
    'meta':
    HtmlElement(display=Display.none),
    'script':
    HtmlElement(display=Display.none),
    'title':
    HtmlElement(display=Display.none),
    'style':
    HtmlElement(display=Display.none),
    'p':
    HtmlElement(display=Display.block, margin_before=1, margin_after=1),
    'figure':
示例#10
0
def test_style_unit_parsing():
    html_element = HtmlElement()
    CssParse.attr_style(
        "margin-top:2.666666667em;margin-bottom: 2.666666667em", html_element)
    assert html_element.margin_before == 3
    assert html_element.margin_after == 3