def test_html_element_refinement(): new = HtmlElement('span', display=Display.inline, prefix=' ', suffix=' ', limit_whitespace_affixes=True) pre = HtmlElement('pre', display=Display.block, whitespace=WhiteSpace.pre) code = HtmlElement('code') # refinement with pre and whitespaces refined = pre.get_refined_html_element(copy(new)) assert refined.prefix == '' assert refined.suffix == '' # refinement with code and whitespaces refined = code.get_refined_html_element(copy(new)) assert refined.prefix == ' ' assert refined.suffix == ' ' # refinement with pre and non-whitespaces new.prefix = ' 1. ' new.suffix = '<' refined = pre.get_refined_html_element(copy(new)) assert refined.prefix == ' 1. ' assert refined.suffix == '<' # refinement with code and non-whitespaces refined = code.get_refined_html_element(copy(new)) assert refined.prefix == ' 1. ' assert refined.suffix == '<'
def test_formatting(): # standard line h = HtmlElement() assert _get_text(h) == 'firstEhre sei Gott!last' h.display = Display.block h.margin_before = 1 h.margin_after = 2 print(h) print(_get_text(h)) assert _get_text(h) == 'first\n\nEhre sei Gott!\n\n\nlast' # list bullet without padding_inline h.list_bullet = "* " assert _get_text(h) == 'first\n\n* Ehre sei Gott!\n\n\nlast' # add a padding_inline h.padding_inline = 3 assert _get_text(h) == 'first\n\n * Ehre sei Gott!\n\n\nlast' # and prefixes + suffixes h.prefix = '>>' h.suffix = '<<' assert _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast'
def _get_text(html_element): """ Returns the text formatted based on the current HTML element. """ c = Canvas() html_element.canvas = c HtmlElement().set_canvas(c).write("first") c.open_tag(html_element) html_element.write("Ehre sei Gott!") c.close_tag(html_element) HtmlElement().set_canvas(c).write("last") c._flush_inline() return '\n'.join(c.blocks)
def test_parse_horizontal_align(): html_element = HtmlElement() CssParse.attr_horizontal_align('center', html_element) assert html_element.align == HorizontalAlignment.center # invalid value CssParse.attr_horizontal_align('unknown', html_element) assert html_element.align == HorizontalAlignment.center
def test_parse_vertical_align(): html_element = HtmlElement() CssParse.attr_vertical_align('top', html_element) assert html_element.valign == VerticalAlignment.top # invalid value CssParse.attr_vertical_align('unknown', html_element) assert html_element.valign == VerticalAlignment.top
def test_html_element_str(): ''' Tests the string representation of an HtmlElement. ''' html_element = HtmlElement('div', '', '', Display.inline, 0, 0, 0, '', WhiteSpace.pre) assert str(html_element) == ('<div prefix=, suffix=, ' 'display=Display.inline, margin_before=0, ' 'margin_after=0, padding_inline=0, ' 'list_bullet=, ' 'whitespace=WhiteSpace.pre, ' 'align=HorizontalAlignment.left, ' 'valign=VerticalAlignment.middle, ' 'annotation=()>')
def test_parse(): """ basic rule parsing. """ rules = {'table#border=1': ['table'], 'hr': ['horizontal-line']} tags, attrs = AnnotationModel._parse(rules) assert tags == {'hr': ['horizontal-line']} apply_annotation = attrs[0] assert apply_annotation.match_tag == 'table' assert apply_annotation.match_value == '1' assert apply_annotation.attr == 'border' e = HtmlElement(tag='table') apply_annotation.apply('1', e) assert e.annotation == ('table', )
def test_merged_attribute(): """ test multiple rules per attribute """ rules = {'#color=white': ['white'], '#color=yellow': ['yellow']} css = deepcopy(CSS_PROFILES['strict']) annotation_model = AnnotationModel(css, rules) attribute_handler = Attribute() attribute_handler.merge_attribute_map(annotation_model.css_attr) e = HtmlElement() attribute_handler.attribute_mapping['color']('green', e) assert e.annotation == () attribute_handler.attribute_mapping['color']('yellow', e) assert e.annotation == ('yellow', ) attribute_handler.attribute_mapping['color']('white', e) assert e.annotation == ('yellow', 'white')
#!/usr/bin/env python3 # coding: utf-8 """Standard CSS profiles shipped with inscriptis. - `strict`: this profile corresponds to the defaults used by Firefox - `relaxed`: this profile is more suited for text analytics, since it ensures that whitespaces are inserted between span and div elements preventing cases where two words stick together. """ from inscriptis.model.html_element import HtmlElement from inscriptis.html_properties import Display, WhiteSpace STRICT_CSS_PROFILE = { 'body': HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal), 'head': HtmlElement(display=Display.none), 'link': HtmlElement(display=Display.none), 'meta': HtmlElement(display=Display.none), 'script': HtmlElement(display=Display.none), 'title': HtmlElement(display=Display.none), 'style': HtmlElement(display=Display.none), 'p': HtmlElement(display=Display.block, margin_before=1, margin_after=1), 'figure':
def test_style_unit_parsing(): html_element = HtmlElement() CssParse.attr_style( "margin-top:2.666666667em;margin-bottom: 2.666666667em", html_element) assert html_element.margin_before == 3 assert html_element.margin_after == 3