def test_should_return_one_page(self): lxml_root = E.DOCUMENT( E.PAGE( ) ) svg_pages = list(iter_svg_pages_for_lxml(lxml_root)) assert len(svg_pages) == 1
def test_should_add_background_rect(self): lxml_root = E.DOCUMENT(E.PAGE(width='600', height='800')) svg_pages = list(iter_svg_pages_for_lxml(lxml_root)) assert len(svg_pages) == 1 background_rect = svg_pages[0].xpath('svg:rect[@class="background"]', namespaces={'svg': SVG_NS}) assert len(background_rect) == 1
def test_should_create_text_node_with_common_attributes(self): lxml_root = E.DOCUMENT( E.PAGE( E.TEXT( E.TOKEN( SOME_TEXT, COMMON_LXML_TOKEN_ATTRIBS ) ) ) ) svg_pages = list(iter_svg_pages_for_lxml(lxml_root)) assert len(svg_pages) == 1 first_page = svg_pages[0] svg_text = first_page.find('.//' + SVG_TEXT) assert svg_text is not None assert svg_text.text == SOME_TEXT assert float(svg_text.attrib[SVG.X]) == float(SOME_X) assert float(svg_text.attrib[SVG.Y]) == float(SOME_Y) assert float(svg_text.attrib[SVG.FONT_SIZE]) == float(SOME_FONT_SIZE) assert svg_text.attrib[SVG.FONT_FAMILY] == SOME_FONT_FAMILY assert svg_text.attrib[SVG.FILL] == SOME_FONT_COLOR assert parse_bounding_box(svg_text.attrib.get(SVG.BOUNDING_BOX)) == BoundingBox( float(COMMON_LXML_TOKEN_ATTRIBS[LXML.X]), float(COMMON_LXML_TOKEN_ATTRIBS[LXML.Y]), float(COMMON_LXML_TOKEN_ATTRIBS[LXML.WIDTH]), float(COMMON_LXML_TOKEN_ATTRIBS[LXML.HEIGHT]) )
def test_should_set_svg_dimensions(self): lxml_root = E.DOCUMENT( E.PAGE( width='600', height='800' ) ) svg_pages = list(iter_svg_pages_for_lxml(lxml_root)) assert len(svg_pages) == 1 assert svg_pages[0].attrib.get('viewBox') == '0 0 600 800'
def test_should_use_base_as_y_in_svg_if_available(self): lxml_root = E.DOCUMENT( E.PAGE( E.TEXT( E.TOKEN( SOME_TEXT, dict_extend(COMMON_LXML_TOKEN_ATTRIBS, {LXML.BASE: SOME_BASE}))))) svg_pages = list(iter_svg_pages_for_lxml(lxml_root)) assert len(svg_pages) == 1 first_page = svg_pages[0] svg_text = first_page.find('.//' + SVG_TEXT) assert float(svg_text.attrib[SVG.Y]) == float(SOME_BASE)
def test_should_keep_text_block_structure_without_block(self): lxml_root = E.DOCUMENT( E.PAGE( E.TEXT( E.TOKEN( SOME_TEXT, dict_extend(COMMON_LXML_TOKEN_ATTRIBS, {LXML.BASE: SOME_BASE}))))) svg_pages = list(iter_svg_pages_for_lxml(lxml_root)) assert len(svg_pages) == 1 first_page = svg_pages[0] svg_text = first_page.find('.//' + SVG_TEXT) assert svg_text is not None assert svg_text.getparent().tag == SVG_G assert svg_text.getparent().getparent().tag == SVG_DOC
def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, name=None): stop_watch_recorder = StopWatchRecorder() stop_watch_recorder.start('parse lxml') lxml_root = etree.fromstring(lxml_content) # use a more lenient way to parse xml as xml errors are not uncomment stop_watch_recorder.start('parse xml') xml_root = xml_from_string_with_recover(xml_content) stop_watch_recorder.start('extract target annotations') target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping ) stop_watch_recorder.stop() annotators = DEFAULT_ANNOTATORS + [MatchingAnnotator( target_annotations, use_tag_begin_prefix=True )] annotator = Annotator(annotators) stop_watch_recorder.start('convert to svg') svg_roots = list(iter_svg_pages_for_lxml(lxml_root)) stop_watch_recorder.start('annotate svg') annotator.annotate(SvgStructuredDocument(svg_roots)) stop_watch_recorder.start('add visualisation') svg_roots = [ visualize_svg_annotations(svg_root) for svg_root in svg_roots ] stop_watch_recorder.stop() get_logger().info( 'processed: name=%s, lxml size=%s, xml size=%s, timings=[%s] (native align impl=%s)', name, format(len(lxml_content), ','), format(len(xml_content), ','), stop_watch_recorder, align_native_enabled ) return svg_roots