def test_should_return_samefont_if_font_size_is_the_same(self): assert get_token_font_size_feature( previous_token=LayoutToken('', font=LayoutFont(font_id='dummy', font_size=1)), current_token=LayoutToken( '', font=LayoutFont(font_id='dummy', font_size=1))) == 'SAMEFONTSIZE'
def test_should_return_lowerfont_if_font_size_is_smaller(self): assert get_token_font_size_feature( previous_token=LayoutToken('', font=LayoutFont(font_id='dummy', font_size=2)), current_token=LayoutToken( '', font=LayoutFont(font_id='dummy', font_size=1))) == 'LOWERFONT'
def test_should_return_higherfont_if_new_font_has_no_size(self): assert get_token_font_size_feature( previous_token=LayoutToken('', font=LayoutFont(font_id='dummy', font_size=1)), current_token=LayoutToken( '', font=LayoutFont(font_id='dummy', font_size=None))) == 'HIGHERFONT'
def test_should_return_higherfont_if_font_size_is_larger(self): assert get_token_font_size_feature( previous_token=LayoutToken('', font=LayoutFont(font_id='dummy', font_size=1)), current_token=LayoutToken( '', font=LayoutFont(font_id='dummy', font_size=2))) == 'HIGHERFONT'
def test_should_add_superscript_text(self): block = LayoutBlock.for_tokens([ LayoutToken(TOKEN_1), LayoutToken(TOKEN_2, font=SUPERSCRIPT_FONT_1), LayoutToken(TOKEN_3) ]) node = TEI_E.node(*iter_layout_block_tei_children(block)) assert get_tei_xpath_text_content_list( node, './tei:hi[@rend="superscript"]') == [TOKEN_2] assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics( self): page_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(0, 0, 200, 200), page_number=1) semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 90, 100, 50), page_number=1) keep_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 10, 100, 20), page_number=1) remove_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 100, 20), page_number=1) empty_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 0, 0), page_number=1) keep_token = LayoutToken('keep', coordinates=keep_coordinates) remove_token = LayoutToken('remove', coordinates=remove_coordinates) keep_graphic = LayoutGraphic(coordinates=keep_coordinates, graphic_type='keep-graphic') remove_graphic = LayoutGraphic(coordinates=remove_coordinates, graphic_type='remove-graphic') layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock( lines=[LayoutLine(tokens=[keep_token, remove_token])]) ], graphics=[keep_graphic, remove_graphic], meta=LayoutPageMeta( page_number=page_coordinates.page_number, coordinates=page_coordinates)) ]) layout_graphic = LayoutGraphic( coordinates=semantic_graphic_coordinates, graphic_type='new-graphic') no_coords_layout_graphic = LayoutGraphic( coordinates=empty_coordinates, graphic_type='empty-coords-graphic') result = get_layout_document_with_text_and_graphics_replaced_by_graphics( layout_document, semantic_graphics=[ SemanticGraphic(layout_graphic=layout_graphic), SemanticGraphic(layout_graphic=no_coords_layout_graphic) ]) LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics) assert result.pages[0].graphics[:-1] == [keep_graphic] LOGGER.debug('result.pages[0].graphics[-1]: %r', result.pages[0].graphics[-1]) assert result.pages[0].graphics[ -1].graphic_type == layout_graphic.graphic_type assert result.pages[0].graphics[ -1].coordinates == layout_graphic.coordinates assert list( result.pages[0].blocks[0].iter_all_tokens()) == [keep_token] assert list( result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [ keep_token, remove_token ]
def test_should_detect_indented_blocks(self): line_indentation_status_feature = LineIndentationStatusFeature() line_indentation_status_feature.on_new_block() line_indentation_status_feature.on_new_line() assert line_indentation_status_feature.get_is_indented_and_update( LayoutToken('x', coordinates=LayoutPageCoordinates( x=10, y=10, width=10, height=10))) is False line_indentation_status_feature.on_new_line() assert line_indentation_status_feature.get_is_indented_and_update( LayoutToken('x', coordinates=LayoutPageCoordinates( x=50, y=10, width=10, height=10))) is True
def test_should_add_bold_and_italics_text(self): block = LayoutBlock.for_tokens([ LayoutToken(TOKEN_1), LayoutToken(TOKEN_2, font=BOLD_ITALICS_FONT_1), LayoutToken(TOKEN_3) ]) node = TEI_E.node(*iter_layout_block_tei_children(block)) LOGGER.debug('xml: %r', etree.tostring(node)) assert get_tei_xpath_text_content_list( node, './/tei:hi[@rend="bold"]') == [TOKEN_2] assert get_tei_xpath_text_content_list( node, './/tei:hi[@rend="italic"]') == [TOKEN_2] assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
def test_should_create_lines_based_on_line_descriptor(self): line_descriptor_1 = LayoutLineDescriptor(line_id=1) line_descriptor_2 = LayoutLineDescriptor(line_id=2) line_tokens_1 = [ LayoutToken(text, line_descriptor=line_descriptor_1) for text in ['token1.1', 'token1.2'] ] line_tokens_2 = [ LayoutToken(text, line_descriptor=line_descriptor_2) for text in ['token2.1', 'token2.2'] ] layout_block = LayoutBlock.for_tokens(line_tokens_1 + line_tokens_2) assert len(layout_block.lines) == 2 assert layout_block.lines[0].tokens == line_tokens_1 assert layout_block.lines[1].tokens == line_tokens_2
def test_should_retokenize_document_with_placeholders(self): text = 'token1 token2' layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_tokens([ LayoutToken(text, whitespace='\n', coordinates=LayoutPageCoordinates( x=10, y=10, width=100, height=50)) ]) ], graphics=[]) ]) retokenized_layout_document = retokenize_layout_document( layout_document) line = retokenized_layout_document.pages[0].blocks[0].lines[0] assert [t.text for t in line.tokens] == ['token1', 'token2'] assert [t.whitespace for t in line.tokens] == [' ', '\n'] assert line.tokens[0].coordinates.x == 10.0 assert line.tokens[0].coordinates.width == 100 * len('token1') / len( text) assert line.tokens[ 1].coordinates.x == 10.0 + 100 * len('token1 ') / len(text) assert line.tokens[1].coordinates.width == 100 * len('token2') / len( text)
def test_should_be_able_to_set_title_with_italic_layout_tokens(self): title_block = LayoutBlock.for_tokens([ LayoutToken('rend'), LayoutToken('italic1', font=ITALICS_FONT_1), LayoutToken('test') ]) document = TeiDocument() document.set_title_layout_block(title_block) LOGGER.debug('xml: %r', etree.tostring(document.root)) nodes = document.root.xpath( '//tei:fileDesc/tei:titleStmt/tei:title[@level="a"][@type="main"]', namespaces=TEI_NS_MAP) assert len(nodes) == 1 title_node = nodes[0] assert get_tei_xpath_text_content_list( title_node, './tei:hi[@rend="italic"]') == ['italic1'] assert document.get_title() == 'rend italic1 test'
def test_should_remove_blank_token(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[LayoutBlock.for_tokens([LayoutToken(' ')])], graphics=[]) ]) retokenized_layout_document = retokenize_layout_document( layout_document) line = retokenized_layout_document.pages[0].blocks[0].lines[0] assert line.tokens == []
def parse_token( self, token_node: etree.ElementBase, page_number: int, layout_line_descriptor: LayoutLineDescriptor) -> LayoutToken: return LayoutToken( text=token_node.attrib.get('CONTENT') or '', font=self.font_by_id_map.get(token_node.attrib.get('STYLEREFS'), EMPTY_FONT), coordinates=self.parse_page_coordinates(token_node, page_number=page_number), line_descriptor=layout_line_descriptor)
def test_should_not_retokenize_document_with_valid_tokens(self): layout_document = LayoutDocument(pages=[ LayoutPage( blocks=[LayoutBlock.for_tokens([LayoutToken('token1')])], graphics=[]) ]) retokenized_layout_document = retokenize_layout_document( layout_document) line = retokenized_layout_document.pages[0].blocks[0].lines[0] assert [t.text for t in line.tokens] == ['token1']
def test_should_return_false_if_no_font_size_available(self): layout_tokens = [ LayoutToken('', font=LayoutFont('font1', font_size=None)), LayoutToken('', font=LayoutFont('font2', font_size=None)), LayoutToken('', font=LayoutFont('font3', font_size=None)), LayoutToken('', font=LayoutFont('font4', font_size=None)) ] relative_font_size_feature = RelativeFontSizeFeature(layout_tokens) assert [ relative_font_size_feature.is_smallest_font_size(layout_token) for layout_token in layout_tokens ] == [False, False, False, False] assert [ relative_font_size_feature.is_largest_font_size(layout_token) for layout_token in layout_tokens ] == [False, False, False, False] assert [ relative_font_size_feature.is_larger_than_average_font_size( layout_token) for layout_token in layout_tokens ] == [False, False, False, False]
def test_should_return_is_smallest_largest_and_larger_than_avg(self): layout_tokens = [ LayoutToken('', font=LayoutFont('font1', font_size=1)), LayoutToken('', font=LayoutFont('font2', font_size=2)), LayoutToken('', font=LayoutFont('font3', font_size=3)), LayoutToken('', font=LayoutFont('font4', font_size=4)) ] relative_font_size_feature = RelativeFontSizeFeature(layout_tokens) assert [ relative_font_size_feature.is_smallest_font_size(layout_token) for layout_token in layout_tokens ] == [True, False, False, False] assert [ relative_font_size_feature.is_largest_font_size(layout_token) for layout_token in layout_tokens ] == [False, False, False, True] assert [ relative_font_size_feature.is_larger_than_average_font_size( layout_token) for layout_token in layout_tokens ] == [False, False, True, True]
def test_should_select_tokens_based_on_index(self): token_1 = LayoutToken(text='token1', whitespace=' ') token_2 = LayoutToken(text='token2', whitespace=' ') layout_tokens_text = LayoutTokensText( LayoutBlock.for_tokens([token_1, token_2])) assert str(layout_tokens_text) == 'token1 token2' assert layout_tokens_text.get_layout_tokens_between(0, 1) == [token_1] assert layout_tokens_text.get_layout_tokens_between( len(token_1.text) - 1, len(token_1.text)) == [token_1] assert not layout_tokens_text.get_layout_tokens_between( len(token_1.text), len(token_1.text) + 1) assert layout_tokens_text.get_layout_tokens_between( len(token_1.text) + 1, len(token_1.text) + 2) == [token_2] assert layout_tokens_text.get_layout_tokens_between( len(token_1.text) + 1 + len(token_2.text) - 1, len(token_1.text) + 1 + len(token_2.text)) == [token_2] assert not layout_tokens_text.get_layout_tokens_between( len(token_1.text) + 1 + len(token_2.text), len(token_1.text) + 1 + len(token_2.text) + 1)
def test_should_preserve_empty_pages_if_requested(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])]) ], graphics=[]), LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])], graphics=[]), ]) cleaned_layout_document = remove_empty_blocks( layout_document, preserve_empty_pages=True) assert len(cleaned_layout_document.pages) == 2
def iter_semantic_markers_for_layout_block( layout_block: LayoutBlock ) -> Iterable[Union[SemanticMarker, SemanticContentWrapper]]: for text in re.split(r'(\D)', layout_block.text): if not text: continue local_block = LayoutBlock.for_tokens( [LayoutToken(text, whitespace='')]) if text == ',' or text.isspace(): yield SemanticNote(layout_block=local_block, note_type='marker_delimiter') continue yield SemanticMarker(layout_block=local_block)
def test_should_preserve_meta(self): page_meta = LayoutPageMeta(COORDINATES_1) layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_tokens([LayoutToken('token1 token2')]) ], graphics=[], meta=page_meta) ]) retokenized_layout_document = retokenize_layout_document( layout_document) page = retokenized_layout_document.pages[0] assert page.meta == page_meta
def test_should_remove_empty_line_block_and_page(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])]) ], graphics=[]), LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])], graphics=[]), ]) cleaned_layout_document = remove_empty_blocks(layout_document) assert len(cleaned_layout_document.pages) == 1 line = cleaned_layout_document.pages[0].blocks[0].lines[0] assert [t.text for t in line.tokens] == ['token1']
def _get_semantic_content_for_page_coordinates( coordinates: LayoutPageCoordinates) -> SemanticContentWrapper: return SemanticFigure(layout_block=LayoutBlock.for_tokens( [LayoutToken(text='dummy', coordinates=coordinates)]))
def test_should_return_higherfont_without_previous_token(self): assert get_token_font_size_feature( previous_token=None, current_token=LayoutToken( '', font=LayoutFont(font_id='dummy'))) == 'HIGHERFONT'
ContextAwareLayoutTokenFeatures, DocumentFeaturesContext, ModelDataGenerator, LayoutModelData, feature_linear_scaling_int, _LINESCALE, get_str_bool_feature_value) LOGGER = logging.getLogger(__name__) NBSP = '\u00A0' def format_feature_text(text: str) -> str: return re.sub(" |\t", NBSP, text.strip()) NBBINS_POSITION = 12 EMPTY_LAYOUT_TOKEN = LayoutToken('') EMPTY_LAYOUT_LINE = LayoutLine([]) def get_block_status(line_index: int, line_count: int) -> str: return ('BLOCKSTART' if line_index == 0 else ('BLOCKEND' if line_index == line_count - 1 else 'BLOCKIN')) def get_page_status(block_index: int, block_count: int, is_first_block_token: bool, is_last_block_token: bool) -> str: return ('PAGESTART' if block_index == 0 and is_first_block_token else ('PAGEEND' if block_index == block_count - 1 and is_last_block_token else 'PAGEIN'))