def convert_two_letter_uppercase_given_name_to_given_middle_name( name: T_SemanticName): given_names = list(name.iter_by_type(SemanticGivenName)) middle_names = list(name.iter_by_type(SemanticMiddleName)) if middle_names: LOGGER.debug('already has a middle name: %r', middle_names) return if len(given_names) != 1: LOGGER.debug('no or too many given names: %r', given_names) return given_name_text = given_names[0].get_text() if len(given_name_text) != 2 or not given_name_text.isupper(): LOGGER.debug('not two uppercase characters: %r', given_name_text) return layout_document = LayoutDocument.for_blocks( list(given_names[0].iter_blocks())) retokenized_layout_document = layout_document.retokenize( tokenize_fn=tokenize_individual_characters) LOGGER.debug('retokenized_layout_document: %r', retokenized_layout_document) split_name_parts = [ (SemanticGivenName(layout_block=LayoutBlock.for_tokens([token])) if index == 0 else SemanticMiddleName( layout_block=LayoutBlock.for_tokens([token]))) for index, token in enumerate( retokenized_layout_document.iter_all_tokens()) ] LOGGER.debug('split_name_parts: %r', split_name_parts) name.flat_map_inplace_by_type(SemanticGivenName, lambda _: split_name_parts)
def test_should_retokenize_document_with_placeholders(self): text = 'token1 token2' layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_tokens([ LayoutToken(text, whitespace='\n', coordinates=LayoutPageCoordinates( x=10, y=10, width=100, height=50)) ]) ], graphics=[]) ]) retokenized_layout_document = retokenize_layout_document( layout_document) line = retokenized_layout_document.pages[0].blocks[0].lines[0] assert [t.text for t in line.tokens] == ['token1', 'token2'] assert [t.whitespace for t in line.tokens] == [' ', '\n'] assert line.tokens[0].coordinates.x == 10.0 assert line.tokens[0].coordinates.width == 100 * len('token1') / len( text) assert line.tokens[ 1].coordinates.x == 10.0 + 100 * len('token1 ') / len(text) assert line.tokens[1].coordinates.width == 100 * len('token2') / len( text)
def iter_entity_layout_blocks_for_labeled_layout_tokens( labeled_layout_tokens: Iterable[LabeledLayoutToken] ) -> Iterable[Tuple[str, LayoutBlock]]: layout_tokens = [result.layout_token for result in labeled_layout_tokens] labels = [result.label for result in labeled_layout_tokens] LOGGER.debug('layout_tokens: %s', layout_tokens) LOGGER.debug('labels: %s', labels) for tag, start, end in get_entities_including_other(list(labels)): yield tag, LayoutBlock.for_tokens(layout_tokens[start:end + 1])
def test_should_remove_blank_token(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[LayoutBlock.for_tokens([LayoutToken(' ')])], graphics=[]) ]) retokenized_layout_document = retokenize_layout_document( layout_document) line = retokenized_layout_document.pages[0].blocks[0].lines[0] assert line.tokens == []
def test_should_not_retokenize_document_with_valid_tokens(self): layout_document = LayoutDocument(pages=[ LayoutPage( blocks=[LayoutBlock.for_tokens([LayoutToken('token1')])], graphics=[]) ]) retokenized_layout_document = retokenize_layout_document( layout_document) line = retokenized_layout_document.pages[0].blocks[0].lines[0] assert [t.text for t in line.tokens] == ['token1']
def test_should_add_superscript_text(self): block = LayoutBlock.for_tokens([ LayoutToken(TOKEN_1), LayoutToken(TOKEN_2, font=SUPERSCRIPT_FONT_1), LayoutToken(TOKEN_3) ]) node = TEI_E.node(*iter_layout_block_tei_children(block)) assert get_tei_xpath_text_content_list( node, './tei:hi[@rend="superscript"]') == [TOKEN_2] assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
def get_section_label_and_title_from_layout_block( layout_block: LayoutBlock ) -> Tuple[Optional[LayoutBlock], LayoutBlock]: if not layout_block: return None, layout_block layout_tokens_text = LayoutTokensText(layout_block) text = str(layout_tokens_text) m = re.match(HEADER_LABEL_REGEX, text, re.IGNORECASE) if not m: return None, layout_block label_end = m.end(1) title_start = m.start(2) LOGGER.debug('label_end: %d, title_start: %d (text: %r)', label_end, title_start, text) section_label_layout_block = LayoutBlock.for_tokens(list( layout_tokens_text.iter_layout_tokens_between(0, label_end) )) section_title_layout_block = LayoutBlock.for_tokens(list( layout_tokens_text.iter_layout_tokens_between(title_start, len(text)) )) return section_label_layout_block, section_title_layout_block
def test_should_preserve_meta(self): page_meta = LayoutPageMeta(COORDINATES_1) layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_tokens([LayoutToken('token1 token2')]) ], graphics=[], meta=page_meta) ]) retokenized_layout_document = retokenize_layout_document( layout_document) page = retokenized_layout_document.pages[0] assert page.meta == page_meta
def test_should_add_bold_and_italics_text(self): block = LayoutBlock.for_tokens([ LayoutToken(TOKEN_1), LayoutToken(TOKEN_2, font=BOLD_ITALICS_FONT_1), LayoutToken(TOKEN_3) ]) node = TEI_E.node(*iter_layout_block_tei_children(block)) LOGGER.debug('xml: %r', etree.tostring(node)) assert get_tei_xpath_text_content_list( node, './/tei:hi[@rend="bold"]') == [TOKEN_2] assert get_tei_xpath_text_content_list( node, './/tei:hi[@rend="italic"]') == [TOKEN_2] assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
def iter_semantic_markers_for_layout_block( layout_block: LayoutBlock ) -> Iterable[Union[SemanticMarker, SemanticContentWrapper]]: for text in re.split(r'(\D)', layout_block.text): if not text: continue local_block = LayoutBlock.for_tokens( [LayoutToken(text, whitespace='')]) if text == ',' or text.isspace(): yield SemanticNote(layout_block=local_block, note_type='marker_delimiter') continue yield SemanticMarker(layout_block=local_block)
def get_cleaned_abstract_layout_block(layout_block: LayoutBlock) -> LayoutBlock: if not layout_block or not layout_block.lines: return layout_block layout_tokens_text = LayoutTokensText(layout_block) text = str(layout_tokens_text) m = re.match(ABSTRACT_REGEX, text, re.IGNORECASE) if not m: LOGGER.debug('text does not match regex: %r', text) return layout_block start = m.start(1) LOGGER.debug('start: %d (text: %r)', start, text) return LayoutBlock.for_tokens(list( layout_tokens_text.iter_layout_tokens_between(start, len(text)) ))
def test_should_create_lines_based_on_line_descriptor(self): line_descriptor_1 = LayoutLineDescriptor(line_id=1) line_descriptor_2 = LayoutLineDescriptor(line_id=2) line_tokens_1 = [ LayoutToken(text, line_descriptor=line_descriptor_1) for text in ['token1.1', 'token1.2'] ] line_tokens_2 = [ LayoutToken(text, line_descriptor=line_descriptor_2) for text in ['token2.1', 'token2.2'] ] layout_block = LayoutBlock.for_tokens(line_tokens_1 + line_tokens_2) assert len(layout_block.lines) == 2 assert layout_block.lines[0].tokens == line_tokens_1 assert layout_block.lines[1].tokens == line_tokens_2
def get_regex_cleaned_layout_block_with_prefix_suffix( layout_block: LayoutBlock, regex_pattern: Optional[str] ) -> Tuple[LayoutBlock, LayoutBlock, LayoutBlock]: if not layout_block or not layout_block.lines or not regex_pattern: return EMPTY_BLOCK, layout_block, EMPTY_BLOCK layout_tokens_text = LayoutTokensText(layout_block) text = str(layout_tokens_text) m = re.match(regex_pattern, text, re.IGNORECASE) if not m: LOGGER.debug('text does not match regex: %r', text) return EMPTY_BLOCK, layout_block, EMPTY_BLOCK start = m.start(1) end = m.end(1) LOGGER.debug('start: %d, end: %d, len: %d (text: %r)', start, end, len(text), text) return (LayoutBlock.for_tokens( list(layout_tokens_text.iter_layout_tokens_between(0, start))), LayoutBlock.for_tokens( list(layout_tokens_text.iter_layout_tokens_between(start, end))), LayoutBlock.for_tokens( list( layout_tokens_text.iter_layout_tokens_between( end, len(text)))))
def test_should_be_able_to_set_title_with_italic_layout_tokens(self): title_block = LayoutBlock.for_tokens([ LayoutToken('rend'), LayoutToken('italic1', font=ITALICS_FONT_1), LayoutToken('test') ]) document = TeiDocument() document.set_title_layout_block(title_block) LOGGER.debug('xml: %r', etree.tostring(document.root)) nodes = document.root.xpath( '//tei:fileDesc/tei:titleStmt/tei:title[@level="a"][@type="main"]', namespaces=TEI_NS_MAP) assert len(nodes) == 1 title_node = nodes[0] assert get_tei_xpath_text_content_list( title_node, './tei:hi[@rend="italic"]') == ['italic1'] assert document.get_title() == 'rend italic1 test'
def test_should_select_tokens_based_on_index(self): token_1 = LayoutToken(text='token1', whitespace=' ') token_2 = LayoutToken(text='token2', whitespace=' ') layout_tokens_text = LayoutTokensText( LayoutBlock.for_tokens([token_1, token_2])) assert str(layout_tokens_text) == 'token1 token2' assert layout_tokens_text.get_layout_tokens_between(0, 1) == [token_1] assert layout_tokens_text.get_layout_tokens_between( len(token_1.text) - 1, len(token_1.text)) == [token_1] assert not layout_tokens_text.get_layout_tokens_between( len(token_1.text), len(token_1.text) + 1) assert layout_tokens_text.get_layout_tokens_between( len(token_1.text) + 1, len(token_1.text) + 2) == [token_2] assert layout_tokens_text.get_layout_tokens_between( len(token_1.text) + 1 + len(token_2.text) - 1, len(token_1.text) + 1 + len(token_2.text)) == [token_2] assert not layout_tokens_text.get_layout_tokens_between( len(token_1.text) + 1 + len(token_2.text), len(token_1.text) + 1 + len(token_2.text) + 1)
def get_layout_page_with_text_or_graphic_replaced_by_graphic( layout_page: LayoutPage, semantic_graphic: SemanticGraphic, is_only_semantic_graphic_on_page: bool, is_replace_overlapping_text: bool) -> LayoutPage: layout_graphic = semantic_graphic.layout_graphic assert layout_graphic assert layout_graphic.coordinates graphic_bounding_box = layout_graphic.coordinates.bounding_box if is_only_semantic_graphic_on_page: layout_graphic = layout_graphic._replace( related_block=LayoutBlock.for_tokens( list(layout_page.iter_all_tokens()))) modified_layout_page = (layout_page.replace(graphics=[ _layout_graphic for _layout_graphic in layout_page.graphics if not is_layout_graphic_within_bounding_box( _layout_graphic, bounding_box=graphic_bounding_box) ] + [layout_graphic])) if is_replace_overlapping_text: modified_layout_page = (modified_layout_page.flat_map_layout_tokens( functools.partial( _remove_tokens_within_bounding_box_flatmap_fn, bounding_box=graphic_bounding_box)).remove_empty_blocks()) return modified_layout_page
def _get_semantic_content_for_page_coordinates( coordinates: LayoutPageCoordinates) -> SemanticContentWrapper: return SemanticFigure(layout_block=LayoutBlock.for_tokens( [LayoutToken(text='dummy', coordinates=coordinates)]))