def test_should_provide_page_and_block_status_for_multi_line_blocks( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock(lines=[ LayoutLine.for_text('line1'), LayoutLine.for_text('line2'), LayoutLine.for_text('line3') ]) ]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'page_status': features.get_page_status(), 'block_status': features.get_block_status() }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [{ 'page_status': 'PAGESTART', 'block_status': 'BLOCKSTART' }, { 'page_status': 'PAGEIN', 'block_status': 'BLOCKIN' }, { 'page_status': 'PAGEEND', 'block_status': 'BLOCKEND' }]
def test_should_provide_block_relative_line_length( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock(lines=[ LayoutLine.for_text('1'), LayoutLine.for_text('12'), LayoutLine.for_text('1234567890'), ]) ]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'str_block_relative_line_length_feature': (features.get_str_block_relative_line_length_feature()) }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [ { 'str_block_relative_line_length_feature': '1', # 1 * 10 / 10 }, { 'str_block_relative_line_length_feature': '2', # 2 * 10 / 10 }, { 'str_block_relative_line_length_feature': '10', # 10 * 10 / 10 }, ]
def test_should_provide_line_text( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock(lines=[ LayoutLine.for_text('first1 second1 this is a line'), LayoutLine.for_text('first2 second2 this is a line') ]) ]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'line_text': features.line_text, 'token_text': features.token_text, 'second_token_text': features.second_token_text }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [ { 'line_text': 'first1 second1 this is a line', 'token_text': 'first1', 'second_token_text': 'second1' }, { 'line_text': 'first2 second2 this is a line', 'token_text': 'first2', 'second_token_text': 'second2' }, ]
def test_should_add_line_feeds(self): layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = get_training_tei_xml_for_layout_document(layout_document) nodes = tei_xpath(xml_root, AUTHOR_XPATH) assert len(nodes) == 1 assert get_text_content(nodes[0]).rstrip() == '\n'.join( [TEXT_1, TEXT_2])
def test_should_preserve_empty_pages_if_requested(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])]) ], graphics=[]), LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])], graphics=[]), ]) cleaned_layout_document = remove_empty_blocks( layout_document, preserve_empty_pages=True) assert len(cleaned_layout_document.pages) == 2
def test_should_remove_empty_line_block_and_page(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])]) ], graphics=[]), LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])], graphics=[]), ]) cleaned_layout_document = remove_empty_blocks(layout_document) assert len(cleaned_layout_document.pages) == 1 line = cleaned_layout_document.pages[0].blocks[0].lines[0] assert [t.text for t in line.tokens] == ['token1']
def test_should_lb_elements_before_line_feeds(self): layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = get_training_tei_xml_for_layout_document(layout_document) nodes = tei_xpath(xml_root, AUTHOR_XPATH) assert len(nodes) == 1 lb_nodes = tei_xpath(nodes[0], 'tei:lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2
def test_should_add_line_feeds(self): training_data_generator = get_tei_training_data_generator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) text_nodes = xml_root.xpath('./text/listBibl') assert len(text_nodes) == 1 assert get_text_content(text_nodes[0]).rstrip() == '\n'.join( [TEXT_1, TEXT_2])
def test_should_add_line_feeds(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == '\n'.join( [TEXT_1, TEXT_2])
def test_should_lb_elements_before_line_feeds(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2
def test_should_lb_elements_before_line_feeds(self): training_data_generator = get_tei_training_data_generator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) text_nodes = xml_root.xpath('./text/listBibl') assert len(text_nodes) == 1 lb_nodes = text_nodes[0].xpath('lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2
def iter_layout_lines_from_layout_tokens( layout_tokens: Iterable[LayoutToken]) -> Iterable[LayoutLine]: line_layout_tokens: List[LayoutToken] = [] for layout_token in layout_tokens: if not line_layout_tokens: line_layout_tokens.append(layout_token) continue if (layout_token.line_descriptor.line_id == line_layout_tokens[0].line_descriptor.line_id): LOGGER.debug('line id matching: %r - %r', layout_token, line_layout_tokens[0]) line_layout_tokens.append(layout_token) continue yield LayoutLine(tokens=line_layout_tokens) line_layout_tokens = [layout_token] if line_layout_tokens: yield LayoutLine(tokens=line_layout_tokens)
def parse_line(self, line_node: etree.ElementBase, page_number: int) -> LayoutLine: return LayoutLine(tokens=[ self.parse_token(token_node, page_number=page_number, layout_line_descriptor=LayoutLineDescriptor( line_id=id(line_node))) for token_node in alto_xpath(line_node, './/alto:String') ])
def get_filtered_document_by_labels(self, labels: List[str]): # pylint: disable=too-many-branches layout_document = LayoutDocument(pages=[]) layout_document_labels = self.get_layout_document_labels_by_labels( labels) if not layout_document_labels: LOGGER.warning( 'no layout_lines_to_include found for: %r, available keys=%r', labels, self.layout_document_labels_by_label.keys()) return layout_document layout_token_ids_to_include = { id(layout_document_label.layout_token) for layout_document_label in layout_document_labels if layout_document_label.layout_token } LOGGER.debug('layout_tokens_to_include: %s', layout_token_ids_to_include) layout_line_ids_to_include: Set[int] = set() if not layout_token_ids_to_include: layout_line_ids_to_include = { id(layout_document_label.layout_line) for layout_document_label in layout_document_labels if layout_document_label.layout_line } LOGGER.debug('layout_line_ids_to_include: %s', layout_line_ids_to_include) result_page: Optional[LayoutPage] = None for page in self.layout_document.pages: # pylint: disable=too-many-nested-blocks result_page = None result_block: Optional[LayoutBlock] = None for block in page.blocks: result_block = None for line in block.lines: accepted_line: Optional[LayoutLine] = None if layout_token_ids_to_include: accepted_tokens: List[LayoutToken] = [] for token in line.tokens: if id(token) in layout_token_ids_to_include: accepted_tokens.append(token) if not accepted_tokens: continue if len(line.tokens) == accepted_tokens: accepted_line = line else: accepted_line = LayoutLine(tokens=accepted_tokens) else: if id(line) not in layout_line_ids_to_include: continue accepted_line = line if result_page is None: result_page = LayoutPage(blocks=[]) layout_document.pages.append(result_page) if result_block is None: result_block = LayoutBlock(lines=[]) result_page.blocks.append(result_block) result_block.lines.append(accepted_line) return layout_document
def test_should_keep_original_whitespace(self): text = 'Token1, Token2 ,Token3' layout_document = LayoutDocument.for_blocks([ LayoutBlock( lines=[LayoutLine.for_text(text, tail_whitespace='\n')]) ]) xml_root = get_training_tei_xml_for_layout_document(layout_document) nodes = tei_xpath(xml_root, AUTHOR_XPATH) assert len(nodes) == 1 assert get_text_content(nodes[0]).rstrip() == text
def test_should_filter_by_line_without_token(self): tagged_lines = [(TAG_1, LayoutLine.for_text('this is line 1')), (TAG_2, LayoutLine.for_text('this is line 2'))] layout_model_labels = [ LayoutModelLabel(label=tag, label_token_text=line.text, layout_line=line, layout_token=None) for tag, line in tagged_lines for token in line.tokens ] layout_document = LayoutDocument(pages=[ LayoutPage( blocks=[LayoutBlock(lines=[line for _, line in tagged_lines])]) ]) layout_document_label_result = LayoutDocumentLabelResult( layout_document, layout_model_labels) for tag, line in tagged_lines: assert (join_layout_tokens( layout_document_label_result.get_filtered_document_by_label( tag).iter_all_tokens()) == join_layout_tokens(line.tokens))
def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics( self): page_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(0, 0, 200, 200), page_number=1) semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 90, 100, 50), page_number=1) keep_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 10, 100, 20), page_number=1) remove_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 100, 20), page_number=1) empty_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 0, 0), page_number=1) keep_token = LayoutToken('keep', coordinates=keep_coordinates) remove_token = LayoutToken('remove', coordinates=remove_coordinates) keep_graphic = LayoutGraphic(coordinates=keep_coordinates, graphic_type='keep-graphic') remove_graphic = LayoutGraphic(coordinates=remove_coordinates, graphic_type='remove-graphic') layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock( lines=[LayoutLine(tokens=[keep_token, remove_token])]) ], graphics=[keep_graphic, remove_graphic], meta=LayoutPageMeta( page_number=page_coordinates.page_number, coordinates=page_coordinates)) ]) layout_graphic = LayoutGraphic( coordinates=semantic_graphic_coordinates, graphic_type='new-graphic') no_coords_layout_graphic = LayoutGraphic( coordinates=empty_coordinates, graphic_type='empty-coords-graphic') result = get_layout_document_with_text_and_graphics_replaced_by_graphics( layout_document, semantic_graphics=[ SemanticGraphic(layout_graphic=layout_graphic), SemanticGraphic(layout_graphic=no_coords_layout_graphic) ]) LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics) assert result.pages[0].graphics[:-1] == [keep_graphic] LOGGER.debug('result.pages[0].graphics[-1]: %r', result.pages[0].graphics[-1]) assert result.pages[0].graphics[ -1].graphic_type == layout_graphic.graphic_type assert result.pages[0].graphics[ -1].coordinates == layout_graphic.coordinates assert list( result.pages[0].blocks[0].iter_all_tokens()) == [keep_token] assert list( result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [ keep_token, remove_token ]
def test_should_keep_original_whitespace(self): training_data_generator = get_tei_training_data_generator() text = 'Token1, Token2 ,Token3' layout_document = LayoutDocument.for_blocks([ LayoutBlock( lines=[LayoutLine.for_text(text, tail_whitespace='\n')]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) text_nodes = xml_root.xpath('./text/listBibl') assert len(text_nodes) == 1 assert get_text_content(text_nodes[0]).rstrip() == text
def test_should_keep_original_whitespace(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() text = 'Token1, Token2 ,Token3' layout_document = LayoutDocument.for_blocks([ LayoutBlock( lines=[LayoutLine.for_text(text, tail_whitespace='\n')]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == text
def test_should_filter_by_token_label(self): tagged_tokens = [(TAG_1, get_layout_tokens_for_text('this is line 1')), (TAG_2, get_layout_tokens_for_text('this is line 2'))] line = LayoutLine( [token for _, tokens in tagged_tokens for token in tokens]) layout_model_labels = [ LayoutModelLabel(label=tag, label_token_text=token.text, layout_line=line, layout_token=token) for tag, tokens in tagged_tokens for token in tokens ] layout_document = LayoutDocument( pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])]) layout_document_label_result = LayoutDocumentLabelResult( layout_document, layout_model_labels) for tag, tokens in tagged_tokens: assert (join_layout_tokens( layout_document_label_result.get_filtered_document_by_label( tag).iter_all_tokens()) == join_layout_tokens(tokens))
def test_should_provide_block_relative_document_token_position( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock( lines=[LayoutLine.for_text(f'line{i}') for i in range(10)]) ]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'str_relative_document_position': (features.get_str_relative_document_position()) }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [{ 'str_relative_document_position': str(feature_linear_scaling_int(i, 10, NBBINS_POSITION)), } for i in range(10)]
def test_should_filter_by_token_multiple_labels(self): tagged_tokens = [(TAG_1, get_layout_tokens_for_text('tokens tag 1')), (TAG_2, get_layout_tokens_for_text('tokens tag 2')), (TAG_3, get_layout_tokens_for_text('tokens tag 3'))] line = LayoutLine( [token for _, tokens in tagged_tokens for token in tokens]) layout_model_labels = [ LayoutModelLabel(label=tag, label_token_text=token.text, layout_line=line, layout_token=token) for tag, tokens in tagged_tokens for token in tokens ] layout_document = LayoutDocument( pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])]) layout_document_label_result = LayoutDocumentLabelResult( layout_document, layout_model_labels) assert join_layout_tokens( layout_document_label_result.get_filtered_document_by_labels([ TAG_1, TAG_3 ]).iter_all_tokens()) == join_layout_tokens(tagged_tokens[0][1] + tagged_tokens[2][1])
def test_should_provide_punctuation_profile( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage( blocks=[LayoutBlock(lines=[ LayoutLine.for_text('a .: b'), ])]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'line_punctuation_profile': features.get_line_punctuation_profile(), 'line_punctuation_profile_length_feature': (features.get_line_punctuation_profile_length_feature()), }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [ { 'line_punctuation_profile': '.:', 'line_punctuation_profile_length_feature': '2' }, ]
ModelDataGenerator, LayoutModelData, feature_linear_scaling_int, _LINESCALE, get_str_bool_feature_value) LOGGER = logging.getLogger(__name__) NBSP = '\u00A0' def format_feature_text(text: str) -> str: return re.sub(" |\t", NBSP, text.strip()) NBBINS_POSITION = 12 EMPTY_LAYOUT_TOKEN = LayoutToken('') EMPTY_LAYOUT_LINE = LayoutLine([]) def get_block_status(line_index: int, line_count: int) -> str: return ('BLOCKSTART' if line_index == 0 else ('BLOCKEND' if line_index == line_count - 1 else 'BLOCKIN')) def get_page_status(block_index: int, block_count: int, is_first_block_token: bool, is_last_block_token: bool) -> str: return ('PAGESTART' if block_index == 0 and is_first_block_token else ('PAGEEND' if block_index == block_count - 1 and is_last_block_token else 'PAGEIN'))
def get_layout_line_for_text(text: str, line_id: int) -> LayoutLine: return LayoutLine.for_text( text, tail_whitespace='\n', line_descriptor=LayoutLineDescriptor(line_id=line_id))