def test_should_preserve_empty_pages_if_requested(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])]) ], graphics=[]), LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])], graphics=[]), ]) cleaned_layout_document = remove_empty_blocks( layout_document, preserve_empty_pages=True) assert len(cleaned_layout_document.pages) == 2
def test_should_remove_empty_line_block_and_page(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])]) ], graphics=[]), LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])], graphics=[]), ]) cleaned_layout_document = remove_empty_blocks(layout_document) assert len(cleaned_layout_document.pages) == 1 line = cleaned_layout_document.pages[0].blocks[0].lines[0] assert [t.text for t in line.tokens] == ['token1']
def test_should_retokenize_document_with_placeholders(self): text = 'token1 token2' layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_tokens([ LayoutToken(text, whitespace='\n', coordinates=LayoutPageCoordinates( x=10, y=10, width=100, height=50)) ]) ], graphics=[]) ]) retokenized_layout_document = retokenize_layout_document( layout_document) line = retokenized_layout_document.pages[0].blocks[0].lines[0] assert [t.text for t in line.tokens] == ['token1', 'token2'] assert [t.whitespace for t in line.tokens] == [' ', '\n'] assert line.tokens[0].coordinates.x == 10.0 assert line.tokens[0].coordinates.width == 100 * len('token1') / len( text) assert line.tokens[ 1].coordinates.x == 10.0 + 100 * len('token1 ') / len(text) assert line.tokens[1].coordinates.width == 100 * len('token2') / len( text)
def test_should_provide_line_text( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock(lines=[ LayoutLine.for_text('first1 second1 this is a line'), LayoutLine.for_text('first2 second2 this is a line') ]) ]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'line_text': features.line_text, 'token_text': features.token_text, 'second_token_text': features.second_token_text }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [ { 'line_text': 'first1 second1 this is a line', 'token_text': 'first1', 'second_token_text': 'second1' }, { 'line_text': 'first2 second2 this is a line', 'token_text': 'first2', 'second_token_text': 'second2' }, ]
def test_should_provide_block_relative_line_length( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock(lines=[ LayoutLine.for_text('1'), LayoutLine.for_text('12'), LayoutLine.for_text('1234567890'), ]) ]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'str_block_relative_line_length_feature': (features.get_str_block_relative_line_length_feature()) }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [ { 'str_block_relative_line_length_feature': '1', # 1 * 10 / 10 }, { 'str_block_relative_line_length_feature': '2', # 2 * 10 / 10 }, { 'str_block_relative_line_length_feature': '10', # 10 * 10 / 10 }, ]
def test_should_provide_page_and_block_status_for_single_token_blocks( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_text('line1'), LayoutBlock.for_text('line2'), LayoutBlock.for_text('line3') ]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'page_status': features.get_page_status(), 'block_status': features.get_block_status() }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [{ 'page_status': 'PAGESTART', 'block_status': 'BLOCKSTART' }, { 'page_status': 'PAGEIN', 'block_status': 'BLOCKSTART' }, { 'page_status': 'PAGEEND', 'block_status': 'BLOCKSTART' }]
def test_should_extract_references_fields_from_document( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): other_body = LayoutBlock.for_text('the body') citation_block = LayoutBlock.for_text('1') body_block = LayoutBlock.merge_blocks([other_body, citation_block]) label_block = LayoutBlock.for_text('1') ref_title_block = LayoutBlock.for_text('Reference Title 1') ref_text_block = LayoutBlock.merge_blocks([ref_title_block]) ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_citation_fields=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock citation_model_mock = fulltext_models_mock.citation_model_mock segmentation_model_mock.update_label_by_layout_block( body_block, '<body>') segmentation_model_mock.update_label_by_layout_block( ref_block, '<references>') fulltext_model_mock.update_label_by_layout_block( other_body, '<section>') fulltext_model_mock.update_label_by_layout_block( citation_block, '<citation_marker>') reference_segmenter_model_mock.update_label_by_layout_block( label_block, '<label>') reference_segmenter_model_mock.update_label_by_layout_block( ref_text_block, '<reference>') citation_model_mock.update_label_by_layout_block( ref_title_block, '<title>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[body_block, ref_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None reference_list = list( semantic_document.back_section.iter_by_type(SemanticReferenceList)) assert len(reference_list) == 1 references = list(reference_list[0].iter_by_type(SemanticReference)) assert len(references) == 1 ref = references[0] assert ref.get_text_by_type(SemanticTitle) == ref_title_block.text assert ref.get_text_by_type(SemanticLabel) == label_block.text assert ref.get_text_by_type( SemanticRawReferenceText) == ref_text_block.text assert ref.content_id == 'b0' ref_citations = list( semantic_document.iter_by_type_recursively( SemanticReferenceCitation)) assert len(ref_citations) == 1 assert ref_citations[0].target_content_id == 'b0'
def test_should_remove_blank_token(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[LayoutBlock.for_tokens([LayoutToken(' ')])], graphics=[]) ]) retokenized_layout_document = retokenize_layout_document( layout_document) line = retokenized_layout_document.pages[0].blocks[0].lines[0] assert line.tokens == []
def get_filtered_document_by_labels(self, labels: List[str]): # pylint: disable=too-many-branches layout_document = LayoutDocument(pages=[]) layout_document_labels = self.get_layout_document_labels_by_labels( labels) if not layout_document_labels: LOGGER.warning( 'no layout_lines_to_include found for: %r, available keys=%r', labels, self.layout_document_labels_by_label.keys()) return layout_document layout_token_ids_to_include = { id(layout_document_label.layout_token) for layout_document_label in layout_document_labels if layout_document_label.layout_token } LOGGER.debug('layout_tokens_to_include: %s', layout_token_ids_to_include) layout_line_ids_to_include: Set[int] = set() if not layout_token_ids_to_include: layout_line_ids_to_include = { id(layout_document_label.layout_line) for layout_document_label in layout_document_labels if layout_document_label.layout_line } LOGGER.debug('layout_line_ids_to_include: %s', layout_line_ids_to_include) result_page: Optional[LayoutPage] = None for page in self.layout_document.pages: # pylint: disable=too-many-nested-blocks result_page = None result_block: Optional[LayoutBlock] = None for block in page.blocks: result_block = None for line in block.lines: accepted_line: Optional[LayoutLine] = None if layout_token_ids_to_include: accepted_tokens: List[LayoutToken] = [] for token in line.tokens: if id(token) in layout_token_ids_to_include: accepted_tokens.append(token) if not accepted_tokens: continue if len(line.tokens) == accepted_tokens: accepted_line = line else: accepted_line = LayoutLine(tokens=accepted_tokens) else: if id(line) not in layout_line_ids_to_include: continue accepted_line = line if result_page is None: result_page = LayoutPage(blocks=[]) layout_document.pages.append(result_page) if result_block is None: result_block = LayoutBlock(lines=[]) result_page.blocks.append(result_block) result_block.lines.append(accepted_line) return layout_document
def test_should_not_retokenize_document_with_valid_tokens(self): layout_document = LayoutDocument(pages=[ LayoutPage( blocks=[LayoutBlock.for_tokens([LayoutToken('token1')])], graphics=[]) ]) retokenized_layout_document = retokenize_layout_document( layout_document) line = retokenized_layout_document.pages[0].blocks[0].lines[0] assert [t.text for t in line.tokens] == ['token1']
def test_should_provide_empty_list_for_empty_pages(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[], meta=LayoutPageMeta( page_number=1, coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1))) ]) result = get_page_numbers_with_mostly_bitmap_graphics(layout_document) assert result == []
def test_should_preserve_meta(self): page_meta = LayoutPageMeta(COORDINATES_1) layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[LayoutBlock.for_text('token1'), EMPTY_BLOCK], graphics=[], meta=page_meta) ]) retokenized_layout_document = remove_empty_blocks(layout_document) page = retokenized_layout_document.pages[0] assert page.meta == page_meta
def test_should_provide_page_number_with_uncomment_page_dimension(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[], meta=LayoutPageMeta( page_number=1, coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1))), LayoutPage(blocks=[], meta=LayoutPageMeta( page_number=2, coordinates=LAYOUT_PAGE_COORDINATES_2._replace( page_number=2))), LayoutPage(blocks=[], meta=LayoutPageMeta( page_number=3, coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=3))) ]) result = get_page_numbers_with_uncommon_page_dimension(layout_document) assert result == [2]
def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics( self): page_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(0, 0, 200, 200), page_number=1) semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 90, 100, 50), page_number=1) keep_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 10, 100, 20), page_number=1) remove_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 100, 20), page_number=1) empty_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 0, 0), page_number=1) keep_token = LayoutToken('keep', coordinates=keep_coordinates) remove_token = LayoutToken('remove', coordinates=remove_coordinates) keep_graphic = LayoutGraphic(coordinates=keep_coordinates, graphic_type='keep-graphic') remove_graphic = LayoutGraphic(coordinates=remove_coordinates, graphic_type='remove-graphic') layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock( lines=[LayoutLine(tokens=[keep_token, remove_token])]) ], graphics=[keep_graphic, remove_graphic], meta=LayoutPageMeta( page_number=page_coordinates.page_number, coordinates=page_coordinates)) ]) layout_graphic = LayoutGraphic( coordinates=semantic_graphic_coordinates, graphic_type='new-graphic') no_coords_layout_graphic = LayoutGraphic( coordinates=empty_coordinates, graphic_type='empty-coords-graphic') result = get_layout_document_with_text_and_graphics_replaced_by_graphics( layout_document, semantic_graphics=[ SemanticGraphic(layout_graphic=layout_graphic), SemanticGraphic(layout_graphic=no_coords_layout_graphic) ]) LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics) assert result.pages[0].graphics[:-1] == [keep_graphic] LOGGER.debug('result.pages[0].graphics[-1]: %r', result.pages[0].graphics[-1]) assert result.pages[0].graphics[ -1].graphic_type == layout_graphic.graphic_type assert result.pages[0].graphics[ -1].coordinates == layout_graphic.coordinates assert list( result.pages[0].blocks[0].iter_all_tokens()) == [keep_token] assert list( result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [ keep_token, remove_token ]
def _create_page( coordinates: LayoutPageCoordinates, graphics: Optional[Sequence[LayoutGraphic]] = None ) -> LayoutPage: return LayoutPage( meta=LayoutPageMeta( page_number=coordinates.page_number, coordinates=coordinates ), blocks=[], graphics=graphics or [] )
def test_should_provide_repetitive_pattern_feature( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_text('this is repetitive'), LayoutBlock.for_text('this is not') ]), LayoutPage(blocks=[ LayoutBlock.for_text('this is repetitive'), LayoutBlock.for_text('it is different') ]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'get_str_is_repetitive_pattern': (features.get_str_is_repetitive_pattern()), 'get_str_is_first_repetitive_pattern': (features.get_str_is_first_repetitive_pattern()) }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [ { 'get_str_is_repetitive_pattern': '1', 'get_str_is_first_repetitive_pattern': '1' }, { 'get_str_is_repetitive_pattern': '0', 'get_str_is_first_repetitive_pattern': '0' }, { 'get_str_is_repetitive_pattern': '1', 'get_str_is_first_repetitive_pattern': '0' }, { 'get_str_is_repetitive_pattern': '0', 'get_str_is_first_repetitive_pattern': '0' }, ]
def test_should_preserve_meta(self): page_meta = LayoutPageMeta(COORDINATES_1) layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_tokens([LayoutToken('token1 token2')]) ], graphics=[], meta=page_meta) ]) retokenized_layout_document = retokenize_layout_document( layout_document) page = retokenized_layout_document.pages[0] assert page.meta == page_meta
def test_should_extract_table_label_caption_from_body( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels, segmentation_label: str): citation_block = LayoutBlock.for_text('Table 1') label_block = LayoutBlock.for_text('Table 1') caption_block = LayoutBlock.for_text('Caption 1') other_block = LayoutBlock.for_text('Other') figure_block = LayoutBlock.merge_blocks( [label_block, other_block, caption_block]) fulltext_block = LayoutBlock.merge_blocks( [citation_block, figure_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_table_fields=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock table_model_mock = fulltext_models_mock.table_model_mock segmentation_model_mock.update_label_by_layout_block( fulltext_block, segmentation_label) fulltext_model_mock.update_label_by_layout_block( citation_block, '<table_marker>') fulltext_model_mock.update_label_by_layout_block( figure_block, '<table>') table_model_mock.update_label_by_layout_block(label_block, '<label>') table_model_mock.update_label_by_layout_block(caption_block, '<figDesc>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[fulltext_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None table_list = list( iter_by_semantic_type_recursively([ semantic_document.body_section, semantic_document.back_section ], SemanticTable)) assert len(table_list) == 1 table = table_list[0] assert table.get_text_by_type(SemanticLabel) == label_block.text assert table.get_text_by_type(SemanticCaption) == caption_block.text assert table.content_id == 'tab_0' table_citation_list = list( semantic_document.iter_by_type_recursively(SemanticTableCitation)) assert len(table_citation_list) == 1 assert table_citation_list[0].get_text() == citation_block.text assert table_citation_list[0].target_content_id == 'tab_0'
def get_layout_page_with_text_or_graphic_replaced_by_graphic( layout_page: LayoutPage, semantic_graphic: SemanticGraphic, is_only_semantic_graphic_on_page: bool, is_replace_overlapping_text: bool) -> LayoutPage: layout_graphic = semantic_graphic.layout_graphic assert layout_graphic assert layout_graphic.coordinates graphic_bounding_box = layout_graphic.coordinates.bounding_box if is_only_semantic_graphic_on_page: layout_graphic = layout_graphic._replace( related_block=LayoutBlock.for_tokens( list(layout_page.iter_all_tokens()))) modified_layout_page = (layout_page.replace(graphics=[ _layout_graphic for _layout_graphic in layout_page.graphics if not is_layout_graphic_within_bounding_box( _layout_graphic, bounding_box=graphic_bounding_box) ] + [layout_graphic])) if is_replace_overlapping_text: modified_layout_page = (modified_layout_page.flat_map_layout_tokens( functools.partial( _remove_tokens_within_bounding_box_flatmap_fn, bounding_box=graphic_bounding_box)).remove_empty_blocks()) return modified_layout_page
def test_should_extract_editor_names_from_references_fields( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): given_name_block = LayoutBlock.for_text('Given name') surname_block = LayoutBlock.for_text('Surname') other_block = LayoutBlock.for_text('Other') editors_block = LayoutBlock.merge_blocks( [given_name_block, other_block, surname_block]) ref_text_block = LayoutBlock.merge_blocks([editors_block]) ref_block = LayoutBlock.merge_blocks([ref_text_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_citation_fields=True, extract_citation_authors=False, extract_citation_editors=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock citation_model_mock = fulltext_models_mock.citation_model_mock name_citation_model_mock = fulltext_models_mock.name_citation_model_mock segmentation_model_mock.update_label_by_layout_block( ref_block, '<references>') reference_segmenter_model_mock.update_label_by_layout_block( ref_text_block, '<reference>') citation_model_mock.update_label_by_layout_block( editors_block, '<editor>') name_citation_model_mock.update_label_by_layout_block( given_name_block, '<forename>') name_citation_model_mock.update_label_by_layout_block( surname_block, '<surname>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[ref_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None reference_list = list( semantic_document.back_section.iter_by_type(SemanticReferenceList)) assert len(reference_list) == 1 references = list(reference_list[0].iter_by_type(SemanticReference)) assert len(references) == 1 ref = references[0] editors = list(ref.iter_by_type(SemanticEditor)) assert len(editors) == 1 assert editors[0].given_name_text == given_name_block.text assert editors[0].surname_text == surname_block.text
def test_should_extract_affiliation_address_from_document( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): marker_block = LayoutBlock.for_text('1') institution_block = LayoutBlock.for_text('Institution1') country_block = LayoutBlock.for_text('Country1') aff_block = LayoutBlock.merge_blocks([marker_block, institution_block]) address_block = LayoutBlock.merge_blocks([country_block]) aff_address_block = LayoutBlock.merge_blocks( [aff_block, address_block]) fulltext_processor = FullTextProcessor(fulltext_models_mock) header_block = aff_address_block segmentation_model_mock = fulltext_models_mock.segmentation_model_mock header_model_mock = fulltext_models_mock.header_model_mock affiliation_address_model_mock = fulltext_models_mock.affiliation_address_model_mock segmentation_model_mock.update_label_by_layout_block( header_block, '<header>') header_model_mock.update_label_by_layout_block(aff_block, '<affiliation>') header_model_mock.update_label_by_layout_block(address_block, '<address>') affiliation_address_model_mock.update_label_by_layout_block( marker_block, '<marker>') affiliation_address_model_mock.update_label_by_layout_block( institution_block, '<institution>') affiliation_address_model_mock.update_label_by_layout_block( country_block, '<country>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[header_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) assert semantic_document is not None assert semantic_document.front.get_text() == aff_address_block.text assert (semantic_document.front.view_by_type( SemanticAffiliationAddress).get_text()) == aff_address_block.text affiliations = list( semantic_document.front.iter_by_type(SemanticAffiliationAddress)) assert len(affiliations) == 1 assert affiliations[0].get_text_by_type( SemanticMarker) == marker_block.text assert affiliations[0].get_text_by_type( SemanticInstitution) == institution_block.text assert affiliations[0].get_text_by_type( SemanticCountry) == country_block.text assert affiliations[0].content_id == 'aff0'
def test_should_ignore_small_bitmap(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[], graphics=[ LayoutGraphic( graphic_type='image', coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1, width=1, height=1)) ], meta=LayoutPageMeta( page_number=1, coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1))) ]) result = get_page_numbers_with_mostly_bitmap_graphics(layout_document) assert result == []
def test_should_extract_invalid_reference_from_document( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): other_body = LayoutBlock.for_text('the body') citation_block = LayoutBlock.for_text('1') body_block = LayoutBlock.merge_blocks([other_body, citation_block]) invalid_reference_block = LayoutBlock.for_text( 'This is an invalid reference 1') ref_text_block = invalid_reference_block ref_block = ref_text_block fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_citation_fields=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock citation_model_mock = fulltext_models_mock.citation_model_mock segmentation_model_mock.update_label_by_layout_block( body_block, '<body>') segmentation_model_mock.update_label_by_layout_block( ref_block, '<references>') fulltext_model_mock.update_label_by_layout_block( other_body, '<section>') fulltext_model_mock.update_label_by_layout_block( citation_block, '<citation_marker>') reference_segmenter_model_mock.update_label_by_layout_block( ref_text_block, '<reference>') citation_model_mock.update_label_by_layout_block( invalid_reference_block, 'O') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[body_block, ref_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None reference_list = list( semantic_document.back_section.iter_by_type(SemanticReferenceList)) assert len(reference_list) == 1 references = list( reference_list[0].iter_by_type(SemanticInvalidReference)) assert len(references) == 1 assert references[0].get_text() == invalid_reference_block.text
def test_should_extract_from_document( self, fulltext_models_mock: MockFullTextModels): fulltext_processor = FullTextProcessor(fulltext_models_mock) header_block = LayoutBlock.for_text('This is the header') body_block = LayoutBlock.for_text('This is the body') acknowledgment_block = LayoutBlock.for_text('Some acknowledgement') back_block = LayoutBlock.for_text('This is the back') segmentation_model_mock = fulltext_models_mock.segmentation_model_mock header_model_mock = fulltext_models_mock.header_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock segmentation_model_mock.update_label_by_layout_block( header_block, '<header>') segmentation_model_mock.update_label_by_layout_block( body_block, '<body>') segmentation_model_mock.update_label_by_layout_block( acknowledgment_block, '<acknowledgement>') segmentation_model_mock.update_label_by_layout_block( back_block, '<annex>') header_model_mock.update_label_by_layout_block(header_block, '<title>') fulltext_model_mock.update_label_by_layout_block( body_block, '<paragraph>') fulltext_model_mock.update_label_by_layout_block( acknowledgment_block, '<paragraph>') fulltext_model_mock.update_label_by_layout_block( back_block, '<paragraph>') layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ header_block, body_block, acknowledgment_block, back_block ]) ]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) assert semantic_document is not None assert semantic_document.front.get_text() == header_block.text assert semantic_document.front.get_text_by_type( SemanticTitle) == header_block.text assert semantic_document.body_section.get_text() == body_block.text assert semantic_document.back_section.view_by_section_type( SemanticSectionTypes.OTHER).get_text() == back_block.text assert semantic_document.back_section.view_by_section_type( SemanticSectionTypes.ACKNOWLEDGEMENT).get_text( ) == acknowledgment_block.text
def __init__(self) -> None: self.title_block = LayoutBlock.for_text('This is the title') self.author_surname_block = LayoutBlock.for_text('Author Surname 1') self.author_block = LayoutBlock.merge_blocks( [self.author_surname_block]) self.institution_block = LayoutBlock.for_text('Institution 1') self.affiliation_block = LayoutBlock.merge_blocks( [self.institution_block]) self.header_block = LayoutBlock.merge_blocks( [self.title_block, self.author_block, self.affiliation_block]) self.figure_head_block = LayoutBlock.for_text('Figure 1') self.figure_block = LayoutBlock.merge_blocks([self.figure_head_block]) self.table_head_block = LayoutBlock.for_text('Table 1') self.table_block = LayoutBlock.merge_blocks([self.table_head_block]) self.body_section_title_block = LayoutBlock.for_text('Section 1') self.body_section_paragraph_block = LayoutBlock.for_text('Paragraph 1') self.body_block = LayoutBlock.merge_blocks([ self.body_section_title_block, self.body_section_paragraph_block, self.figure_block, self.table_block ]) self.ref_author_surname_block = LayoutBlock.for_text( 'Ref Author Surname 1') self.ref_author_block = LayoutBlock.merge_blocks( [self.ref_author_surname_block]) self.ref_label_block = LayoutBlock.for_text('1') self.ref_title_block = LayoutBlock.for_text('Reference 1') self.ref_text_block = LayoutBlock.merge_blocks( [self.ref_title_block, self.ref_author_block]) self.ref_ref_block = LayoutBlock.merge_blocks( [self.ref_label_block, self.ref_text_block]) self.layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ self.header_block, self.body_block, self.ref_ref_block ]) ])
def test_should_filter_by_line_without_token(self): tagged_lines = [(TAG_1, LayoutLine.for_text('this is line 1')), (TAG_2, LayoutLine.for_text('this is line 2'))] layout_model_labels = [ LayoutModelLabel(label=tag, label_token_text=line.text, layout_line=line, layout_token=None) for tag, line in tagged_lines for token in line.tokens ] layout_document = LayoutDocument(pages=[ LayoutPage( blocks=[LayoutBlock(lines=[line for _, line in tagged_lines])]) ]) layout_document_label_result = LayoutDocumentLabelResult( layout_document, layout_model_labels) for tag, line in tagged_lines: assert (join_layout_tokens( layout_document_label_result.get_filtered_document_by_label( tag).iter_all_tokens()) == join_layout_tokens(line.tokens))
def test_should_provide_block_relative_document_token_position( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock( lines=[LayoutLine.for_text(f'line{i}') for i in range(10)]) ]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'str_relative_document_position': (features.get_str_relative_document_position()) }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [{ 'str_relative_document_position': str(feature_linear_scaling_int(i, 10, NBBINS_POSITION)), } for i in range(10)]
def test_should_filter_by_token_label(self): tagged_tokens = [(TAG_1, get_layout_tokens_for_text('this is line 1')), (TAG_2, get_layout_tokens_for_text('this is line 2'))] line = LayoutLine( [token for _, tokens in tagged_tokens for token in tokens]) layout_model_labels = [ LayoutModelLabel(label=tag, label_token_text=token.text, layout_line=line, layout_token=token) for tag, tokens in tagged_tokens for token in tokens ] layout_document = LayoutDocument( pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])]) layout_document_label_result = LayoutDocumentLabelResult( layout_document, layout_model_labels) for tag, tokens in tagged_tokens: assert (join_layout_tokens( layout_document_label_result.get_filtered_document_by_label( tag).iter_all_tokens()) == join_layout_tokens(tokens))
def test_should_extract_acknowledgement_only( self, fulltext_models_mock: MockFullTextModels): fulltext_processor = FullTextProcessor(fulltext_models_mock) acknowledgment_block = LayoutBlock.for_text('Some acknowledgement') segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock segmentation_model_mock.update_label_by_layout_block( acknowledgment_block, '<acknowledgement>') fulltext_model_mock.update_label_by_layout_block( acknowledgment_block, '<paragraph>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[acknowledgment_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) assert semantic_document is not None assert semantic_document.back_section.view_by_section_type( SemanticSectionTypes.ACKNOWLEDGEMENT).get_text( ) == acknowledgment_block.text
def test_should_filter_by_token_multiple_labels(self): tagged_tokens = [(TAG_1, get_layout_tokens_for_text('tokens tag 1')), (TAG_2, get_layout_tokens_for_text('tokens tag 2')), (TAG_3, get_layout_tokens_for_text('tokens tag 3'))] line = LayoutLine( [token for _, tokens in tagged_tokens for token in tokens]) layout_model_labels = [ LayoutModelLabel(label=tag, label_token_text=token.text, layout_line=line, layout_token=token) for tag, tokens in tagged_tokens for token in tokens ] layout_document = LayoutDocument( pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])]) layout_document_label_result = LayoutDocumentLabelResult( layout_document, layout_model_labels) assert join_layout_tokens( layout_document_label_result.get_filtered_document_by_labels([ TAG_1, TAG_3 ]).iter_all_tokens()) == join_layout_tokens(tagged_tokens[0][1] + tagged_tokens[2][1])