def test_should_extract_references_fields_from_document( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): other_body = LayoutBlock.for_text('the body') citation_block = LayoutBlock.for_text('1') body_block = LayoutBlock.merge_blocks([other_body, citation_block]) label_block = LayoutBlock.for_text('1') ref_title_block = LayoutBlock.for_text('Reference Title 1') ref_text_block = LayoutBlock.merge_blocks([ref_title_block]) ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_citation_fields=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock citation_model_mock = fulltext_models_mock.citation_model_mock segmentation_model_mock.update_label_by_layout_block( body_block, '<body>') segmentation_model_mock.update_label_by_layout_block( ref_block, '<references>') fulltext_model_mock.update_label_by_layout_block( other_body, '<section>') fulltext_model_mock.update_label_by_layout_block( citation_block, '<citation_marker>') reference_segmenter_model_mock.update_label_by_layout_block( label_block, '<label>') reference_segmenter_model_mock.update_label_by_layout_block( ref_text_block, '<reference>') citation_model_mock.update_label_by_layout_block( ref_title_block, '<title>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[body_block, ref_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None reference_list = list( semantic_document.back_section.iter_by_type(SemanticReferenceList)) assert len(reference_list) == 1 references = list(reference_list[0].iter_by_type(SemanticReference)) assert len(references) == 1 ref = references[0] assert ref.get_text_by_type(SemanticTitle) == ref_title_block.text assert ref.get_text_by_type(SemanticLabel) == label_block.text assert ref.get_text_by_type( SemanticRawReferenceText) == ref_text_block.text assert ref.content_id == 'b0' ref_citations = list( semantic_document.iter_by_type_recursively( SemanticReferenceCitation)) assert len(ref_citations) == 1 assert ref_citations[0].target_content_id == 'b0'
def test_should_extract_table_label_caption_from_body( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels, segmentation_label: str): citation_block = LayoutBlock.for_text('Table 1') label_block = LayoutBlock.for_text('Table 1') caption_block = LayoutBlock.for_text('Caption 1') other_block = LayoutBlock.for_text('Other') figure_block = LayoutBlock.merge_blocks( [label_block, other_block, caption_block]) fulltext_block = LayoutBlock.merge_blocks( [citation_block, figure_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_table_fields=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock table_model_mock = fulltext_models_mock.table_model_mock segmentation_model_mock.update_label_by_layout_block( fulltext_block, segmentation_label) fulltext_model_mock.update_label_by_layout_block( citation_block, '<table_marker>') fulltext_model_mock.update_label_by_layout_block( figure_block, '<table>') table_model_mock.update_label_by_layout_block(label_block, '<label>') table_model_mock.update_label_by_layout_block(caption_block, '<figDesc>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[fulltext_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None table_list = list( iter_by_semantic_type_recursively([ semantic_document.body_section, semantic_document.back_section ], SemanticTable)) assert len(table_list) == 1 table = table_list[0] assert table.get_text_by_type(SemanticLabel) == label_block.text assert table.get_text_by_type(SemanticCaption) == caption_block.text assert table.content_id == 'tab_0' table_citation_list = list( semantic_document.iter_by_type_recursively(SemanticTableCitation)) assert len(table_citation_list) == 1 assert table_citation_list[0].get_text() == citation_block.text assert table_citation_list[0].target_content_id == 'tab_0'
def test_should_extract_editor_names_from_references_fields( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): given_name_block = LayoutBlock.for_text('Given name') surname_block = LayoutBlock.for_text('Surname') other_block = LayoutBlock.for_text('Other') editors_block = LayoutBlock.merge_blocks( [given_name_block, other_block, surname_block]) ref_text_block = LayoutBlock.merge_blocks([editors_block]) ref_block = LayoutBlock.merge_blocks([ref_text_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_citation_fields=True, extract_citation_authors=False, extract_citation_editors=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock citation_model_mock = fulltext_models_mock.citation_model_mock name_citation_model_mock = fulltext_models_mock.name_citation_model_mock segmentation_model_mock.update_label_by_layout_block( ref_block, '<references>') reference_segmenter_model_mock.update_label_by_layout_block( ref_text_block, '<reference>') citation_model_mock.update_label_by_layout_block( editors_block, '<editor>') name_citation_model_mock.update_label_by_layout_block( given_name_block, '<forename>') name_citation_model_mock.update_label_by_layout_block( surname_block, '<surname>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[ref_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None reference_list = list( semantic_document.back_section.iter_by_type(SemanticReferenceList)) assert len(reference_list) == 1 references = list(reference_list[0].iter_by_type(SemanticReference)) assert len(references) == 1 ref = references[0] editors = list(ref.iter_by_type(SemanticEditor)) assert len(editors) == 1 assert editors[0].given_name_text == given_name_block.text assert editors[0].surname_text == surname_block.text
def test_should_extract_invalid_reference_from_document( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): other_body = LayoutBlock.for_text('the body') citation_block = LayoutBlock.for_text('1') body_block = LayoutBlock.merge_blocks([other_body, citation_block]) invalid_reference_block = LayoutBlock.for_text( 'This is an invalid reference 1') ref_text_block = invalid_reference_block ref_block = ref_text_block fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_citation_fields=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock citation_model_mock = fulltext_models_mock.citation_model_mock segmentation_model_mock.update_label_by_layout_block( body_block, '<body>') segmentation_model_mock.update_label_by_layout_block( ref_block, '<references>') fulltext_model_mock.update_label_by_layout_block( other_body, '<section>') fulltext_model_mock.update_label_by_layout_block( citation_block, '<citation_marker>') reference_segmenter_model_mock.update_label_by_layout_block( ref_text_block, '<reference>') citation_model_mock.update_label_by_layout_block( invalid_reference_block, 'O') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[body_block, ref_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None reference_list = list( semantic_document.back_section.iter_by_type(SemanticReferenceList)) assert len(reference_list) == 1 references = list( reference_list[0].iter_by_type(SemanticInvalidReference)) assert len(references) == 1 assert references[0].get_text() == invalid_reference_block.text
def test_should_extract_author_names_separated_by_another_tag( self, fulltext_models_mock: MockFullTextModels): given_name_block = LayoutBlock.for_text('Given name') surname_block = LayoutBlock.for_text('Surname') other_block = LayoutBlock.for_text('Other') authors_block = LayoutBlock.merge_blocks( [given_name_block, other_block, surname_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, config=FullTextProcessorConfig(merge_raw_authors=True)) header_block = authors_block segmentation_model_mock = fulltext_models_mock.segmentation_model_mock header_model_mock = fulltext_models_mock.header_model_mock name_header_model_mock = fulltext_models_mock.name_header_model_mock segmentation_model_mock.update_label_by_layout_block( header_block, '<header>') header_model_mock.update_label_by_layout_block(given_name_block, '<author>') header_model_mock.update_label_by_layout_block(surname_block, '<author>') name_header_model_mock.update_label_by_layout_block( given_name_block, '<forename>') name_header_model_mock.update_label_by_layout_block( surname_block, '<surname>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[header_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) assert semantic_document is not None authors = semantic_document.front.authors assert len(authors) == 1 assert authors[0].given_name_text == given_name_block.text assert authors[0].surname_text == surname_block.text
def test_should_extract_raw_references_from_document( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): label_block = LayoutBlock.for_text('1') ref_text_block = LayoutBlock.for_text('Reference 1') ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_citation_fields=False)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock segmentation_model_mock.update_label_by_layout_block( ref_block, '<references>') reference_segmenter_model_mock.update_label_by_layout_block( label_block, '<label>') reference_segmenter_model_mock.update_label_by_layout_block( ref_text_block, '<reference>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[ref_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None assert semantic_document.back_section.get_text() == ref_block.text reference_list = list( semantic_document.back_section.iter_by_type(SemanticReferenceList)) assert len(reference_list) == 1 references = list(reference_list[0].iter_by_type(SemanticRawReference)) assert len(references) == 1 ref = references[0] assert ref.get_text_by_type(SemanticLabel) == label_block.text assert ref.get_text_by_type( SemanticRawReferenceText) == ref_text_block.text assert ref.content_id == 'b0'
def test_should_extract_figure_label_caption_from_body( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels, segmentation_label: str): citation_block = LayoutBlock.for_text('Figure 1') _coordinates = LayoutPageCoordinates(x=10, y=10, width=100, height=10) graphic_local_file_path = '/path/to/graphic1.svg' graphic = LayoutGraphic(coordinates=_coordinates, local_file_path=graphic_local_file_path) _coordinates = _coordinates.move_by(dy=10) label_block = LayoutBlock.for_text('Figure 1', coordinates=_coordinates) _coordinates = _coordinates.move_by(dy=10) caption_block = LayoutBlock.for_text('Caption 1', coordinates=_coordinates) other_block = LayoutBlock.for_text('Other') figure_block = LayoutBlock.merge_blocks( [label_block, other_block, caption_block]) fulltext_block = LayoutBlock.merge_blocks( [citation_block, figure_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_figure_fields=True, extract_graphic_bounding_boxes=True, extract_graphic_assets=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock figure_model_mock = fulltext_models_mock.figure_model_mock segmentation_model_mock.update_label_by_layout_block( fulltext_block, segmentation_label) fulltext_model_mock.update_label_by_layout_block( citation_block, '<figure_marker>') fulltext_model_mock.update_label_by_layout_block( figure_block, '<figure>') figure_model_mock.update_label_by_layout_block(label_block, '<label>') figure_model_mock.update_label_by_layout_block(caption_block, '<figDesc>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[fulltext_block], graphics=[graphic])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None figure_list = list( iter_by_semantic_type_recursively([ semantic_document.body_section, semantic_document.back_section ], SemanticFigure)) assert len(figure_list) == 1 figure = figure_list[0] assert figure.get_text_by_type(SemanticLabel) == label_block.text assert figure.get_text_by_type(SemanticCaption) == caption_block.text assert figure.content_id == 'fig_0' figure_citation_list = list( semantic_document.iter_by_type_recursively(SemanticFigureCitation)) assert len(figure_citation_list) == 1 assert figure_citation_list[0].get_text() == citation_block.text assert figure_citation_list[0].target_content_id == 'fig_0' semantic_graphic_list = list(figure.iter_by_type(SemanticGraphic)) assert semantic_graphic_list assert semantic_graphic_list[0].layout_graphic == graphic assert semantic_graphic_list[0].relative_path == os.path.basename( graphic_local_file_path)