def test_should_add_parsed_references(self): semantic_document = SemanticDocument() semantic_ref = SemanticReference([ SemanticTitle( layout_block=LayoutBlock.for_text('Reference Title 1')), SemanticRawReferenceText( layout_block=LayoutBlock.for_text('Reference 1')) ]) semantic_ref.content_id = 'b0' semantic_document.back_section.add_content( SemanticReferenceList([ SemanticHeading( layout_block=LayoutBlock.for_text('References')), semantic_ref ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl/tei:head' ) == ['References'] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl' '/tei:biblStruct/tei:analytic/tei:title[@type="main"]') == [ 'Reference Title 1' ] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl' '/tei:biblStruct/tei:note[@type="raw_reference"]') == [ 'Reference 1' ] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl' '/tei:biblStruct/@xml:id') == ['b0']
def test_should_add_section_figures_to_back(self): semantic_document = SemanticDocument() semantic_document.back_section.add_content( SemanticSection([ SemanticFigure([ SemanticLabel( layout_block=LayoutBlock.for_text('Label 1')), SemanticCaption( layout_block=LayoutBlock.for_text('Caption 1')) ], content_id='fig_0') ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) figure_xpath = ( '//tei:back/tei:div[@type="annex"]/tei:figure[not(contains(@type, "table"))]' ) assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:head') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:label') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:figDesc') == ['Caption 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/@xml:id') == ['fig_0'] assert not tei_document.xpath( '//tei:back/tei:div[@type="annex"]/tei:div')
def test_should_add_asset_citation_for_resolved_reference(self): semantic_document = SemanticDocument() semantic_document.body_section.add_content( SemanticSection([ SemanticParagraph([ SemanticTextContentWrapper( layout_block=LayoutBlock.for_text('See')), SemanticReferenceCitation( layout_block=LayoutBlock.for_text('Ref 1'), target_content_id='b0') ]), SemanticReferenceList([ SemanticReference([ SemanticLabel(layout_block=LayoutBlock.for_text('1')) ], content_id='b0') ]) ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p') == ['See Ref 1'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]') == ['Ref 1'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]/@target') == [ '#b0' ]
def test_should_add_single_author(self): semantic_document = SemanticDocument() title = SemanticNameTitle(layout_block=LayoutBlock.for_text('Title1')) given_name = SemanticGivenName( layout_block=LayoutBlock.for_text('Given1')) middle_name = SemanticMiddleName( layout_block=LayoutBlock.for_text('Middle1')) surname = SemanticSurname( layout_block=LayoutBlock.for_text('Surname1')) suffix = SemanticNameSuffix( layout_block=LayoutBlock.for_text('Suffix1')) author = SemanticAuthor( [title, given_name, middle_name, surname, suffix]) semantic_document.front.add_content(author) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:author//tei:roleName') == ['Title1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="first"]') == ['Given1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="middle"]') == ['Middle1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:surname') == ['Surname1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:genName') == ['Suffix1']
def _process_graphics( self, document: SemanticDocument, layout_document: LayoutDocument, context: FullTextProcessorDocumentContext ): unmatched_graphics_container = SemanticMixedNote(note_type='unmatched_graphics') candidate_semantic_content_list = list( document.iter_by_type_recursively(SemanticFigure) ) self._match_graphic_elements( semantic_graphic_list=list( self._get_document_graphic_provider( context=context, page_numbers=get_page_numbers_for_semantic_content_list( candidate_semantic_content_list ) ).iter_semantic_graphic_for_layout_document( layout_document, extract_graphic_assets=self.config.extract_graphic_assets ) ), candidate_semantic_content_list=candidate_semantic_content_list, unmatched_graphics_container=unmatched_graphics_container ) if not unmatched_graphics_container.is_empty(): LOGGER.debug('unmatched_graphics_container: %r', unmatched_graphics_container) document.back_section.add_content(unmatched_graphics_container) else: LOGGER.debug('no unmatched graphics')
def test_should_set_abstract(self): semantic_document = SemanticDocument() semantic_document.front.add_content( SemanticAbstract(LayoutBlock.for_text(TOKEN_1))) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:abstract/tei:p') == [TOKEN_1]
def test_should_add_notes_to_body(self): semantic_document = SemanticDocument() semantic_document.body_section.add_note(LayoutBlock.for_text(TOKEN_1), 'other') tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:body/tei:note[@type="other"]') == [TOKEN_1]
def test_should_set_manuscript_title(self): semantic_document = SemanticDocument() semantic_document.front.add_content( SemanticTitle(layout_block=LayoutBlock.for_text(TOKEN_1))) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:fileDesc/tei:titleStmt/tei:title[@level="a"][@type="main"]' ) == [TOKEN_1]
def test_should_create_back_section(self): semantic_document = SemanticDocument() section = semantic_document.back_section.add_new_section() section.add_heading_block(LayoutBlock.for_text(TOKEN_1)) paragraph = section.add_new_paragraph() paragraph.add_block_content(LayoutBlock.for_text(TOKEN_2)) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="annex"]/tei:div/tei:head') == [TOKEN_1] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="annex"]/tei:div/tei:p') == [TOKEN_2]
def test_should_not_convert_pdf_to_jats_zip( # pylint: disable=too-many-locals self, sciencebeam_parser_session: ScienceBeamParserSession, get_tei_for_semantic_document_mock: MagicMock, full_text_processor_class_mock: MagicMock, full_text_processor_mock: MagicMock, xslt_transformer_wrapper_mock: MagicMock, request_temp_path: Path ): expected_pdf_path = request_temp_path / 'test.pdf' expected_output_path = request_temp_path / TEMP_ALTO_XML_FILENAME graphic_local_file_path = request_temp_path / 'image1.png' graphic_relative_path = graphic_local_file_path.name expected_output_path.write_bytes(XML_CONTENT_1) graphic_local_file_path.write_bytes(IMAGE_DATA_1) get_tei_for_semantic_document_mock.return_value = ( TeiDocument(etree.fromstring(TEI_XML_CONTENT_1)) ) xslt_transformer_wrapper_mock.return_value = ( etree.fromstring(JATS_XML_CONTENT_1) ) semantic_document = SemanticDocument() semantic_document.back_section.add_content( SemanticGraphic( layout_graphic=LayoutGraphic( local_file_path=str(graphic_local_file_path) ), relative_path=graphic_relative_path ) ) full_text_processor_mock.get_semantic_document_for_layout_document.return_value = ( semantic_document ) result_file = ( sciencebeam_parser_session.get_source( str(expected_pdf_path), MediaTypes.PDF ).get_local_file_for_response_media_type( MediaTypes.JATS_ZIP ) ) with ZipFile(result_file, 'r') as zip_file: jats_xml_data = zip_file.read('jats.xml') assert jats_xml_data == JATS_XML_CONTENT_1 image_data = zip_file.read(graphic_relative_path) assert image_data == IMAGE_DATA_1 full_text_processor_kwargs = full_text_processor_class_mock.call_args[1] full_text_processor_config = full_text_processor_kwargs['config'] assert full_text_processor_config.extract_graphic_assets is True
def test_should_unmatched_graphics_to_back(self): semantic_document = SemanticDocument() semantic_document.back_section.add_content( SemanticMixedNote([ SemanticGraphic( layout_graphic=LayoutGraphic(coordinates=COORDINATES_1), relative_path='image1.svg') ], note_type='unmatched_graphics')) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) graphics_xpath = '//tei:note[@type="unmatched_graphics"]//tei:graphic' assert tei_document.xpath_nodes(graphics_xpath) assert tei_document.get_xpath_text_content_list( f'{graphics_xpath}/@url') == ['image1.svg']
def test_should_add_orphan_affiliation(self): semantic_document = SemanticDocument() aff_marker = SemanticMarker(layout_block=LayoutBlock.for_text('1')) institution = SemanticInstitution( layout_block=LayoutBlock.for_text('Institution1')) aff = SemanticAffiliationAddress([aff_marker, institution], content_id='aff0') semantic_document.front.add_content(aff) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]' ) == [aff.get_text()] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]/tei:label' ) == [aff_marker.get_text()] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/@key') == ['aff0']
def test_should_add_single_author_with_affiliation(self): semantic_document = SemanticDocument() title = SemanticNameTitle(layout_block=LayoutBlock.for_text('Title1')) given_name = SemanticGivenName( layout_block=LayoutBlock.for_text('Given1')) middle_name = SemanticMiddleName( layout_block=LayoutBlock.for_text('Middle1')) surname = SemanticSurname( layout_block=LayoutBlock.for_text('Surname1')) suffix = SemanticNameSuffix( layout_block=LayoutBlock.for_text('Suffix1')) author_marker = SemanticMarker(layout_block=LayoutBlock.for_text('1')) author = SemanticAuthor( [title, given_name, middle_name, surname, suffix, author_marker]) aff_marker = SemanticMarker(layout_block=LayoutBlock.for_text('1')) institution = SemanticInstitution( layout_block=LayoutBlock.for_text('Institution1')) aff = SemanticAffiliationAddress([aff_marker, institution], content_id='aff0') semantic_document.front.add_content(author) semantic_document.front.add_content(aff) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:author//tei:roleName') == ['Title1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="first"]') == ['Given1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="middle"]') == ['Middle1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:surname') == ['Surname1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:genName') == ['Suffix1'] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]' ) == [aff.get_text()] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]/tei:label' ) == [aff_marker.get_text()] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/@key') == ['aff0'] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/tei:orgName[@type="institution"]' ) == [institution.get_text()]
def create_asset_zip_for_semantic_document(zip_filename: str, semantic_document: SemanticDocument, relative_xml_filename: str, local_xml_filename: str): semantic_graphic_list = list( semantic_document.iter_by_type_recursively(SemanticGraphic)) LOGGER.debug('semantic_graphic_list: %r', semantic_graphic_list) with ZipFile(zip_filename, 'w') as zip_file: zip_file.write(local_xml_filename, relative_xml_filename) for semantic_graphic in semantic_graphic_list: assert semantic_graphic.relative_path, \ "graphic relative_path missing, ensure extract_graphic_assets was enabled" layout_graphic = semantic_graphic.layout_graphic assert layout_graphic assert layout_graphic.local_file_path zip_file.write(layout_graphic.local_file_path, semantic_graphic.relative_path) LOGGER.debug('response_content (bytes): %d', Path(zip_filename).stat().st_size)
def test_should_use_author_name_part_values(self): semantic_document = SemanticDocument() given_name = SemanticGivenName( layout_block=LayoutBlock.for_text('GIVEN1')) given_name.value = 'Given1' middle_name = SemanticMiddleName( layout_block=LayoutBlock.for_text('MIDDLE1')) middle_name.value = 'Middle1' surname = SemanticSurname( layout_block=LayoutBlock.for_text('SURNAME1')) surname.value = 'Surname1' author = SemanticAuthor([given_name, middle_name, surname]) semantic_document.front.add_content(author) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="first"]') == ['Given1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="middle"]') == ['Middle1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:surname') == ['Surname1']
def test_should_add_raw_equation_with_label_to_paragraph(self): # to be consistent with Java GROBID semantic_document = SemanticDocument() semantic_document.body_section.add_content( SemanticSection([ SemanticParagraph([ SemanticTextContentWrapper( layout_block=LayoutBlock.for_text('Next')), SemanticRawEquation([ SemanticRawEquationContent( layout_block=LayoutBlock.for_text('Equation 1')), SemanticLabel(layout_block=LayoutBlock.for_text('(1)')) ]) ]), ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:formula') == ['Equation 1 (1)'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:formula/tei:label') == ['(1)'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p') == ['Next']
def test_should_add_section_tables_to_body(self): semantic_document = SemanticDocument() semantic_document.body_section.add_content( SemanticSection([ SemanticTable([ SemanticLabel( layout_block=LayoutBlock.for_text('Label 1')), SemanticCaption( layout_block=LayoutBlock.for_text('Caption 1')) ], content_id='tab_0') ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) table_xpath = '//tei:body/tei:figure[@type="table"]' assert tei_document.get_xpath_text_content_list( f'{table_xpath}/tei:head') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{table_xpath}/tei:label') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{table_xpath}/tei:figDesc') == ['Caption 1'] assert tei_document.get_xpath_text_content_list( f'{table_xpath}/@xml:id') == ['tab_0'] assert not tei_document.xpath('//tei:body/tei:div')
def get_semantic_document_for_layout_document( self, layout_document: LayoutDocument, context: Optional[FullTextProcessorDocumentContext] = None ) -> SemanticDocument: if context is None: context = FullTextProcessorDocumentContext() layout_document = self._preprocess_layout_graphics( layout_document, context=context ) segmentation_label_result = self.segmentation_model.get_label_layout_document_result( layout_document, app_features_context=self.app_features_context ) header_layout_document = segmentation_label_result.get_filtered_document_by_label( '<header>' ).remove_empty_blocks() document = SemanticDocument() if self.config.extract_front: self._process_header_layout_document( header_layout_document=header_layout_document, semantic_document=document ) if self.config.extract_body_sections: self._update_semantic_section_using_segmentation_result_and_fulltext_model( document.body_section, segmentation_label_result, '<body>', SemanticSectionTypes.OTHER ) if self.config.extract_acknowledgements: self._update_semantic_section_using_segmentation_result_and_fulltext_model( document.back_section, segmentation_label_result, '<acknowledgement>', SemanticSectionTypes.ACKNOWLEDGEMENT ) if self.config.extract_back_sections: self._update_semantic_section_using_segmentation_result_and_fulltext_model( document.back_section, segmentation_label_result, '<annex>', SemanticSectionTypes.OTHER ) if self.config.extract_references: self._extract_raw_references_from_segmentation( semantic_document=document, segmentation_label_result=segmentation_label_result ) if self.config.extract_citation_fields: self._extract_reference_fields_from_raw_references( semantic_document=document ) if self.config.extract_citation_authors or self.config.extract_citation_editors: self._extract_reference_name_lists_from_raw_references( semantic_document=document ) references = list(document.iter_by_type_recursively(SemanticReference)) ref_citations = list(document.iter_by_type_recursively(SemanticReferenceCitation)) self._assign_content_ids(references, iter(iter_ids('b'))) self._assign_target_content_ids(ref_citations, ChainedContentIdMatcher([ SimpleContentIdMatcher( self._get_semantic_content_text_by_content_id(references, SemanticLabel) ), PartialContentIdMatcher( self._get_semantic_content_text_by_content_id( references, SemanticRawReferenceText ) ) ])) if self.config.extract_figure_fields: self._extract_figure_fields_from_raw_figures(semantic_document=document) figures = list(document.iter_by_type_recursively(SemanticFigure)) figure_citations = list(document.iter_by_type_recursively(SemanticFigureCitation)) self._assign_content_ids(figures, iter(iter_ids('fig_'))) self._assign_target_content_ids(figure_citations, SimpleContentIdMatcher( self._get_semantic_content_text_by_content_id(figures, SemanticLabel) )) if self.config.extract_table_fields: self._extract_table_fields_from_raw_tables(semantic_document=document) tables = list(document.iter_by_type_recursively(SemanticTable)) table_citations = list(document.iter_by_type_recursively(SemanticTableCitation)) self._assign_content_ids(tables, iter(iter_ids('tab_'))) self._assign_target_content_ids(table_citations, SimpleContentIdMatcher( self._get_semantic_content_text_by_content_id(tables, SemanticLabel) )) if self.config.extract_graphic_bounding_boxes: self._process_graphics( document=document, layout_document=layout_document, context=context ) return document
def test_should_return_empty_document(self): semantic_document = SemanticDocument() tei_document = get_tei_for_semantic_document(semantic_document) assert not tei_document.xpath('//tei:div')