Exemplo n.º 1
0
 def get_tei_element_for_semantic_content(
         self, semantic_content: SemanticContentWrapper,
         context: TeiElementFactoryContext) -> etree.ElementBase:
     LOGGER.debug('semantic_content: %s', semantic_content)
     assert isinstance(semantic_content, SemanticTable)
     semantic_table = semantic_content
     children = [
         context.get_default_attributes_for_semantic_content(semantic_table)
     ]
     for child_semantic_content in semantic_table:
         if isinstance(child_semantic_content, SemanticLabel):
             layout_block = child_semantic_content.merged_block
             children.append(
                 TEI_E(
                     'head',
                     *context.iter_layout_block_tei_children(layout_block)))
             children.append(
                 TEI_E(
                     'label',
                     *context.iter_layout_block_tei_children(layout_block)))
             continue
         if isinstance(child_semantic_content, SemanticCaption):
             children.append(
                 TEI_E(
                     'figDesc',
                     *context.iter_layout_block_tei_children(
                         child_semantic_content.merged_block)))
             continue
         children.extend(
             context.get_tei_child_elements_for_semantic_content(
                 child_semantic_content))
     return TEI_E('figure', {'type': 'table'}, *children)
Exemplo n.º 2
0
def get_tei_author_for_semantic_author_element(
    semantic_author: SemanticAuthor,
    context: TeiElementFactoryContext,
    affiliations_by_marker: Optional[Mapping[str, Sequence[SemanticAffiliationAddress]]] = None
) -> etree.ElementBase:
    if affiliations_by_marker is None:
        affiliations_by_marker = {}
    LOGGER.debug('semantic_author: %s', semantic_author)
    pers_name_children = []
    for semantic_content in semantic_author:
        pers_name_children.extend(context.get_tei_child_elements_for_semantic_content(
            semantic_content
        ))
    children = [
        TEI_E(
            'persName',
            context.get_default_attributes_for_semantic_content(semantic_author),
            *pers_name_children
        )
    ]
    affiliations = []
    for marker_text in semantic_author.view_by_type(SemanticMarker).get_text_list():
        semantic_affiliations = affiliations_by_marker.get(marker_text)
        if not semantic_affiliations:
            LOGGER.warning('affiliation not found for marker: %r', marker_text)
            continue
        for semantic_affiliation in semantic_affiliations:
            affiliations.append(get_tei_affiliation_for_semantic_affiliation_address_element(
                semantic_affiliation,
                context=context
            ))
    children.extend(affiliations)
    return TEI_E('author', *children)
Exemplo n.º 3
0
def get_tei_reference_element(  # pylint: disable=too-many-branches
        semantic_ref: SemanticReference,
        context: TeiElementFactoryContext) -> etree.ElementBase:
    LOGGER.debug('semantic_ref: %s', semantic_ref)
    tei_ref = TeiElementBuilder(
        TEI_E(
            'biblStruct',
            context.get_default_attributes_for_semantic_content(semantic_ref)))
    is_first_date = True
    for semantic_content in semantic_ref:
        parent_path = context.get_parent_path_for_semantic_content(
            semantic_content)
        tei_child_parent = tei_ref.get_or_create(parent_path)
        if isinstance(semantic_content, SemanticLabel):
            tei_child_parent.append(
                create_tei_note_element('label',
                                        semantic_content.merged_block))
            continue
        if isinstance(semantic_content, SemanticRawReferenceText):
            tei_child_parent.append(
                create_tei_note_element('raw_reference',
                                        semantic_content.merged_block))
            continue
        if isinstance(semantic_content, SemanticTitle):
            tei_child_parent.append(
                TEI_E(
                    'title', {
                        'level': 'a',
                        'type': 'main'
                    },
                    *context.iter_layout_block_tei_children(
                        semantic_content.merged_block)))
            continue
        if isinstance(semantic_content, SemanticAuthor):
            tei_child_parent.append(
                get_tei_author_for_semantic_author_element(semantic_content,
                                                           context=context))
            continue
        if isinstance(semantic_content, SemanticDate):
            tei_child_parent = tei_ref.get_or_create(['monogr', 'imprint'])
            attrib = {}
            if is_first_date:
                # assume first date is published date (more or less matches GROBID)
                attrib['type'] = 'published'
            if semantic_content.year:
                attrib['when'] = str(semantic_content.year)
            tei_child_parent.append(
                TEI_E(
                    'date', attrib,
                    *context.iter_layout_block_tei_children(
                        layout_block=semantic_content.merged_block)))
            is_first_date = False
            continue
        tei_child_parent.extend(
            context.get_tei_child_elements_for_semantic_content(
                semantic_content))
    return tei_ref.element
Exemplo n.º 4
0
 def get_tei_element_for_semantic_content(
         self, semantic_content: SemanticContentWrapper,
         context: TeiElementFactoryContext) -> etree.ElementBase:
     assert isinstance(semantic_content, SemanticPageRange)
     page_range = semantic_content
     if not page_range.from_page or not page_range.to_page:
         return TEI_E('biblScope', {'unit': 'page'}, page_range.get_text())
     return TEI_E('biblScope', {
         'unit': 'page',
         'from': page_range.from_page,
         'to': page_range.to_page
     })
Exemplo n.º 5
0
def get_dummy_tei_author_for_semantic_affiliations_element(
    semantic_affiliations: Sequence[SemanticAffiliationAddress],
    context: TeiElementFactoryContext
) -> etree.ElementBase:
    children = [
        TEI_E('note', {'type': 'dummy_author'}, 'Dummy author for orphan affiliations')
    ]
    children.extend([
        get_tei_affiliation_for_semantic_affiliation_address_element(
            semantic_affiliation,
            context=context
        )
        for semantic_affiliation in semantic_affiliations
    ])
    return TEI_E('author', *children)
Exemplo n.º 6
0
 def get_tei_children_for_semantic_content(
         self, semantic_content: SemanticContentWrapper,
         context: TeiElementFactoryContext) -> List[etree.ElementBase]:
     LOGGER.debug('semantic_content: %s', semantic_content)
     assert isinstance(semantic_content, SemanticRawEquation)
     semantic_raw_equation = semantic_content
     children: T_ElementChildrenList = [
         context.get_default_attributes_for_semantic_content(
             semantic_raw_equation)
     ]
     pending_whitespace = ''
     for child_semantic_content in semantic_raw_equation:
         if isinstance(child_semantic_content, SemanticRawEquationContent):
             layout_block = child_semantic_content.merged_block
             if pending_whitespace:
                 children.append(pending_whitespace)
             children.extend(
                 context.iter_layout_block_tei_children(layout_block))
             pending_whitespace = layout_block.whitespace
             continue
         pending_whitespace = context.append_tei_children_list_and_get_whitespace(
             children,
             child_semantic_content,
             pending_whitespace=pending_whitespace)
     return [TEI_E('formula', *children)]
def _get_wrapped_figure_tei_element(
        semantic_figure: SemanticFigure) -> TeiElementWrapper:
    return TeiElementWrapper(
        TEI_E(
            'root',
            *FigureTeiElementFactory().get_tei_children_for_semantic_content(
                semantic_figure, context=DEFAULT_TEI_ELEMENT_FACTORY_CONTEXT)))
Exemplo n.º 8
0
 def get_tei_children_for_semantic_content(
         self, semantic_content: SemanticContentWrapper,
         context: TeiElementFactoryContext) -> List[etree.ElementBase]:
     LOGGER.debug('semantic_content: %s', semantic_content)
     assert isinstance(semantic_content, SemanticParagraph)
     semantic_paragraph = semantic_content
     result: List[etree.ElementBase] = []
     for flat_parent_semantic_content in iter_flat_paragraph_formula(
             semantic_paragraph):
         if not isinstance(flat_parent_semantic_content, SemanticParagraph):
             result.extend(
                 context.get_tei_child_elements_for_semantic_content(
                     flat_parent_semantic_content))
             continue
         children: T_ElementChildrenList = [
             context.get_default_attributes_for_semantic_content(
                 flat_parent_semantic_content)
         ]
         pending_whitespace = ''
         for child_semantic_content in flat_parent_semantic_content:
             pending_whitespace = context.append_tei_children_list_and_get_whitespace(
                 children,
                 child_semantic_content,
                 pending_whitespace=pending_whitespace)
         result.append(TEI_E('p', *children))
     return result
Exemplo n.º 9
0
 def __init__(self, root: Optional[etree.ElementBase] = None):
     if root is None:
         self.root = TEI_E('TEI')
     else:
         self.root = root
     self._reference_element: Optional[etree.ElementBase] = None
     super().__init__(self.root)
Exemplo n.º 10
0
 def set_title_layout_block(self, title_block: LayoutBlock):
     self.set_child_element_at(
         ['teiHeader', 'fileDesc', 'titleStmt'],
         TEI_E('title', {
             'level': 'a',
             'type': 'main'
         }, *iter_layout_block_tei_children(title_block)))
Exemplo n.º 11
0
 def test_should_add_superscript_text(self):
     block = LayoutBlock.for_tokens([
         LayoutToken(TOKEN_1),
         LayoutToken(TOKEN_2, font=SUPERSCRIPT_FONT_1),
         LayoutToken(TOKEN_3)
     ])
     node = TEI_E.node(*iter_layout_block_tei_children(block))
     assert get_tei_xpath_text_content_list(
         node, './tei:hi[@rend="superscript"]') == [TOKEN_2]
     assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
Exemplo n.º 12
0
def _get_wrapped_graphic_tei_element(
    semantic_graphic: SemanticGraphic
) -> TeiElementWrapper:
    return TeiElementWrapper(TEI_E(
        'root',
        *GraphicTeiElementFactory().get_tei_children_for_semantic_content(
            semantic_graphic,
            context=DEFAULT_TEI_ELEMENT_FACTORY_CONTEXT
        )
    ))
Exemplo n.º 13
0
 def test_should_add_bold_and_italics_text(self):
     block = LayoutBlock.for_tokens([
         LayoutToken(TOKEN_1),
         LayoutToken(TOKEN_2, font=BOLD_ITALICS_FONT_1),
         LayoutToken(TOKEN_3)
     ])
     node = TEI_E.node(*iter_layout_block_tei_children(block))
     LOGGER.debug('xml: %r', etree.tostring(node))
     assert get_tei_xpath_text_content_list(
         node, './/tei:hi[@rend="bold"]') == [TOKEN_2]
     assert get_tei_xpath_text_content_list(
         node, './/tei:hi[@rend="italic"]') == [TOKEN_2]
     assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
Exemplo n.º 14
0
def get_tei_affiliation_for_semantic_affiliation_address_element(
    semantic_affiliation_address: SemanticAffiliationAddress,
    context: TeiElementFactoryContext
) -> etree.ElementBase:
    LOGGER.debug('semantic_affiliation_address: %s', semantic_affiliation_address)
    raw_affiliation = _get_tei_raw_affiliation_element_for_semantic_affiliation_address(
        semantic_affiliation_address,
        context=context
    )
    attributes = context.get_default_attributes_for_semantic_content(
        semantic_affiliation_address
    )
    if semantic_affiliation_address.content_id:
        attributes = {**attributes, 'key': semantic_affiliation_address.content_id}
        if XML_ID in attributes:
            del attributes[XML_ID]
    children = [
        attributes,
        raw_affiliation
    ]
    address_semantic_content_list = []
    for semantic_content in semantic_affiliation_address:
        if isinstance(semantic_content, SemanticAddressField):
            address_semantic_content_list.append(semantic_content)
            continue
        children.extend(context.get_tei_child_elements_for_semantic_content(
            semantic_content
        ))
    LOGGER.debug('address_semantic_content_list: %r', address_semantic_content_list)
    if address_semantic_content_list:
        children.append(TEI_E('address', *[
            child
            for semantic_content in address_semantic_content_list
            for child in context.get_tei_child_elements_for_semantic_content(
                semantic_content
            )
        ]))
    return TEI_E('affiliation', *children)
Exemplo n.º 15
0
 def get_tei_element_for_semantic_content(
         self, semantic_content: SemanticContentWrapper,
         context: TeiElementFactoryContext) -> etree.ElementBase:
     assert isinstance(semantic_content, SemanticCitation)
     citation = semantic_content
     citation_type = CITATION_TYPE_BY_SEMANTIC_CLASS.get(type(citation))
     attributes = {}
     if citation_type:
         attributes['type'] = citation_type
     if citation.target_content_id:
         attributes['target'] = '#' + citation.target_content_id
     return TEI_E(
         'ref', attributes,
         *context.iter_layout_block_tei_children(citation.merged_block))
Exemplo n.º 16
0
def _get_tei_raw_affiliation_element_for_semantic_affiliation_address(
    semantic_affiliation_address: SemanticAffiliationAddress,
    context: TeiElementFactoryContext
) -> etree.ElementBase:
    children: List[Union[str, dict, etree.ElementBase]] = []
    children.append({'type': 'raw_affiliation'})
    pending_whitespace: str = ''
    for semantic_content in semantic_affiliation_address:
        merged_block = semantic_content.merged_block
        if pending_whitespace:
            children.append(pending_whitespace)
        if isinstance(semantic_content, SemanticMarker):
            children.append(TEI_E(
                'label',
                *context.iter_layout_block_tei_children(merged_block, enable_coordinates=False)
            ))
            pending_whitespace = merged_block.whitespace
            continue
        children.extend(
            context.iter_layout_block_tei_children(merged_block, enable_coordinates=False)
        )
        pending_whitespace = merged_block.whitespace
    return TEI_E('note', *children)
Exemplo n.º 17
0
 def get_tei_element_for_semantic_content(
         self, semantic_content: SemanticContentWrapper,
         context: TeiElementFactoryContext) -> etree.ElementBase:
     LOGGER.debug('semantic_content: %s', semantic_content)
     assert isinstance(semantic_content, SemanticMixedNote)
     semantic_mixed_note = semantic_content
     note_type = semantic_mixed_note.note_type or 'other'
     children = [
         context.get_default_attributes_for_semantic_content(
             semantic_mixed_note)
     ]
     children.append({'type': note_type})
     for child_semantic_content in semantic_mixed_note:
         children.extend(
             context.get_tei_child_elements_for_semantic_content(
                 child_semantic_content))
     return TEI_E('note', *children)
Exemplo n.º 18
0
def get_tei_raw_reference_list_element(
        semantic_reference_list: SemanticReferenceList,
        context: TeiElementFactoryContext) -> etree.ElementBase:
    tei_reference_list = TeiElementBuilder(TEI_E('listBibl'))
    for semantic_content in semantic_reference_list:
        if isinstance(semantic_content, SemanticRawReference):
            tei_reference_list.append(
                _get_tei_raw_reference_element(semantic_content,
                                               context=context))
            continue
        if isinstance(semantic_content, SemanticReference):
            tei_reference_list.append(
                get_tei_reference_element(semantic_content, context=context))
            continue
        tei_reference_list.extend(
            context.get_tei_child_elements_for_semantic_content(
                semantic_content))
    return tei_reference_list.element
Exemplo n.º 19
0
 def get_tei_element_for_semantic_content(
     self,
     semantic_content: SemanticContentWrapper,
     context: TeiElementFactoryContext
 ) -> etree.ElementBase:
     assert isinstance(semantic_content, SemanticExternalIdentifier)
     external_identifier = semantic_content
     if LOGGER.isEnabledFor(logging.DEBUG):
         LOGGER.debug(
             'external_identifier: type=%r, value=%r, text=%r, content=%r',
             external_identifier.external_identifier_type,
             external_identifier.value,
             external_identifier.get_text(),
             external_identifier
         )
     attributes = context.get_default_attributes_for_semantic_content(external_identifier)
     if external_identifier.external_identifier_type:
         attributes = {**attributes, 'type': external_identifier.external_identifier_type}
     return TEI_E('idno', attributes, external_identifier.value)
Exemplo n.º 20
0
def _get_tei_raw_reference_element(
        semantic_raw_ref: SemanticRawReference,
        context: TeiElementFactoryContext) -> etree.ElementBase:
    LOGGER.debug('semantic_raw_ref: %s', semantic_raw_ref)
    children = []
    for semantic_content in semantic_raw_ref:
        if isinstance(semantic_content, SemanticRawReferenceText):
            children.append(
                create_tei_note_element('raw_reference',
                                        semantic_content.merged_block))
            continue
        children.extend(
            context.get_tei_child_elements_for_semantic_content(
                semantic_content))
    tei_ref = TEI_E(
        'biblStruct',
        context.get_default_attributes_for_semantic_content(semantic_raw_ref),
        *children)
    return tei_ref
Exemplo n.º 21
0
 def get_tei_element_for_semantic_content(
         self, semantic_content: SemanticContentWrapper,
         context: TeiElementFactoryContext) -> etree.ElementBase:
     LOGGER.debug('semantic_content: %s', semantic_content)
     assert isinstance(semantic_content, SemanticGraphic)
     semantic_graphic = semantic_content
     layout_graphic = semantic_graphic.layout_graphic
     children = [
         context.get_default_attributes_for_semantic_content(
             semantic_graphic, enable_coordinates=False)
     ]
     if semantic_graphic.relative_path:
         children.append({'url': semantic_graphic.relative_path})
     if layout_graphic:
         if layout_graphic.coordinates:
             children.append(
                 {'coords': format_coordinates(layout_graphic.coordinates)})
         if layout_graphic.graphic_type:
             children.append({'type': layout_graphic.graphic_type})
     return TEI_E('graphic', *children)
Exemplo n.º 22
0
 def get_tei_children_for_semantic_content(
         self, semantic_content: SemanticContentWrapper,
         context: TeiElementFactoryContext) -> List[etree.ElementBase]:
     LOGGER.debug('semantic_content: %s', semantic_content)
     assert isinstance(semantic_content, SemanticSection)
     semantic_section = semantic_content
     tei_section = TeiElementBuilder(TEI_E('div'))
     for child_semantic_content in semantic_section:
         if isinstance(child_semantic_content, (
                 SemanticFigure,
                 SemanticTable,
         )):
             # rendered at parent level
             continue
         tei_section.extend(
             context.get_tei_child_elements_for_semantic_content(
                 child_semantic_content))
     if semantic_content.section_type == SemanticSectionTypes.ACKNOWLEDGEMENT:
         tei_section.element.attrib['type'] = 'acknowledgement'
     if not list(tei_section.element):
         return []
     return [tei_section.element]
Exemplo n.º 23
0
 def get_tei_element_for_semantic_content(
         self, semantic_content: SemanticContentWrapper,
         context: TeiElementFactoryContext) -> etree.ElementBase:
     LOGGER.debug('semantic_content: %s', semantic_content)
     assert isinstance(semantic_content, SemanticHeading)
     semantic_heading = semantic_content
     children: T_ElementChildrenList = [
         context.get_default_attributes_for_semantic_content(
             semantic_heading)
     ]
     pending_whitespace = ''
     for child_semantic_content in semantic_heading:
         if isinstance(child_semantic_content, SemanticLabel):
             children.append({'n': child_semantic_content.get_text()})
             continue
         layout_block = child_semantic_content.merged_block
         if pending_whitespace:
             children.append(pending_whitespace)
         children.extend(
             context.iter_layout_block_tei_children(
                 layout_block=layout_block, enable_coordinates=False))
         pending_whitespace = layout_block.whitespace
     return TEI_E('head', *children)
Exemplo n.º 24
0
 def set_abstract_layout_block(self, abstract_block: LayoutBlock):
     self.set_child_element_at(
         ['teiHeader', 'profileDesc', 'abstract'],
         TEI_E('p', *iter_layout_block_tei_children(abstract_block)))
Exemplo n.º 25
0
 def create_section(self) -> TeiSection:
     return TeiSection(TEI_E('div'))
Exemplo n.º 26
0
 def create_paragraph(self) -> TeiSectionParagraph:
     return TeiSectionParagraph(TEI_E('p'))
Exemplo n.º 27
0
 def set_title(self, title: str):
     self.set_child_element_at(['teiHeader', 'fileDesc', 'titleStmt'],
                               TEI_E('title', title, level="a",
                                     type="main"))
Exemplo n.º 28
0
 def set_abstract(self, abstract: str):
     self.set_child_element_at(['teiHeader', 'profileDesc', 'abstract'],
                               TEI_E('p', abstract))