示例#1
0
 def format_person_names(self, person_names, reverse=False):
     """
     From each tei:name/tei:persName/... in person_names, extracts a full name of the form
     'surname, forename [name link] [& additional name]' or (if reverse=True) 'forename [name link] surname [<additional name>]'.
     However, if there is an @key, this will be used as the full name.
     """
     names = []
     for person in person_names:
         name = ''
         if exists(person, '@key'):
             name = person.xpath('@key')[0]
         elif exists(person, 'tei:surname and tei:forename'):
             surname = person.xpath('tei:surname/text()', namespaces=xml_ns)[0]
             forename = person.xpath('tei:forename/text()', namespaces=xml_ns)[0]
             name_link = ''
             if exists(person, 'tei:nameLink'):
                 name_link += ' ' + person.xpath('tei:nameLink/text()', namespaces=xml_ns)[0]
             add_name = ''
             if exists(person, 'tei:addName'):
                 if reverse:
                     add_name += ' <' + person.xpath('tei:addName/text()', namespaces=xml_ns)[0] + '>'
                 else:
                     add_name += ' & (' + person.xpath('tei:addName/text()', namespaces=xml_ns)[0] + ')'
             if reverse:
                 name = forename + ' ' + surname + name_link + add_name
             else:
                 name = surname + ', ' + forename + name_link + add_name
         else:
             name = str(person.xpath('string(.)')[0])
         names.append(name)
     return names
示例#2
0
 def get_publish_date_range(self, date: etree._Element):
     if exists(date, '@from'):
         date = {'start': date.xpath('@start')[0]}
         if exists(date, '@to'):
             date['to'] = date.xpath('@to')[0]
         return date
     else:
         return date.xpath('text()')[0]
示例#3
0
 def transform_p(self, node, mode):
     text = self.passthru(node, mode)
     if exists(node, 'ancestor::tei:note'):
         if exists(node, 'following-sibling::tei:p'):
             return text + '\n'
         else:
             return text
     else:
         return '\n' + text + '\n'
示例#4
0
 def make_title(self, node: etree._Element) -> str:
     if exists('self::tei:charDecl'):
         return str(
             node.xpath(
                 'ancestor::tei:teiheader/tei:fileDesc/tei:titleStmt/tei:title[@xml:lang = "en"]',
                 namespaces=xml_ns)[0])  # TODO i18n
     elif exists('self::tei:char/tei:desc'):
         return str(node.xpath('tei:desc/text()', namespaces=xml_ns)[0])
     else:
         return None
示例#5
0
 def is_passagetrail_node(self, node):
     """
     Determines if a node constitutes a 'passagetrail' part.
     Note: assumes that get_elem_type(node) == True.
     """
     name = etree.QName(node).localname
     return bool(exists(node, 'self::tei:text[@type = "work_volume"]') \
                 or (exists(node, 'self::tei:div') and citation_labels[node.get('type')].get('isCiteRef')) \
                 or (exists(node, 'self::tei:milestone') and citation_labels[node.get('unit')].get('isCiteRef')) \
                 or exists(node, 'self::tei:pb[not(@sameAs or @corresp)]') \
                 or (citation_labels.get(name) and citation_labels.get(name).get('isCiteRef')))
示例#6
0
 def transform_item(
         self, node,
         mode):  # TODO test this, esp. with more complicated/nested lists
     text = ''
     if self.analysis.is_basic_node(
             node) or self.analysis.has_basic_ancestor(node):
         text = self.passthru(node, mode)
     leading = '- '
     if exists(node, 'parent::tei:list/@type = "numbered"'):
         leading = '# '
     elif exists(node, 'parent::tei:list/@type = "simple"'):
         leading = ' '
     return leading + text + '\n'
示例#7
0
 def get_citable_ancestors(self, node: etree._Element, node_type: str, mode: str):
     """
     Gets all citetrail or passagetrail ancestors of a node (switch modes: 'citetrail' vs 'passagetrail').
     """
     tei_ancestors = node.xpath('ancestor::*')
     ancestors = []
     if node_type == 'marginal' or node_type == 'anchor':
         # marginals and anchors must not have p (or some other "main" node) as their parent
         for anc in tei_ancestors:
             if (mode == 'citetrail' or (mode == 'passagetrail' and self.is_passagetrail_node(anc))) \
                     and self.is_structural_node(anc):
                 ancestors.append(anc)
     elif node_type == 'page':
         # within front, back, and single volumes, citable parent resolves to one of those elements for avoiding
         # collisions with identically named pb in other parts
         for anc in tei_ancestors:
             if (mode == 'citetrail' or (mode == 'passagetrail' and self.is_passagetrail_node(anc))) \
                     and exists(anc, 'self::tei:front or self::tei:back'
                                     + ' or self::tei:text[1][not(@xml:id = "completeWork" or @type = "work_part")]'):
                 ancestors.append(anc)
         # note: this makes all other pb appear outside of any structural hierarchy, but this should be fine
     else:
         for anc in tei_ancestors:
             if (mode == 'citetrail' or (mode == 'passagetrail' and self.is_passagetrail_node(anc))) \
                     and self.get_node_type(anc):
                 ancestors.append(anc)
     return ancestors[::-1] # ancestors.reverse() is not working here
示例#8
0
 def transform_bibl(self, node, mode):
     if mode == 'edit' and exists(node, '@sortKey'):
         text = self.passthru(node, mode)
         return text + ' [' + re.sub(
             r'_', ', ',
             node.get('sortKey')) + ']'  # TODO revision of bibl/@sortKey
     else:
         return self.passthru(node, mode)
示例#9
0
 def transform_orig_elem(self, node, mode):
     if mode == 'orig' or not exists(
             node,
             'parent::tei:choice/*[self::tei:expan or self::tei:corr or self::tei:reg]'
     ):
         return self.passthru(node, mode)
     else:
         return ''
示例#10
0
 def get_citetrail_prefix(self, node: etree._Element, node_type: str):
     """
     Citetrails for certain node types are always prefixed with a 'categorical' keyword/string.
     """
     prefix = ''
     name = etree.QName(node).localname
     if node_type == 'page':
         prefix = 'p'
     elif node_type == 'marginal':
         prefix = 'n'
     elif node_type == 'anchor' and exists(node, 'self::tei:milestone[@unit]'):
         prefix = node.get('unit')
     elif node_type == 'structural':
         if name == 'front':
             prefix = 'frontmatter'
         elif name == 'back':
             prefix = 'backmatter'
         elif exists(node, 'self::tei:text[@type = "work_volume"]'):
             prefix = 'vol'
     elif node_type == 'main':
         if exists(node, 'self::tei:head'):
             prefix = 'heading'
         elif exists(node, 'self::tei:titlePage'):
             prefix = 'titlepage'
     elif node_type == 'list':
         if exists(node, 'self::tei:list[@type = "dict" or @type = "index"]'):
             prefix = node.get('type')
         elif exists(node, 'self::tei:item[ancestor::tei:list[@type = "dict"]]'):
             prefix = 'entry'
     return prefix
示例#11
0
    def make_resource_metadata(self, tei_header: etree._Element, config, wid: str):
        """Translates data from the teiHeader of a work to DTS+DC metadata for a DTS textual Resource"""

        # 1.) gather metadata
        # a) digital edition
        id = id_server + '/texts/' + config.get_wid()
        # TODO @id shouldn't be the same as the @id of the parent collection, but sth more specific
        title = tei_header.xpath('tei:fileDesc/tei:titleStmt/tei:title[@type = "short"]/text()', namespaces=xml_ns)[0]
        alt_title = tei_header.xpath('tei:fileDesc/tei:titleStmt/tei:title[@type = "main"]/text()', namespaces=xml_ns)[0]  # or short title here?
        author = '; '.join(self.format_person_names(tei_header.xpath('tei:fileDesc/tei:titleStmt/tei:author/tei:persName',
                                                      namespaces=xml_ns), reverse=False))
        scholarly_editors = self.format_person_names(tei_header.xpath('tei:fileDesc/tei:titleStmt/' +
                                                                 'tei:editor[contains(@role, "#scholarly")]/tei:persName',
                                                                 namespaces=xml_ns))
        technical_editors = self.format_person_names(tei_header.xpath('tei:fileDesc/tei:titleStmt/' +
                                                                 'tei:editor[contains(@role, "#technical")]/tei:persName',
                                                                 namespaces=xml_ns))
        editors = list(set(scholarly_editors + technical_editors))
        pub_date = self.get_publish_date(tei_header)
        version = tei_header.xpath('tei:fileDesc/tei:editionStmt/tei:edition/@n', namespaces=xml_ns)[0]
        series_volume = tei_header.xpath('tei:fileDesc/tei:seriesStmt/tei:biblScope[@unit = "volume"]/@n',
                                         namespaces=xml_ns)[0]
        rights_holder = {
            '@id': 'https://id.salamanca.school',
            'name': {
                '@language': 'en',
                '@value': 'The School of Salamanca'
            }
        }  # TODO provisional values
        bibliographic_citation = self.make_bibliographic_citations(tei_header, wid)

        # b) print source
        source_title = tei_header.xpath('tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:monogr/tei:title[@type = "main"]/text()',
                                        namespaces=xml_ns)[0]
        source_publishers = self.format_person_names(self.get_source_publishers(tei_header), reverse=False)
        source_extents = tei_header.xpath('tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:monogr/tei:extent',
                                          namespaces=xml_ns)
        source_extents_i18n = []
        for extent in source_extents:
            if exists(extent, '@xml:lang'):
                extent_i18n = {
                    '@language': extent.xpath('@xml:lang', namespaces=xml_ns)[0],
                    '@value': extent.xpath('text()')[0]
                }
                source_extents_i18n.append(extent_i18n)
            else:
                source_extents_i18n.append(extent.xpath('text()')[0])
        source_lang = []
        for lang in tei_header.xpath('tei:profileDesc/tei:langUsage/tei:language/@ident', namespaces=xml_ns):
            source_lang.append(lang)
        source_pub_date = self.get_source_publish_date(tei_header)
        source_pub_place = self.get_source_publish_place(tei_header)
        source_repositories = self.get_source_repositories(tei_header)

        # c) other dts metadata
        total_items = 0 # TODO
        dts_total_children = 0  # TODO
        dts_total_parents = 1  # resource is part only of the parent collection that represents the work
        dts_cite_depth = config.get_cite_depth() # TODO multivol vs singlevol?

        # 2.) construct metadata object
        resource_metadata = {
            '@context': context,
            '@id': id,
            '@type': 'Resource',
            'title': title,
            'totalItems': total_items,
            'dts:totalParents': dts_total_parents,
            'dts:totalChildren': dts_total_children,
            'dts:citeDepth': dts_cite_depth,
            'dts:dublincore': {
                'dc:title': title,
                'dc:alternative': alt_title,
                'dc:contributor': editors,  # TODO editors as "contributors"? information can be found also in sal:...Editors
                'dc:type': [
                    'http://purl.org/spar/fabio/work',
                    'dc:Text'
                ],
                'dc:created': pub_date,
                'dc:bibliographicCitation': bibliographic_citation,
                'dc:rightsHolder': rights_holder,
                'dc:license': 'http://creativecommons.org/licenses/by/4.0/',
                'dc:source': {
                    'dc:title': source_title,
                    'dc:creator': author,
                    'dc:publisher': source_publishers,
                    'dc:format': source_extents_i18n,
                    'dc:language': source_lang,
                    'dc:created': source_pub_date
                }
            },
            'dts:extensions': {  # supplementary information that doesn't easily fit into dts/dc elements
                'sal:version': version,  # the version of this edition of the text
                'sal:scholarlyEditors': scholarly_editors,
                'sal:technicalEditors': technical_editors,
                # omitting "#additional" editors here
                'sal:seriesVolume': series_volume,
                'sal:sourcePublishPlace': source_pub_place,  # there seem to be no dc elements for this type of information...
                'sal:sourceRepositories': source_repositories
            }
        }
        return resource_metadata
示例#12
0
 def get_place_name(self, place_name: etree._Element):
     if exists(place_name, '@key'):
         return place_name.get('key')
     else:
         return place_name.text
示例#13
0
 def get_node_title(self, node):
     name = etree.QName(node).localname
     xml_id = node.xpath('@xml:id', namespaces=xml_ns)[0]
     title = ''
     if name == 'div':
         if node.get('n') and not re.match(r'^[\d\[\]]+$', node.get('n')):
             title = '"' + node.get('n') + '"'
         elif exists(node, 'tei:head'):
             title = self.make_node_teaser(node.xpath('tei:head[1]', namespaces=xml_ns)[0])
         elif exists(node, 'tei:label'):
             title = self.make_node_teaser(node.xpath('tei:label[1]', namespaces=xml_ns)[0])
         elif node.get('n') and node.get('type'):
             title = node.get('n')
         elif exists(node, 'ancestor::tei:TEI//tei:text//tei:ref[@target = "#' + xml_id + '"]'):
             title = self.make_node_teaser(node.xpath('ancestor::tei:TEI//tei:text//tei:ref[@target = "#' + xml_id
                                                         + '"][1]')[0])
         elif exists(node, 'tei:list/tei:head'):
             title = self.make_node_teaser(node.xpath('tei:list/tei:head[1]', namespaces=xml_ns)[0])
         elif exists(node, 'tei_list/tei:label'):
             title = self.make_node_teaser(node.xpath('tei:list/tei:label[1]', namespaces=xml_ns)[0])
     elif name == 'item':
         #if exists(node, 'parent::tei:list[@type="dict"] and descendant::tei:term[1]/@key'):
         #    return '"' + node.xpath('descendant::tei:term[1]/@key')[0] + '"'
         #    # TODO this needs revision when we have really have such dict. lists
         if node.get('n') and not re.match(r'^[\d\[\]]+$', node.get('n')):
             title = '"' + node.get('n') + '"'
         elif exists(node, 'tei:head'):
             title = self.make_node_teaser(node.xpath('tei:head[1]', namespaces=xml_ns)[0])
         elif exists(node, 'tei:label'):
             title = self.make_node_teaser(node.xpath('tei:label[1]', namespaces=xml_ns)[0])
         elif node.get('n'):
             title = node.get('n')
         elif exists(node, 'ancestor::tei:TEI//tei:text//tei:ref[@target = "#' + xml_id + '"]'):
             title = self.make_node_teaser(
                 node.xpath('ancestor::tei:TEI//tei:text//tei:ref[@target = "#' + xml_id + '"][1]', namespaces=xml_ns)[0])
     elif name == 'lg':
         if exists(node, 'tei:head'):
             title = self.make_node_teaser(node.xpath('tei:head[1]', namespaces=xml_ns)[0])
         else:
             title = self.make_node_teaser(node)
     elif name == 'list':
         if node.get('n') and not re.match(r'^[\d\[\]]+$', node.get('n')):
             title = '"' + node.get('n') + '"'
         elif exists(node, 'tei:head'):
             title = self.make_node_teaser(node.xpath('tei:head[1]', namespaces=xml_ns)[0])
         elif exists(node, 'tei:label'):
             title = self.make_node_teaser(node.xpath('tei:label[1]', namespaces=xml_ns)[0])
         elif node.get('n'):
             title = node.get('n')
         elif exists(node, 'ancestor::tei:TEI//tei:text//tei:ref[@target = "#'+ xml_id +'"]'):
             title = self.make_node_teaser(node.xpath('ancestor::tei:TEI//tei:text//tei:ref[@target = "#'+ xml_id +'"][1]')[0])
     elif name == 'milestone':
         if node.get('n') and not re.match(r'^[\d\[\]]+$', node.get('n')):
             title = '"' + node.get('n') + '"'
         elif node.get('n'):
             title = node.get('n')
         elif exists(node, 'ancestor::tei:TEI//tei:text//tei:ref[@target = "#'+ xml_id +'"]'):
             title = self.make_node_teaser(node.xpath('ancestor::tei:TEI//tei:text//tei:ref[@target = "#'+ xml_id +'"][1]')[0])
     elif name == 'note':
         if node.get('n'):
             title = '"' + node.get('n') + '"'
     elif name == 'pb':
         if node.get('n') and re.match(r'fol\.', node.get('n')):
             title = node.get('n')
         else:
             title = 'p. ' + node.get('n')
         # one could also prepend a 'Vol. ' prefix here in case of a multivolume work
     elif name == 'text':
         if node.get('type') == 'work_volume':
             title = node.get('n')
     elif name == 'head' or name == 'label' or name == 'p' or name == 'signed' or name == 'titlePart':
         title = self.make_node_teaser(node)
     return title