def format_person_names(self, person_names, reverse=False): """ From each tei:name/tei:persName/... in person_names, extracts a full name of the form 'surname, forename [name link] [& additional name]' or (if reverse=True) 'forename [name link] surname [<additional name>]'. However, if there is an @key, this will be used as the full name. """ names = [] for person in person_names: name = '' if exists(person, '@key'): name = person.xpath('@key')[0] elif exists(person, 'tei:surname and tei:forename'): surname = person.xpath('tei:surname/text()', namespaces=xml_ns)[0] forename = person.xpath('tei:forename/text()', namespaces=xml_ns)[0] name_link = '' if exists(person, 'tei:nameLink'): name_link += ' ' + person.xpath('tei:nameLink/text()', namespaces=xml_ns)[0] add_name = '' if exists(person, 'tei:addName'): if reverse: add_name += ' <' + person.xpath('tei:addName/text()', namespaces=xml_ns)[0] + '>' else: add_name += ' & (' + person.xpath('tei:addName/text()', namespaces=xml_ns)[0] + ')' if reverse: name = forename + ' ' + surname + name_link + add_name else: name = surname + ', ' + forename + name_link + add_name else: name = str(person.xpath('string(.)')[0]) names.append(name) return names
def get_publish_date_range(self, date: etree._Element): if exists(date, '@from'): date = {'start': date.xpath('@start')[0]} if exists(date, '@to'): date['to'] = date.xpath('@to')[0] return date else: return date.xpath('text()')[0]
def transform_p(self, node, mode): text = self.passthru(node, mode) if exists(node, 'ancestor::tei:note'): if exists(node, 'following-sibling::tei:p'): return text + '\n' else: return text else: return '\n' + text + '\n'
def make_title(self, node: etree._Element) -> str: if exists('self::tei:charDecl'): return str( node.xpath( 'ancestor::tei:teiheader/tei:fileDesc/tei:titleStmt/tei:title[@xml:lang = "en"]', namespaces=xml_ns)[0]) # TODO i18n elif exists('self::tei:char/tei:desc'): return str(node.xpath('tei:desc/text()', namespaces=xml_ns)[0]) else: return None
def is_passagetrail_node(self, node): """ Determines if a node constitutes a 'passagetrail' part. Note: assumes that get_elem_type(node) == True. """ name = etree.QName(node).localname return bool(exists(node, 'self::tei:text[@type = "work_volume"]') \ or (exists(node, 'self::tei:div') and citation_labels[node.get('type')].get('isCiteRef')) \ or (exists(node, 'self::tei:milestone') and citation_labels[node.get('unit')].get('isCiteRef')) \ or exists(node, 'self::tei:pb[not(@sameAs or @corresp)]') \ or (citation_labels.get(name) and citation_labels.get(name).get('isCiteRef')))
def transform_item( self, node, mode): # TODO test this, esp. with more complicated/nested lists text = '' if self.analysis.is_basic_node( node) or self.analysis.has_basic_ancestor(node): text = self.passthru(node, mode) leading = '- ' if exists(node, 'parent::tei:list/@type = "numbered"'): leading = '# ' elif exists(node, 'parent::tei:list/@type = "simple"'): leading = ' ' return leading + text + '\n'
def get_citable_ancestors(self, node: etree._Element, node_type: str, mode: str): """ Gets all citetrail or passagetrail ancestors of a node (switch modes: 'citetrail' vs 'passagetrail'). """ tei_ancestors = node.xpath('ancestor::*') ancestors = [] if node_type == 'marginal' or node_type == 'anchor': # marginals and anchors must not have p (or some other "main" node) as their parent for anc in tei_ancestors: if (mode == 'citetrail' or (mode == 'passagetrail' and self.is_passagetrail_node(anc))) \ and self.is_structural_node(anc): ancestors.append(anc) elif node_type == 'page': # within front, back, and single volumes, citable parent resolves to one of those elements for avoiding # collisions with identically named pb in other parts for anc in tei_ancestors: if (mode == 'citetrail' or (mode == 'passagetrail' and self.is_passagetrail_node(anc))) \ and exists(anc, 'self::tei:front or self::tei:back' + ' or self::tei:text[1][not(@xml:id = "completeWork" or @type = "work_part")]'): ancestors.append(anc) # note: this makes all other pb appear outside of any structural hierarchy, but this should be fine else: for anc in tei_ancestors: if (mode == 'citetrail' or (mode == 'passagetrail' and self.is_passagetrail_node(anc))) \ and self.get_node_type(anc): ancestors.append(anc) return ancestors[::-1] # ancestors.reverse() is not working here
def transform_bibl(self, node, mode): if mode == 'edit' and exists(node, '@sortKey'): text = self.passthru(node, mode) return text + ' [' + re.sub( r'_', ', ', node.get('sortKey')) + ']' # TODO revision of bibl/@sortKey else: return self.passthru(node, mode)
def transform_orig_elem(self, node, mode): if mode == 'orig' or not exists( node, 'parent::tei:choice/*[self::tei:expan or self::tei:corr or self::tei:reg]' ): return self.passthru(node, mode) else: return ''
def get_citetrail_prefix(self, node: etree._Element, node_type: str): """ Citetrails for certain node types are always prefixed with a 'categorical' keyword/string. """ prefix = '' name = etree.QName(node).localname if node_type == 'page': prefix = 'p' elif node_type == 'marginal': prefix = 'n' elif node_type == 'anchor' and exists(node, 'self::tei:milestone[@unit]'): prefix = node.get('unit') elif node_type == 'structural': if name == 'front': prefix = 'frontmatter' elif name == 'back': prefix = 'backmatter' elif exists(node, 'self::tei:text[@type = "work_volume"]'): prefix = 'vol' elif node_type == 'main': if exists(node, 'self::tei:head'): prefix = 'heading' elif exists(node, 'self::tei:titlePage'): prefix = 'titlepage' elif node_type == 'list': if exists(node, 'self::tei:list[@type = "dict" or @type = "index"]'): prefix = node.get('type') elif exists(node, 'self::tei:item[ancestor::tei:list[@type = "dict"]]'): prefix = 'entry' return prefix
def make_resource_metadata(self, tei_header: etree._Element, config, wid: str): """Translates data from the teiHeader of a work to DTS+DC metadata for a DTS textual Resource""" # 1.) gather metadata # a) digital edition id = id_server + '/texts/' + config.get_wid() # TODO @id shouldn't be the same as the @id of the parent collection, but sth more specific title = tei_header.xpath('tei:fileDesc/tei:titleStmt/tei:title[@type = "short"]/text()', namespaces=xml_ns)[0] alt_title = tei_header.xpath('tei:fileDesc/tei:titleStmt/tei:title[@type = "main"]/text()', namespaces=xml_ns)[0] # or short title here? author = '; '.join(self.format_person_names(tei_header.xpath('tei:fileDesc/tei:titleStmt/tei:author/tei:persName', namespaces=xml_ns), reverse=False)) scholarly_editors = self.format_person_names(tei_header.xpath('tei:fileDesc/tei:titleStmt/' + 'tei:editor[contains(@role, "#scholarly")]/tei:persName', namespaces=xml_ns)) technical_editors = self.format_person_names(tei_header.xpath('tei:fileDesc/tei:titleStmt/' + 'tei:editor[contains(@role, "#technical")]/tei:persName', namespaces=xml_ns)) editors = list(set(scholarly_editors + technical_editors)) pub_date = self.get_publish_date(tei_header) version = tei_header.xpath('tei:fileDesc/tei:editionStmt/tei:edition/@n', namespaces=xml_ns)[0] series_volume = tei_header.xpath('tei:fileDesc/tei:seriesStmt/tei:biblScope[@unit = "volume"]/@n', namespaces=xml_ns)[0] rights_holder = { '@id': 'https://id.salamanca.school', 'name': { '@language': 'en', '@value': 'The School of Salamanca' } } # TODO provisional values bibliographic_citation = self.make_bibliographic_citations(tei_header, wid) # b) print source source_title = tei_header.xpath('tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:monogr/tei:title[@type = "main"]/text()', namespaces=xml_ns)[0] source_publishers = self.format_person_names(self.get_source_publishers(tei_header), reverse=False) source_extents = tei_header.xpath('tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:monogr/tei:extent', namespaces=xml_ns) source_extents_i18n = [] for extent in source_extents: if exists(extent, '@xml:lang'): extent_i18n = { '@language': extent.xpath('@xml:lang', namespaces=xml_ns)[0], '@value': extent.xpath('text()')[0] } source_extents_i18n.append(extent_i18n) else: source_extents_i18n.append(extent.xpath('text()')[0]) source_lang = [] for lang in tei_header.xpath('tei:profileDesc/tei:langUsage/tei:language/@ident', namespaces=xml_ns): source_lang.append(lang) source_pub_date = self.get_source_publish_date(tei_header) source_pub_place = self.get_source_publish_place(tei_header) source_repositories = self.get_source_repositories(tei_header) # c) other dts metadata total_items = 0 # TODO dts_total_children = 0 # TODO dts_total_parents = 1 # resource is part only of the parent collection that represents the work dts_cite_depth = config.get_cite_depth() # TODO multivol vs singlevol? # 2.) construct metadata object resource_metadata = { '@context': context, '@id': id, '@type': 'Resource', 'title': title, 'totalItems': total_items, 'dts:totalParents': dts_total_parents, 'dts:totalChildren': dts_total_children, 'dts:citeDepth': dts_cite_depth, 'dts:dublincore': { 'dc:title': title, 'dc:alternative': alt_title, 'dc:contributor': editors, # TODO editors as "contributors"? information can be found also in sal:...Editors 'dc:type': [ 'http://purl.org/spar/fabio/work', 'dc:Text' ], 'dc:created': pub_date, 'dc:bibliographicCitation': bibliographic_citation, 'dc:rightsHolder': rights_holder, 'dc:license': 'http://creativecommons.org/licenses/by/4.0/', 'dc:source': { 'dc:title': source_title, 'dc:creator': author, 'dc:publisher': source_publishers, 'dc:format': source_extents_i18n, 'dc:language': source_lang, 'dc:created': source_pub_date } }, 'dts:extensions': { # supplementary information that doesn't easily fit into dts/dc elements 'sal:version': version, # the version of this edition of the text 'sal:scholarlyEditors': scholarly_editors, 'sal:technicalEditors': technical_editors, # omitting "#additional" editors here 'sal:seriesVolume': series_volume, 'sal:sourcePublishPlace': source_pub_place, # there seem to be no dc elements for this type of information... 'sal:sourceRepositories': source_repositories } } return resource_metadata
def get_place_name(self, place_name: etree._Element): if exists(place_name, '@key'): return place_name.get('key') else: return place_name.text
def get_node_title(self, node): name = etree.QName(node).localname xml_id = node.xpath('@xml:id', namespaces=xml_ns)[0] title = '' if name == 'div': if node.get('n') and not re.match(r'^[\d\[\]]+$', node.get('n')): title = '"' + node.get('n') + '"' elif exists(node, 'tei:head'): title = self.make_node_teaser(node.xpath('tei:head[1]', namespaces=xml_ns)[0]) elif exists(node, 'tei:label'): title = self.make_node_teaser(node.xpath('tei:label[1]', namespaces=xml_ns)[0]) elif node.get('n') and node.get('type'): title = node.get('n') elif exists(node, 'ancestor::tei:TEI//tei:text//tei:ref[@target = "#' + xml_id + '"]'): title = self.make_node_teaser(node.xpath('ancestor::tei:TEI//tei:text//tei:ref[@target = "#' + xml_id + '"][1]')[0]) elif exists(node, 'tei:list/tei:head'): title = self.make_node_teaser(node.xpath('tei:list/tei:head[1]', namespaces=xml_ns)[0]) elif exists(node, 'tei_list/tei:label'): title = self.make_node_teaser(node.xpath('tei:list/tei:label[1]', namespaces=xml_ns)[0]) elif name == 'item': #if exists(node, 'parent::tei:list[@type="dict"] and descendant::tei:term[1]/@key'): # return '"' + node.xpath('descendant::tei:term[1]/@key')[0] + '"' # # TODO this needs revision when we have really have such dict. lists if node.get('n') and not re.match(r'^[\d\[\]]+$', node.get('n')): title = '"' + node.get('n') + '"' elif exists(node, 'tei:head'): title = self.make_node_teaser(node.xpath('tei:head[1]', namespaces=xml_ns)[0]) elif exists(node, 'tei:label'): title = self.make_node_teaser(node.xpath('tei:label[1]', namespaces=xml_ns)[0]) elif node.get('n'): title = node.get('n') elif exists(node, 'ancestor::tei:TEI//tei:text//tei:ref[@target = "#' + xml_id + '"]'): title = self.make_node_teaser( node.xpath('ancestor::tei:TEI//tei:text//tei:ref[@target = "#' + xml_id + '"][1]', namespaces=xml_ns)[0]) elif name == 'lg': if exists(node, 'tei:head'): title = self.make_node_teaser(node.xpath('tei:head[1]', namespaces=xml_ns)[0]) else: title = self.make_node_teaser(node) elif name == 'list': if node.get('n') and not re.match(r'^[\d\[\]]+$', node.get('n')): title = '"' + node.get('n') + '"' elif exists(node, 'tei:head'): title = self.make_node_teaser(node.xpath('tei:head[1]', namespaces=xml_ns)[0]) elif exists(node, 'tei:label'): title = self.make_node_teaser(node.xpath('tei:label[1]', namespaces=xml_ns)[0]) elif node.get('n'): title = node.get('n') elif exists(node, 'ancestor::tei:TEI//tei:text//tei:ref[@target = "#'+ xml_id +'"]'): title = self.make_node_teaser(node.xpath('ancestor::tei:TEI//tei:text//tei:ref[@target = "#'+ xml_id +'"][1]')[0]) elif name == 'milestone': if node.get('n') and not re.match(r'^[\d\[\]]+$', node.get('n')): title = '"' + node.get('n') + '"' elif node.get('n'): title = node.get('n') elif exists(node, 'ancestor::tei:TEI//tei:text//tei:ref[@target = "#'+ xml_id +'"]'): title = self.make_node_teaser(node.xpath('ancestor::tei:TEI//tei:text//tei:ref[@target = "#'+ xml_id +'"][1]')[0]) elif name == 'note': if node.get('n'): title = '"' + node.get('n') + '"' elif name == 'pb': if node.get('n') and re.match(r'fol\.', node.get('n')): title = node.get('n') else: title = 'p. ' + node.get('n') # one could also prepend a 'Vol. ' prefix here in case of a multivolume work elif name == 'text': if node.get('type') == 'work_volume': title = node.get('n') elif name == 'head' or name == 'label' or name == 'p' or name == 'signed' or name == 'titlePart': title = self.make_node_teaser(node) return title