folders.remove(("verschiedenes", )) if len(folders) > 1 and ("paralipomena", ) in folders: folders.remove(("paralipomena", )) if len(folders) > 1: folders.sort() folders.reverse() for folder in folders: if "faust" == folder[0]: documents_struct[gsa_ident] = [folder] break if len(documents_struct[gsa_ident]) > 1: documents_struct[gsa_ident] = [folders[-1]] for gsa_ident in gsa_documents: gathering_path = "/".join(("transcript", "gsa", gsa_ident)) transcript_dir = faust.absolute_path(gathering_path) document_xml = gsa_documents[gsa_ident] text = None pages = list() for f in os.listdir(transcript_dir): if not f.endswith(".xml"): continue f_ident = re.search(r'[0-9]+', f).group(0) if f_ident == gsa_ident: text = f continue if int(f_ident) == 1: continue pages.append(f) if len(pages) > 0: document_xml.set(xml_ns + "base", "faust://xml/" + gathering_path + "/") if text is not None:
# Updates the all TEI headers based on a template # import copy import sys import lxml.etree import faust # XPath expressions for extraction of templated header contents handNotes_xp = faust.xpath("//tei:teiHeader/tei:profileDesc/tei:handNotes") charDecl_xp = faust.xpath("//tei:teiHeader/tei:encodingDesc/tei:charDecl") # Get the template and parse it tei_template = faust.absolute_path("template/tei.xml") template = lxml.etree.parse(tei_template) # extract relevant header fragments from template template_hand_notes = handNotes_xp(template)[0] template_char_decl = charDecl_xp(template)[0] def replace(node, with_node): '''Replaces a node with a deep copy of a node (from another document)''' node.getparent().replace(node, copy.deepcopy(with_node)) # iterate over TEI files (excluding the template) for xml_file in faust.xml_files(): try: if (xml_file != tei_template) and faust.is_tei_document(xml_file):
def to_convert_unencoded(): files = [faust.absolute_path(rel_file) for rel_file in static_to_convert()] return query.matches (files, "not(" + query.encoded_xp + ")")
def convert(): tei_text_xp = faust.xpath("//tei:text") xml_id_cnt = 0 for xml_file in [faust.absolute_path(rel) for rel in static_to_convert()]: print xml_file xml = lxml.etree.parse(xml_file) # ***** Documentary transcript ***** # prepare <ge:document/> context root = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces) root.set(faust.ns("xml") + "id", "converted") for child in xml.getroot(): if child.tag not in (faust.ns("tei") + "teiHeader", faust.ns("tei") + "facsimile"): child.addprevious(root) break surface = lxml.etree.SubElement(root, faust.ns("tei") + "surface") for body in faust.xpath(".//tei:body", xml): body_zone = copy.deepcopy(body) body_zone.tag = faust.ns("tei") + "zone" surface.append(body_zone) root = surface # let <add/>/<del/> inherit @hand from <subst/>/<restore/> for container_with_hand in faust.xpath(".//tei:subst[@hand]|./tei:restore[@hand]", root): hand = container_with_hand.get("hand") for add_xml in faust.xpath("./tei:add[count(@hand) = 0]", container_with_hand): add_xml.set("hand", hand) for del_xml in faust.xpath("./tei:del[count(@hand) = 0]", container_with_hand): del_xml.set("hand", hand) del container_with_hand.attrib["hand"] # convert @hand into <handShift/> for hand_annotated in faust.xpath(".//*[@hand]", root): if hand_annotated.tag not in (faust.ns("tei") + "add", faust.ns("tei") + "fw"): continue handShifts = faust.xpath("./preceding::tei:handShift", hand_annotated) last_hand = (len(handShifts) > 0) and handShifts[-1].get("new") or "#i_have_no_idea" # start of new hand hs = lxml.etree.Element(faust.ns("tei") + "handShift") hs.set("new", hand_annotated.get("hand")) hs.tail = hand_annotated.text hand_annotated.text = None hand_annotated.insert(0, hs) # reset to last hand hs = lxml.etree.Element(faust.ns("tei") + "handShift") hs.set("new", last_hand) hand_annotated.append(hs) del hand_annotated.attrib["hand"] # convert <div/> with @type == "zone" for div in root.iter(faust.ns("tei") + "div"): if "zone" == div.get("type", ""): div.tag = faust.ns("tei") + "zone" del div.attrib["type"] # convert overwritten parts for subst in root.iter(faust.ns("tei") + "subst"): att_vals = ["overwrite", "overwriting", "overwritiung"] def type_overwr_in_attributes(element): return element.get("type", "") in att_vals def rend_overwr_in_attributes(element): return element.get("rend", "") in att_vals children_with_type = filter(type_overwr_in_attributes, subst) children_with_rend = filter(rend_overwr_in_attributes, subst) # type attribute in substitution if type_overwr_in_attributes(subst): del subst.attrib["type"] # rend attribute in substitution elif rend_overwr_in_attributes(subst): del subst.attrib["rend"] # type attribute in a child (in add or del) elif children_with_type: for child in children_with_type: del child.attrib["type"] # rend attribute in a child elif children_with_rend: for child in children_with_rend: del child.attrib["rend"] else: continue subst.tag = faust.ns("f") + "overw" for del_xml in subst.findall(faust.ns("tei") + "del"): del_xml.tag = faust.ns("f") + "under" for add in subst.findall(faust.ns("tei") + "add"): add.tag = faust.ns("f") + "over" # <div type="cleared"/> becomes <ge:used spanTo="#..."> for div in root.iter(faust.ns("tei") + "div"): if "type" in div.attrib: if div.attrib["type"] == "cleared": used = lxml.etree.Element(faust.ns("ge") + "used") div.addprevious(used) xml_id_cnt += 1 anchor_id = str(xml_id_cnt) used.set("spanTo", "#" + anchor_id) for child in div.getchildren(): div.remove(child) div.addprevious(child) div.tag = faust.ns("tei") + "anchor" div.set(faust.ns("xml") + "id", anchor_id) # throw away text structure tagging lxml.etree.strip_tags(root,\ faust.ns("tei") + "div", faust.ns("tei") + "lg",\ faust.ns("tei") + "sp", faust.ns("tei") + "subst",\ faust.ns("tei") + "name", faust.ns("tei") + "addSpan") # remove Schroer numbers for l in root.iter(faust.ns("tei") + "l"): if "n" in l.attrib: del l.attrib["n"] # create simple lines for line_element in ("speaker", "l", "p", "stage", "head", "ab"): line_elements = list(root.iter(faust.ns("tei") + line_element)) for le in line_elements: if le.get("rend", "") in ["underline", "underlined", "centered unterline"]: hi = copy.deepcopy(le) hi.tag = faust.ns("tei") + "hi" le.clear() for attr in list(hi.attrib.keys()): if attr == "rend": continue le.set(attr, hi.get(attr)) del hi.attrib[attr] le.append(hi) le.tag = faust.ns("ge") + "line" # turn deletions into <f:st/> by default for del_xml in root.iter(faust.ns("tei") + "del"): del_xml.tag = faust.ns("f") + "st" del_type = del_xml.get("rend", "") if del_type == "strikethrough" or del_type == "strikedthrough": del del_xml.attrib["rend"] # rename tags for fixations for rewrite_tag in ("fix", "repetition"): for rewrite in root.iter(faust.ns("tei") + rewrite_tag): rewrite.tag = faust.ns("ge") + "rewrite" # rename semantic tags with @rend="underline" for sem_hi_tag in ("emph", "name"): for sem_hi in root.iter(faust.ns("tei") + sem_hi_tag): if sem_hi.get("rend", "") == "underline": sem_hi.tag = faust.ns("tei") + "hi" # convert umlaut corrections umlaut_mapping = { u"ä":u"a", u"Ä":u"A", u"ö":u"o", u"Ö":u"O", u"ü":u"u", u"Ü":u"U" } corr_or_reg = itertools.chain(root.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg")) for element in corr_or_reg: for umlaut in umlaut_mapping: if element.text == umlaut: element.text = umlaut_mapping[umlaut] element.tag = faust.ns("tei") + "orig" # join lines with @rend='inline' for inline_line in list(faust.xpath(".//ge:line[@rend='inline']", root)): prev_lines = faust.xpath("./preceding::ge:line", inline_line) if len(prev_lines) == 0: continue prev_line = prev_lines[-1] if inline_line.text is None: inline_line.text = " " else: inline_line.text += " " inline_line.getparent().remove(inline_line) prev_line.append(inline_line) lxml.etree.strip_tags(prev_line, faust.ns("ge") + "line") # convert inline <lb/> to <ge:line/> for lb in list(root.iter(faust.ns("tei") + "lb")): parent = lb.getparent() if parent.tag != (faust.ns("ge") + "line"): continue lb.tag = faust.ns("ge") + "line" lb.text = lb.tail lb.tail = None sibling = lb.getnext() while sibling is not None: next_sibling = sibling.getnext() parent.remove(sibling) lb.append(sibling) sibling = next_sibling parent.remove(lb) parent.addnext(lb) # put <note/> in zones for note in list(root.iter(faust.ns("tei") + "note")): parent = surface if len(faust.xpath(".//ge:line", note)) == 0: parent = lxml.etree.SubElement(parent, faust.ns("tei") + "zone") note.tag = faust.ns("ge") + "line" else: note.tag = faust.ns("tei") + "zone" note.getparent().remove(note) parent.append(note) if "place" in note.attrib: del note.attrib["place"] # u<ex>nd</ex> becomes "und" for ex in root.iter(faust.ns("tei") + "ex"): try: pre_sibling = ex.itersiblings(preceding=True).next() except StopIteration: continue if pre_sibling.text: if re.split("\s+", pre_sibling.text).pop() == "u": # only in two files, do nothing pass # <abbr>u</abbr> becomes "und" for abbr in root.iter(faust.ns("tei") + "abbr"): if abbr.text == "u": tail = abbr.tail if tail: abbr.tail = "und" + tail else: abbr.tail = "und" remove_keep_tail(abbr) #abbr.tag = None # </ex> outside of <abbr/> becomes <supplied/> for ex in root.iter(faust.ns("tei") + "ex"): pass if not list(ex.iterancestors(faust.ns("tei") + "abbr")): ex.tag = faust.ns("tei") + "supplied" # <delSpan/> becomes <f:st/> for delSpan in root.iter(faust.ns("tei") + "delSpan"): delSpan.tag = faust.ns("f") + "st" # detach marginal elements for margin in list(faust.xpath(".//*[@place]", root)): place = margin.get("place") if place not in ("margin",\ "top", "top-left", "topleft", "top-right", "topright",\ "bottom", "bottom-left", "bottomleft", "bottom-right", "bottomright"): continue del margin.attrib["place"] parent = margin.getparent() margin_zone = lxml.etree.Element(faust.ns("tei") + "zone") if place.startswith("top"): surface.insert(0, margin_zone) else: surface.append(margin_zone) margin_parent = margin_zone if margin.tag != faust.ns("ge") + "line": margin_parent = lxml.etree.SubElement(margin_parent, faust.ns("ge") + "line") for ancestor in margin.iterancestors(faust.ns("ge") + "line"): line_id = ancestor.get(faust.ns("xml") + "id", None) if line_id is None: xml_id_cnt += 1 line_id = "line_" + str(xml_id_cnt) ancestor.set(faust.ns("xml") + "id", line_id) margin_zone.set(faust.ns("f") + "top", "#" + line_id) break parent.remove(margin) margin_parent.append(margin) # detach interlinear additions for inter_add in list(faust.xpath(".//tei:add[@place='above' or @place='below']", root)): line = None for ancestor in inter_add.iterancestors(faust.ns("ge") + "line"): line = ancestor break if line is None: raise Exception(lxml.etree.tostring(inter_add)) adjunct_line = None if inter_add.get("place") == "above": adjunct_line = line.getprevious() else: adjunct_line = line.getnext() if (adjunct_line is None) or (adjunct_line.tag != (faust.ns("ge") + "line")) or\ (adjunct_line.get("type", "") != "inter"): adjunct_line = lxml.etree.Element(faust.ns("ge") + "line") adjunct_line.set("type", "inter") if inter_add.get("place") == "above": line.addprevious(adjunct_line) else: line.addnext(adjunct_line) xml_id_cnt += 1 anchor_id = "anchor_" + str(xml_id_cnt) ins_mark = lxml.etree.SubElement(adjunct_line, faust.ns("f") + "ins") ins_mark.set(faust.ns("f") + "at", "#" + anchor_id) ins_mark.tail = inter_add.text inter_add.text = None inter_add.tag = faust.ns("tei") + "anchor" inter_add.set(faust.ns("xml") + "id", anchor_id) for child in inter_add.getchildren(): inter_add.remove(child) adjunct_line.append(child) del inter_add.attrib["place"] # remove remaining <add/> elements lxml.etree.strip_tags(root, faust.ns("tei") + "add") # remove <lb/>s, which are located in zones after conversion for lb in list(root.iter(faust.ns("tei") + "lb")): parent = lb.getparent() if parent.tag == (faust.ns("tei") + "zone"): parent.remove(lb) # convert some attribute values for typed in faust.xpath(".//*[@type='foliation']" , root): typed.set("type", "folioNum") for typed in faust.xpath(".//*[@type='sigel']" , root): typed.set("type", "sig") for typed in faust.xpath(".//*[@type='sigil']" , root): typed.set("type", "sig") # "#_bl", "#_t" u.ä. → "#sc_bl" etc. for any_elem in root.iter(tag=lxml.etree.Element): for attrib in any_elem.attrib: val = any_elem.get(attrib) if val.startswith("#_"): new_val ="#sc" + val[1:] any_elem.set(attrib, new_val) #remove type attributes for certain values for typed in faust.xpath(".//*[@type='instant' or @type='inst' or @type='instantrevision'\ or @type='late' or @type='soon']" , root): del typed.attrib["type"] # ***** Textual transcript ***** for text in faust.xpath(".//tei:text", xml): # remove hand attribute for hand_attributed in faust.xpath(".//*[@hand]", text): del hand_attributed.attrib["hand"] #remove handShifts lxml.etree.strip_tags(text, faust.ns("tei") + "handShift") # convert umlaut corrections corr_or_reg = itertools.chain(text.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg")) for element in corr_or_reg: for umlaut in umlaut_mapping: if element.text == umlaut: element.text = umlaut_mapping[umlaut] element.tag = faust.ns("tei") + "orig" # write the converted file path = ("conversion_test/" + faust.relative_path(xml_file)).split("/") path[-1] = "conv_" + path[-1] dir_path = "/".join(path[:-1]) if not os.path.isdir(dir_path): os.makedirs(dir_path) xml.write("/".join(path), encoding="UTF-8")
folders.remove(("verschiedenes", )) if len(folders) > 1 and ("paralipomena", ) in folders: folders.remove(("paralipomena", )) if len(folders) > 1: folders.sort() folders.reverse() for folder in folders: if "faust" == folder[0]: documents_struct[gsa_ident] = [ folder ] break if len(documents_struct[gsa_ident]) > 1: documents_struct[gsa_ident] = [ folders[-1] ] for gsa_ident in gsa_documents: gathering_path = "/".join(("transcript", "gsa", gsa_ident)) transcript_dir = faust.absolute_path(gathering_path) document_xml = gsa_documents[gsa_ident] text = None pages = list() for f in os.listdir(transcript_dir): if not f.endswith(".xml"): continue f_ident = re.search(r'[0-9]+', f).group(0) if f_ident == gsa_ident: text = f continue if int(f_ident) == 1: continue pages.append(f) if len(pages) > 0: document_xml.set(xml_ns + "base", "faust://xml/" + gathering_path + "/") if text is not None: document_xml.set("transcript", text)