def to_convert(): text_content_xp = faust.xpath("normalize-space(//tei:text)") ge_document_content_xp = faust.xpath("normalize-space(//ge:document)") has_text_in = lambda xp, xml: (len(" ".join(xp(xml)).strip()) > 0) to_convert = list() for xml_file in faust.xml_files(): path = faust.relative_path(xml_file).split("/") if path[0] != "transcript": continue file_name = path[-1] if file_name[:-len(".xml")] == path[-2]: continue if int(re.search(r'[0-9]+', file_name).group(0)) == 1: continue if not faust.is_tei_document(xml_file): continue xml = lxml.etree.parse(xml_file) if has_text_in(text_content_xp, xml) and not has_text_in(ge_document_content_xp, xml): to_convert.append(xml_file) return to_convert
def validate(last=False): """Validates the content of the queue by calling Jing and parsing its output""" global validation_queue if not last and len(validation_queue) < validation_queue_max_length: return validation = subprocess.Popen(shlex.split(validation_command) + validation_queue, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) validation_result = validation.communicate()[0] or "" for msg_match in validation_msg_re.finditer(validation_result): xml_file, line, column, msg = msg_match.groups() xml_file = faust.relative_path(xml_file) error_msg = "[%s:%s] %s" % (line, column, msg) if xml_file in validation_report: validation_report[xml_file].append(error_msg) else: validation_report[xml_file] = [error_msg] validation_queue = []
#!/usr/bin/env python import faust import query import lxml.etree import os.path text_xp = faust.xpath("//tei:text") for f in query.matches(query.documentary_by_name(), "//tei:text and not(//ge:document)"): relpath = faust.relative_path(f) xml = lxml.etree.parse(f) text = text_xp(xml)[0] gedocument = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces) surface = lxml.etree.Element(faust.ns("tei") + "surface") gedocument.append(surface) zone = lxml.etree.Element(faust.ns("tei") + "zone") zone.set("type", "main") surface.append(zone) text.addprevious(gedocument) out = os.path.join("/tmp/faust/" + relpath) outdir = os.path.dirname(out) try: os.makedirs(outdir) except:
"used", "zone") ignored_empty_elems = ( "addSpan", "anchor", "cb", "certainty", "damageSpan", "delSpan", "gap", "grBrace", "grLine", "handShift", "ins", "join", "lb", "pb", "space", "st", "undo", "p") element_selector_xp = faust.xpath("//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]") text_content_xp = faust.xpath("normalize-space()") candidates = dict() for xml_file in faust.xml_files(): try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) xml_key = faust.relative_path(xml_file) candidates[xml_key] = [] for elem in element_selector_xp(xml): if elem.tag.startswith(faust.ns("svg")): continue local_name = elem.tag[elem.tag.rfind("}") + 1:] if local_name in ignored_tags: continue empty_elem = elem.text is None and len(elem) == 0 if empty_elem and local_name in ignored_empty_elems: continue text_content = text_content_xp(elem) if empty_elem or (len(text_content) > 0 and len(text_content.strip()) == 0): candidates[xml_key].append(lxml.etree.tostring(elem)) except IOError:
def d_b_n(file): rel = faust.relative_path(file) return not textual.match(rel)
import sys import lxml.etree import faust # XPath expression for extracting the revision history from TEI documents ge_doc_xp = faust.xpath("normalize-space(//ge:document)") # XPath expression for extracting the revision history from TEI documents change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change") # iterate over all TEI documents for xml_file in faust.xml_files(): status = None try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) if len(ge_doc_xp(xml).strip()) == 0: continue encoded = False for change in change_xp(xml): change_str = lxml.etree.tostring(change).lower() if "encoded" in change_str: encoded = True if not encoded: print faust.relative_path(xml_file) except IOError: sys.stderr.write("I/O error while extracting status from " + xml_file + "\n")
#!/usr/bin/env python import faust import query import lxml.etree import os.path text_xp = faust.xpath("//tei:text") for f in query.matches (query.documentary_by_name(), "//tei:text and not(//ge:document)"): relpath = faust.relative_path(f) xml = lxml.etree.parse(f) text = text_xp(xml)[0] gedocument = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces) surface = lxml.etree.Element(faust.ns("tei") + "surface") gedocument.append(surface) zone = lxml.etree.Element(faust.ns("tei") + "zone") zone.set("type", "main") surface.append(zone) text.addprevious(gedocument) out = os.path.join("/tmp/faust/" + relpath) outdir = os.path.dirname(out) try: os.makedirs (outdir) except: pass xml.write(out, encoding="UTF-8")
def convert(): tei_text_xp = faust.xpath("//tei:text") xml_id_cnt = 0 for xml_file in [faust.absolute_path(rel) for rel in static_to_convert()]: print xml_file xml = lxml.etree.parse(xml_file) # ***** Documentary transcript ***** # prepare <ge:document/> context root = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces) root.set(faust.ns("xml") + "id", "converted") for child in xml.getroot(): if child.tag not in (faust.ns("tei") + "teiHeader", faust.ns("tei") + "facsimile"): child.addprevious(root) break surface = lxml.etree.SubElement(root, faust.ns("tei") + "surface") for body in faust.xpath(".//tei:body", xml): body_zone = copy.deepcopy(body) body_zone.tag = faust.ns("tei") + "zone" surface.append(body_zone) root = surface # let <add/>/<del/> inherit @hand from <subst/>/<restore/> for container_with_hand in faust.xpath(".//tei:subst[@hand]|./tei:restore[@hand]", root): hand = container_with_hand.get("hand") for add_xml in faust.xpath("./tei:add[count(@hand) = 0]", container_with_hand): add_xml.set("hand", hand) for del_xml in faust.xpath("./tei:del[count(@hand) = 0]", container_with_hand): del_xml.set("hand", hand) del container_with_hand.attrib["hand"] # convert @hand into <handShift/> for hand_annotated in faust.xpath(".//*[@hand]", root): if hand_annotated.tag not in (faust.ns("tei") + "add", faust.ns("tei") + "fw"): continue handShifts = faust.xpath("./preceding::tei:handShift", hand_annotated) last_hand = (len(handShifts) > 0) and handShifts[-1].get("new") or "#i_have_no_idea" # start of new hand hs = lxml.etree.Element(faust.ns("tei") + "handShift") hs.set("new", hand_annotated.get("hand")) hs.tail = hand_annotated.text hand_annotated.text = None hand_annotated.insert(0, hs) # reset to last hand hs = lxml.etree.Element(faust.ns("tei") + "handShift") hs.set("new", last_hand) hand_annotated.append(hs) del hand_annotated.attrib["hand"] # convert <div/> with @type == "zone" for div in root.iter(faust.ns("tei") + "div"): if "zone" == div.get("type", ""): div.tag = faust.ns("tei") + "zone" del div.attrib["type"] # convert overwritten parts for subst in root.iter(faust.ns("tei") + "subst"): att_vals = ["overwrite", "overwriting", "overwritiung"] def type_overwr_in_attributes(element): return element.get("type", "") in att_vals def rend_overwr_in_attributes(element): return element.get("rend", "") in att_vals children_with_type = filter(type_overwr_in_attributes, subst) children_with_rend = filter(rend_overwr_in_attributes, subst) # type attribute in substitution if type_overwr_in_attributes(subst): del subst.attrib["type"] # rend attribute in substitution elif rend_overwr_in_attributes(subst): del subst.attrib["rend"] # type attribute in a child (in add or del) elif children_with_type: for child in children_with_type: del child.attrib["type"] # rend attribute in a child elif children_with_rend: for child in children_with_rend: del child.attrib["rend"] else: continue subst.tag = faust.ns("f") + "overw" for del_xml in subst.findall(faust.ns("tei") + "del"): del_xml.tag = faust.ns("f") + "under" for add in subst.findall(faust.ns("tei") + "add"): add.tag = faust.ns("f") + "over" # <div type="cleared"/> becomes <ge:used spanTo="#..."> for div in root.iter(faust.ns("tei") + "div"): if "type" in div.attrib: if div.attrib["type"] == "cleared": used = lxml.etree.Element(faust.ns("ge") + "used") div.addprevious(used) xml_id_cnt += 1 anchor_id = str(xml_id_cnt) used.set("spanTo", "#" + anchor_id) for child in div.getchildren(): div.remove(child) div.addprevious(child) div.tag = faust.ns("tei") + "anchor" div.set(faust.ns("xml") + "id", anchor_id) # throw away text structure tagging lxml.etree.strip_tags(root,\ faust.ns("tei") + "div", faust.ns("tei") + "lg",\ faust.ns("tei") + "sp", faust.ns("tei") + "subst",\ faust.ns("tei") + "name", faust.ns("tei") + "addSpan") # remove Schroer numbers for l in root.iter(faust.ns("tei") + "l"): if "n" in l.attrib: del l.attrib["n"] # create simple lines for line_element in ("speaker", "l", "p", "stage", "head", "ab"): line_elements = list(root.iter(faust.ns("tei") + line_element)) for le in line_elements: if le.get("rend", "") in ["underline", "underlined", "centered unterline"]: hi = copy.deepcopy(le) hi.tag = faust.ns("tei") + "hi" le.clear() for attr in list(hi.attrib.keys()): if attr == "rend": continue le.set(attr, hi.get(attr)) del hi.attrib[attr] le.append(hi) le.tag = faust.ns("ge") + "line" # turn deletions into <f:st/> by default for del_xml in root.iter(faust.ns("tei") + "del"): del_xml.tag = faust.ns("f") + "st" del_type = del_xml.get("rend", "") if del_type == "strikethrough" or del_type == "strikedthrough": del del_xml.attrib["rend"] # rename tags for fixations for rewrite_tag in ("fix", "repetition"): for rewrite in root.iter(faust.ns("tei") + rewrite_tag): rewrite.tag = faust.ns("ge") + "rewrite" # rename semantic tags with @rend="underline" for sem_hi_tag in ("emph", "name"): for sem_hi in root.iter(faust.ns("tei") + sem_hi_tag): if sem_hi.get("rend", "") == "underline": sem_hi.tag = faust.ns("tei") + "hi" # convert umlaut corrections umlaut_mapping = { u"ä":u"a", u"Ä":u"A", u"ö":u"o", u"Ö":u"O", u"ü":u"u", u"Ü":u"U" } corr_or_reg = itertools.chain(root.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg")) for element in corr_or_reg: for umlaut in umlaut_mapping: if element.text == umlaut: element.text = umlaut_mapping[umlaut] element.tag = faust.ns("tei") + "orig" # join lines with @rend='inline' for inline_line in list(faust.xpath(".//ge:line[@rend='inline']", root)): prev_lines = faust.xpath("./preceding::ge:line", inline_line) if len(prev_lines) == 0: continue prev_line = prev_lines[-1] if inline_line.text is None: inline_line.text = " " else: inline_line.text += " " inline_line.getparent().remove(inline_line) prev_line.append(inline_line) lxml.etree.strip_tags(prev_line, faust.ns("ge") + "line") # convert inline <lb/> to <ge:line/> for lb in list(root.iter(faust.ns("tei") + "lb")): parent = lb.getparent() if parent.tag != (faust.ns("ge") + "line"): continue lb.tag = faust.ns("ge") + "line" lb.text = lb.tail lb.tail = None sibling = lb.getnext() while sibling is not None: next_sibling = sibling.getnext() parent.remove(sibling) lb.append(sibling) sibling = next_sibling parent.remove(lb) parent.addnext(lb) # put <note/> in zones for note in list(root.iter(faust.ns("tei") + "note")): parent = surface if len(faust.xpath(".//ge:line", note)) == 0: parent = lxml.etree.SubElement(parent, faust.ns("tei") + "zone") note.tag = faust.ns("ge") + "line" else: note.tag = faust.ns("tei") + "zone" note.getparent().remove(note) parent.append(note) if "place" in note.attrib: del note.attrib["place"] # u<ex>nd</ex> becomes "und" for ex in root.iter(faust.ns("tei") + "ex"): try: pre_sibling = ex.itersiblings(preceding=True).next() except StopIteration: continue if pre_sibling.text: if re.split("\s+", pre_sibling.text).pop() == "u": # only in two files, do nothing pass # <abbr>u</abbr> becomes "und" for abbr in root.iter(faust.ns("tei") + "abbr"): if abbr.text == "u": tail = abbr.tail if tail: abbr.tail = "und" + tail else: abbr.tail = "und" remove_keep_tail(abbr) #abbr.tag = None # </ex> outside of <abbr/> becomes <supplied/> for ex in root.iter(faust.ns("tei") + "ex"): pass if not list(ex.iterancestors(faust.ns("tei") + "abbr")): ex.tag = faust.ns("tei") + "supplied" # <delSpan/> becomes <f:st/> for delSpan in root.iter(faust.ns("tei") + "delSpan"): delSpan.tag = faust.ns("f") + "st" # detach marginal elements for margin in list(faust.xpath(".//*[@place]", root)): place = margin.get("place") if place not in ("margin",\ "top", "top-left", "topleft", "top-right", "topright",\ "bottom", "bottom-left", "bottomleft", "bottom-right", "bottomright"): continue del margin.attrib["place"] parent = margin.getparent() margin_zone = lxml.etree.Element(faust.ns("tei") + "zone") if place.startswith("top"): surface.insert(0, margin_zone) else: surface.append(margin_zone) margin_parent = margin_zone if margin.tag != faust.ns("ge") + "line": margin_parent = lxml.etree.SubElement(margin_parent, faust.ns("ge") + "line") for ancestor in margin.iterancestors(faust.ns("ge") + "line"): line_id = ancestor.get(faust.ns("xml") + "id", None) if line_id is None: xml_id_cnt += 1 line_id = "line_" + str(xml_id_cnt) ancestor.set(faust.ns("xml") + "id", line_id) margin_zone.set(faust.ns("f") + "top", "#" + line_id) break parent.remove(margin) margin_parent.append(margin) # detach interlinear additions for inter_add in list(faust.xpath(".//tei:add[@place='above' or @place='below']", root)): line = None for ancestor in inter_add.iterancestors(faust.ns("ge") + "line"): line = ancestor break if line is None: raise Exception(lxml.etree.tostring(inter_add)) adjunct_line = None if inter_add.get("place") == "above": adjunct_line = line.getprevious() else: adjunct_line = line.getnext() if (adjunct_line is None) or (adjunct_line.tag != (faust.ns("ge") + "line")) or\ (adjunct_line.get("type", "") != "inter"): adjunct_line = lxml.etree.Element(faust.ns("ge") + "line") adjunct_line.set("type", "inter") if inter_add.get("place") == "above": line.addprevious(adjunct_line) else: line.addnext(adjunct_line) xml_id_cnt += 1 anchor_id = "anchor_" + str(xml_id_cnt) ins_mark = lxml.etree.SubElement(adjunct_line, faust.ns("f") + "ins") ins_mark.set(faust.ns("f") + "at", "#" + anchor_id) ins_mark.tail = inter_add.text inter_add.text = None inter_add.tag = faust.ns("tei") + "anchor" inter_add.set(faust.ns("xml") + "id", anchor_id) for child in inter_add.getchildren(): inter_add.remove(child) adjunct_line.append(child) del inter_add.attrib["place"] # remove remaining <add/> elements lxml.etree.strip_tags(root, faust.ns("tei") + "add") # remove <lb/>s, which are located in zones after conversion for lb in list(root.iter(faust.ns("tei") + "lb")): parent = lb.getparent() if parent.tag == (faust.ns("tei") + "zone"): parent.remove(lb) # convert some attribute values for typed in faust.xpath(".//*[@type='foliation']" , root): typed.set("type", "folioNum") for typed in faust.xpath(".//*[@type='sigel']" , root): typed.set("type", "sig") for typed in faust.xpath(".//*[@type='sigil']" , root): typed.set("type", "sig") # "#_bl", "#_t" u.ä. → "#sc_bl" etc. for any_elem in root.iter(tag=lxml.etree.Element): for attrib in any_elem.attrib: val = any_elem.get(attrib) if val.startswith("#_"): new_val ="#sc" + val[1:] any_elem.set(attrib, new_val) #remove type attributes for certain values for typed in faust.xpath(".//*[@type='instant' or @type='inst' or @type='instantrevision'\ or @type='late' or @type='soon']" , root): del typed.attrib["type"] # ***** Textual transcript ***** for text in faust.xpath(".//tei:text", xml): # remove hand attribute for hand_attributed in faust.xpath(".//*[@hand]", text): del hand_attributed.attrib["hand"] #remove handShifts lxml.etree.strip_tags(text, faust.ns("tei") + "handShift") # convert umlaut corrections corr_or_reg = itertools.chain(text.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg")) for element in corr_or_reg: for umlaut in umlaut_mapping: if element.text == umlaut: element.text = umlaut_mapping[umlaut] element.tag = faust.ns("tei") + "orig" # write the converted file path = ("conversion_test/" + faust.relative_path(xml_file)).split("/") path[-1] = "conv_" + path[-1] dir_path = "/".join(path[:-1]) if not os.path.isdir(dir_path): os.makedirs(dir_path) xml.write("/".join(path), encoding="UTF-8")
# import sys import lxml.etree import faust # XPath expression for extracting the revision history from TEI documents ge_doc_xp = faust.xpath("normalize-space(//ge:document)") # XPath expression for extracting the revision history from TEI documents change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change") # iterate over all TEI documents for xml_file in faust.xml_files(): status = None try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) if len(ge_doc_xp(xml).strip()) == 0: continue encoded = False for change in change_xp(xml): change_str = lxml.etree.tostring(change).lower() if "encoded" in change_str: encoded = True if not encoded: print faust.relative_path(xml_file) except IOError: sys.stderr.write("I/O error while extracting status from " + xml_file + "\n")
ignored_empty_elems = ("addSpan", "anchor", "cb", "certainty", "damageSpan", "delSpan", "gap", "grBrace", "grLine", "handShift", "ins", "join", "lb", "pb", "space", "st", "undo", "p") element_selector_xp = faust.xpath( "//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]") text_content_xp = faust.xpath("normalize-space()") candidates = dict() for xml_file in faust.xml_files(): try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) xml_key = faust.relative_path(xml_file) candidates[xml_key] = [] for elem in element_selector_xp(xml): if elem.tag.startswith(faust.ns("svg")): continue local_name = elem.tag[elem.tag.rfind("}") + 1:] if local_name in ignored_tags: continue empty_elem = elem.text is None and len(elem) == 0 if empty_elem and local_name in ignored_empty_elems: continue text_content = text_content_xp(elem) if empty_elem or (len(text_content) > 0 and len(text_content.strip()) == 0): candidates[xml_key].append(lxml.etree.tostring(elem))