def to_convert(): text_content_xp = faust.xpath("normalize-space(//tei:text)") ge_document_content_xp = faust.xpath("normalize-space(//ge:document)") has_text_in = lambda xp, xml: (len(" ".join(xp(xml)).strip()) > 0) to_convert = list() for xml_file in faust.xml_files(): path = faust.relative_path(xml_file).split("/") if path[0] != "transcript": continue file_name = path[-1] if file_name[:-len(".xml")] == path[-2]: continue if int(re.search(r'[0-9]+', file_name).group(0)) == 1: continue if not faust.is_tei_document(xml_file): continue xml = lxml.etree.parse(xml_file) if has_text_in(text_content_xp, xml) and not has_text_in(ge_document_content_xp, xml): to_convert.append(xml_file) return to_convert
import faust ignored_tags = ( "app", "back", "body", "choice", "div", "docTitle", "fix", "front", "fw", "g", "group", "lg", "overw", "patch", "sp", "subst", "surface", "text", "titlePage", "titlePart", "used", "zone") ignored_empty_elems = ( "addSpan", "anchor", "cb", "certainty", "damageSpan", "delSpan", "gap", "grBrace", "grLine", "handShift", "ins", "join", "lb", "pb", "space", "st", "undo", "p") element_selector_xp = faust.xpath("//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]") text_content_xp = faust.xpath("normalize-space()") candidates = dict() for xml_file in faust.xml_files(): try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) xml_key = faust.relative_path(xml_file) candidates[xml_key] = [] for elem in element_selector_xp(xml): if elem.tag.startswith(faust.ns("svg")): continue local_name = elem.tag[elem.tag.rfind("}") + 1:] if local_name in ignored_tags: continue empty_elem = elem.text is None and len(elem) == 0 if empty_elem and local_name in ignored_empty_elems: continue
# import sys import lxml.etree import faust # XPath expression for extracting the revision history from TEI documents ge_doc_xp = faust.xpath("normalize-space(//ge:document)") # XPath expression for extracting the revision history from TEI documents change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change") # iterate over all TEI documents for xml_file in faust.xml_files(): status = None try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) if len(ge_doc_xp(xml).strip()) == 0: continue encoded = False for change in change_xp(xml): change_str = lxml.etree.tostring(change).lower() if "encoded" in change_str: encoded = True if not encoded: print faust.relative_path(xml_file) except IOError: sys.stderr.write("I/O error while extracting status from " + xml_file +