def compare_dirs(one, two): '''Compares two directories, ignoring whitespace. Replaces the tei:text and ge:document elements files in one by those of files in two if equal''' for dir in os.walk(one): for file in dir[2]: if file.endswith('.xml'): path_1 = os.path.join(dir[0], file) relpath = os.path.relpath(path_1, one) path_2 = os.path.join(two, relpath) print(relpath, '\t', end='') if os.path.isfile(path_2): try: xml_1 = lxml.etree.parse(path_1) xml_2 = lxml.etree.parse(path_2) #remove templates from new xml templates = template_xp(xml_1) if templates: templates[0].getparent().remove(templates[0]) print("tei:text ", end='') txt_equal = compare_elements(txt_xp(xml_1), txt_xp(xml_2)) print("\t", end=''); print("ge:document ", end='') doc_equal = compare_elements(doc_xp(xml_1), doc_xp(xml_2)) print("\t", end='') if replace_new: if txt_equal: replace(txt_xp(xml_1)[0], txt_xp(xml_2)[0]) if doc_equal: replace(doc_xp(xml_1)[0], doc_xp(xml_2)[0]) if templates: body_xp(xml_1)[0].append(copy.deepcopy(templates[0])) if txt_equal or doc_equal: rev_desc.add_change(xml_1, "system", "whitespace-restored") faust.tei_serialize(xml_1).write(path_1, encoding="UTF-8") if templates: print(" *t* ", end='') if txt_equal or doc_equal: print(" ***MODIFIED***", end='') except lxml.etree.XMLSyntaxError: print("XML syntax error", end='') except ValueError as e: print(e, end=''); else: print ("not in dir2", end='') print()
def compare_dirs(one, two): """Compares two directories, ignoring whitespace. Replaces the tei:text and ge:document elements files in one by those of files in two if equal""" for dir in os.walk(one): for file in dir[2]: if file.endswith(".xml"): path_1 = os.path.join(dir[0], file) relpath = os.path.relpath(path_1, one) path_2 = os.path.join(two, relpath) print(relpath, "\t", end="") if os.path.isfile(path_2): try: xml_1 = lxml.etree.parse(path_1) xml_2 = lxml.etree.parse(path_2) # remove templates from new xml templates = template_xp(xml_1) if templates: templates[0].getparent().remove(templates[0]) print("tei:text ", end="") txt_equal = compare_elements(txt_xp(xml_1), txt_xp(xml_2)) print("\t", end="") print("ge:document ", end="") doc_equal = compare_elements(doc_xp(xml_1), doc_xp(xml_2)) print("\t", end="") if replace_new: if txt_equal: replace(txt_xp(xml_1)[0], txt_xp(xml_2)[0]) if doc_equal: replace(doc_xp(xml_1)[0], doc_xp(xml_2)[0]) if templates: body_xp(xml_1)[0].append(copy.deepcopy(templates[0])) if txt_equal or doc_equal: rev_desc.add_change(xml_1, "system", "whitespace-restored") faust.tei_serialize(xml_1).write(path_1, encoding="UTF-8") if templates: print(" *t* ", end="") if txt_equal or doc_equal: print(" ***MODIFIED***", end="") except lxml.etree.XMLSyntaxError: print("XML syntax error", end="") except ValueError as e: print(e, end="") else: print("not in dir2", end="") print()
def correct_graphic_uris(): # take into account old GSA files files = [f for f in faust.transcript_files() if '/gsa/' in f] files.extend(xml_names_from_facsimiles()) for f in files: rewrite_file = False try: xml = lxml.etree.parse(f) except IOError: # these should only be GSA files print "(", f, " doesn't exist)" continue print f graphics = graphic_xp(xml) if len(graphics) == 0: append_facsimile_element(xml) # find the newly appended element graphics = graphic_xp(xml) brutal = False if len(graphics) == 1: brutal = True for graphic in graphics: old = graphic.attrib["url"] new = correct_uri(old, brutal, f) graphic.attrib["url"] = new if new != old: print " correcting: ", old, " -> ", new rewrite_file = True if rewrite_file: rev_desc.add_change(xml, "system", "facsimile_adapted") print " writing" faust.tei_serialize(xml).write(f, encoding='UTF-8') else: print " not writing"