Пример #1
0
def compare_dirs(one, two):
	'''Compares two directories, ignoring whitespace.

	Replaces the tei:text and ge:document elements files in one by
	those of files in two if equal'''

	for dir in os.walk(one):
		for file in dir[2]:
			if file.endswith('.xml'):
				path_1 = os.path.join(dir[0], file)
				relpath =  os.path.relpath(path_1, one)
				path_2 = os.path.join(two, relpath)
				print(relpath, '\t', end='')
				
				if os.path.isfile(path_2):
					try:
						
						xml_1 = lxml.etree.parse(path_1)
						xml_2 = lxml.etree.parse(path_2)

						#remove templates from new xml
						templates = template_xp(xml_1)
						if templates:
							templates[0].getparent().remove(templates[0])

						print("tei:text   ", end='')
						txt_equal = compare_elements(txt_xp(xml_1), txt_xp(xml_2))
						print("\t", end='');

						print("ge:document   ", end='')
						doc_equal = compare_elements(doc_xp(xml_1), doc_xp(xml_2))
						print("\t", end='')
						
						if replace_new:
							if txt_equal:
								replace(txt_xp(xml_1)[0], txt_xp(xml_2)[0])
							if doc_equal:
								replace(doc_xp(xml_1)[0], doc_xp(xml_2)[0])
							if templates:
								body_xp(xml_1)[0].append(copy.deepcopy(templates[0]))

							if txt_equal or doc_equal:
								rev_desc.add_change(xml_1, "system", "whitespace-restored")
								faust.tei_serialize(xml_1).write(path_1, encoding="UTF-8")


						if templates:
							print(" *t* ", end='')
						if txt_equal or doc_equal:
							print(" ***MODIFIED***", end='')
							
					except lxml.etree.XMLSyntaxError:
						print("XML syntax error", end='')
					except ValueError as e:
						print(e, end='');
				else:
					print ("not in dir2", end='')
				print()
Пример #2
0
def compare_dirs(one, two):
    """Compares two directories, ignoring whitespace.

	Replaces the tei:text and ge:document elements files in one by
	those of files in two if equal"""

    for dir in os.walk(one):
        for file in dir[2]:
            if file.endswith(".xml"):
                path_1 = os.path.join(dir[0], file)
                relpath = os.path.relpath(path_1, one)
                path_2 = os.path.join(two, relpath)
                print(relpath, "\t", end="")

                if os.path.isfile(path_2):
                    try:

                        xml_1 = lxml.etree.parse(path_1)
                        xml_2 = lxml.etree.parse(path_2)

                        # remove templates from new xml
                        templates = template_xp(xml_1)
                        if templates:
                            templates[0].getparent().remove(templates[0])

                        print("tei:text   ", end="")
                        txt_equal = compare_elements(txt_xp(xml_1), txt_xp(xml_2))
                        print("\t", end="")

                        print("ge:document   ", end="")
                        doc_equal = compare_elements(doc_xp(xml_1), doc_xp(xml_2))
                        print("\t", end="")

                        if replace_new:
                            if txt_equal:
                                replace(txt_xp(xml_1)[0], txt_xp(xml_2)[0])
                            if doc_equal:
                                replace(doc_xp(xml_1)[0], doc_xp(xml_2)[0])
                            if templates:
                                body_xp(xml_1)[0].append(copy.deepcopy(templates[0]))

                            if txt_equal or doc_equal:
                                rev_desc.add_change(xml_1, "system", "whitespace-restored")
                                faust.tei_serialize(xml_1).write(path_1, encoding="UTF-8")

                        if templates:
                            print(" *t* ", end="")
                        if txt_equal or doc_equal:
                            print(" ***MODIFIED***", end="")

                    except lxml.etree.XMLSyntaxError:
                        print("XML syntax error", end="")
                    except ValueError as e:
                        print(e, end="")
                else:
                    print("not in dir2", end="")
                print()
Пример #3
0
def tei_transform (tei_file, transform_etree):
	try:
		if not faust.is_tei_document(tei_file):
			sys.stderr.write("Not a TEI file: " + file + "\n")
			return
		xml = lxml.etree.parse(tei_file)
		result = transform_etree(xml)
		faust.tei_serialize(result).write(tei_file, encoding="UTF-8")
	except IOError:
		sys.stderr.write("I/O error while transforming " + tei_file + "\n")
	except lxml.etree.XMLSyntaxError:
		sys.stderr.write("XML syntax error while transforming " + tei_file + "\n")
Пример #4
0
def correct_graphic_uris():
    # take into account old GSA files
    files = [f for f in faust.transcript_files() if '/gsa/' in f]
    files.extend(xml_names_from_facsimiles())
    for f in files:
        rewrite_file = False

        try:
            xml = lxml.etree.parse(f)
        except IOError:
            # these should only be GSA files
            print "(", f, " doesn't exist)"
            continue
        print f
        graphics = graphic_xp(xml)

        if len(graphics) == 0:
            append_facsimile_element(xml)
            # find the newly appended element
            graphics = graphic_xp(xml)

        brutal = False
        if len(graphics) == 1:
            brutal = True

        for graphic in graphics:
            old = graphic.attrib["url"]
            new = correct_uri(old, brutal, f)
            graphic.attrib["url"] = new
            if new != old:
                print "   correcting: ", old, " -> ", new
                rewrite_file = True
        if rewrite_file:
            rev_desc.add_change(xml, "system", "facsimile_adapted")
            print "   writing"
            faust.tei_serialize(xml).write(f, encoding='UTF-8')
        else:
            print "   not writing"
def correct_graphic_uris():
	# take into account old GSA files
	files = [f for f in faust.transcript_files() if '/gsa/' in f]
	files.extend(xml_names_from_facsimiles())
	for f in files:
		rewrite_file = False
	
		try:
			xml = lxml.etree.parse(f)
		except IOError:
			# these should only be GSA files
			print "(", f, " doesn't exist)"
			continue
		print f
		graphics = graphic_xp(xml)

		if len(graphics) == 0:
			append_facsimile_element(xml)
			# find the newly appended element
			graphics = graphic_xp(xml)

		brutal = False
		if len(graphics) == 1:
			brutal = True

		for graphic in graphics:
			old = graphic.attrib["url"]
			new = correct_uri(old, brutal, f)
			graphic.attrib["url"] = new
			if new != old:
				print "   correcting: ", old, " -> ", new
				rewrite_file = True
		if rewrite_file:
			rev_desc.add_change(xml, "system", "facsimile_adapted")
			print "   writing"
			faust.tei_serialize(xml).write(f, encoding='UTF-8')
		else:
			print "   not writing"
Пример #6
0
def make_template(path):
    print "creating: ", path
    faust.tei_serialize(doc_template).write(path, encoding='UTF-8')
Пример #7
0
# Get the template and parse it
tei_template = faust.absolute_path("template/tei.xml")
template = lxml.etree.parse(tei_template)

# extract relevant header fragments from template
template_hand_notes = handNotes_xp(template)[0]
template_char_decl = charDecl_xp(template)[0]


def replace(node, with_node):
	'''Replaces a node with a deep copy of a node (from another document)'''
	node.getparent().replace(node, copy.deepcopy(with_node))

# iterate over TEI files (excluding the template)
for xml_file in faust.xml_files():
	try:
		if (xml_file != tei_template) and faust.is_tei_document(xml_file):
			xml = lxml.etree.parse(xml_file)
	
			# replace header fragments
			for hand_notes in handNotes_xp(xml): replace(hand_notes, template_hand_notes)
			for char_decl in charDecl_xp(xml): replace(char_decl, template_char_decl)
	
			# write back updated document
			faust.tei_serialize(xml).write(xml_file, encoding="UTF-8")
	except IOError:
		sys.stderr.write("I/O error while updating " + xml_file + "\n")
	except lxml.etree.XMLSyntaxError:
		sys.stderr.write("XML syntax error while updating " + xml_file + "\n")
def make_template(path):
	print "creating: " , path
	faust.tei_serialize(doc_template).write(path, encoding='UTF-8')