示例#1
0
        folders.remove(("verschiedenes", ))
    if len(folders) > 1 and ("paralipomena", ) in folders:
        folders.remove(("paralipomena", ))
    if len(folders) > 1:
        folders.sort()
        folders.reverse()
        for folder in folders:
            if "faust" == folder[0]:
                documents_struct[gsa_ident] = [folder]
                break
    if len(documents_struct[gsa_ident]) > 1:
        documents_struct[gsa_ident] = [folders[-1]]

for gsa_ident in gsa_documents:
    gathering_path = "/".join(("transcript", "gsa", gsa_ident))
    transcript_dir = faust.absolute_path(gathering_path)
    document_xml = gsa_documents[gsa_ident]
    text = None
    pages = list()
    for f in os.listdir(transcript_dir):
        if not f.endswith(".xml"): continue
        f_ident = re.search(r'[0-9]+', f).group(0)
        if f_ident == gsa_ident:
            text = f
            continue
        if int(f_ident) == 1: continue
        pages.append(f)
    if len(pages) > 0:
        document_xml.set(xml_ns + "base",
                         "faust://xml/" + gathering_path + "/")
        if text is not None:
示例#2
0
# Updates the all TEI headers based on a template
#

import copy
import sys

import lxml.etree

import faust

# XPath expressions for extraction of templated header contents
handNotes_xp = faust.xpath("//tei:teiHeader/tei:profileDesc/tei:handNotes")
charDecl_xp = faust.xpath("//tei:teiHeader/tei:encodingDesc/tei:charDecl")

# Get the template and parse it
tei_template = faust.absolute_path("template/tei.xml")
template = lxml.etree.parse(tei_template)

# extract relevant header fragments from template
template_hand_notes = handNotes_xp(template)[0]
template_char_decl = charDecl_xp(template)[0]


def replace(node, with_node):
	'''Replaces a node with a deep copy of a node (from another document)'''
	node.getparent().replace(node, copy.deepcopy(with_node))

# iterate over TEI files (excluding the template)
for xml_file in faust.xml_files():
	try:
		if (xml_file != tei_template) and faust.is_tei_document(xml_file):
示例#3
0
def to_convert_unencoded():
	files = [faust.absolute_path(rel_file) for rel_file in static_to_convert()]
	return query.matches (files, "not(" + query.encoded_xp + ")")
示例#4
0
def convert():
	tei_text_xp = faust.xpath("//tei:text")
	xml_id_cnt = 0
	
	for xml_file in [faust.absolute_path(rel) for rel in static_to_convert()]:
		print xml_file

		xml = lxml.etree.parse(xml_file)

		# ***** Documentary transcript *****
		
		# prepare <ge:document/> context
		root = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces)
		root.set(faust.ns("xml") + "id", "converted")
		for child in xml.getroot():
			if child.tag not in (faust.ns("tei") + "teiHeader", faust.ns("tei") + "facsimile"):
				child.addprevious(root)
				break
		
		surface = lxml.etree.SubElement(root, faust.ns("tei") + "surface")
		for body in faust.xpath(".//tei:body", xml):
			body_zone = copy.deepcopy(body)
			body_zone.tag = faust.ns("tei") + "zone"
			surface.append(body_zone)

		root = surface

		# let <add/>/<del/> inherit @hand from <subst/>/<restore/>
		for container_with_hand in faust.xpath(".//tei:subst[@hand]|./tei:restore[@hand]", root):
			hand = container_with_hand.get("hand")
			for add_xml in faust.xpath("./tei:add[count(@hand) = 0]", container_with_hand):
				add_xml.set("hand", hand)
			for del_xml in faust.xpath("./tei:del[count(@hand) = 0]", container_with_hand):
				del_xml.set("hand", hand)
			del container_with_hand.attrib["hand"]
						
		# convert @hand into <handShift/>
		for hand_annotated in faust.xpath(".//*[@hand]", root):
			if hand_annotated.tag not in (faust.ns("tei") + "add", faust.ns("tei") + "fw"): continue
			
			handShifts = faust.xpath("./preceding::tei:handShift", hand_annotated)
			last_hand = (len(handShifts) > 0) and handShifts[-1].get("new") or "#i_have_no_idea"
			
			# start of new hand
			hs = lxml.etree.Element(faust.ns("tei") + "handShift")
			hs.set("new", hand_annotated.get("hand"))
			hs.tail = hand_annotated.text
			
			hand_annotated.text = None
			hand_annotated.insert(0, hs)
			
			
			# reset to last hand
			hs = lxml.etree.Element(faust.ns("tei") + "handShift")
			hs.set("new", last_hand)
			hand_annotated.append(hs)
			
			del hand_annotated.attrib["hand"]
				
		# convert <div/> with @type == "zone"
		for div in root.iter(faust.ns("tei") + "div"):
			if "zone" == div.get("type", ""):
				div.tag = faust.ns("tei") + "zone"
				del div.attrib["type"]

		# convert overwritten parts
		for subst in root.iter(faust.ns("tei") + "subst"):
			att_vals = ["overwrite", "overwriting", "overwritiung"]
			def type_overwr_in_attributes(element): return element.get("type", "") in att_vals
			def rend_overwr_in_attributes(element): return element.get("rend", "") in att_vals
			children_with_type = filter(type_overwr_in_attributes, subst)
			children_with_rend = filter(rend_overwr_in_attributes, subst)
			# type attribute in substitution
			if type_overwr_in_attributes(subst):
				del subst.attrib["type"]
			# rend attribute in substitution
			elif rend_overwr_in_attributes(subst):
				del subst.attrib["rend"]
			# type attribute in a child (in add or del)
			elif children_with_type:
				for child in children_with_type:
					del child.attrib["type"]
			# rend attribute in a child
			elif children_with_rend:
				for child in children_with_rend:
					del child.attrib["rend"]
			else:
				continue
			subst.tag = faust.ns("f") + "overw"
			for del_xml in subst.findall(faust.ns("tei") + "del"):
				del_xml.tag = faust.ns("f") + "under"
			for add in subst.findall(faust.ns("tei") + "add"):
				add.tag = faust.ns("f") + "over"

		# <div type="cleared"/> becomes <ge:used spanTo="#...">
		for div in root.iter(faust.ns("tei") + "div"):
			if "type" in div.attrib:
				if div.attrib["type"] == "cleared":
					used = lxml.etree.Element(faust.ns("ge") + "used")
					div.addprevious(used)
					xml_id_cnt += 1
					anchor_id = str(xml_id_cnt)
					used.set("spanTo", "#" + anchor_id)
					for child in div.getchildren():
						div.remove(child)
						div.addprevious(child)
					div.tag = faust.ns("tei") + "anchor"
					div.set(faust.ns("xml") + "id", anchor_id)

		# throw away text structure tagging
		lxml.etree.strip_tags(root,\
			faust.ns("tei") + "div", faust.ns("tei") + "lg",\
			faust.ns("tei") + "sp", faust.ns("tei") + "subst",\
			faust.ns("tei") + "name", faust.ns("tei") + "addSpan")

		# remove Schroer numbers
		for l in root.iter(faust.ns("tei") + "l"): 
			if "n" in l.attrib: del l.attrib["n"]
		
		# create simple lines
		for line_element in ("speaker", "l", "p", "stage", "head", "ab"):
			line_elements = list(root.iter(faust.ns("tei") + line_element))
			for le in line_elements:
				if le.get("rend", "") in ["underline", "underlined", "centered unterline"]:
					hi = copy.deepcopy(le)
					hi.tag = faust.ns("tei") + "hi"
					le.clear()
					for attr in list(hi.attrib.keys()):
						if attr == "rend": continue
						le.set(attr, hi.get(attr))
						del hi.attrib[attr]
					le.append(hi)
				le.tag = faust.ns("ge") + "line"
		
		# turn deletions into <f:st/> by default
		for del_xml in root.iter(faust.ns("tei") + "del"):
			del_xml.tag = faust.ns("f") + "st"
			del_type = del_xml.get("rend", "")
			if del_type == "strikethrough" or del_type == "strikedthrough": 
				del del_xml.attrib["rend"]
			
		# rename tags for fixations
		for rewrite_tag in ("fix", "repetition"):
			for rewrite in root.iter(faust.ns("tei") + rewrite_tag):
				rewrite.tag = faust.ns("ge") + "rewrite"


		# rename semantic tags with @rend="underline"
		for sem_hi_tag in ("emph", "name"):
			for sem_hi in root.iter(faust.ns("tei") + sem_hi_tag):
				if sem_hi.get("rend", "") == "underline":
					sem_hi.tag = faust.ns("tei") + "hi"
		
		# convert umlaut corrections
		umlaut_mapping = { 
			u"ä":u"a", u"Ä":u"A", 
			u"ö":u"o", u"Ö":u"O", 
			u"ü":u"u", u"Ü":u"U" 
			}
		corr_or_reg = itertools.chain(root.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg"))
		for element in corr_or_reg:
			for umlaut in umlaut_mapping:
				if element.text == umlaut:
					element.text = umlaut_mapping[umlaut]
					element.tag = faust.ns("tei") + "orig"
					
		# join lines with @rend='inline'
		for inline_line in list(faust.xpath(".//ge:line[@rend='inline']", root)):
			prev_lines = faust.xpath("./preceding::ge:line", inline_line)
			if len(prev_lines) == 0: continue
			prev_line = prev_lines[-1]
			
			if inline_line.text is None: 
				inline_line.text = " "
			else:
				inline_line.text += " "				
			inline_line.getparent().remove(inline_line)
			prev_line.append(inline_line)
			lxml.etree.strip_tags(prev_line, faust.ns("ge") + "line")
			
		# convert inline <lb/> to <ge:line/>
		for lb in list(root.iter(faust.ns("tei") + "lb")):
			parent = lb.getparent()
			if parent.tag != (faust.ns("ge") + "line"): continue

			lb.tag = faust.ns("ge") + "line"
			lb.text = lb.tail
			lb.tail = None
			sibling = lb.getnext()
			while sibling is not None:
				next_sibling = sibling.getnext()
				parent.remove(sibling)
				lb.append(sibling)
				sibling = next_sibling			
			parent.remove(lb)
			parent.addnext(lb)
		
		# put <note/> in zones		
		for note in list(root.iter(faust.ns("tei") + "note")):
			parent = surface
			if len(faust.xpath(".//ge:line", note)) == 0:
				parent = lxml.etree.SubElement(parent, faust.ns("tei") + "zone")
				note.tag = faust.ns("ge") + "line"
			else:
				note.tag = faust.ns("tei") + "zone"
			note.getparent().remove(note)			
			parent.append(note)
			if "place" in note.attrib: del note.attrib["place"]


		# u<ex>nd</ex> becomes "und"
		for ex in root.iter(faust.ns("tei") + "ex"):
			try: pre_sibling = ex.itersiblings(preceding=True).next()
			except StopIteration: continue
			if pre_sibling.text:
				if re.split("\s+", pre_sibling.text).pop() == "u":
					# only in two files, do nothing
					pass

		# <abbr>u</abbr> becomes "und"
		for abbr in root.iter(faust.ns("tei") + "abbr"):
			if abbr.text == "u":
				tail = abbr.tail
				if tail: abbr.tail = "und" + tail
				else: abbr.tail = "und"
				remove_keep_tail(abbr)
				#abbr.tag = None

		# </ex> outside of <abbr/> becomes <supplied/>
		for ex in root.iter(faust.ns("tei") + "ex"):
			pass
			if not list(ex.iterancestors(faust.ns("tei") + "abbr")):
				ex.tag = faust.ns("tei") + "supplied"

		# <delSpan/> becomes <f:st/>
		for delSpan in root.iter(faust.ns("tei") + "delSpan"):
			delSpan.tag = faust.ns("f") + "st"

		

		# detach marginal elements
		for margin in list(faust.xpath(".//*[@place]", root)):
			place = margin.get("place")
			if place not in ("margin",\
			 	"top", "top-left", "topleft", "top-right", "topright",\
				"bottom", "bottom-left", "bottomleft", "bottom-right", "bottomright"):
				continue

			del margin.attrib["place"]
			parent = margin.getparent()
			
			margin_zone = lxml.etree.Element(faust.ns("tei") + "zone")
			if place.startswith("top"):
				surface.insert(0, margin_zone)
			else:
				surface.append(margin_zone)
				
			margin_parent = margin_zone
			if margin.tag != faust.ns("ge") + "line":
				margin_parent = lxml.etree.SubElement(margin_parent, faust.ns("ge") + "line")
				
			for ancestor in margin.iterancestors(faust.ns("ge") + "line"):
				line_id = ancestor.get(faust.ns("xml") + "id", None)
				if line_id is None:
					xml_id_cnt += 1
					line_id = "line_" + str(xml_id_cnt)
					ancestor.set(faust.ns("xml") + "id", line_id)
				margin_zone.set(faust.ns("f") + "top", "#" + line_id)
				break
			
			parent.remove(margin)
			margin_parent.append(margin)
			
		# detach interlinear additions
		for inter_add in list(faust.xpath(".//tei:add[@place='above' or @place='below']", root)):
			line = None
			for ancestor in inter_add.iterancestors(faust.ns("ge") + "line"):
				line = ancestor
				break
			if line is None: raise Exception(lxml.etree.tostring(inter_add))
			
			adjunct_line = None
			if inter_add.get("place") == "above":
				adjunct_line = line.getprevious()
			else:
				adjunct_line = line.getnext()
			if (adjunct_line is None) or (adjunct_line.tag != (faust.ns("ge") + "line")) or\
				(adjunct_line.get("type", "") != "inter"):
				adjunct_line = lxml.etree.Element(faust.ns("ge") + "line")
				adjunct_line.set("type", "inter")
				if inter_add.get("place") == "above":
					line.addprevious(adjunct_line)
				else:
					line.addnext(adjunct_line)
			
			xml_id_cnt += 1
			anchor_id = "anchor_" + str(xml_id_cnt)
			
			ins_mark = lxml.etree.SubElement(adjunct_line, faust.ns("f") + "ins")
			ins_mark.set(faust.ns("f") + "at", "#" + anchor_id)
			
			ins_mark.tail = inter_add.text
			inter_add.text = None
			inter_add.tag = faust.ns("tei") + "anchor"
			inter_add.set(faust.ns("xml") + "id", anchor_id)
			for child in inter_add.getchildren():
				inter_add.remove(child)
				adjunct_line.append(child)
			del inter_add.attrib["place"]
			
		# remove remaining <add/> elements
		lxml.etree.strip_tags(root, faust.ns("tei") + "add")
		
		# remove <lb/>s, which are located in zones after conversion
		for lb in list(root.iter(faust.ns("tei") + "lb")):
			parent = lb.getparent()
			if parent.tag == (faust.ns("tei") + "zone"):
				parent.remove(lb)

			
		

		# convert some attribute values
		for typed in faust.xpath(".//*[@type='foliation']" , root):
			typed.set("type", "folioNum")
		for typed in faust.xpath(".//*[@type='sigel']" , root):
			typed.set("type", "sig")
		for typed in faust.xpath(".//*[@type='sigil']" , root):
			typed.set("type", "sig")

		# "#_bl", "#_t" u.ä. → "#sc_bl" etc.
		for any_elem in root.iter(tag=lxml.etree.Element):
			for attrib in any_elem.attrib:
				val = any_elem.get(attrib)
				if val.startswith("#_"):
					new_val ="#sc" + val[1:]
					any_elem.set(attrib, new_val)
				
		#remove type attributes for certain values
		for typed in faust.xpath(".//*[@type='instant' or @type='inst' or @type='instantrevision'\
		or @type='late' or @type='soon']" , root):
			del typed.attrib["type"]

		
		# ***** Textual transcript *****
		
		for text in faust.xpath(".//tei:text", xml):

			# remove hand attribute
			for hand_attributed in faust.xpath(".//*[@hand]", text):
				del hand_attributed.attrib["hand"]
			#remove handShifts
			lxml.etree.strip_tags(text, faust.ns("tei") + "handShift")

			# convert umlaut corrections
			corr_or_reg = itertools.chain(text.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg"))
			for element in corr_or_reg:
				for umlaut in umlaut_mapping:
					if element.text == umlaut:
						element.text = umlaut_mapping[umlaut]
						element.tag = faust.ns("tei") + "orig"

		# write the converted file
		path = ("conversion_test/" + faust.relative_path(xml_file)).split("/")
		path[-1] = "conv_" + path[-1]
		dir_path = "/".join(path[:-1])
		if not os.path.isdir(dir_path): os.makedirs(dir_path)

		xml.write("/".join(path), encoding="UTF-8")
示例#5
0
		folders.remove(("verschiedenes", ))
	if len(folders) > 1 and ("paralipomena", ) in folders:	
		folders.remove(("paralipomena", ))
	if len(folders) > 1:
		folders.sort()
		folders.reverse()		
		for folder in folders:
			if "faust" == folder[0]:
				documents_struct[gsa_ident] = [ folder ]
				break
	if len(documents_struct[gsa_ident]) > 1:
		documents_struct[gsa_ident] = [ folders[-1] ]

for gsa_ident in gsa_documents:
	gathering_path = "/".join(("transcript", "gsa", gsa_ident))
	transcript_dir = faust.absolute_path(gathering_path)	
	document_xml = gsa_documents[gsa_ident]
	text = None
	pages = list()
	for f in os.listdir(transcript_dir):
		if not f.endswith(".xml"): continue
		f_ident = re.search(r'[0-9]+', f).group(0)
		if f_ident == gsa_ident: 
			text = f
			continue
		if int(f_ident) == 1: continue
		pages.append(f)
	if len(pages) > 0:
		document_xml.set(xml_ns + "base", "faust://xml/" + gathering_path + "/")		
		if text is not None:
			document_xml.set("transcript", text)