def main(): # for line in open('lahu_writing.txt','rU'): # for word in line.split(): # print word, # print transduce(word,decompose), # print transduce(transduce(word,decompose),chinese), # print transduce(transduce(word,decompose),baptist) f = open('4testLahuTexts.xml','rt') tree = ET.parse(f) f.close() for node in tree.findall('.//word/words/word/item'): if node.attrib['type'] == 'txt': form = codecs.encode(node.text,'utf8') print form, print [form], print transduce(form,decompose), print transduce(transduce(form,decompose),chinese), print transduce(transduce(form,decompose),baptist)
phrases = text.findall('.//phrases') print >> OutLaTeX, '\\begin{examples}' sentences = [] for p in phrases: for s in p.findall('.//word/words'): # sentencenumber = s.find(".//item[@type='segnum']") print >> OutLaTeX, "\\item\n" print >> OutLaTeX, "\glll ", BaptistSentence = '' ChineseSentence = '' for w in s.findall('.//word'): for i in w.findall('.//item'): if i.attrib['type'] in ['txt','punct']: #form = codecs.encode(i.text,'utf8') form = i.text form = transduce(form,decompose) form = escape(form) BaptistSentence += ' %s' % transduce(form,baptist) ChineseSentence += ' %s' % transduce(form,chinese) for level in ['txt', 'msa', 'gls']: for w in s.findall('.//word'): itemToOutput = ' {}' for i in w.findall('.//item'): if i.attrib['type'] == 'punct': if level in ['msa']: form = '' elif level in ['gls']: form = '' else: form = str(i.text) elif i.attrib['type'] == level: