def evaluate(goldstandard, macronizedtext): vowelcount = 0 lengthcorrect = 0 outtext = [] for (a, b) in zip(list(goldstandard), list(macronizedtext)): plaina = postags.removemacrons(a) plainb = postags.removemacrons(b) if touiorthography(toascii(plaina)) != touiorthography(toascii(plainb)): raise Exception("Error: Text mismatch.") if plaina in "AEIOUYaeiouy": vowelcount += 1 if a == b: lengthcorrect += 1 if toascii(touiorthography(a)) == toascii(touiorthography(b)): outtext.append(escape(b)) else: outtext.append('<span class="wrong">%s</span>' % b) return lengthcorrect / float(vowelcount), "".join(outtext)
def __init__(self, token): self.tag = "" self.lemma = "" self.accented = "" self.macronized = "" self.token = postags.removemacrons(token) self.isword = re.match("[^\W\d_]", token, flags=re.UNICODE) self.isspace = re.match("\s", token, flags=re.UNICODE) self.isreordered = False self.startssentence = False self.endssentence = False self.isunknown = False self.isambiguous = False
def __init__(self, text): self.tag = "" self.lemma = "" self.accented = [""] self.macronized = "" self.text = postags.removemacrons(text) self.isword = True if re.match("[^\W\d_]", text, flags=re.UNICODE) else False self.isspace = True if re.match("\s", text, flags=re.UNICODE) else False self.hasenclitic = False self.isenclitic = False self.startssentence = False self.endssentence = False self.isunknown = False
tagtoaccents[tag] = tagtoaccents.get(tag,[]) + [postags.unicodeaccents(accented)] if accented[0].isupper(): wordform = wordform.title() tag = '.'.join(list(tag)) lexicon.write(wordform + '\t' + tag + '\t' + lemma + '\n') def escapedaccents(txt): for replacement, source in [("a_",u"ā"),("e_",u"ē"),("i_",u"ī"),("o_",u"ō"),("u_",u"ū"),("y_",u"ȳ"), ("A_",u"Ā"),("E_",u"Ē"),("I_",u"Ī"),("O_",u"Ō"),("U_",u"Ū"),("Y_",u"Ȳ")]: txt = txt.replace(source,replacement) return txt endingsfile = codecs.open("macronized-endings.txt","w","utf8") for tag in tagtoaccents: endingfreqs = {} for accented in tagtoaccents[tag]: for i in range(1,min(len(accented)-3, 12)): ending = accented[-i:] endingfreqs[ending] = endingfreqs.get(ending,0) + 1 endingsfile.write(tag) relevantendings = [] for ending in endingfreqs: endingwithoutmacrons = postags.removemacrons(ending) if ending[0] != endingwithoutmacrons[0] and endingfreqs[ending] > endingfreqs.get(endingwithoutmacrons, 1): relevantendings.append(ending) relevantendings.sort(lambda x,y: cmp(len(y), len(x))) for ending in relevantendings: endingsfile.write('\t' + escapedaccents(ending)) endingsfile.write('\n')
wordform = wordform.title() tag = '.'.join(list(tag)) lexicon_file.write("%s\t%s\t%s\n" % (wordform, tag, lemma)) with codecs.open('macronized_endings.py', 'w', 'utf8') as endings_file: endings_file.write('tag_to_endings = {\n') for tag in sorted(tag_to_accents): ending_freqs = defaultdict(int) for accented in tag_to_accents[tag]: for i in range(1, min(len(accented)-3, 12)): ending = accented[-i:] ending_freqs[ending] += 1 relevant_endings = [] for ending in ending_freqs: ending_without_macrons = postags.removemacrons(ending) if ending[0] != ending_without_macrons[0] and ending_freqs[ending] > ending_freqs.get(ending_without_macrons, 1): relevant_endings.append(ending) cleaned_list = [str(postags.escape_macrons(ending)) for ending in sorted(relevant_endings, key=lambda x: (-len(x), x))] endings_file.write(" '%s': %s,\n" % (str(tag), cleaned_list)) endings_file.write('}\n') with codecs.open('ldt-corpus.txt', 'w', 'utf8') as pos_corpus_file: xsegment = '' xsegmentbehind = '' for f in ['1999.02.0010', '2008.01.0002', '2007.01.0001', '1999.02.0060', 'phi0448.phi001.perseus-lat1',
print '<input type="checkbox" name="itoj" value="on" %s> Convert i to j.<br>' % ("checked" if performitoj else "") print '<input type="submit" value="Submit"> (Please be patient!)<br>' print '</p></form>' if macronizedtext != "": print '<h2>Result</h2>' print '<p>(Ambiguous forms are marked <span class="ambig">yellow</span>; unknown forms are <span class="unknown">orange</span>. You may click on a vowel to add or remove a macron.)</p>' print tokenization.detokenize(True).replace("\n","<br>") if domacronize and any(i in texttomacronize for i in u"āēīōū"): print '<h2>Evaluation</h2>' sys.stdout.write('<div style="white-space: pre-wrap;">') vowelcount = 0 lengthcorrect = 0 for (a,b) in zip(list(texttomacronize),list(macronizedtext)): clean = postags.removemacrons(b) if touiorthography(toascii(clean)) != touiorthography(toascii(postags.removemacrons(a))): raise Exception("Error: Text mismatch.") if clean in "AEIOUYaeiouy": vowelcount += 1 if a == b: lengthcorrect += 1 if toascii(touiorthography(a)) == toascii(touiorthography(b)): sys.stdout.write(escape(b)) else: sys.stdout.write('<span class="wrong">'+escape(b)+'</span>') print '</div>' print '<p>Accuracy:', print "{0:.2f}".format(100 * lengthcorrect / float(vowelcount)), print '</p>'
def escapedaccents(txt): for replacement, source in [("a_", u"ā"), ("e_", u"ē"), ("i_", u"ī"), ("o_", u"ō"), ("u_", u"ū"), ("y_", u"ȳ"), ("A_", u"Ā"), ("E_", u"Ē"), ("I_", u"Ī"), ("O_", u"Ō"), ("U_", u"Ū"), ("Y_", u"Ȳ")]: txt = txt.replace(source, replacement) return txt #enddef endingsfile = codecs.open("macronized-endings.txt", "w", "utf8") for tag in tagtoaccents: endingfreqs = {} for accented in tagtoaccents[tag]: for i in range(1, min(len(accented) - 3, 12)): ending = accented[-i:] endingfreqs[ending] = endingfreqs.get(ending, 0) + 1 endingsfile.write(tag) relevantendings = [] for ending in endingfreqs: endingwithoutmacrons = postags.removemacrons(ending) if ending[0] != endingwithoutmacrons[0] and endingfreqs[ ending] > endingfreqs.get(endingwithoutmacrons, 1): relevantendings.append(ending) relevantendings.sort(lambda x, y: cmp(len(y), len(x))) for ending in relevantendings: endingsfile.write('\t' + escapedaccents(ending)) endingsfile.write('\n')