def __call__(self, words): tagsline = '\n' while tagsline == '\n': tagsline = tagsfile.readline() tags = get_tags(tagsline) if len(tags) != len(words): # print >> sys.stderr, "Number of tags doesn't match number of words" # print >> sys.stderr, ' previous line: ' + self.prevline # print >> sys.stderr, (' tags line: %s\n tags: %s\n words: %s' % # (tagsline, ', '.join(tags), ', '.join(words))) self.ignored += 1 # raw_input() return uprint(' '.join('|||'.join(pair) for pair in zip(words, tags))) self.prevline = tagsline
def get_edits(line, options): if '\t' not in line: if options.verbose: uprint("ignoring line that doesn't have two parts:") uprint(' ' + repr(line)) return raw, seg = line.split('\t') # Special cases: # - an odd edit with no segmentations [e.g. ع -> على] if raw != seg and SEG_MARKER not in seg: return [u'<other>'] * len(raw) # - token deleted if seg == '': return [u' <del> '] * len(raw) # - nothing on the raw side if raw == '': if options.verbose: uprint("ignoring line with empty raw text:") uprint(' ' + repr(line)) return edits = [] last_raw = '' last_seg = '' while len(raw) != 0: # Possible edits, in order that they are searched for: # :+Al // li + definite article + word starting with l if raw.endswith(u'لل') and seg.endswith(u'ل%sالل' % SEG_MARKER): edits.append(u' %s+ال' % SEG_MARKER) seg = seg[:-3] # +A:+A // mA + A... verbal negation spelled as just m elif is_ma_alif(seg, raw): edits.append(u' +ا%s+ا ' % SEG_MARKER) seg = seg[:-3] # x:x // shadda breaking: character duplicated on either side of # segmentation # x>xx // shadda breaking: character duplicated, no segmentation elif is_shadda(seg, raw): if seg.endswith(SEG_MARKER + raw[-1]): edits.append(u' x:x ') seg = seg[:-2] else: assert seg.endswith(raw[-1] * 2), repr(seg + '\t' + raw) edits.append(u' x>xx ') seg = seg[:-1] # :+x // added an letter after segmentation (alif for # li + definite article, noon for recovered first person # prefix or y -> ny in dialect) elif is_seg_plus(seg, raw): edits.append(u' %s+%s ' % (SEG_MARKER, seg[-2])) seg = seg[:-2] # +x: // added a letter before segmentation (usually noon, for # plurals, mim~A, Al~A, etc.) elif is_plus_seg(seg, raw): edits.append(u' +%s%s ' % (seg[-3], SEG_MARKER)) seg = seg[:-2] # <del> // deleted lengthening effect (yAAAAAA -> yA) elif is_lengthening(seg, raw, last_raw): edits.append(u' <del> ') seg += u' ' # : // ordinary segmentation boundary elif seg.endswith(SEG_MARKER + raw[-1]): edits.append(SEG) seg = seg[:-1] # <noseg> // character doesn't change, no segmentation added elif len(seg) != 0 and seg[-1] == raw[-1]: edits.append(NOSEG) # <other> // normalized E or El to ElY elif is_alaa_normalization(seg, raw): edits.append(u'<other>') seg = seg[:-2] if raw[-1] != u'ع': assert raw[-2] == u'ع' seg = seg + ' ' # +V: // added a long vowel (verbal or dialect -wA ending, jussive # normalization) elif len(seg) >= 2 and seg[-2] == raw[-1] and seg[-1] in LONG_VOWELS: if len(seg) >= 3 and seg[-3] == SEG_MARKER: edits.append(u' %s+%s ' % (SEG_MARKER, seg[-1])) seg = seg[:-2] else: edits.append(u' +%s ' % seg[-1]) seg = seg[:-1] # y:+h // recover dialectal silent haa after segmentation elif seg.endswith(u'ي' + SEG_MARKER + u'ه') and raw.endswith(u'ي'): edits.append(u' ي%s+ه ' % SEG_MARKER) seg = seg[:-2] # <del> // deleted a long vowel (dialect ending normalization: mostly # -kwA -> -kw and -kY -> -k) or dialectal silent haa elif (len(raw) >= 2 and norm_endswith(seg, raw[-2], HAAS) and raw[-1] in LONG_VOWELS + u'ه'): edits.append(u' <del> ') seg += u' ' # <del> // deleted diacritic elif is_diacritic(raw[-1]): edits.append(u' <del> ') seg += u' ' # x>y: // change x to y after a segment boundary elif (len(seg) >= 2 and seg[-2] == SEG_MARKER and is_common_rewrite(seg, raw)): edits.append(u' %s%s>%s ' % (SEG_MARKER, raw[-1], seg[-1])) seg = seg[:-1] # x>y // change x to y without a segmentation (orthography # normalization) elif is_common_rewrite(seg, raw): edits.append(u' %s>%s ' % (raw[-1], seg[-1])) else: if options.verbose: uprint('ignoring line with unknown edit:') uprint(' ' + line) uprint('(seg = %s; raw = %s)' % (seg, raw)) uprint('(edits = %s)' % edits) return last_raw = raw[-1] seg = seg[:-1] last_seg = raw[-1] raw = raw[:-1] if len(seg) != 0: if options.verbose: uprint('ignoring line with unknown edit:') uprint(' ' + line) uprint('(extra seg: %s)' % seg) uprint('(edits = %s)' % edits) return edits.reverse() return edits