def chunking (token) : chunks = [] for chunk in syllabify(token)[0]: chunks.append(unicodedata.normalize('NFD', chunk)) return chunks
def deaffix(word, affix): 'wordform, affix -> (stem, affixlist)' stemmed = () syls = syllabify(word) for v in syls: for allomorph in affix[0]: if match_affix(v, allomorph, affix[1]): if affix[1] < 0: lemma = ''.join(v[:affix[1]]) aff = (), (''.join(v[affix[1]:]), affix[2]) else: lemma = ''.join(v[affix[1]:]) aff = (''.join(v[:affix[1]]), affix[2]), () stemmed = (lemma, aff) if stemmed: return stemmed else: return (word, ((),()))
def parse_composite(gloss, glossary): 'gloss -> [gloss]' result = [] stem, afflist, pslist, ge = gloss # syllabify stem syls = syllabify(stem) for scheme in composite_order: if psmatch(pslist, scheme['pslist']): for variant in syls: if len(variant) > 2: splits = split_composite(variant) for split in splits: possible_gloss = product(*[lookup_lemma(strtolemma(stem), glossary) for stem in split]) possible_gloss = [gl for gl in possible_gloss if reduce(and_, map(truth, zip(*gl)[3]))] for glosses in possible_gloss: for template in scheme['templates']: if match_template(glosses, template): result.append(('.'.join(zip(*glosses)[0]), afflist, psmatch(pslist,scheme['pslist']), '.'.join(zip(*glosses)[3]))) return result
#!/usr/bin/python # -*- encoding: utf-8 -*- from morphology import wl from syllables import syllabify import unicodedata for i, lx in wl.items(): for lemma in lx: if 'n' in lemma[1].split('/'): t = u''.join(syllabify(lemma[0])[0][1:]) for c in unicodedata.normalize('NFD', t): if unicodedata.category(c) == 'Mn': print lemma[0].encode('utf-8'), "\t", "‘" + lemma[2].encode('utf-8') + "’" break