예제 #1
0
def chunking (token) :

    chunks = []
    for chunk in syllabify(token)[0]:
        chunks.append(unicodedata.normalize('NFD', chunk))

    return chunks
예제 #2
0
def deaffix(word, affix):
    'wordform, affix -> (stem, affixlist)'
    stemmed = ()
    syls = syllabify(word)
    for v in syls:
        for allomorph in affix[0]:
            if match_affix(v, allomorph, affix[1]):
                if affix[1] < 0:
                    lemma = ''.join(v[:affix[1]])
                    aff = (), (''.join(v[affix[1]:]), affix[2])
                else:
                    lemma = ''.join(v[affix[1]:])
                    aff = (''.join(v[:affix[1]]), affix[2]), ()
                stemmed = (lemma, aff)
    if stemmed:
        return stemmed 
    else:
        return (word, ((),()))
예제 #3
0
def parse_composite(gloss, glossary):
    'gloss -> [gloss]'
    result = []
    stem, afflist, pslist, ge = gloss
    # syllabify stem
    syls = syllabify(stem)
    for scheme in composite_order:
        if psmatch(pslist, scheme['pslist']):
            for variant in syls:
                if len(variant) > 2:
                    splits = split_composite(variant)
                    for split in splits:
                        possible_gloss = product(*[lookup_lemma(strtolemma(stem), glossary) for stem in split])
                        possible_gloss = [gl for gl in possible_gloss if reduce(and_, map(truth, zip(*gl)[3]))]
                        for glosses in possible_gloss:
                            for template in scheme['templates']:
                                if match_template(glosses, template):
                                    result.append(('.'.join(zip(*glosses)[0]), afflist, psmatch(pslist,scheme['pslist']), '.'.join(zip(*glosses)[3])))

    return result
예제 #4
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from morphology import wl
from syllables import syllabify
import unicodedata

for i, lx in wl.items():
    for lemma in lx:
        if 'n' in lemma[1].split('/'):
            t = u''.join(syllabify(lemma[0])[0][1:])
            for c in unicodedata.normalize('NFD', t):
                if unicodedata.category(c) == 'Mn':
                    print lemma[0].encode('utf-8'), "\t", "‘" + lemma[2].encode('utf-8') + "’"
                    break