예제 #1
0
파일: digrams.py 프로젝트: israaar/daba
def __main__():
    from bamana import test
    from orthograph import convertw
    ng = collections.defaultdict(int)

    for i in test.words():
        if not re.search(r'[0-9.,;:!?]', i):
            for w in convertw(i):
                for n in map(lambda x:x+1, range(3)):
                    for g in ngrams(w, ngram=n):
                        ng[g] = ng[g]+1

    for g, f in ng.iteritems():
        print(u'{0} {1}'.format(g,f))
예제 #2
0
파일: digrams.py 프로젝트: Mompolice/daba
def __main__():
    from bamana import test
    from orthograph import convertw
    ng = collections.defaultdict(int)

    for i in test.words():
        if not re.search(r'[0-9.,;:!?]', i):
            for w in convertw(i):
                for n in map(lambda x:x+1, range(3)):
                    for g in ngrams(w, ngram=n):
                        ng[g] = ng[g]+1

    for g, f in ng.iteritems():
        print u'{0} {1}'.format(g,f)
예제 #3
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from __future__ import division
from bamana import test,wl,wl_detone
from orthograph import convertw,detone
from morphology import lemmatize, dict_disambiguate, print_gloss
import re

types = list(set([s.lower() for s in set(test.words())]))
types.sort()

wlist = [convertw(w) for w in types if not re.search(r'[0-9.,;:!?]', w)]
wtest = [convertw(w) for w in test.words() if not re.search(r'[0-9.,;:!?]', w)]

def counts(wordlist):
    for word in wordlist:
            stages = -2
            length = []
            result = []
            for form in word:
                if form != detone(form):
                    stage, gl = lemmatize(form, wl)
                else:
                    stage, gl = lemmatize(form,wl_detone)
                if stages < stage:
                    stages = stage
                result.extend(gl)
            length = len(dict_disambiguate(result))
            yield (stages, length, word)
예제 #4
0
from bamana import test, wl, wl_detone
from nltk.text import ConcordanceIndex
from orthograph import convertw, detone
from morphology import lemmatize, dict_disambiguate, print_gloss
import re

ci = ConcordanceIndex(test.words(), key=lambda s: s.lower())
types = list(set([s.lower() for s in set(test.words())]))
types.sort()

for word in types:
    if not re.search(r'[0-9.,;:!?]', word):
        ci.print_concordance(word, lines=15)
        print
        nw = convertw(word)
        nwl = [w for w in nw if w in wl]
        if nwl:
            formlist = nwl
        else:
            formlist = nw
        result = []
        for form in formlist:
            if form != detone(form):
                stage, gl = lemmatize(form, wl)
            else:
                stage, gl = lemmatize(form, wl_detone)
            result.extend(gl)

        glstr = [print_gloss(g) for g in dict_disambiguate(result)]
        for gs in glstr:
예제 #5
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from bamana import wl, test
from nltk import FreqDist
from orthograph import convertw, detone

words = []
for i in test.words():
    words.extend(convertw(detone(i)))
fd = FreqDist(words)

for w, lems in wl.items():
    if len(lems) > 1:
        pslist = reduce(lambda x, y: x.union(y), [lem[1] for lem in lems])
        if len(pslist) > 1:
            #if len(pslist) == 1: # polysemy case
            print fd[detone(w)], w,
            for l in lems:
                print '|', '/'.join(l[1]), u"‘" + l[2] + u"’",
            print
예제 #6
0
from bamana import test,wl,wl_detone
from nltk.text import ConcordanceIndex
from orthograph import convertw,detone
from morphology import lemmatize, dict_disambiguate, print_gloss
import re

ci = ConcordanceIndex(test.words(), key=lambda s:s.lower())
types = list(set([s.lower() for s in set(test.words())]))
types.sort()

for word in types:
    if not re.search(r'[0-9.,;:!?]', word):
        ci.print_concordance(word, lines=15)
        print 
        nw = convertw(word)
        nwl = [w for w in nw if w in wl]
        if nwl:
            formlist = nwl
        else:
            formlist = nw
        result = []
        for form in formlist:
            if form != detone(form):
                stage, gl = lemmatize(form, wl)
            else:
                stage, gl = lemmatize(form,wl_detone)
            result.extend(gl)

        glstr = [print_gloss(g) for g in dict_disambiguate(result)]
        for gs in glstr:
예제 #7
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from __future__ import division
from bamana import test,wl,wl_detone
from orthograph import convertw,detone
from morphology import lemmatize, dict_disambiguate, print_gloss
from nltk import FreqDist
import re

types = list(set([s.lower() for s in set(test.words())]))
types.sort()

wlist = [convertw(w) for w in types if not re.search(r'[0-9.,;:!?]', w)]
fdist = FreqDist(tuple(convertw(w)) for w in test.words() if not re.search(r'[0-9.,;:!?]', w))

def counts(wordlist,fd):
    for word in wordlist:
            stages = -2
            length = []
            result = []
            for form in word:
                if form != detone(form):
                    stage, gl = lemmatize(form, wl)
                else:
                    stage, gl = lemmatize(form,wl_detone)
                if stages < stage:
                    stages = stage
                result.extend(gl)
            length = len(dict_disambiguate(result))
            yield (stages, length, u' '.join(word), fd[tuple(word)])
예제 #8
0
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from bamana import wl,test
from nltk import FreqDist
from orthograph import convertw, detone

words = []
for i in test.words():
    words.extend(convertw(detone(i)))
fd = FreqDist(words)

for w, lems in wl.items():
    if len(lems) > 1:
        pslist = reduce(lambda x,y: x.union(y), [lem[1] for lem in lems])
        if len(pslist) > 1:
        #if len(pslist) == 1: # polysemy case
            print fd[detone(w)], w,
            for l in lems:
                print '|', '/'.join(l[1]), u"‘" + l[2] + u"’",
            print