def __main__(): from bamana import test from orthograph import convertw ng = collections.defaultdict(int) for i in test.words(): if not re.search(r'[0-9.,;:!?]', i): for w in convertw(i): for n in map(lambda x:x+1, range(3)): for g in ngrams(w, ngram=n): ng[g] = ng[g]+1 for g, f in ng.iteritems(): print(u'{0} {1}'.format(g,f))
def __main__(): from bamana import test from orthograph import convertw ng = collections.defaultdict(int) for i in test.words(): if not re.search(r'[0-9.,;:!?]', i): for w in convertw(i): for n in map(lambda x:x+1, range(3)): for g in ngrams(w, ngram=n): ng[g] = ng[g]+1 for g, f in ng.iteritems(): print u'{0} {1}'.format(g,f)
#!/usr/bin/python # -*- encoding: utf-8 -*- from __future__ import division from bamana import test,wl,wl_detone from orthograph import convertw,detone from morphology import lemmatize, dict_disambiguate, print_gloss import re types = list(set([s.lower() for s in set(test.words())])) types.sort() wlist = [convertw(w) for w in types if not re.search(r'[0-9.,;:!?]', w)] wtest = [convertw(w) for w in test.words() if not re.search(r'[0-9.,;:!?]', w)] def counts(wordlist): for word in wordlist: stages = -2 length = [] result = [] for form in word: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form,wl_detone) if stages < stage: stages = stage result.extend(gl) length = len(dict_disambiguate(result)) yield (stages, length, word)
from bamana import test, wl, wl_detone from nltk.text import ConcordanceIndex from orthograph import convertw, detone from morphology import lemmatize, dict_disambiguate, print_gloss import re ci = ConcordanceIndex(test.words(), key=lambda s: s.lower()) types = list(set([s.lower() for s in set(test.words())])) types.sort() for word in types: if not re.search(r'[0-9.,;:!?]', word): ci.print_concordance(word, lines=15) print nw = convertw(word) nwl = [w for w in nw if w in wl] if nwl: formlist = nwl else: formlist = nw result = [] for form in formlist: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form, wl_detone) result.extend(gl) glstr = [print_gloss(g) for g in dict_disambiguate(result)] for gs in glstr:
#!/usr/bin/python # -*- encoding: utf-8 -*- from bamana import wl, test from nltk import FreqDist from orthograph import convertw, detone words = [] for i in test.words(): words.extend(convertw(detone(i))) fd = FreqDist(words) for w, lems in wl.items(): if len(lems) > 1: pslist = reduce(lambda x, y: x.union(y), [lem[1] for lem in lems]) if len(pslist) > 1: #if len(pslist) == 1: # polysemy case print fd[detone(w)], w, for l in lems: print '|', '/'.join(l[1]), u"‘" + l[2] + u"’", print
from bamana import test,wl,wl_detone from nltk.text import ConcordanceIndex from orthograph import convertw,detone from morphology import lemmatize, dict_disambiguate, print_gloss import re ci = ConcordanceIndex(test.words(), key=lambda s:s.lower()) types = list(set([s.lower() for s in set(test.words())])) types.sort() for word in types: if not re.search(r'[0-9.,;:!?]', word): ci.print_concordance(word, lines=15) print nw = convertw(word) nwl = [w for w in nw if w in wl] if nwl: formlist = nwl else: formlist = nw result = [] for form in formlist: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form,wl_detone) result.extend(gl) glstr = [print_gloss(g) for g in dict_disambiguate(result)] for gs in glstr:
#!/usr/bin/python # -*- encoding: utf-8 -*- from __future__ import division from bamana import test,wl,wl_detone from orthograph import convertw,detone from morphology import lemmatize, dict_disambiguate, print_gloss from nltk import FreqDist import re types = list(set([s.lower() for s in set(test.words())])) types.sort() wlist = [convertw(w) for w in types if not re.search(r'[0-9.,;:!?]', w)] fdist = FreqDist(tuple(convertw(w)) for w in test.words() if not re.search(r'[0-9.,;:!?]', w)) def counts(wordlist,fd): for word in wordlist: stages = -2 length = [] result = [] for form in word: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form,wl_detone) if stages < stage: stages = stage result.extend(gl) length = len(dict_disambiguate(result)) yield (stages, length, u' '.join(word), fd[tuple(word)])
#!/usr/bin/python # -*- encoding: utf-8 -*- from bamana import wl,test from nltk import FreqDist from orthograph import convertw, detone words = [] for i in test.words(): words.extend(convertw(detone(i))) fd = FreqDist(words) for w, lems in wl.items(): if len(lems) > 1: pslist = reduce(lambda x,y: x.union(y), [lem[1] for lem in lems]) if len(pslist) > 1: #if len(pslist) == 1: # polysemy case print fd[detone(w)], w, for l in lems: print '|', '/'.join(l[1]), u"‘" + l[2] + u"’", print