# coding=utf-8 from collections import Counter from plp import PLP p = PLP() def basic_form(word): ids = p.rec(word) return p.bform(ids[0]) if len(ids) > 0 else word def stats_sorted(stats): return sorted(stats, key=lambda x: x[1], reverse=True) def ranking(words): basic_forms = [basic_form(word) for word in words] stats = Counter(basic_forms) return stats_sorted(stats.items()), stats_sorted(stats.most_common(100))
def getForms(bodziec): return map(lambda x: PLP().forms(x), PLP().orec(bodziec))[0]
# coding: utf-8 import codecs from collections import defaultdict from plp import PLP __author__ = "Michał Ciołczyk" _FILENAME = "data/odm.txt" _ENCODING = "windows-1250" _basic_forms = defaultdict(list) _initialized = False _plp = PLP() _SIE = ' się' def _load_flection_map(): global _initialized if not _initialized: with codecs.open(_FILENAME, 'r', encoding=_ENCODING) as f: for line in f: forms = line.rstrip('\n').split(', ') bform = forms[0] for form in forms: _basic_forms[form].append(bform) for form, bforms in _basic_forms.items(): _basic_forms[form] = list(set(bforms)) _initialized = True def _strip_sie(form):
def setUp(self): self.plp = PLP('/usr/local/clp/lib/libclp_2.6.so')