def load_dict(path, gramtab_format='opencorpora-int'): """ Load pymorphy2 dictionary. ``path`` is a folder name with dictionary data. """ _f = lambda p: os.path.join(path, p) meta = _load_meta(_f('meta.json')) _assert_format_is_compatible(meta, path) Tag = _load_tag_class(gramtab_format, _f('grammemes.json')) str_gramtab = _load_gramtab(meta, gramtab_format, path) gramtab = [Tag(tag_str) for tag_str in str_gramtab] suffixes = json_read(_f('suffixes.json')) paradigm_prefixes = json_read(_f('paradigm-prefixes.json')) paradigms = _load_paradigms(_f('paradigms.array')) words = dawg.WordsDawg().load(_f('words.dawg')) prediction_prefixes = dawg.DAWG().load(_f('prediction-prefixes.dawg')) prediction_suffixes_dawgs = [] for prefix_id in range(len(paradigm_prefixes)): fn = _f('prediction-suffixes-%s.dawg' % prefix_id) assert os.path.exists(fn) prediction_suffixes_dawgs.append(dawg.PredictionSuffixesDAWG().load(fn)) return LoadedDictionary(meta, gramtab, suffixes, paradigms, words, prediction_prefixes, prediction_suffixes_dawgs, Tag, paradigm_prefixes)
def estimate_tag_cpd(corpus_filename, out_path, min_word_freq, update_meta=True): from pymorphy2.opencorpora_dict.probability import ( estimate_conditional_tag_probability, build_cpd_dawg) m = pymorphy2.MorphAnalyzer(out_path, probability_estimator_cls=None) logger.info("Estimating P(t|w) from %s" % corpus_filename) cpd, cfd = estimate_conditional_tag_probability(m, corpus_filename) logger.info("Encoding P(t|w) as DAWG") d = build_cpd_dawg(m, cpd, int(min_word_freq)) dawg_filename = os.path.join(out_path, 'p_t_given_w.intdawg') d.save(dawg_filename) if update_meta: logger.info("Updating meta information") meta_filename = os.path.join(out_path, 'meta.json') meta = json_read(meta_filename) meta.extend([ ('P(t|w)', True), ('P(t|w)_unique_words', len(cpd.conditions())), ('P(t|w)_outcomes', cfd.N()), ('P(t|w)_min_word_freq', int(min_word_freq)), ]) json_write(meta_filename, meta) logger.info('\nDone.')
def _load_gramtab(meta, gramtab_format, path): """ Load gramtab (a list of tags) """ gramtab_formats = meta.get('gramtab_formats', {}) if gramtab_format not in gramtab_formats: raise ValueError("This gramtab format (%s) is unavailable; available formats: %s" % (gramtab_format, gramtab_formats.keys())) gramtab_filename = os.path.join(path, gramtab_formats[gramtab_format]) return json_read(gramtab_filename)
def _load_tag_class(gramtab_format, grammemes_filename): """ Load and initialize Tag class (according to ``gramtab_format``). """ if gramtab_format not in tagset.registry: raise ValueError("This gramtab format ('%s') is unsupported." % gramtab_format) # FIXME: clone the class Tag = tagset.registry[gramtab_format] #._clone_class() grammemes = json_read(grammemes_filename) Tag._init_grammemes(grammemes) return Tag
def _load_tag_class(gramtab_format, grammemes_filename): """ Load and initialize Tag class (according to ``gramtab_format``). """ if gramtab_format not in tagset.registry: raise ValueError("This gramtab format ('%s') is unsupported." % gramtab_format) grammemes = json_read(grammemes_filename) Tag = tagset.registry[gramtab_format] # FIXME: clone the class # Tag = type(Tag.__name__, (Tag,), { # 'KNOWN_GRAMMEMES': Tag.KNOWN_GRAMMEMES.copy(), # }) Tag._init_grammemes(grammemes) return Tag
def _load_meta(filename): """ Load metadata. """ meta = json_read(filename, parse_float=str) if hasattr(collections, 'OrderedDict'): return collections.OrderedDict(meta) return dict(meta)