def estimate_tag_cpd(corpus_filename, out_path, min_word_freq, update_meta=True): from pymorphy2.opencorpora_dict.probability import ( estimate_conditional_tag_probability, build_cpd_dawg) m = pymorphy2.MorphAnalyzer(out_path, probability_estimator_cls=None) logger.info("Estimating P(t|w) from %s" % corpus_filename) cpd, cfd = estimate_conditional_tag_probability(m, corpus_filename) logger.info("Encoding P(t|w) as DAWG") d = build_cpd_dawg(m, cpd, int(min_word_freq)) dawg_filename = os.path.join(out_path, 'p_t_given_w.intdawg') d.save(dawg_filename) if update_meta: logger.info("Updating meta information") meta_filename = os.path.join(out_path, 'meta.json') meta = json_read(meta_filename) meta.extend([ ('P(t|w)', True), ('P(t|w)_unique_words', len(cpd.conditions())), ('P(t|w)_outcomes', cfd.N()), ('P(t|w)_min_word_freq', int(min_word_freq)), ]) json_write(meta_filename, meta) logger.info('\nDone.')
def save_compiled_dict(compiled_dict, out_path): """ Save a compiled_dict to ``out_path`` ``out_path`` should be a name of folder where to put dictionaries. """ logger.info("Saving...") _f = lambda path: os.path.join(out_path, path) json_write(_f('grammemes.json'), compiled_dict.parsed_dict.grammemes) gramtab_formats = {} for format, Tag in tagset.registry.items(): Tag._init_grammemes(compiled_dict.parsed_dict.grammemes) new_gramtab = [Tag._from_internal_tag(tag) for tag in compiled_dict.gramtab] gramtab_name = "gramtab-%s.json" % format gramtab_formats[format] = gramtab_name json_write(_f(gramtab_name), new_gramtab) with open(_f('paradigms.array'), 'wb') as f: f.write(struct.pack(str("<H"), len(compiled_dict.paradigms))) for para in compiled_dict.paradigms: f.write(struct.pack(str("<H"), len(para))) para.tofile(f) json_write(_f('suffixes.json'), compiled_dict.suffixes) compiled_dict.words_dawg.save(_f('words.dawg')) for prefix_id, prediction_suffixes_dawg in enumerate(compiled_dict.prediction_suffixes_dawgs): prediction_suffixes_dawg.save(_f('prediction-suffixes-%s.dawg' % prefix_id)) dawg.DAWG(PREDICTION_PREFIXES).save(_f('prediction-prefixes.dawg')) json_write(_f('paradigm-prefixes.json'), PARADIGM_PREFIXES) logger.debug("computing metadata..") def _dawg_len(dawg): return sum(1 for k in dawg.iterkeys()) logger.debug(' words_dawg_len') words_dawg_len = _dawg_len(compiled_dict.words_dawg) logger.debug(' prediction_suffixes_dawgs_len') prediction_suffixes_dawg_lenghts = [] for prediction_suffixes_dawg in compiled_dict.prediction_suffixes_dawgs: prediction_suffixes_dawg_lenghts.append(_dawg_len(prediction_suffixes_dawg)) meta = [ ['format_version', CURRENT_FORMAT_VERSION], ['pymorphy2_version', pymorphy2.__version__], ['compiled_at', datetime.datetime.utcnow().isoformat()], ['source', 'opencorpora.org'], ['source_version', compiled_dict.parsed_dict.version], ['source_revision', compiled_dict.parsed_dict.revision], ['source_lexemes_count', len(compiled_dict.parsed_dict.lexemes)], ['source_links_count', len(compiled_dict.parsed_dict.links)], ['gramtab_length', len(compiled_dict.gramtab)], ['gramtab_formats', gramtab_formats], ['paradigms_length', len(compiled_dict.paradigms)], ['suffixes_length', len(compiled_dict.suffixes)], ['words_dawg_length', words_dawg_len], ['prediction_options', compiled_dict.prediction_options], ['prediction_suffixes_dawg_lengths', prediction_suffixes_dawg_lenghts], ['prediction_prefixes_dawg_length', len(PREDICTION_PREFIXES)], ['paradigm_prefixes_length', len(PARADIGM_PREFIXES)], ] json_write(_f('meta.json'), meta, indent=4)
def write_meta(filename, meta): """ Save metadata to a file. """ if isinstance(meta, dict): meta = list(meta.items()) json_write(filename, meta)