示例#1
0
def estimate_tag_cpd(corpus_filename,
                     out_path,
                     min_word_freq,
                     update_meta=True):
    from pymorphy2.opencorpora_dict.probability import (
        estimate_conditional_tag_probability, build_cpd_dawg)

    m = pymorphy2.MorphAnalyzer(out_path, probability_estimator_cls=None)

    logger.info("Estimating P(t|w) from %s" % corpus_filename)
    cpd, cfd = estimate_conditional_tag_probability(m, corpus_filename)

    logger.info("Encoding P(t|w) as DAWG")
    d = build_cpd_dawg(m, cpd, int(min_word_freq))
    dawg_filename = os.path.join(out_path, 'p_t_given_w.intdawg')
    d.save(dawg_filename)

    if update_meta:
        logger.info("Updating meta information")
        meta_filename = os.path.join(out_path, 'meta.json')
        meta = json_read(meta_filename)
        meta.extend([
            ('P(t|w)', True),
            ('P(t|w)_unique_words', len(cpd.conditions())),
            ('P(t|w)_outcomes', cfd.N()),
            ('P(t|w)_min_word_freq', int(min_word_freq)),
        ])
        json_write(meta_filename, meta)

    logger.info('\nDone.')
示例#2
0
文件: cli.py 项目: alafin/pymorphy2
def estimate_tag_cpd(corpus_filename, out_path, min_word_freq, update_meta=True):
    from pymorphy2.opencorpora_dict.probability import (
        estimate_conditional_tag_probability, build_cpd_dawg)

    m = pymorphy2.MorphAnalyzer(out_path, probability_estimator_cls=None)

    logger.info("Estimating P(t|w) from %s" % corpus_filename)
    cpd, cfd = estimate_conditional_tag_probability(m, corpus_filename)

    logger.info("Encoding P(t|w) as DAWG")
    d = build_cpd_dawg(m, cpd, int(min_word_freq))
    dawg_filename = os.path.join(out_path, 'p_t_given_w.intdawg')
    d.save(dawg_filename)

    if update_meta:
        logger.info("Updating meta information")
        meta_filename = os.path.join(out_path, 'meta.json')
        meta = json_read(meta_filename)
        meta.extend([
            ('P(t|w)', True),
            ('P(t|w)_unique_words', len(cpd.conditions())),
            ('P(t|w)_outcomes', cfd.N()),
            ('P(t|w)_min_word_freq', int(min_word_freq)),
        ])
        json_write(meta_filename, meta)

    logger.info('\nDone.')
示例#3
0
def save_compiled_dict(compiled_dict, out_path):
    """
    Save a compiled_dict to ``out_path``
    ``out_path`` should be a name of folder where to put dictionaries.
    """
    logger.info("Saving...")
    _f = lambda path: os.path.join(out_path, path)

    json_write(_f('grammemes.json'), compiled_dict.parsed_dict.grammemes)

    gramtab_formats = {}
    for format, Tag in tagset.registry.items():
        Tag._init_grammemes(compiled_dict.parsed_dict.grammemes)
        new_gramtab = [Tag._from_internal_tag(tag) for tag in compiled_dict.gramtab]

        gramtab_name = "gramtab-%s.json" % format
        gramtab_formats[format] = gramtab_name

        json_write(_f(gramtab_name), new_gramtab)

    with open(_f('paradigms.array'), 'wb') as f:
        f.write(struct.pack(str("<H"), len(compiled_dict.paradigms)))
        for para in compiled_dict.paradigms:
            f.write(struct.pack(str("<H"), len(para)))
            para.tofile(f)

    json_write(_f('suffixes.json'), compiled_dict.suffixes)
    compiled_dict.words_dawg.save(_f('words.dawg'))

    for prefix_id, prediction_suffixes_dawg in enumerate(compiled_dict.prediction_suffixes_dawgs):
        prediction_suffixes_dawg.save(_f('prediction-suffixes-%s.dawg' % prefix_id))


    dawg.DAWG(PREDICTION_PREFIXES).save(_f('prediction-prefixes.dawg'))
    json_write(_f('paradigm-prefixes.json'), PARADIGM_PREFIXES)

    logger.debug("computing metadata..")

    def _dawg_len(dawg):
        return sum(1 for k in dawg.iterkeys())

    logger.debug('  words_dawg_len')
    words_dawg_len = _dawg_len(compiled_dict.words_dawg)
    logger.debug('  prediction_suffixes_dawgs_len')

    prediction_suffixes_dawg_lenghts = []
    for prediction_suffixes_dawg in compiled_dict.prediction_suffixes_dawgs:
        prediction_suffixes_dawg_lenghts.append(_dawg_len(prediction_suffixes_dawg))

    meta = [
        ['format_version', CURRENT_FORMAT_VERSION],
        ['pymorphy2_version', pymorphy2.__version__],
        ['compiled_at', datetime.datetime.utcnow().isoformat()],

        ['source', 'opencorpora.org'],
        ['source_version', compiled_dict.parsed_dict.version],
        ['source_revision', compiled_dict.parsed_dict.revision],
        ['source_lexemes_count', len(compiled_dict.parsed_dict.lexemes)],
        ['source_links_count', len(compiled_dict.parsed_dict.links)],

        ['gramtab_length', len(compiled_dict.gramtab)],
        ['gramtab_formats', gramtab_formats],
        ['paradigms_length', len(compiled_dict.paradigms)],
        ['suffixes_length', len(compiled_dict.suffixes)],

        ['words_dawg_length', words_dawg_len],
        ['prediction_options', compiled_dict.prediction_options],
        ['prediction_suffixes_dawg_lengths', prediction_suffixes_dawg_lenghts],
        ['prediction_prefixes_dawg_length', len(PREDICTION_PREFIXES)],
        ['paradigm_prefixes_length', len(PARADIGM_PREFIXES)],
    ]

    json_write(_f('meta.json'), meta, indent=4)
示例#4
0
def write_meta(filename, meta):
    """ Save metadata to a file. """
    if isinstance(meta, dict):
        meta = list(meta.items())
    json_write(filename, meta)
示例#5
0
def write_meta(filename, meta):
    """ Save metadata to a file. """
    if isinstance(meta, dict):
        meta = list(meta.items())
    json_write(filename, meta)