Пример #1
0
def add_conditional_tag_probability(corpus_filename, out_path, min_word_freq,
                                    logger=None, morph=None):
    """ Add P(t|w) estimates to a compiled dictionary """

    if morph is None:
        morph = MorphAnalyzer(out_path, probability_estimator_cls=None)

    if logger is None:
        logger = logging.getLogger(__name__)

    logger.info("Estimating P(t|w) from %s" % corpus_filename)
    cpd, cfd = estimate_conditional_tag_probability(morph, corpus_filename, logger)

    logger.info("Encoding P(t|w) as DAWG")
    d = build_cpd_dawg(morph, cpd, int(min_word_freq))
    dawg_filename = os.path.join(out_path, 'p_t_given_w.intdawg')
    d.save(dawg_filename)

    logger.info("Updating meta information")
    meta_filename = os.path.join(out_path, 'meta.json')
    update_meta(meta_filename, [
        ('P(t|w)', True),
        ('P(t|w)_unique_words', len(cpd.conditions())),
        ('P(t|w)_outcomes', cfd.N()),
        ('P(t|w)_min_word_freq', int(min_word_freq)),
    ])
    logger.info('\nDone.')
Пример #2
0
def add_conditional_tag_probability(corpus_filename,
                                    out_path,
                                    min_word_freq,
                                    logger=None,
                                    morph=None):
    """ Add P(t|w) estimates to a compiled dictionary """

    if morph is None:
        morph = MorphAnalyzer(out_path, probability_estimator_cls=None)

    if logger is None:
        logger = logging.getLogger(__name__)

    logger.info("Estimating P(t|w) from %s" % corpus_filename)
    cpd, cfd = estimate_conditional_tag_probability(morph, corpus_filename,
                                                    logger)

    logger.info("Encoding P(t|w) as DAWG")
    d = build_cpd_dawg(morph, cpd, int(min_word_freq))
    dawg_filename = os.path.join(out_path, 'p_t_given_w.intdawg')
    d.save(dawg_filename)

    logger.info("Updating meta information")
    meta_filename = os.path.join(out_path, 'meta.json')
    update_meta(meta_filename, [
        ('P(t|w)', True),
        ('P(t|w)_unique_words', len(cpd.conditions())),
        ('P(t|w)_outcomes', cfd.N()),
        ('P(t|w)_min_word_freq', int(min_word_freq)),
    ])
    logger.info('\nDone.')
Пример #3
0
        if args['--clear']:
            shutil.rmtree(out_path)
        else:
            logger.error("Output path exists: %r", out_path)
            sys.exit(1)

    compile_options = dict(
        (key.replace('-', '_'), int(args['--' + key]))
        for key in ('min-ending-freq', 'min-paradigm-popularity', 'max-suffix-length')
    )
    compile_options["paradigm_prefixes"] = lang.PARADIGM_PREFIXES

    opencorpora_dict.convert_to_pymorphy2(
        opencorpora_dict_path=dict_xml,
        out_path=out_path,
        source_name=args['--source-name'],
        language_code=args['--lang'],
        compile_options=compile_options,
    )

    if args["--corpus"]:
        add_conditional_tag_probability(
            corpus_filename=args["--corpus"],
            out_path=out_path,
            min_word_freq=int(args['--min-word-freq']),
            logger=logger,
        )
        rev = get_corpus_revision(args["--corpus"])
        meta_filename = os.path.join(out_path, "meta.json")
        update_meta(meta_filename, {"corpus_revision": rev})
Пример #4
0
            shutil.rmtree(out_path)
        else:
            logger.error("Output path exists: %r", out_path)
            sys.exit(1)

    compile_options = dict(
        (key.replace('-', '_'), int(args['--' + key]))
        for key in ('min-ending-freq', 'min-paradigm-popularity', 'max-suffix-length')
    )
    # compile_options["paradigm_prefixes"] = lang.PARADIGM_PREFIXES
    compile_options["paradigm_prefixes"] = ['', 'naj']  # not sure, maybe just ['']?

    opencorpora_dict.convert_to_pymorphy2(
        opencorpora_dict_path=dict_xml,
        out_path=out_path,
        source_name=args['--source-name'],
        language_code=args['--lang'],
        compile_options=compile_options,
    )

    if args["--corpus"]:
        add_conditional_tag_probability(
            corpus_filename=args["--corpus"],
            out_path=out_path,
            min_word_freq=int(args['--min-word-freq']),
            logger=logger,
        )
        rev = get_corpus_revision(args["--corpus"])
        meta_filename = os.path.join(out_path, "meta.json")
        update_meta(meta_filename, {"corpus_revision": rev})