def add_conditional_tag_probability(corpus_filename, out_path, min_word_freq, logger=None, morph=None): """ Add P(t|w) estimates to a compiled dictionary """ if morph is None: morph = MorphAnalyzer(out_path, probability_estimator_cls=None) if logger is None: logger = logging.getLogger(__name__) logger.info("Estimating P(t|w) from %s" % corpus_filename) cpd, cfd = estimate_conditional_tag_probability(morph, corpus_filename, logger) logger.info("Encoding P(t|w) as DAWG") d = build_cpd_dawg(morph, cpd, int(min_word_freq)) dawg_filename = os.path.join(out_path, 'p_t_given_w.intdawg') d.save(dawg_filename) logger.info("Updating meta information") meta_filename = os.path.join(out_path, 'meta.json') update_meta(meta_filename, [ ('P(t|w)', True), ('P(t|w)_unique_words', len(cpd.conditions())), ('P(t|w)_outcomes', cfd.N()), ('P(t|w)_min_word_freq', int(min_word_freq)), ]) logger.info('\nDone.')
if args['--clear']: shutil.rmtree(out_path) else: logger.error("Output path exists: %r", out_path) sys.exit(1) compile_options = dict( (key.replace('-', '_'), int(args['--' + key])) for key in ('min-ending-freq', 'min-paradigm-popularity', 'max-suffix-length') ) compile_options["paradigm_prefixes"] = lang.PARADIGM_PREFIXES opencorpora_dict.convert_to_pymorphy2( opencorpora_dict_path=dict_xml, out_path=out_path, source_name=args['--source-name'], language_code=args['--lang'], compile_options=compile_options, ) if args["--corpus"]: add_conditional_tag_probability( corpus_filename=args["--corpus"], out_path=out_path, min_word_freq=int(args['--min-word-freq']), logger=logger, ) rev = get_corpus_revision(args["--corpus"]) meta_filename = os.path.join(out_path, "meta.json") update_meta(meta_filename, {"corpus_revision": rev})
shutil.rmtree(out_path) else: logger.error("Output path exists: %r", out_path) sys.exit(1) compile_options = dict( (key.replace('-', '_'), int(args['--' + key])) for key in ('min-ending-freq', 'min-paradigm-popularity', 'max-suffix-length') ) # compile_options["paradigm_prefixes"] = lang.PARADIGM_PREFIXES compile_options["paradigm_prefixes"] = ['', 'naj'] # not sure, maybe just ['']? opencorpora_dict.convert_to_pymorphy2( opencorpora_dict_path=dict_xml, out_path=out_path, source_name=args['--source-name'], language_code=args['--lang'], compile_options=compile_options, ) if args["--corpus"]: add_conditional_tag_probability( corpus_filename=args["--corpus"], out_path=out_path, min_word_freq=int(args['--min-word-freq']), logger=logger, ) rev = get_corpus_revision(args["--corpus"]) meta_filename = os.path.join(out_path, "meta.json") update_meta(meta_filename, {"corpus_revision": rev})