def eval_lm_main(data_dir): eval_lm_parser = options.get_eval_lm_parser() eval_lm_args = options.parse_args_and_arch( eval_lm_parser, [ data_dir, '--path', os.path.join(data_dir, 'checkpoint_last.pt'), '--no-progress-bar', ], ) eval_lm.main(eval_lm_args)
def lm_scoring( preprocess_directory, bpe_status, gen_output, pre_gen, cur_lm_dict, cur_lm_name, cur_language_model, cur_lm_bpe_code, batch_size, lm_score_file, target_lang, source_lang, prefix_len=None, ): if prefix_len is not None: assert (bpe_status == "different" ), "bpe status must be different to use prefix len" if bpe_status == "no bpe": # run lm on output without bpe write_reprocessed( gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, pre_gen + "/rescore_data_no_bpe.de", pre_gen + "/rescore_data_no_bpe.en", pre_gen + "/reference_file_no_bpe", ) preprocess_lm_param = [ "--only-source", "--trainpref", pre_gen + "/rescore_data_no_bpe." + target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_directory, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [ preprocess_directory, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--max-tokens", "1024", "--sample-break-mode", "eos", "--gen-subset", "train", ] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, "w") as f: with redirect_stdout(f): eval_lm.main(input_args) elif bpe_status == "shared": preprocess_lm_param = [ "--only-source", "--trainpref", pre_gen + "/rescore_data." + target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_directory, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [ preprocess_directory, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--sample-break-mode", "eos", "--gen-subset", "train", ] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, "w") as f: with redirect_stdout(f): eval_lm.main(input_args) elif bpe_status == "different": rescore_file = pre_gen + "/rescore_data_no_bpe" rescore_bpe = pre_gen + "/rescore_data_new_bpe" rescore_file += "." rescore_bpe += "." write_reprocessed( gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, rescore_file + source_lang, rescore_file + target_lang, pre_gen + "/reference_file_no_bpe", bpe_symbol=None, ) # apply LM bpe to nbest list bpe_src_param = [ "-c", cur_lm_bpe_code, "--input", rescore_file + target_lang, "--output", rescore_bpe + target_lang, ] subprocess.call( [ "python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py"), ] + bpe_src_param, shell=False, ) # uncomment to use fastbpe instead of subword-nmt bpe # bpe_src_param = [rescore_bpe+target_lang, rescore_file+target_lang, cur_lm_bpe_code] # subprocess.call(["/private/home/edunov/fastBPE/fast", "applybpe"] + bpe_src_param, shell=False) preprocess_dir = preprocess_directory preprocess_lm_param = [ "--only-source", "--trainpref", rescore_bpe + target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [ preprocess_dir, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--max-tokens", "1024", "--sample-break-mode", "eos", "--gen-subset", "train", ] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, "w") as f: with redirect_stdout(f): eval_lm.main(input_args)