def score_lm(args): using_nbest = args.nbest_list is not None pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \ backwards_preprocessed_dir, lm_preprocessed_dir = \ rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, args.shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac) predictions_bpe_file = pre_gen+"/generate_output_bpe.txt" if using_nbest: print("Using predefined n-best list from interactive.py") predictions_bpe_file = args.nbest_list gen_output = rerank_utils.BitextOutputFromGen(predictions_bpe_file, bpe_symbol=args.bitext_remove_bpe, nbest=using_nbest) if args.language_model is not None: lm_score_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.lm_name, lm_file=True) if args.language_model is not None and not os.path.isfile(lm_score_file): print("STEP 4.5: language modeling for P(T)") if args.lm_bpe_code is None: bpe_status = "no bpe" elif args.lm_bpe_code == "shared": bpe_status = "shared" else: bpe_status = "different" rerank_utils.lm_scoring(lm_preprocessed_dir, bpe_status, gen_output, pre_gen, args.lm_dict, args.lm_name, args.language_model, args.lm_bpe_code, 128, lm_score_file, args.target_lang, args.source_lang, prefix_len=args.prefix_len)
def score_bw(args): if args.backwards1: scorer1_src = args.target_lang scorer1_tgt = args.source_lang else: scorer1_src = args.source_lang scorer1_tgt = args.target_lang if args.score_model2 is not None: if args.backwards2: scorer2_src = args.target_lang scorer2_tgt = args.source_lang else: scorer2_src = args.source_lang scorer2_tgt = args.target_lang rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \ backwards_preprocessed_dir, lm_preprocessed_dir = \ rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, args.shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac) score1_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model1_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards1) if args.score_model2 is not None: score2_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model2_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards2) if args.right_to_left1: rerank_data1 = right_to_left_preprocessed_dir elif args.backwards1: rerank_data1 = backwards_preprocessed_dir else: rerank_data1 = left_to_right_preprocessed_dir gen_param = [ "--batch-size", str(128), "--score-reference", "--gen-subset", "train" ] if not rerank1_is_gen and not os.path.isfile(score1_file): print("STEP 4: score the translations for model 1") model_param1 = [ "--path", args.score_model1, "--source-lang", scorer1_src, "--target-lang", scorer1_tgt ] gen_model1_param = [rerank_data1] + gen_param + model_param1 gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, gen_model1_param) with open(score1_file, 'w') as f: with redirect_stdout(f): generate.main(input_args) if args.score_model2 is not None and not os.path.isfile( score2_file) and not rerank2_is_gen: print("STEP 4: score the translations for model 2") if args.right_to_left2: rerank_data2 = right_to_left_preprocessed_dir elif args.backwards2: rerank_data2 = backwards_preprocessed_dir else: rerank_data2 = left_to_right_preprocessed_dir model_param2 = [ "--path", args.score_model2, "--source-lang", scorer2_src, "--target-lang", scorer2_tgt ] gen_model2_param = [rerank_data2] + gen_param + model_param2 gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, gen_model2_param) with open(score2_file, 'w') as f: with redirect_stdout(f): generate.main(input_args)
def load_score_files(args): if args.all_shards: shard_ids = list(range(args.num_shards)) else: shard_ids = [args.shard_id] gen_output_lst = [] bitext1_lst = [] bitext2_lst = [] lm_res1_lst = [] for shard_id in shard_ids: using_nbest = args.nbest_list is not None pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \ backwards_preprocessed_dir, lm_preprocessed_dir = \ rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac) rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None score1_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model1_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards1) if args.score_model2 is not None: score2_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model2_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards2) if args.language_model is not None: lm_score_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.lm_name, lm_file=True) # get gen output predictions_bpe_file = pre_gen+"/generate_output_bpe.txt" if using_nbest: print("Using predefined n-best list from interactive.py") predictions_bpe_file = args.nbest_list gen_output = rerank_utils.BitextOutputFromGen(predictions_bpe_file, bpe_symbol=args.remove_bpe, nbest=using_nbest, prefix_len=args.prefix_len, target_prefix_frac=args.target_prefix_frac) if rerank1_is_gen: bitext1 = gen_output else: bitext1 = rerank_utils.BitextOutput(score1_file, args.backwards1, args.right_to_left1, args.remove_bpe, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac) if args.score_model2 is not None or args.nbest_list is not None: if rerank2_is_gen: bitext2 = gen_output else: bitext2 = rerank_utils.BitextOutput(score2_file, args.backwards2, args.right_to_left2, args.remove_bpe, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac) assert bitext2.source_lengths == bitext1.source_lengths, \ "source lengths for rescoring models do not match" assert bitext2.target_lengths == bitext1.target_lengths, \ "target lengths for rescoring models do not match" else: if args.diff_bpe: assert args.score_model2 is None bitext2 = gen_output else: bitext2 = None if args.language_model is not None: lm_res1 = rerank_utils.LMOutput(lm_score_file, args.lm_dict, args.prefix_len, args.remove_bpe, args.target_prefix_frac) else: lm_res1 = None gen_output_lst.append(gen_output) bitext1_lst.append(bitext1) bitext2_lst.append(bitext2) lm_res1_lst.append(lm_res1) return gen_output_lst, bitext1_lst, bitext2_lst, lm_res1_lst
def gen_and_reprocess_nbest(args): if args.score_dict_dir is None: args.score_dict_dir = args.data if args.prefix_len is not None: assert args.right_to_left1 is False, "prefix length not compatible with right to left models" assert args.right_to_left2 is False, "prefix length not compatible with right to left models" if args.nbest_list is not None: assert args.score_model2 is None if args.backwards1: scorer1_src = args.target_lang scorer1_tgt = args.source_lang else: scorer1_src = args.source_lang scorer1_tgt = args.target_lang store_data = os.path.join( os.path.dirname(__file__)) + "/rerank_data/" + args.data_dir_name if not os.path.exists(store_data): os.makedirs(store_data) pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \ backwards_preprocessed_dir, lm_preprocessed_dir = \ rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, args.shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac) assert not (args.right_to_left1 and args.backwards1), "backwards right to left not supported" assert not (args.right_to_left2 and args.backwards2), "backwards right to left not supported" assert not (args.prefix_len is not None and args.target_prefix_frac is not None), \ "target prefix frac and target prefix len incompatible" # make directory to store generation results if not os.path.exists(pre_gen): os.makedirs(pre_gen) rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None if args.nbest_list is not None: rerank2_is_gen = True # make directories to store preprossed nbest list for reranking if not os.path.exists(left_to_right_preprocessed_dir): os.makedirs(left_to_right_preprocessed_dir) if not os.path.exists(right_to_left_preprocessed_dir): os.makedirs(right_to_left_preprocessed_dir) if not os.path.exists(lm_preprocessed_dir): os.makedirs(lm_preprocessed_dir) if not os.path.exists(backwards_preprocessed_dir): os.makedirs(backwards_preprocessed_dir) score1_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model1_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards1) if args.score_model2 is not None: score2_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model2_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards2) predictions_bpe_file = pre_gen + "/generate_output_bpe.txt" using_nbest = args.nbest_list is not None if using_nbest: print("Using predefined n-best list from interactive.py") predictions_bpe_file = args.nbest_list else: if not os.path.isfile(predictions_bpe_file): print( "STEP 1: generate predictions using the p(T|S) model with bpe") print(args.data) param1 = [ args.data, "--path", args.gen_model, "--shard-id", str(args.shard_id), "--num-shards", str(args.num_shards), "--nbest", str(args.num_rescore), "--batch-size", str(args.batch_size), "--beam", str(args.num_rescore), "--max-sentences", str(args.num_rescore), "--gen-subset", args.gen_subset, "--source-lang", args.source_lang, "--target-lang", args.target_lang ] if args.sampling: param1 += ["--sampling"] gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, param1) print(input_args) with open(predictions_bpe_file, 'w') as f: with redirect_stdout(f): generate.main(input_args) gen_output = rerank_utils.BitextOutputFromGen( predictions_bpe_file, bpe_symbol=args.remove_bpe, nbest=using_nbest, prefix_len=args.prefix_len, target_prefix_frac=args.target_prefix_frac) if args.diff_bpe: rerank_utils.write_reprocessed( gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, pre_gen + "/source_gen_bpe." + args.source_lang, pre_gen + "/target_gen_bpe." + args.target_lang, pre_gen + "/reference_gen_bpe." + args.target_lang) bitext_bpe = args.rescore_bpe_code bpe_src_param = [ "-c", bitext_bpe, "--input", pre_gen + "/source_gen_bpe." + args.source_lang, "--output", pre_gen + "/rescore_data." + args.source_lang ] bpe_tgt_param = [ "-c", bitext_bpe, "--input", pre_gen + "/target_gen_bpe." + args.target_lang, "--output", pre_gen + "/rescore_data." + args.target_lang ] subprocess.call([ "python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py") ] + bpe_src_param, shell=False) subprocess.call([ "python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py") ] + bpe_tgt_param, shell=False) if (not os.path.isfile(score1_file) and not rerank1_is_gen) or \ (args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen): print( "STEP 2: process the output of generate.py so we have clean text files with the translations" ) rescore_file = "/rescore_data" if args.prefix_len is not None: prefix_len_rescore_file = rescore_file + "prefix" + str( args.prefix_len) if args.target_prefix_frac is not None: target_prefix_frac_rescore_file = rescore_file + "target_prefix_frac" + str( args.target_prefix_frac) if args.source_prefix_frac is not None: source_prefix_frac_rescore_file = rescore_file + "source_prefix_frac" + str( args.source_prefix_frac) if not args.right_to_left1 or not args.right_to_left2: if not args.diff_bpe: rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + rescore_file + "." + args.source_lang, pre_gen + rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.remove_bpe) if args.prefix_len is not None: bw_rescore_file = prefix_len_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + prefix_len_rescore_file + "." + args.source_lang, pre_gen + prefix_len_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", prefix_len=args.prefix_len, bpe_symbol=args.remove_bpe) elif args.target_prefix_frac is not None: bw_rescore_file = target_prefix_frac_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + target_prefix_frac_rescore_file + "." + args.source_lang, pre_gen + target_prefix_frac_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.remove_bpe, target_prefix_frac=args.target_prefix_frac) else: bw_rescore_file = rescore_file if args.source_prefix_frac is not None: fw_rescore_file = source_prefix_frac_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + source_prefix_frac_rescore_file + "." + args.source_lang, pre_gen + source_prefix_frac_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.remove_bpe, source_prefix_frac=args.source_prefix_frac) else: fw_rescore_file = rescore_file if args.right_to_left1 or args.right_to_left2: rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + "/right_to_left_rescore_data." + args.source_lang, pre_gen + "/right_to_left_rescore_data." + args.target_lang, pre_gen + "/right_to_left_reference_file", right_to_left=True, bpe_symbol=args.remove_bpe) print("STEP 3: binarize the translations") if not args.right_to_left1 or args.score_model2 is not None and not args.right_to_left2 or not rerank1_is_gen: if args.backwards1 or args.backwards2: if args.backwards_score_dict_dir is not None: bw_dict = args.backwards_score_dict_dir else: bw_dict = args.score_dict_dir bw_preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + bw_rescore_file, "--srcdict", bw_dict + "/dict." + scorer1_src + ".txt", "--tgtdict", bw_dict + "/dict." + scorer1_tgt + ".txt", "--destdir", backwards_preprocessed_dir ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(bw_preprocess_param) preprocess.main(input_args) preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + fw_rescore_file, "--srcdict", args.score_dict_dir + "/dict." + scorer1_src + ".txt", "--tgtdict", args.score_dict_dir + "/dict." + scorer1_tgt + ".txt", "--destdir", left_to_right_preprocessed_dir ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) if args.right_to_left1 or args.right_to_left2: preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + "/right_to_left_rescore_data", "--srcdict", args.score_dict_dir + "/dict." + scorer1_src + ".txt", "--tgtdict", args.score_dict_dir + "/dict." + scorer1_tgt + ".txt", "--destdir", right_to_left_preprocessed_dir ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) return gen_output