def score_lm(args):
    using_nbest = args.nbest_list is not None
    pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
        backwards_preprocessed_dir, lm_preprocessed_dir = \
        rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset,
                                     args.gen_model_name, args.shard_id, args.num_shards,
                                     args.sampling, args.prefix_len, args.target_prefix_frac,
                                     args.source_prefix_frac)

    predictions_bpe_file = pre_gen+"/generate_output_bpe.txt"
    if using_nbest:
        print("Using predefined n-best list from interactive.py")
        predictions_bpe_file = args.nbest_list

    gen_output = rerank_utils.BitextOutputFromGen(predictions_bpe_file, bpe_symbol=args.bitext_remove_bpe, nbest=using_nbest)

    if args.language_model is not None:
        lm_score_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.lm_name, lm_file=True)

    if args.language_model is not None and not os.path.isfile(lm_score_file):
        print("STEP 4.5: language modeling for P(T)")
        if args.lm_bpe_code is None:
            bpe_status = "no bpe"
        elif args.lm_bpe_code == "shared":
            bpe_status = "shared"
        else:
            bpe_status = "different"

        rerank_utils.lm_scoring(lm_preprocessed_dir, bpe_status, gen_output, pre_gen,
                                args.lm_dict, args.lm_name, args.language_model,
                                args.lm_bpe_code, 128, lm_score_file, args.target_lang,
                                args.source_lang, prefix_len=args.prefix_len)
示例#2
0
def rerank(args):
    if type(args.lenpen) is not list:
        args.lenpen = [args.lenpen]
    if type(args.weight1) is not list:
        args.weight1 = [args.weight1]
    if type(args.weight2) is not list:
        args.weight2 = [args.weight2]
    if type(args.weight3) is not list:
        args.weight3 = [args.weight3]
    if args.all_shards:
        shard_ids = list(range(args.num_shards))
    else:
        shard_ids = [args.shard_id]

    for shard_id in shard_ids:
        (
            pre_gen,
            left_to_right_preprocessed_dir,
            right_to_left_preprocessed_dir,
            backwards_preprocessed_dir,
            lm_preprocessed_dir,
        ) = rerank_utils.get_directories(
            args.data_dir_name,
            args.num_rescore,
            args.gen_subset,
            args.gen_model_name,
            shard_id,
            args.num_shards,
            args.sampling,
            args.prefix_len,
            args.target_prefix_frac,
            args.source_prefix_frac,
        )
        rerank_generate.gen_and_reprocess_nbest(args)
        rerank_score_bw.score_bw(args)
        rerank_score_lm.score_lm(args)

        if args.write_hypos is None:
            write_targets = pre_gen + "/matched_targets"
            write_hypos = pre_gen + "/matched_hypos"
        else:
            write_targets = args.write_hypos + "_targets" + args.gen_subset
            write_hypos = args.write_hypos + "_hypos" + args.gen_subset

    if args.all_shards:
        write_targets += "_all_shards"
        write_hypos += "_all_shards"

    (
        best_lenpen,
        best_weight1,
        best_weight2,
        best_weight3,
        best_score,
    ) = match_target_hypo(args, write_targets, write_hypos)

    return best_lenpen, best_weight1, best_weight2, best_weight3, best_score
示例#3
0
def score_bw(args):
    if args.backwards1:
        scorer1_src = args.target_lang
        scorer1_tgt = args.source_lang
    else:
        scorer1_src = args.source_lang
        scorer1_tgt = args.target_lang

    if args.score_model2 is not None:
        if args.backwards2:
            scorer2_src = args.target_lang
            scorer2_tgt = args.source_lang
        else:
            scorer2_src = args.source_lang
            scorer2_tgt = args.target_lang

    rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None
    rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None

    pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
        backwards_preprocessed_dir, lm_preprocessed_dir = \
        rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset,
                                     args.gen_model_name, args.shard_id, args.num_shards,
                                     args.sampling, args.prefix_len, args.target_prefix_frac,
                                     args.source_prefix_frac)

    score1_file = rerank_utils.rescore_file_name(
        pre_gen,
        args.prefix_len,
        args.model1_name,
        target_prefix_frac=args.target_prefix_frac,
        source_prefix_frac=args.source_prefix_frac,
        backwards=args.backwards1)

    if args.score_model2 is not None:
        score2_file = rerank_utils.rescore_file_name(
            pre_gen,
            args.prefix_len,
            args.model2_name,
            target_prefix_frac=args.target_prefix_frac,
            source_prefix_frac=args.source_prefix_frac,
            backwards=args.backwards2)

    if args.right_to_left1:
        rerank_data1 = right_to_left_preprocessed_dir
    elif args.backwards1:
        rerank_data1 = backwards_preprocessed_dir
    else:
        rerank_data1 = left_to_right_preprocessed_dir

    gen_param = [
        "--batch-size",
        str(128), "--score-reference", "--gen-subset", "train"
    ]
    if not rerank1_is_gen and not os.path.isfile(score1_file):
        print("STEP 4: score the translations for model 1")

        model_param1 = [
            "--path", args.score_model1, "--source-lang", scorer1_src,
            "--target-lang", scorer1_tgt
        ]
        gen_model1_param = [rerank_data1] + gen_param + model_param1

        gen_parser = options.get_generation_parser()
        input_args = options.parse_args_and_arch(gen_parser, gen_model1_param)

        with open(score1_file, 'w') as f:
            with redirect_stdout(f):
                generate.main(input_args)

    if args.score_model2 is not None and not os.path.isfile(
            score2_file) and not rerank2_is_gen:
        print("STEP 4: score the translations for model 2")

        if args.right_to_left2:
            rerank_data2 = right_to_left_preprocessed_dir
        elif args.backwards2:
            rerank_data2 = backwards_preprocessed_dir
        else:
            rerank_data2 = left_to_right_preprocessed_dir

        model_param2 = [
            "--path", args.score_model2, "--source-lang", scorer2_src,
            "--target-lang", scorer2_tgt
        ]
        gen_model2_param = [rerank_data2] + gen_param + model_param2

        gen_parser = options.get_generation_parser()
        input_args = options.parse_args_and_arch(gen_parser, gen_model2_param)

        with open(score2_file, 'w') as f:
            with redirect_stdout(f):
                generate.main(input_args)
示例#4
0
def load_score_files(args):
    if args.all_shards:
        shard_ids = list(range(args.num_shards))
    else:
        shard_ids = [args.shard_id]

    gen_output_lst = []
    bitext1_lst = []
    bitext2_lst = []
    lm_res1_lst = []

    for shard_id in shard_ids:
        using_nbest = args.nbest_list is not None
        pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
            backwards_preprocessed_dir, lm_preprocessed_dir = \
            rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset,
                                         args.gen_model_name, shard_id, args.num_shards, args.sampling,
                                         args.prefix_len, args.target_prefix_frac, args.source_prefix_frac)

        rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None
        rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None

        score1_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model1_name,
                                                     target_prefix_frac=args.target_prefix_frac,
                                                     source_prefix_frac=args.source_prefix_frac,
                                                     backwards=args.backwards1)
        if args.score_model2 is not None:
            score2_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.model2_name,
                                                         target_prefix_frac=args.target_prefix_frac,
                                                         source_prefix_frac=args.source_prefix_frac,
                                                         backwards=args.backwards2)
        if args.language_model is not None:
            lm_score_file = rerank_utils.rescore_file_name(pre_gen, args.prefix_len, args.lm_name, lm_file=True)

        # get gen output
        predictions_bpe_file = pre_gen+"/generate_output_bpe.txt"
        if using_nbest:
            print("Using predefined n-best list from interactive.py")
            predictions_bpe_file = args.nbest_list
        gen_output = rerank_utils.BitextOutputFromGen(predictions_bpe_file, bpe_symbol=args.remove_bpe,
                                                      nbest=using_nbest, prefix_len=args.prefix_len,
                                                      target_prefix_frac=args.target_prefix_frac)

        if rerank1_is_gen:
            bitext1 = gen_output
        else:
            bitext1 = rerank_utils.BitextOutput(score1_file, args.backwards1, args.right_to_left1,
                                                args.remove_bpe, args.prefix_len, args.target_prefix_frac,
                                                args.source_prefix_frac)

        if args.score_model2 is not None or args.nbest_list is not None:
            if rerank2_is_gen:
                bitext2 = gen_output
            else:
                bitext2 = rerank_utils.BitextOutput(score2_file, args.backwards2, args.right_to_left2,
                                                    args.remove_bpe, args.prefix_len, args.target_prefix_frac,
                                                    args.source_prefix_frac)

                assert bitext2.source_lengths == bitext1.source_lengths, \
                    "source lengths for rescoring models do not match"
                assert bitext2.target_lengths == bitext1.target_lengths, \
                    "target lengths for rescoring models do not match"
        else:
            if args.diff_bpe:
                assert args.score_model2 is None
                bitext2 = gen_output
            else:
                bitext2 = None

        if args.language_model is not None:
            lm_res1 = rerank_utils.LMOutput(lm_score_file, args.lm_dict, args.prefix_len,
                                            args.remove_bpe, args.target_prefix_frac)
        else:
            lm_res1 = None

        gen_output_lst.append(gen_output)
        bitext1_lst.append(bitext1)
        bitext2_lst.append(bitext2)
        lm_res1_lst.append(lm_res1)
    return gen_output_lst, bitext1_lst, bitext2_lst, lm_res1_lst
示例#5
0
def gen_and_reprocess_nbest(args):
    if args.score_dict_dir is None:
        args.score_dict_dir = args.data
    if args.prefix_len is not None:
        assert args.right_to_left1 is False, "prefix length not compatible with right to left models"
        assert args.right_to_left2 is False, "prefix length not compatible with right to left models"

    if args.nbest_list is not None:
        assert args.score_model2 is None

    if args.backwards1:
        scorer1_src = args.target_lang
        scorer1_tgt = args.source_lang
    else:
        scorer1_src = args.source_lang
        scorer1_tgt = args.target_lang

    store_data = os.path.join(
        os.path.dirname(__file__)) + "/rerank_data/" + args.data_dir_name
    if not os.path.exists(store_data):
        os.makedirs(store_data)

    pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \
        backwards_preprocessed_dir, lm_preprocessed_dir = \
        rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset,
                                     args.gen_model_name, args.shard_id, args.num_shards,
                                     args.sampling, args.prefix_len, args.target_prefix_frac,
                                     args.source_prefix_frac)
    assert not (args.right_to_left1
                and args.backwards1), "backwards right to left not supported"
    assert not (args.right_to_left2
                and args.backwards2), "backwards right to left not supported"
    assert not (args.prefix_len is not None and args.target_prefix_frac is not None), \
        "target prefix frac and target prefix len incompatible"

    # make directory to store generation results
    if not os.path.exists(pre_gen):
        os.makedirs(pre_gen)

    rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None
    rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None

    if args.nbest_list is not None:
        rerank2_is_gen = True

    # make directories to store preprossed nbest list for reranking
    if not os.path.exists(left_to_right_preprocessed_dir):
        os.makedirs(left_to_right_preprocessed_dir)
    if not os.path.exists(right_to_left_preprocessed_dir):
        os.makedirs(right_to_left_preprocessed_dir)
    if not os.path.exists(lm_preprocessed_dir):
        os.makedirs(lm_preprocessed_dir)
    if not os.path.exists(backwards_preprocessed_dir):
        os.makedirs(backwards_preprocessed_dir)

    score1_file = rerank_utils.rescore_file_name(
        pre_gen,
        args.prefix_len,
        args.model1_name,
        target_prefix_frac=args.target_prefix_frac,
        source_prefix_frac=args.source_prefix_frac,
        backwards=args.backwards1)
    if args.score_model2 is not None:
        score2_file = rerank_utils.rescore_file_name(
            pre_gen,
            args.prefix_len,
            args.model2_name,
            target_prefix_frac=args.target_prefix_frac,
            source_prefix_frac=args.source_prefix_frac,
            backwards=args.backwards2)

    predictions_bpe_file = pre_gen + "/generate_output_bpe.txt"

    using_nbest = args.nbest_list is not None

    if using_nbest:
        print("Using predefined n-best list from interactive.py")
        predictions_bpe_file = args.nbest_list

    else:
        if not os.path.isfile(predictions_bpe_file):
            print(
                "STEP 1: generate predictions using the p(T|S) model with bpe")
            print(args.data)
            param1 = [
                args.data, "--path", args.gen_model, "--shard-id",
                str(args.shard_id), "--num-shards",
                str(args.num_shards), "--nbest",
                str(args.num_rescore), "--batch-size",
                str(args.batch_size), "--beam",
                str(args.num_rescore), "--max-sentences",
                str(args.num_rescore), "--gen-subset", args.gen_subset,
                "--source-lang", args.source_lang, "--target-lang",
                args.target_lang
            ]
            if args.sampling:
                param1 += ["--sampling"]

            gen_parser = options.get_generation_parser()
            input_args = options.parse_args_and_arch(gen_parser, param1)

            print(input_args)
            with open(predictions_bpe_file, 'w') as f:
                with redirect_stdout(f):
                    generate.main(input_args)

    gen_output = rerank_utils.BitextOutputFromGen(
        predictions_bpe_file,
        bpe_symbol=args.remove_bpe,
        nbest=using_nbest,
        prefix_len=args.prefix_len,
        target_prefix_frac=args.target_prefix_frac)

    if args.diff_bpe:
        rerank_utils.write_reprocessed(
            gen_output.no_bpe_source, gen_output.no_bpe_hypo,
            gen_output.no_bpe_target,
            pre_gen + "/source_gen_bpe." + args.source_lang,
            pre_gen + "/target_gen_bpe." + args.target_lang,
            pre_gen + "/reference_gen_bpe." + args.target_lang)
        bitext_bpe = args.rescore_bpe_code
        bpe_src_param = [
            "-c", bitext_bpe, "--input",
            pre_gen + "/source_gen_bpe." + args.source_lang, "--output",
            pre_gen + "/rescore_data." + args.source_lang
        ]
        bpe_tgt_param = [
            "-c", bitext_bpe, "--input",
            pre_gen + "/target_gen_bpe." + args.target_lang, "--output",
            pre_gen + "/rescore_data." + args.target_lang
        ]

        subprocess.call([
            "python",
            os.path.join(os.path.dirname(__file__),
                         "subword-nmt/subword_nmt/apply_bpe.py")
        ] + bpe_src_param,
                        shell=False)

        subprocess.call([
            "python",
            os.path.join(os.path.dirname(__file__),
                         "subword-nmt/subword_nmt/apply_bpe.py")
        ] + bpe_tgt_param,
                        shell=False)

    if (not os.path.isfile(score1_file) and not rerank1_is_gen) or \
            (args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen):
        print(
            "STEP 2: process the output of generate.py so we have clean text files with the translations"
        )

        rescore_file = "/rescore_data"
        if args.prefix_len is not None:
            prefix_len_rescore_file = rescore_file + "prefix" + str(
                args.prefix_len)
        if args.target_prefix_frac is not None:
            target_prefix_frac_rescore_file = rescore_file + "target_prefix_frac" + str(
                args.target_prefix_frac)
        if args.source_prefix_frac is not None:
            source_prefix_frac_rescore_file = rescore_file + "source_prefix_frac" + str(
                args.source_prefix_frac)

        if not args.right_to_left1 or not args.right_to_left2:
            if not args.diff_bpe:
                rerank_utils.write_reprocessed(
                    gen_output.source,
                    gen_output.hypo,
                    gen_output.target,
                    pre_gen + rescore_file + "." + args.source_lang,
                    pre_gen + rescore_file + "." + args.target_lang,
                    pre_gen + "/reference_file",
                    bpe_symbol=args.remove_bpe)
                if args.prefix_len is not None:
                    bw_rescore_file = prefix_len_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + prefix_len_rescore_file + "." +
                        args.source_lang,
                        pre_gen + prefix_len_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        prefix_len=args.prefix_len,
                        bpe_symbol=args.remove_bpe)
                elif args.target_prefix_frac is not None:
                    bw_rescore_file = target_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + target_prefix_frac_rescore_file + "." +
                        args.source_lang,
                        pre_gen + target_prefix_frac_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        bpe_symbol=args.remove_bpe,
                        target_prefix_frac=args.target_prefix_frac)
                else:
                    bw_rescore_file = rescore_file

                if args.source_prefix_frac is not None:
                    fw_rescore_file = source_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + source_prefix_frac_rescore_file + "." +
                        args.source_lang,
                        pre_gen + source_prefix_frac_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        bpe_symbol=args.remove_bpe,
                        source_prefix_frac=args.source_prefix_frac)
                else:
                    fw_rescore_file = rescore_file

        if args.right_to_left1 or args.right_to_left2:
            rerank_utils.write_reprocessed(
                gen_output.source,
                gen_output.hypo,
                gen_output.target,
                pre_gen + "/right_to_left_rescore_data." + args.source_lang,
                pre_gen + "/right_to_left_rescore_data." + args.target_lang,
                pre_gen + "/right_to_left_reference_file",
                right_to_left=True,
                bpe_symbol=args.remove_bpe)

        print("STEP 3: binarize the translations")
        if not args.right_to_left1 or args.score_model2 is not None and not args.right_to_left2 or not rerank1_is_gen:

            if args.backwards1 or args.backwards2:
                if args.backwards_score_dict_dir is not None:
                    bw_dict = args.backwards_score_dict_dir
                else:
                    bw_dict = args.score_dict_dir
                bw_preprocess_param = [
                    "--source-lang", scorer1_src, "--target-lang", scorer1_tgt,
                    "--trainpref", pre_gen + bw_rescore_file, "--srcdict",
                    bw_dict + "/dict." + scorer1_src + ".txt", "--tgtdict",
                    bw_dict + "/dict." + scorer1_tgt + ".txt", "--destdir",
                    backwards_preprocessed_dir
                ]
                preprocess_parser = options.get_preprocessing_parser()
                input_args = preprocess_parser.parse_args(bw_preprocess_param)
                preprocess.main(input_args)

            preprocess_param = [
                "--source-lang", scorer1_src, "--target-lang", scorer1_tgt,
                "--trainpref", pre_gen + fw_rescore_file, "--srcdict",
                args.score_dict_dir + "/dict." + scorer1_src + ".txt",
                "--tgtdict",
                args.score_dict_dir + "/dict." + scorer1_tgt + ".txt",
                "--destdir", left_to_right_preprocessed_dir
            ]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

        if args.right_to_left1 or args.right_to_left2:
            preprocess_param = [
                "--source-lang", scorer1_src, "--target-lang", scorer1_tgt,
                "--trainpref", pre_gen + "/right_to_left_rescore_data",
                "--srcdict",
                args.score_dict_dir + "/dict." + scorer1_src + ".txt",
                "--tgtdict",
                args.score_dict_dir + "/dict." + scorer1_tgt + ".txt",
                "--destdir", right_to_left_preprocessed_dir
            ]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

    return gen_output