예제 #1
0
def cli_main():
    parser = options.get_preprocessing_parser()
    parser.add_argument("--label_schema", type=str, default=None)
    parser.add_argument("--nonterm_suffix", type=str, default=None)
    parser.add_argument("--term_suffix", type=str, default=None)
    args = parser.parse_args()
    main(args)
예제 #2
0
def preprocess_summarization_data(data_dir, extra_flags=None):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args(
        [
            "--source-lang",
            "in",
            "--target-lang",
            "out",
            "--trainpref",
            os.path.join(data_dir, "train"),
            "--validpref",
            os.path.join(data_dir, "valid"),
            "--testpref",
            os.path.join(data_dir, "test"),
            "--thresholdtgt",
            "0",
            "--thresholdsrc",
            "0",
            "--joined-dictionary",
            "--destdir",
            data_dir,
        ]
        + (extra_flags or []),
    )
    preprocess.main(preprocess_args)
예제 #3
0
파일: preprocess.py 프로젝트: yydxhn/KEPLER
def cli_main():
    parser = options.get_preprocessing_parser()
    parser.add_argument('--bert',
                        action='store_true',
                        help='Use bert dictionary')
    args = parser.parse_args()
    main(args)
예제 #4
0
def preprocess_lm_data(data_dir, languages=None):
    preprocess_parser = options.get_preprocessing_parser()
    if languages is None:
        preprocess_args = preprocess_parser.parse_args([
            "--only-source",
            "--trainpref",
            os.path.join(data_dir, "train.out"),
            "--validpref",
            os.path.join(data_dir, "valid.out"),
            "--testpref",
            os.path.join(data_dir, "test.out"),
            "--destdir",
            data_dir,
        ])
        preprocess.main(preprocess_args)
    else:
        for lang in languages:
            lang_dir = os.path.join(data_dir, lang)
            assert os.path.exists(lang_dir)
            preprocess_args = preprocess_parser.parse_args([
                "--only-source",
                "--trainpref",
                os.path.join(lang_dir, "train.out"),
                "--validpref",
                os.path.join(lang_dir, "valid.out"),
                "--testpref",
                os.path.join(lang_dir, "test.out"),
                "--destdir",
                lang_dir,
            ])
            preprocess.main(preprocess_args)
        shutil.copyfile(
            os.path.join(data_dir, languages[0], "dict.txt"),
            os.path.join(data_dir, "dict.txt"),
        )
예제 #5
0
def preprocess_lm_data(data_dir):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args([
        '--only-source',
        '--trainpref', os.path.join(data_dir, 'train.out'),
        '--validpref', os.path.join(data_dir, 'valid.out'),
        '--testpref', os.path.join(data_dir, 'test.out'),
        '--destdir', data_dir,
    ])
    preprocess.main(preprocess_args)
예제 #6
0
 def __init__(self, pretrain="auxiliary_data/PhoBERT_base_fairseq"):
     self.phoBERT = RobertaModel.from_pretrained(pretrain,
                                                 checkpoint_file='model.pt')
     self.phoBERT.eval()
     parser = options.get_preprocessing_parser()
     parser.add_argument('--bpe-codes',
                         type=str,
                         help='path to fastBPE BPE',
                         default=pretrain + "/bpe.codes")
     args, unknown = parser.parse_known_args()
     self.phoBERT.bpe = fastBPE(
         args)  #Incorporate the BPE encoder into PhoBERT
예제 #7
0
def cli_main():
    parser = options.get_preprocessing_parser()

    # Modified
    parser.add_argument('--extra-features', nargs='*', help="List of files which have the same number of lines as the src and the tgt files. Each file contains extra labels including the information of the example's domains, speakers, etc.")


    args = parser.parse_args()
    if not args.extra_features:
        args.extra_features = {}
    
    main(args)
예제 #8
0
def cli_main():
    parser = options.get_preprocessing_parser()
    parser.add_argument('--seqtag-data-dir',
                        default=None,
                        type=str,
                        help='directory for IOB formatted data')
    parser.add_argument('--sentencepiece-model',
                        type=str,
                        default=None,
                        help='directorty for the sentencepiece model')

    args = parser.parse_args()
    main(args)
예제 #9
0
파일: utils.py 프로젝트: scheiblr/fairseq
def preprocess_lm_data(data_dir):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args([
        "--only-source",
        "--trainpref",
        os.path.join(data_dir, "train.out"),
        "--validpref",
        os.path.join(data_dir, "valid.out"),
        "--testpref",
        os.path.join(data_dir, "test.out"),
        "--destdir",
        data_dir,
    ])
    preprocess.main(preprocess_args)
예제 #10
0
def preprocess_translation_data(data_dir, extra_flags=None):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args(
        [
            '--source-lang', 'in',
            '--target-lang', 'out',
            '--trainpref', os.path.join(data_dir, 'train'),
            '--validpref', os.path.join(data_dir, 'valid'),
            '--testpref', os.path.join(data_dir, 'test'),
            '--thresholdtgt', '0',
            '--thresholdsrc', '0',
            '--destdir', data_dir,
        ] + (extra_flags or []),
    )
    preprocess.main(preprocess_args)
예제 #11
0
def gen_fairseq_data(source_lang, target_lang, trainpref, validpref, nwordssrc,
                     nwordstgt, destdir):
    from fairseq import options
    from fairseq_cli import preprocess

    parser = options.get_preprocessing_parser()
    args = parser.parse_args()

    args.source_lang = source_lang
    args.target_lang = target_lang
    args.trainpref = trainpref
    args.validpref = validpref
    args.nwordssrc = nwordssrc
    args.nwordstgt = nwordstgt
    args.destdir = destdir
    preprocess.main(args)
예제 #12
0
def _init_model(pretrain_model):
    bpe_path = os.path.join(pretrain_model, "bpe.codes")

    BERTweet = RobertaModel.from_pretrained(pretrain_model,
                                            checkpoint_file='model.pt')
    BERTweet.eval()  # disable dropout (or leave in train mode to finetune)

    # Incorporate the BPE encoder into BERTweet-base

    parser = options.get_preprocessing_parser()
    parser.add_argument('--bpe-codes',
                        type=str,
                        help='path to fastBPE BPE',
                        default=bpe_path)
    args = parser.parse_args()
    BERTweet.bpe = fastBPE(args)  # Incorporate the BPE encoder into BERTweet
    return BERTweet
def cli_main():
    parser = options.get_preprocessing_parser()
    group = parser.add_argument_group('Preprocessing')

    group.add_argument("--convert_raw",
                       action="store_true",
                       help="convert_raw")
    group.add_argument("--convert_raw_only",
                       action="store_true",
                       help="convert_raw")
    group.add_argument("--convert_with_bpe",
                       action="store_true",
                       help="convert_with_bpe")
    # group.add_argument("--bpe_code", action="store_true", help="convert_with_bpe")
    group.add_argument('--bpe_code', metavar='FILE', help='bpe_code')

    group.add_argument("--no_remove_root",
                       action="store_true",
                       help="no_remove_root")
    group.add_argument("--no_take_pos_tag",
                       action="store_true",
                       help="no_take_pos_tag")
    group.add_argument("--no_take_nodes",
                       action="store_true",
                       help="no_take_nodes")
    group.add_argument("--no_reverse_node",
                       action="store_true",
                       help="no_reverse_node")
    group.add_argument("--no_collapse",
                       action="store_true",
                       help="no_collapse")

    group.add_argument("--raw_workers",
                       metavar="N",
                       default=0,
                       type=int,
                       help="number of parallel workers")
    group.add_argument("--eval_workers",
                       metavar="N",
                       default=0,
                       type=int,
                       help="number of parallel workers")

    args = parser.parse_args()
    main(args)
예제 #14
0
def cli_main():
    parser = options.get_preprocessing_parser()
    args = parser.parse_args()
    #set arguments
    """
    args.source_lang = "src"
    args.target_lang = "tgt"
    args.trainpref = "/home/v-jiaya/fairseq-master/data/test/train"
    args.validpref = "/home/v-jiaya/fairseq-master/data/test/valid"
    args.testpref =  "/home/v-jiaya/fairseq-master/data/test/test"
    args.destdir="/home/v-jiaya/fairseq-master/data/test/data-bin/"
    args.workers=1
    args.src_mask=True
    args.joined_dictionary = True
    """

    #end
    main(args)
예제 #15
0
def fairseq_preprocess(src_lang,
                       tgt_lang,
                       destdir,
                       traindir=None,
                       validdir=None,
                       testdir=None,
                       nworkers=5):
    """
    Helper function to do pre-processing using fairseq-preprocess
    """
    def preprocessing_done():
        if os.path.exists(destdir):
            # TODO : more extensive checks
            print("Warning: Check processed dir manually")
            return True
        else:
            return False

    if not preprocessing_done():
        # TODO : to use FseqArgs
        args = []
        args.append(f"--source-lang={src_lang}")
        #src_dict_path = os.path.join(destdir, f"dict.{src_lang}.txt")
        #args.append(f"--srcdict={src_dict_path}")
        args.append(f"--target-lang={tgt_lang}")
        #tgt_dict_path = os.path.join(destdir, f"dict.{tgt_lang}.txt")
        #args.append(f"--tgtdict={tgt_dict_path}")
        if traindir:
            args.append(f"--trainpref={traindir}/train.tok")
        if validdir:
            args.append(f"--validpref={validdir}/valid.tok")
        if testdir:
            args.append(f"--testpref={testdir}/test.tok")
        args.append(f"--destdir={destdir}")
        args.append(f"--workers={nworkers}")

        # fairseq preprocessing argument parser
        parser = options.get_preprocessing_parser()
        pargs = parser.parse_args(args)
        preprocess.main(pargs)
    else:
        print("Probably, preprocessing is already done. Check dirs.")
예제 #16
0
def fairseq_preprocess(dataset):
    dataset_dir = get_dataset_dir(dataset)
    with lock_directory(dataset_dir):
        preprocessed_dir = dataset_dir / 'fairseq_preprocessed'
        with create_directory_or_skip(preprocessed_dir):
            preprocessing_parser = options.get_preprocessing_parser()
            preprocess_args = preprocessing_parser.parse_args([
                '--source-lang',
                'complex',
                '--target-lang',
                'simple',
                '--trainpref',
                os.path.join(dataset_dir, f'{dataset}.train'),
                '--validpref',
                os.path.join(dataset_dir, f'{dataset}.valid'),
                '--testpref',
                os.path.join(dataset_dir, f'{dataset}.test'),
                '--destdir',
                str(preprocessed_dir),
                '--output-format',
                'raw',
            ])
            preprocess.main(preprocess_args)
        return preprocessed_dir
예제 #17
0
def cli_main():
    parser = options.get_preprocessing_parser()
    args = parser.parse_args()
    main(args)
예제 #18
0
from numpy import dot

from numpy.linalg import norm

from fairseq.data import Dictionary
config = RobertaConfig.from_pretrained("PhoBERT_base_transformers/config.json")
# Load BPE encoder
parser_w2v = argparse.ArgumentParser()
parser_w2v.add_argument('--bpe-codes',
                        default="PhoBERT_base_transformers/bpe.codes",
                        required=False,
                        type=str,
                        help='path to fastBPE BPE')

from fairseq import options
parser_mask = options.get_preprocessing_parser()
parser_mask.add_argument('--bpe-codes',
                         type=str,
                         help='',
                         default="PhoBERT_base_fairseq/bpe.codes")

from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP("VnCoreNLP-1.1.1.jar",
                         annotators="wseg",
                         max_heap_size='-Xmx500m')
vocab = Dictionary()
vocab.add_from_file("PhoBERT_base_transformers/dict.txt")
from transformers import RobertaModel
phobert_w2v = RobertaModel.from_pretrained(
    "PhoBERT_base_transformers/model.bin", config=config)
import fasttext
model = fasttext.load_model("cc.vi.300.bin")
import torch
from collections import Counter
from typing import List
# Load PhoBERT-base in fairseq
from fairseq.models.roberta import RobertaModel
phobert = RobertaModel.from_pretrained('PhoBERT_base_fairseq',
                                       checkpoint_file='model.pt')
phobert.eval()  # disable dropout (or leave in train mode to finetune)

# Incorporate the BPE encoder into PhoBERT-base
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq import options

parser = options.get_preprocessing_parser()
parser.add_argument('--bpe-codes',
                    type=str,
                    help='path to fastBPE BPE',
                    default="PhoBERT_base_fairseq/bpe.codes")
args = parser.parse_args()
phobert.bpe = fastBPE(args)


def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor,
                       other_tokens: List[str]):
    """
    Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy).
    Args:
        roberta (RobertaHubInterface): RoBERTa instance
        bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)`
예제 #20
0
def lm_scoring(preprocess_directory, bpe_status, gen_output, pre_gen,
               cur_lm_dict, cur_lm_name, cur_language_model, cur_lm_bpe_code,
               batch_size, lm_score_file, target_lang, source_lang, prefix_len=None):
    if prefix_len is not None:
        assert bpe_status == "different", "bpe status must be different to use prefix len"
    if bpe_status == "no bpe":
        # run lm on output without bpe
        write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo,
                          gen_output.no_bpe_target, pre_gen+"/rescore_data_no_bpe.de",
                          pre_gen+"/rescore_data_no_bpe.en", pre_gen+"/reference_file_no_bpe")

        preprocess_lm_param = ["--only-source",
                               "--trainpref", pre_gen+"/rescore_data_no_bpe."+target_lang,
                               "--srcdict", cur_lm_dict,
                               "--destdir", preprocess_directory]
        preprocess_parser = options.get_preprocessing_parser()
        input_args = preprocess_parser.parse_args(preprocess_lm_param)
        preprocess.main(input_args)

        eval_lm_param = [preprocess_directory,
                         "--path", cur_language_model,
                         "--output-word-probs",
                         "--batch-size", str(batch_size),
                         "--max-tokens", "1024",
                         "--sample-break-mode", "eos",
                         "--gen-subset", "train"]

        eval_lm_parser = options.get_eval_lm_parser()
        input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)

        with open(lm_score_file, 'w') as f:
            with redirect_stdout(f):
                eval_lm.main(input_args)

    elif bpe_status == "shared":
            preprocess_lm_param = ["--only-source",
                                   "--trainpref", pre_gen+"/rescore_data."+target_lang,
                                   "--srcdict", cur_lm_dict,
                                   "--destdir", preprocess_directory]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_lm_param)
            preprocess.main(input_args)

            eval_lm_param = [preprocess_directory,
                             "--path", cur_language_model,
                             "--output-word-probs",
                             "--batch-size", str(batch_size),
                             "--sample-break-mode", "eos",
                             "--gen-subset", "train"]

            eval_lm_parser = options.get_eval_lm_parser()
            input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)

            with open(lm_score_file, 'w') as f:
                with redirect_stdout(f):
                    eval_lm.main(input_args)

    elif bpe_status == "different":
        rescore_file = pre_gen+"/rescore_data_no_bpe"
        rescore_bpe = pre_gen+"/rescore_data_new_bpe"

        rescore_file += "."
        rescore_bpe += "."

        write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo,
                          gen_output.no_bpe_target, rescore_file+source_lang,
                          rescore_file+target_lang, pre_gen+"/reference_file_no_bpe",
                          bpe_symbol=None)

        # apply LM bpe to nbest list
        bpe_src_param = ["-c", cur_lm_bpe_code,
                         "--input", rescore_file+target_lang,
                         "--output", rescore_bpe+target_lang]
        subprocess.call(["python",
                         os.path.join(os.path.dirname(__file__),
                                      "subword-nmt/subword_nmt/apply_bpe.py")] + bpe_src_param,
                        shell=False)
        # uncomment to use fastbpe instead of subword-nmt bpe
        # bpe_src_param = [rescore_bpe+target_lang, rescore_file+target_lang, cur_lm_bpe_code]
        # subprocess.call(["/private/home/edunov/fastBPE/fast", "applybpe"] + bpe_src_param, shell=False)

        preprocess_dir = preprocess_directory

        preprocess_lm_param = ["--only-source",
                               "--trainpref", rescore_bpe+target_lang,
                               "--srcdict", cur_lm_dict,
                               "--destdir", preprocess_dir]
        preprocess_parser = options.get_preprocessing_parser()
        input_args = preprocess_parser.parse_args(preprocess_lm_param)
        preprocess.main(input_args)

        eval_lm_param = [preprocess_dir,
                         "--path", cur_language_model,
                         "--output-word-probs",
                         "--batch-size", str(batch_size),
                         "--max-tokens", "1024",
                         "--sample-break-mode", "eos",
                         "--gen-subset", "train"]

        eval_lm_parser = options.get_eval_lm_parser()
        input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)

        with open(lm_score_file, 'w') as f:
            with redirect_stdout(f):
                eval_lm.main(input_args)
예제 #21
0
def gen_and_reprocess_nbest(args):
    if args.score_dict_dir is None:
        args.score_dict_dir = args.data
    if args.prefix_len is not None:
        assert (args.right_to_left1 is False
                ), "prefix length not compatible with right to left models"
        assert (args.right_to_left2 is False
                ), "prefix length not compatible with right to left models"

    if args.nbest_list is not None:
        assert args.score_model2 is None

    if args.backwards1:
        scorer1_src = args.target_lang
        scorer1_tgt = args.source_lang
    else:
        scorer1_src = args.source_lang
        scorer1_tgt = args.target_lang

    store_data = (os.path.join(os.path.dirname(__file__)) + "/rerank_data/" +
                  args.data_dir_name)
    if not os.path.exists(store_data):
        os.makedirs(store_data)

    (
        pre_gen,
        left_to_right_preprocessed_dir,
        right_to_left_preprocessed_dir,
        backwards_preprocessed_dir,
        lm_preprocessed_dir,
    ) = rerank_utils.get_directories(
        args.data_dir_name,
        args.num_rescore,
        args.gen_subset,
        args.gen_model_name,
        args.shard_id,
        args.num_shards,
        args.sampling,
        args.prefix_len,
        args.target_prefix_frac,
        args.source_prefix_frac,
    )
    assert not (args.right_to_left1
                and args.backwards1), "backwards right to left not supported"
    assert not (args.right_to_left2
                and args.backwards2), "backwards right to left not supported"
    assert not (args.prefix_len is not None
                and args.target_prefix_frac is not None
                ), "target prefix frac and target prefix len incompatible"

    # make directory to store generation results
    if not os.path.exists(pre_gen):
        os.makedirs(pre_gen)

    rerank1_is_gen = (args.gen_model == args.score_model1
                      and args.source_prefix_frac is None)
    rerank2_is_gen = (args.gen_model == args.score_model2
                      and args.source_prefix_frac is None)

    if args.nbest_list is not None:
        rerank2_is_gen = True

    # make directories to store preprossed nbest list for reranking
    if not os.path.exists(left_to_right_preprocessed_dir):
        os.makedirs(left_to_right_preprocessed_dir)
    if not os.path.exists(right_to_left_preprocessed_dir):
        os.makedirs(right_to_left_preprocessed_dir)
    if not os.path.exists(lm_preprocessed_dir):
        os.makedirs(lm_preprocessed_dir)
    if not os.path.exists(backwards_preprocessed_dir):
        os.makedirs(backwards_preprocessed_dir)

    score1_file = rerank_utils.rescore_file_name(
        pre_gen,
        args.prefix_len,
        args.model1_name,
        target_prefix_frac=args.target_prefix_frac,
        source_prefix_frac=args.source_prefix_frac,
        backwards=args.backwards1,
    )
    if args.score_model2 is not None:
        score2_file = rerank_utils.rescore_file_name(
            pre_gen,
            args.prefix_len,
            args.model2_name,
            target_prefix_frac=args.target_prefix_frac,
            source_prefix_frac=args.source_prefix_frac,
            backwards=args.backwards2,
        )

    predictions_bpe_file = pre_gen + "/generate_output_bpe.txt"

    using_nbest = args.nbest_list is not None

    if using_nbest:
        print("Using predefined n-best list from interactive.py")
        predictions_bpe_file = args.nbest_list

    else:
        if not os.path.isfile(predictions_bpe_file):
            print(
                "STEP 1: generate predictions using the p(T|S) model with bpe")
            print(args.data)
            param1 = [
                args.data,
                "--path",
                args.gen_model,
                "--shard-id",
                str(args.shard_id),
                "--num-shards",
                str(args.num_shards),
                "--nbest",
                str(args.num_rescore),
                "--batch-size",
                str(args.batch_size),
                "--beam",
                str(args.num_rescore),
                "--batch-size",
                str(args.num_rescore),
                "--gen-subset",
                args.gen_subset,
                "--source-lang",
                args.source_lang,
                "--target-lang",
                args.target_lang,
            ]
            if args.sampling:
                param1 += ["--sampling"]

            gen_parser = options.get_generation_parser()
            input_args = options.parse_args_and_arch(gen_parser, param1)

            print(input_args)
            with open(predictions_bpe_file, "w") as f:
                with redirect_stdout(f):
                    generate.main(input_args)

    gen_output = rerank_utils.BitextOutputFromGen(
        predictions_bpe_file,
        bpe_symbol=args.post_process,
        nbest=using_nbest,
        prefix_len=args.prefix_len,
        target_prefix_frac=args.target_prefix_frac,
    )

    if args.diff_bpe:
        rerank_utils.write_reprocessed(
            gen_output.no_bpe_source,
            gen_output.no_bpe_hypo,
            gen_output.no_bpe_target,
            pre_gen + "/source_gen_bpe." + args.source_lang,
            pre_gen + "/target_gen_bpe." + args.target_lang,
            pre_gen + "/reference_gen_bpe." + args.target_lang,
        )
        bitext_bpe = args.rescore_bpe_code
        bpe_src_param = [
            "-c",
            bitext_bpe,
            "--input",
            pre_gen + "/source_gen_bpe." + args.source_lang,
            "--output",
            pre_gen + "/rescore_data." + args.source_lang,
        ]
        bpe_tgt_param = [
            "-c",
            bitext_bpe,
            "--input",
            pre_gen + "/target_gen_bpe." + args.target_lang,
            "--output",
            pre_gen + "/rescore_data." + args.target_lang,
        ]

        subprocess.call(
            [
                "python",
                os.path.join(os.path.dirname(__file__),
                             "subword-nmt/subword_nmt/apply_bpe.py"),
            ] + bpe_src_param,
            shell=False,
        )

        subprocess.call(
            [
                "python",
                os.path.join(os.path.dirname(__file__),
                             "subword-nmt/subword_nmt/apply_bpe.py"),
            ] + bpe_tgt_param,
            shell=False,
        )

    if (not os.path.isfile(score1_file)
            and not rerank1_is_gen) or (args.score_model2 is not None
                                        and not os.path.isfile(score2_file)
                                        and not rerank2_is_gen):
        print(
            "STEP 2: process the output of generate.py so we have clean text files with the translations"
        )

        rescore_file = "/rescore_data"
        if args.prefix_len is not None:
            prefix_len_rescore_file = rescore_file + "prefix" + str(
                args.prefix_len)
        if args.target_prefix_frac is not None:
            target_prefix_frac_rescore_file = (rescore_file +
                                               "target_prefix_frac" +
                                               str(args.target_prefix_frac))
        if args.source_prefix_frac is not None:
            source_prefix_frac_rescore_file = (rescore_file +
                                               "source_prefix_frac" +
                                               str(args.source_prefix_frac))

        if not args.right_to_left1 or not args.right_to_left2:
            if not args.diff_bpe:
                rerank_utils.write_reprocessed(
                    gen_output.source,
                    gen_output.hypo,
                    gen_output.target,
                    pre_gen + rescore_file + "." + args.source_lang,
                    pre_gen + rescore_file + "." + args.target_lang,
                    pre_gen + "/reference_file",
                    bpe_symbol=args.post_process,
                )
                if args.prefix_len is not None:
                    bw_rescore_file = prefix_len_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + prefix_len_rescore_file + "." +
                        args.source_lang,
                        pre_gen + prefix_len_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        prefix_len=args.prefix_len,
                        bpe_symbol=args.post_process,
                    )
                elif args.target_prefix_frac is not None:
                    bw_rescore_file = target_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + target_prefix_frac_rescore_file + "." +
                        args.source_lang,
                        pre_gen + target_prefix_frac_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        bpe_symbol=args.post_process,
                        target_prefix_frac=args.target_prefix_frac,
                    )
                else:
                    bw_rescore_file = rescore_file

                if args.source_prefix_frac is not None:
                    fw_rescore_file = source_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + source_prefix_frac_rescore_file + "." +
                        args.source_lang,
                        pre_gen + source_prefix_frac_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        bpe_symbol=args.post_process,
                        source_prefix_frac=args.source_prefix_frac,
                    )
                else:
                    fw_rescore_file = rescore_file

        if args.right_to_left1 or args.right_to_left2:
            rerank_utils.write_reprocessed(
                gen_output.source,
                gen_output.hypo,
                gen_output.target,
                pre_gen + "/right_to_left_rescore_data." + args.source_lang,
                pre_gen + "/right_to_left_rescore_data." + args.target_lang,
                pre_gen + "/right_to_left_reference_file",
                right_to_left=True,
                bpe_symbol=args.post_process,
            )

        print("STEP 3: binarize the translations")
        if (not args.right_to_left1
                or args.score_model2 is not None and not args.right_to_left2
                or not rerank1_is_gen):

            if args.backwards1 or args.backwards2:
                if args.backwards_score_dict_dir is not None:
                    bw_dict = args.backwards_score_dict_dir
                else:
                    bw_dict = args.score_dict_dir
                bw_preprocess_param = [
                    "--source-lang",
                    scorer1_src,
                    "--target-lang",
                    scorer1_tgt,
                    "--trainpref",
                    pre_gen + bw_rescore_file,
                    "--srcdict",
                    bw_dict + "/dict." + scorer1_src + ".txt",
                    "--tgtdict",
                    bw_dict + "/dict." + scorer1_tgt + ".txt",
                    "--destdir",
                    backwards_preprocessed_dir,
                ]
                preprocess_parser = options.get_preprocessing_parser()
                input_args = preprocess_parser.parse_args(bw_preprocess_param)
                preprocess.main(input_args)

            preprocess_param = [
                "--source-lang",
                scorer1_src,
                "--target-lang",
                scorer1_tgt,
                "--trainpref",
                pre_gen + fw_rescore_file,
                "--srcdict",
                args.score_dict_dir + "/dict." + scorer1_src + ".txt",
                "--tgtdict",
                args.score_dict_dir + "/dict." + scorer1_tgt + ".txt",
                "--destdir",
                left_to_right_preprocessed_dir,
            ]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

        if args.right_to_left1 or args.right_to_left2:
            preprocess_param = [
                "--source-lang",
                scorer1_src,
                "--target-lang",
                scorer1_tgt,
                "--trainpref",
                pre_gen + "/right_to_left_rescore_data",
                "--srcdict",
                args.score_dict_dir + "/dict." + scorer1_src + ".txt",
                "--tgtdict",
                args.score_dict_dir + "/dict." + scorer1_tgt + ".txt",
                "--destdir",
                right_to_left_preprocessed_dir,
            ]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

    return gen_output
def binarize(arguments):
    parser = options.get_preprocessing_parser()
    args = parser.parse_args(arguments)
    binarize_func(args)
def cli_main():
    parser = options.get_preprocessing_parser()
    args = parser.parse_args()
    src_dict, tgt_dict, char_dict = prepare_dict(args)
    prepare_raw_data(args, src_dict, tgt_dict, char_dict)