Exemplo n.º 1
0
def preprocess_summarization_data(data_dir, extra_flags=None):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args(
        [
            "--source-lang",
            "in",
            "--target-lang",
            "out",
            "--trainpref",
            os.path.join(data_dir, "train"),
            "--validpref",
            os.path.join(data_dir, "valid"),
            "--testpref",
            os.path.join(data_dir, "test"),
            "--thresholdtgt",
            "0",
            "--thresholdsrc",
            "0",
            "--joined-dictionary",
            "--destdir",
            data_dir,
        ]
        + (extra_flags or []),
    )
    preprocess.main(preprocess_args)
Exemplo n.º 2
0
def preprocess_lm_data(data_dir, languages=None):
    preprocess_parser = options.get_preprocessing_parser()
    if languages is None:
        preprocess_args = preprocess_parser.parse_args([
            "--only-source",
            "--trainpref",
            os.path.join(data_dir, "train.out"),
            "--validpref",
            os.path.join(data_dir, "valid.out"),
            "--testpref",
            os.path.join(data_dir, "test.out"),
            "--destdir",
            data_dir,
        ])
        preprocess.main(preprocess_args)
    else:
        for lang in languages:
            lang_dir = os.path.join(data_dir, lang)
            assert os.path.exists(lang_dir)
            preprocess_args = preprocess_parser.parse_args([
                "--only-source",
                "--trainpref",
                os.path.join(lang_dir, "train.out"),
                "--validpref",
                os.path.join(lang_dir, "valid.out"),
                "--testpref",
                os.path.join(lang_dir, "test.out"),
                "--destdir",
                lang_dir,
            ])
            preprocess.main(preprocess_args)
        shutil.copyfile(
            os.path.join(data_dir, languages[0], "dict.txt"),
            os.path.join(data_dir, "dict.txt"),
        )
Exemplo n.º 3
0
def preprocess_lm_data(data_dir):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args([
        '--only-source',
        '--trainpref', os.path.join(data_dir, 'train.out'),
        '--validpref', os.path.join(data_dir, 'valid.out'),
        '--testpref', os.path.join(data_dir, 'test.out'),
        '--destdir', data_dir,
    ])
    preprocess.main(preprocess_args)
Exemplo n.º 4
0
def main(args):

    os.makedirs(args.destdir, exist_ok=True)

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, suffix):
        return os.path.join(args.destdir, file_name(prefix, suffix))

    def dest_prefix(prefix):
        return os.path.join(args.destdir, prefix)

    def prepare_label(input_prefix, output_prefix):
        with open(file_name(input_prefix, args.source_lang),
                  "r",
                  encoding="utf-8") as src_file:
            with open(dest_path(output_prefix, "char"), "w",
                      encoding="utf-8") as char_file:
                with open(dest_path(output_prefix, "label"),
                          "w",
                          encoding="utf-8") as label_file:
                    for line in src_file:
                        char_file.write(
                            segmenter_utils.to_character_sequence(line))
                        label_file.write(
                            segmenter_utils.to_boundary_tag(line) + "\n")

    def prepare_dataset():
        if args.trainpref:
            prepare_label(args.trainpref, "train")
        if args.validpref:
            prepare_label(args.validpref, "valid")
        if args.testpref:
            prepare_label(args.testpref, "test")

    def override_args():
        args.source_lang = "char"
        args.target_lang = "label"
        args.only_source = False
        args.joined_dictionary = False

        if args.trainpref:
            args.trainpref = dest_prefix("train")
        if args.validpref:
            args.validpref = dest_prefix("valid")
        if args.testpref:
            args.testpref = dest_prefix("test")

    prepare_dataset()
    override_args()
    preprocess.main(args)
Exemplo n.º 5
0
def preprocess_lm_data(data_dir):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args([
        "--only-source",
        "--trainpref",
        os.path.join(data_dir, "train.out"),
        "--validpref",
        os.path.join(data_dir, "valid.out"),
        "--testpref",
        os.path.join(data_dir, "test.out"),
        "--destdir",
        data_dir,
    ])
    preprocess.main(preprocess_args)
Exemplo n.º 6
0
def preprocess_translation_data(data_dir, extra_flags=None):
    preprocess_parser = options.get_preprocessing_parser()
    preprocess_args = preprocess_parser.parse_args(
        [
            '--source-lang', 'in',
            '--target-lang', 'out',
            '--trainpref', os.path.join(data_dir, 'train'),
            '--validpref', os.path.join(data_dir, 'valid'),
            '--testpref', os.path.join(data_dir, 'test'),
            '--thresholdtgt', '0',
            '--thresholdsrc', '0',
            '--destdir', data_dir,
        ] + (extra_flags or []),
    )
    preprocess.main(preprocess_args)
Exemplo n.º 7
0
def gen_fairseq_data(source_lang, target_lang, trainpref, validpref, nwordssrc,
                     nwordstgt, destdir):
    from fairseq import options
    from fairseq_cli import preprocess

    parser = options.get_preprocessing_parser()
    args = parser.parse_args()

    args.source_lang = source_lang
    args.target_lang = target_lang
    args.trainpref = trainpref
    args.validpref = validpref
    args.nwordssrc = nwordssrc
    args.nwordstgt = nwordstgt
    args.destdir = destdir
    preprocess.main(args)
Exemplo n.º 8
0
def fairseq_preprocess(src_lang,
                       tgt_lang,
                       destdir,
                       traindir=None,
                       validdir=None,
                       testdir=None,
                       nworkers=5):
    """
    Helper function to do pre-processing using fairseq-preprocess
    """
    def preprocessing_done():
        if os.path.exists(destdir):
            # TODO : more extensive checks
            print("Warning: Check processed dir manually")
            return True
        else:
            return False

    if not preprocessing_done():
        # TODO : to use FseqArgs
        args = []
        args.append(f"--source-lang={src_lang}")
        #src_dict_path = os.path.join(destdir, f"dict.{src_lang}.txt")
        #args.append(f"--srcdict={src_dict_path}")
        args.append(f"--target-lang={tgt_lang}")
        #tgt_dict_path = os.path.join(destdir, f"dict.{tgt_lang}.txt")
        #args.append(f"--tgtdict={tgt_dict_path}")
        if traindir:
            args.append(f"--trainpref={traindir}/train.tok")
        if validdir:
            args.append(f"--validpref={validdir}/valid.tok")
        if testdir:
            args.append(f"--testpref={testdir}/test.tok")
        args.append(f"--destdir={destdir}")
        args.append(f"--workers={nworkers}")

        # fairseq preprocessing argument parser
        parser = options.get_preprocessing_parser()
        pargs = parser.parse_args(args)
        preprocess.main(pargs)
    else:
        print("Probably, preprocessing is already done. Check dirs.")
Exemplo n.º 9
0
def fairseq_preprocess(dataset):
    dataset_dir = get_dataset_dir(dataset)
    with lock_directory(dataset_dir):
        preprocessed_dir = dataset_dir / 'fairseq_preprocessed'
        with create_directory_or_skip(preprocessed_dir):
            preprocessing_parser = options.get_preprocessing_parser()
            preprocess_args = preprocessing_parser.parse_args([
                '--source-lang',
                'complex',
                '--target-lang',
                'simple',
                '--trainpref',
                os.path.join(dataset_dir, f'{dataset}.train'),
                '--validpref',
                os.path.join(dataset_dir, f'{dataset}.valid'),
                '--testpref',
                os.path.join(dataset_dir, f'{dataset}.test'),
                '--destdir',
                str(preprocessed_dir),
                '--output-format',
                'raw',
            ])
            preprocess.main(preprocess_args)
        return preprocessed_dir
Exemplo n.º 10
0
def lm_scoring(
    preprocess_directory,
    bpe_status,
    gen_output,
    pre_gen,
    cur_lm_dict,
    cur_lm_name,
    cur_language_model,
    cur_lm_bpe_code,
    batch_size,
    lm_score_file,
    target_lang,
    source_lang,
    prefix_len=None,
):
    if prefix_len is not None:
        assert (bpe_status == "different"
                ), "bpe status must be different to use prefix len"
    if bpe_status == "no bpe":
        # run lm on output without bpe
        write_reprocessed(
            gen_output.no_bpe_source,
            gen_output.no_bpe_hypo,
            gen_output.no_bpe_target,
            pre_gen + "/rescore_data_no_bpe.de",
            pre_gen + "/rescore_data_no_bpe.en",
            pre_gen + "/reference_file_no_bpe",
        )

        preprocess_lm_param = [
            "--only-source",
            "--trainpref",
            pre_gen + "/rescore_data_no_bpe." + target_lang,
            "--srcdict",
            cur_lm_dict,
            "--destdir",
            preprocess_directory,
        ]
        preprocess_parser = options.get_preprocessing_parser()
        input_args = preprocess_parser.parse_args(preprocess_lm_param)
        preprocess.main(input_args)

        eval_lm_param = [
            preprocess_directory,
            "--path",
            cur_language_model,
            "--output-word-probs",
            "--batch-size",
            str(batch_size),
            "--max-tokens",
            "1024",
            "--sample-break-mode",
            "eos",
            "--gen-subset",
            "train",
        ]

        eval_lm_parser = options.get_eval_lm_parser()
        input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)

        with open(lm_score_file, "w") as f:
            with redirect_stdout(f):
                eval_lm.main(input_args)

    elif bpe_status == "shared":
        preprocess_lm_param = [
            "--only-source",
            "--trainpref",
            pre_gen + "/rescore_data." + target_lang,
            "--srcdict",
            cur_lm_dict,
            "--destdir",
            preprocess_directory,
        ]
        preprocess_parser = options.get_preprocessing_parser()
        input_args = preprocess_parser.parse_args(preprocess_lm_param)
        preprocess.main(input_args)

        eval_lm_param = [
            preprocess_directory,
            "--path",
            cur_language_model,
            "--output-word-probs",
            "--batch-size",
            str(batch_size),
            "--sample-break-mode",
            "eos",
            "--gen-subset",
            "train",
        ]

        eval_lm_parser = options.get_eval_lm_parser()
        input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)

        with open(lm_score_file, "w") as f:
            with redirect_stdout(f):
                eval_lm.main(input_args)

    elif bpe_status == "different":
        rescore_file = pre_gen + "/rescore_data_no_bpe"
        rescore_bpe = pre_gen + "/rescore_data_new_bpe"

        rescore_file += "."
        rescore_bpe += "."

        write_reprocessed(
            gen_output.no_bpe_source,
            gen_output.no_bpe_hypo,
            gen_output.no_bpe_target,
            rescore_file + source_lang,
            rescore_file + target_lang,
            pre_gen + "/reference_file_no_bpe",
            bpe_symbol=None,
        )

        # apply LM bpe to nbest list
        bpe_src_param = [
            "-c",
            cur_lm_bpe_code,
            "--input",
            rescore_file + target_lang,
            "--output",
            rescore_bpe + target_lang,
        ]
        subprocess.call(
            [
                "python",
                os.path.join(os.path.dirname(__file__),
                             "subword-nmt/subword_nmt/apply_bpe.py"),
            ] + bpe_src_param,
            shell=False,
        )
        # uncomment to use fastbpe instead of subword-nmt bpe
        # bpe_src_param = [rescore_bpe+target_lang, rescore_file+target_lang, cur_lm_bpe_code]
        # subprocess.call(["/private/home/edunov/fastBPE/fast", "applybpe"] + bpe_src_param, shell=False)

        preprocess_dir = preprocess_directory

        preprocess_lm_param = [
            "--only-source",
            "--trainpref",
            rescore_bpe + target_lang,
            "--srcdict",
            cur_lm_dict,
            "--destdir",
            preprocess_dir,
        ]
        preprocess_parser = options.get_preprocessing_parser()
        input_args = preprocess_parser.parse_args(preprocess_lm_param)
        preprocess.main(input_args)

        eval_lm_param = [
            preprocess_dir,
            "--path",
            cur_language_model,
            "--output-word-probs",
            "--batch-size",
            str(batch_size),
            "--max-tokens",
            "1024",
            "--sample-break-mode",
            "eos",
            "--gen-subset",
            "train",
        ]

        eval_lm_parser = options.get_eval_lm_parser()
        input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param)

        with open(lm_score_file, "w") as f:
            with redirect_stdout(f):
                eval_lm.main(input_args)
Exemplo n.º 11
0
def gen_and_reprocess_nbest(args):
    if args.score_dict_dir is None:
        args.score_dict_dir = args.data
    if args.prefix_len is not None:
        assert (args.right_to_left1 is False
                ), "prefix length not compatible with right to left models"
        assert (args.right_to_left2 is False
                ), "prefix length not compatible with right to left models"

    if args.nbest_list is not None:
        assert args.score_model2 is None

    if args.backwards1:
        scorer1_src = args.target_lang
        scorer1_tgt = args.source_lang
    else:
        scorer1_src = args.source_lang
        scorer1_tgt = args.target_lang

    store_data = (os.path.join(os.path.dirname(__file__)) + "/rerank_data/" +
                  args.data_dir_name)
    if not os.path.exists(store_data):
        os.makedirs(store_data)

    (
        pre_gen,
        left_to_right_preprocessed_dir,
        right_to_left_preprocessed_dir,
        backwards_preprocessed_dir,
        lm_preprocessed_dir,
    ) = rerank_utils.get_directories(
        args.data_dir_name,
        args.num_rescore,
        args.gen_subset,
        args.gen_model_name,
        args.shard_id,
        args.num_shards,
        args.sampling,
        args.prefix_len,
        args.target_prefix_frac,
        args.source_prefix_frac,
    )
    assert not (args.right_to_left1
                and args.backwards1), "backwards right to left not supported"
    assert not (args.right_to_left2
                and args.backwards2), "backwards right to left not supported"
    assert not (args.prefix_len is not None
                and args.target_prefix_frac is not None
                ), "target prefix frac and target prefix len incompatible"

    # make directory to store generation results
    if not os.path.exists(pre_gen):
        os.makedirs(pre_gen)

    rerank1_is_gen = (args.gen_model == args.score_model1
                      and args.source_prefix_frac is None)
    rerank2_is_gen = (args.gen_model == args.score_model2
                      and args.source_prefix_frac is None)

    if args.nbest_list is not None:
        rerank2_is_gen = True

    # make directories to store preprossed nbest list for reranking
    if not os.path.exists(left_to_right_preprocessed_dir):
        os.makedirs(left_to_right_preprocessed_dir)
    if not os.path.exists(right_to_left_preprocessed_dir):
        os.makedirs(right_to_left_preprocessed_dir)
    if not os.path.exists(lm_preprocessed_dir):
        os.makedirs(lm_preprocessed_dir)
    if not os.path.exists(backwards_preprocessed_dir):
        os.makedirs(backwards_preprocessed_dir)

    score1_file = rerank_utils.rescore_file_name(
        pre_gen,
        args.prefix_len,
        args.model1_name,
        target_prefix_frac=args.target_prefix_frac,
        source_prefix_frac=args.source_prefix_frac,
        backwards=args.backwards1,
    )
    if args.score_model2 is not None:
        score2_file = rerank_utils.rescore_file_name(
            pre_gen,
            args.prefix_len,
            args.model2_name,
            target_prefix_frac=args.target_prefix_frac,
            source_prefix_frac=args.source_prefix_frac,
            backwards=args.backwards2,
        )

    predictions_bpe_file = pre_gen + "/generate_output_bpe.txt"

    using_nbest = args.nbest_list is not None

    if using_nbest:
        print("Using predefined n-best list from interactive.py")
        predictions_bpe_file = args.nbest_list

    else:
        if not os.path.isfile(predictions_bpe_file):
            print(
                "STEP 1: generate predictions using the p(T|S) model with bpe")
            print(args.data)
            param1 = [
                args.data,
                "--path",
                args.gen_model,
                "--shard-id",
                str(args.shard_id),
                "--num-shards",
                str(args.num_shards),
                "--nbest",
                str(args.num_rescore),
                "--batch-size",
                str(args.batch_size),
                "--beam",
                str(args.num_rescore),
                "--batch-size",
                str(args.num_rescore),
                "--gen-subset",
                args.gen_subset,
                "--source-lang",
                args.source_lang,
                "--target-lang",
                args.target_lang,
            ]
            if args.sampling:
                param1 += ["--sampling"]

            gen_parser = options.get_generation_parser()
            input_args = options.parse_args_and_arch(gen_parser, param1)

            print(input_args)
            with open(predictions_bpe_file, "w") as f:
                with redirect_stdout(f):
                    generate.main(input_args)

    gen_output = rerank_utils.BitextOutputFromGen(
        predictions_bpe_file,
        bpe_symbol=args.post_process,
        nbest=using_nbest,
        prefix_len=args.prefix_len,
        target_prefix_frac=args.target_prefix_frac,
    )

    if args.diff_bpe:
        rerank_utils.write_reprocessed(
            gen_output.no_bpe_source,
            gen_output.no_bpe_hypo,
            gen_output.no_bpe_target,
            pre_gen + "/source_gen_bpe." + args.source_lang,
            pre_gen + "/target_gen_bpe." + args.target_lang,
            pre_gen + "/reference_gen_bpe." + args.target_lang,
        )
        bitext_bpe = args.rescore_bpe_code
        bpe_src_param = [
            "-c",
            bitext_bpe,
            "--input",
            pre_gen + "/source_gen_bpe." + args.source_lang,
            "--output",
            pre_gen + "/rescore_data." + args.source_lang,
        ]
        bpe_tgt_param = [
            "-c",
            bitext_bpe,
            "--input",
            pre_gen + "/target_gen_bpe." + args.target_lang,
            "--output",
            pre_gen + "/rescore_data." + args.target_lang,
        ]

        subprocess.call(
            [
                "python",
                os.path.join(os.path.dirname(__file__),
                             "subword-nmt/subword_nmt/apply_bpe.py"),
            ] + bpe_src_param,
            shell=False,
        )

        subprocess.call(
            [
                "python",
                os.path.join(os.path.dirname(__file__),
                             "subword-nmt/subword_nmt/apply_bpe.py"),
            ] + bpe_tgt_param,
            shell=False,
        )

    if (not os.path.isfile(score1_file)
            and not rerank1_is_gen) or (args.score_model2 is not None
                                        and not os.path.isfile(score2_file)
                                        and not rerank2_is_gen):
        print(
            "STEP 2: process the output of generate.py so we have clean text files with the translations"
        )

        rescore_file = "/rescore_data"
        if args.prefix_len is not None:
            prefix_len_rescore_file = rescore_file + "prefix" + str(
                args.prefix_len)
        if args.target_prefix_frac is not None:
            target_prefix_frac_rescore_file = (rescore_file +
                                               "target_prefix_frac" +
                                               str(args.target_prefix_frac))
        if args.source_prefix_frac is not None:
            source_prefix_frac_rescore_file = (rescore_file +
                                               "source_prefix_frac" +
                                               str(args.source_prefix_frac))

        if not args.right_to_left1 or not args.right_to_left2:
            if not args.diff_bpe:
                rerank_utils.write_reprocessed(
                    gen_output.source,
                    gen_output.hypo,
                    gen_output.target,
                    pre_gen + rescore_file + "." + args.source_lang,
                    pre_gen + rescore_file + "." + args.target_lang,
                    pre_gen + "/reference_file",
                    bpe_symbol=args.post_process,
                )
                if args.prefix_len is not None:
                    bw_rescore_file = prefix_len_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + prefix_len_rescore_file + "." +
                        args.source_lang,
                        pre_gen + prefix_len_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        prefix_len=args.prefix_len,
                        bpe_symbol=args.post_process,
                    )
                elif args.target_prefix_frac is not None:
                    bw_rescore_file = target_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + target_prefix_frac_rescore_file + "." +
                        args.source_lang,
                        pre_gen + target_prefix_frac_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        bpe_symbol=args.post_process,
                        target_prefix_frac=args.target_prefix_frac,
                    )
                else:
                    bw_rescore_file = rescore_file

                if args.source_prefix_frac is not None:
                    fw_rescore_file = source_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + source_prefix_frac_rescore_file + "." +
                        args.source_lang,
                        pre_gen + source_prefix_frac_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        bpe_symbol=args.post_process,
                        source_prefix_frac=args.source_prefix_frac,
                    )
                else:
                    fw_rescore_file = rescore_file

        if args.right_to_left1 or args.right_to_left2:
            rerank_utils.write_reprocessed(
                gen_output.source,
                gen_output.hypo,
                gen_output.target,
                pre_gen + "/right_to_left_rescore_data." + args.source_lang,
                pre_gen + "/right_to_left_rescore_data." + args.target_lang,
                pre_gen + "/right_to_left_reference_file",
                right_to_left=True,
                bpe_symbol=args.post_process,
            )

        print("STEP 3: binarize the translations")
        if (not args.right_to_left1
                or args.score_model2 is not None and not args.right_to_left2
                or not rerank1_is_gen):

            if args.backwards1 or args.backwards2:
                if args.backwards_score_dict_dir is not None:
                    bw_dict = args.backwards_score_dict_dir
                else:
                    bw_dict = args.score_dict_dir
                bw_preprocess_param = [
                    "--source-lang",
                    scorer1_src,
                    "--target-lang",
                    scorer1_tgt,
                    "--trainpref",
                    pre_gen + bw_rescore_file,
                    "--srcdict",
                    bw_dict + "/dict." + scorer1_src + ".txt",
                    "--tgtdict",
                    bw_dict + "/dict." + scorer1_tgt + ".txt",
                    "--destdir",
                    backwards_preprocessed_dir,
                ]
                preprocess_parser = options.get_preprocessing_parser()
                input_args = preprocess_parser.parse_args(bw_preprocess_param)
                preprocess.main(input_args)

            preprocess_param = [
                "--source-lang",
                scorer1_src,
                "--target-lang",
                scorer1_tgt,
                "--trainpref",
                pre_gen + fw_rescore_file,
                "--srcdict",
                args.score_dict_dir + "/dict." + scorer1_src + ".txt",
                "--tgtdict",
                args.score_dict_dir + "/dict." + scorer1_tgt + ".txt",
                "--destdir",
                left_to_right_preprocessed_dir,
            ]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

        if args.right_to_left1 or args.right_to_left2:
            preprocess_param = [
                "--source-lang",
                scorer1_src,
                "--target-lang",
                scorer1_tgt,
                "--trainpref",
                pre_gen + "/right_to_left_rescore_data",
                "--srcdict",
                args.score_dict_dir + "/dict." + scorer1_src + ".txt",
                "--tgtdict",
                args.score_dict_dir + "/dict." + scorer1_tgt + ".txt",
                "--destdir",
                right_to_left_preprocessed_dir,
            ]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

    return gen_output