def preprocess_summarization_data(data_dir, extra_flags=None): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args( [ "--source-lang", "in", "--target-lang", "out", "--trainpref", os.path.join(data_dir, "train"), "--validpref", os.path.join(data_dir, "valid"), "--testpref", os.path.join(data_dir, "test"), "--thresholdtgt", "0", "--thresholdsrc", "0", "--joined-dictionary", "--destdir", data_dir, ] + (extra_flags or []), ) preprocess.main(preprocess_args)
def preprocess_lm_data(data_dir, languages=None): preprocess_parser = options.get_preprocessing_parser() if languages is None: preprocess_args = preprocess_parser.parse_args([ "--only-source", "--trainpref", os.path.join(data_dir, "train.out"), "--validpref", os.path.join(data_dir, "valid.out"), "--testpref", os.path.join(data_dir, "test.out"), "--destdir", data_dir, ]) preprocess.main(preprocess_args) else: for lang in languages: lang_dir = os.path.join(data_dir, lang) assert os.path.exists(lang_dir) preprocess_args = preprocess_parser.parse_args([ "--only-source", "--trainpref", os.path.join(lang_dir, "train.out"), "--validpref", os.path.join(lang_dir, "valid.out"), "--testpref", os.path.join(lang_dir, "test.out"), "--destdir", lang_dir, ]) preprocess.main(preprocess_args) shutil.copyfile( os.path.join(data_dir, languages[0], "dict.txt"), os.path.join(data_dir, "dict.txt"), )
def preprocess_lm_data(data_dir): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args([ '--only-source', '--trainpref', os.path.join(data_dir, 'train.out'), '--validpref', os.path.join(data_dir, 'valid.out'), '--testpref', os.path.join(data_dir, 'test.out'), '--destdir', data_dir, ]) preprocess.main(preprocess_args)
def main(args): os.makedirs(args.destdir, exist_ok=True) def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, suffix): return os.path.join(args.destdir, file_name(prefix, suffix)) def dest_prefix(prefix): return os.path.join(args.destdir, prefix) def prepare_label(input_prefix, output_prefix): with open(file_name(input_prefix, args.source_lang), "r", encoding="utf-8") as src_file: with open(dest_path(output_prefix, "char"), "w", encoding="utf-8") as char_file: with open(dest_path(output_prefix, "label"), "w", encoding="utf-8") as label_file: for line in src_file: char_file.write( segmenter_utils.to_character_sequence(line)) label_file.write( segmenter_utils.to_boundary_tag(line) + "\n") def prepare_dataset(): if args.trainpref: prepare_label(args.trainpref, "train") if args.validpref: prepare_label(args.validpref, "valid") if args.testpref: prepare_label(args.testpref, "test") def override_args(): args.source_lang = "char" args.target_lang = "label" args.only_source = False args.joined_dictionary = False if args.trainpref: args.trainpref = dest_prefix("train") if args.validpref: args.validpref = dest_prefix("valid") if args.testpref: args.testpref = dest_prefix("test") prepare_dataset() override_args() preprocess.main(args)
def preprocess_lm_data(data_dir): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args([ "--only-source", "--trainpref", os.path.join(data_dir, "train.out"), "--validpref", os.path.join(data_dir, "valid.out"), "--testpref", os.path.join(data_dir, "test.out"), "--destdir", data_dir, ]) preprocess.main(preprocess_args)
def preprocess_translation_data(data_dir, extra_flags=None): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args( [ '--source-lang', 'in', '--target-lang', 'out', '--trainpref', os.path.join(data_dir, 'train'), '--validpref', os.path.join(data_dir, 'valid'), '--testpref', os.path.join(data_dir, 'test'), '--thresholdtgt', '0', '--thresholdsrc', '0', '--destdir', data_dir, ] + (extra_flags or []), ) preprocess.main(preprocess_args)
def gen_fairseq_data(source_lang, target_lang, trainpref, validpref, nwordssrc, nwordstgt, destdir): from fairseq import options from fairseq_cli import preprocess parser = options.get_preprocessing_parser() args = parser.parse_args() args.source_lang = source_lang args.target_lang = target_lang args.trainpref = trainpref args.validpref = validpref args.nwordssrc = nwordssrc args.nwordstgt = nwordstgt args.destdir = destdir preprocess.main(args)
def fairseq_preprocess(src_lang, tgt_lang, destdir, traindir=None, validdir=None, testdir=None, nworkers=5): """ Helper function to do pre-processing using fairseq-preprocess """ def preprocessing_done(): if os.path.exists(destdir): # TODO : more extensive checks print("Warning: Check processed dir manually") return True else: return False if not preprocessing_done(): # TODO : to use FseqArgs args = [] args.append(f"--source-lang={src_lang}") #src_dict_path = os.path.join(destdir, f"dict.{src_lang}.txt") #args.append(f"--srcdict={src_dict_path}") args.append(f"--target-lang={tgt_lang}") #tgt_dict_path = os.path.join(destdir, f"dict.{tgt_lang}.txt") #args.append(f"--tgtdict={tgt_dict_path}") if traindir: args.append(f"--trainpref={traindir}/train.tok") if validdir: args.append(f"--validpref={validdir}/valid.tok") if testdir: args.append(f"--testpref={testdir}/test.tok") args.append(f"--destdir={destdir}") args.append(f"--workers={nworkers}") # fairseq preprocessing argument parser parser = options.get_preprocessing_parser() pargs = parser.parse_args(args) preprocess.main(pargs) else: print("Probably, preprocessing is already done. Check dirs.")
def fairseq_preprocess(dataset): dataset_dir = get_dataset_dir(dataset) with lock_directory(dataset_dir): preprocessed_dir = dataset_dir / 'fairseq_preprocessed' with create_directory_or_skip(preprocessed_dir): preprocessing_parser = options.get_preprocessing_parser() preprocess_args = preprocessing_parser.parse_args([ '--source-lang', 'complex', '--target-lang', 'simple', '--trainpref', os.path.join(dataset_dir, f'{dataset}.train'), '--validpref', os.path.join(dataset_dir, f'{dataset}.valid'), '--testpref', os.path.join(dataset_dir, f'{dataset}.test'), '--destdir', str(preprocessed_dir), '--output-format', 'raw', ]) preprocess.main(preprocess_args) return preprocessed_dir
def lm_scoring( preprocess_directory, bpe_status, gen_output, pre_gen, cur_lm_dict, cur_lm_name, cur_language_model, cur_lm_bpe_code, batch_size, lm_score_file, target_lang, source_lang, prefix_len=None, ): if prefix_len is not None: assert (bpe_status == "different" ), "bpe status must be different to use prefix len" if bpe_status == "no bpe": # run lm on output without bpe write_reprocessed( gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, pre_gen + "/rescore_data_no_bpe.de", pre_gen + "/rescore_data_no_bpe.en", pre_gen + "/reference_file_no_bpe", ) preprocess_lm_param = [ "--only-source", "--trainpref", pre_gen + "/rescore_data_no_bpe." + target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_directory, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [ preprocess_directory, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--max-tokens", "1024", "--sample-break-mode", "eos", "--gen-subset", "train", ] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, "w") as f: with redirect_stdout(f): eval_lm.main(input_args) elif bpe_status == "shared": preprocess_lm_param = [ "--only-source", "--trainpref", pre_gen + "/rescore_data." + target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_directory, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [ preprocess_directory, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--sample-break-mode", "eos", "--gen-subset", "train", ] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, "w") as f: with redirect_stdout(f): eval_lm.main(input_args) elif bpe_status == "different": rescore_file = pre_gen + "/rescore_data_no_bpe" rescore_bpe = pre_gen + "/rescore_data_new_bpe" rescore_file += "." rescore_bpe += "." write_reprocessed( gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, rescore_file + source_lang, rescore_file + target_lang, pre_gen + "/reference_file_no_bpe", bpe_symbol=None, ) # apply LM bpe to nbest list bpe_src_param = [ "-c", cur_lm_bpe_code, "--input", rescore_file + target_lang, "--output", rescore_bpe + target_lang, ] subprocess.call( [ "python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py"), ] + bpe_src_param, shell=False, ) # uncomment to use fastbpe instead of subword-nmt bpe # bpe_src_param = [rescore_bpe+target_lang, rescore_file+target_lang, cur_lm_bpe_code] # subprocess.call(["/private/home/edunov/fastBPE/fast", "applybpe"] + bpe_src_param, shell=False) preprocess_dir = preprocess_directory preprocess_lm_param = [ "--only-source", "--trainpref", rescore_bpe + target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [ preprocess_dir, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--max-tokens", "1024", "--sample-break-mode", "eos", "--gen-subset", "train", ] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, "w") as f: with redirect_stdout(f): eval_lm.main(input_args)
def gen_and_reprocess_nbest(args): if args.score_dict_dir is None: args.score_dict_dir = args.data if args.prefix_len is not None: assert (args.right_to_left1 is False ), "prefix length not compatible with right to left models" assert (args.right_to_left2 is False ), "prefix length not compatible with right to left models" if args.nbest_list is not None: assert args.score_model2 is None if args.backwards1: scorer1_src = args.target_lang scorer1_tgt = args.source_lang else: scorer1_src = args.source_lang scorer1_tgt = args.target_lang store_data = (os.path.join(os.path.dirname(__file__)) + "/rerank_data/" + args.data_dir_name) if not os.path.exists(store_data): os.makedirs(store_data) ( pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, backwards_preprocessed_dir, lm_preprocessed_dir, ) = rerank_utils.get_directories( args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, args.shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac, ) assert not (args.right_to_left1 and args.backwards1), "backwards right to left not supported" assert not (args.right_to_left2 and args.backwards2), "backwards right to left not supported" assert not (args.prefix_len is not None and args.target_prefix_frac is not None ), "target prefix frac and target prefix len incompatible" # make directory to store generation results if not os.path.exists(pre_gen): os.makedirs(pre_gen) rerank1_is_gen = (args.gen_model == args.score_model1 and args.source_prefix_frac is None) rerank2_is_gen = (args.gen_model == args.score_model2 and args.source_prefix_frac is None) if args.nbest_list is not None: rerank2_is_gen = True # make directories to store preprossed nbest list for reranking if not os.path.exists(left_to_right_preprocessed_dir): os.makedirs(left_to_right_preprocessed_dir) if not os.path.exists(right_to_left_preprocessed_dir): os.makedirs(right_to_left_preprocessed_dir) if not os.path.exists(lm_preprocessed_dir): os.makedirs(lm_preprocessed_dir) if not os.path.exists(backwards_preprocessed_dir): os.makedirs(backwards_preprocessed_dir) score1_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model1_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards1, ) if args.score_model2 is not None: score2_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model2_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards2, ) predictions_bpe_file = pre_gen + "/generate_output_bpe.txt" using_nbest = args.nbest_list is not None if using_nbest: print("Using predefined n-best list from interactive.py") predictions_bpe_file = args.nbest_list else: if not os.path.isfile(predictions_bpe_file): print( "STEP 1: generate predictions using the p(T|S) model with bpe") print(args.data) param1 = [ args.data, "--path", args.gen_model, "--shard-id", str(args.shard_id), "--num-shards", str(args.num_shards), "--nbest", str(args.num_rescore), "--batch-size", str(args.batch_size), "--beam", str(args.num_rescore), "--batch-size", str(args.num_rescore), "--gen-subset", args.gen_subset, "--source-lang", args.source_lang, "--target-lang", args.target_lang, ] if args.sampling: param1 += ["--sampling"] gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, param1) print(input_args) with open(predictions_bpe_file, "w") as f: with redirect_stdout(f): generate.main(input_args) gen_output = rerank_utils.BitextOutputFromGen( predictions_bpe_file, bpe_symbol=args.post_process, nbest=using_nbest, prefix_len=args.prefix_len, target_prefix_frac=args.target_prefix_frac, ) if args.diff_bpe: rerank_utils.write_reprocessed( gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, pre_gen + "/source_gen_bpe." + args.source_lang, pre_gen + "/target_gen_bpe." + args.target_lang, pre_gen + "/reference_gen_bpe." + args.target_lang, ) bitext_bpe = args.rescore_bpe_code bpe_src_param = [ "-c", bitext_bpe, "--input", pre_gen + "/source_gen_bpe." + args.source_lang, "--output", pre_gen + "/rescore_data." + args.source_lang, ] bpe_tgt_param = [ "-c", bitext_bpe, "--input", pre_gen + "/target_gen_bpe." + args.target_lang, "--output", pre_gen + "/rescore_data." + args.target_lang, ] subprocess.call( [ "python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py"), ] + bpe_src_param, shell=False, ) subprocess.call( [ "python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py"), ] + bpe_tgt_param, shell=False, ) if (not os.path.isfile(score1_file) and not rerank1_is_gen) or (args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen): print( "STEP 2: process the output of generate.py so we have clean text files with the translations" ) rescore_file = "/rescore_data" if args.prefix_len is not None: prefix_len_rescore_file = rescore_file + "prefix" + str( args.prefix_len) if args.target_prefix_frac is not None: target_prefix_frac_rescore_file = (rescore_file + "target_prefix_frac" + str(args.target_prefix_frac)) if args.source_prefix_frac is not None: source_prefix_frac_rescore_file = (rescore_file + "source_prefix_frac" + str(args.source_prefix_frac)) if not args.right_to_left1 or not args.right_to_left2: if not args.diff_bpe: rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + rescore_file + "." + args.source_lang, pre_gen + rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.post_process, ) if args.prefix_len is not None: bw_rescore_file = prefix_len_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + prefix_len_rescore_file + "." + args.source_lang, pre_gen + prefix_len_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", prefix_len=args.prefix_len, bpe_symbol=args.post_process, ) elif args.target_prefix_frac is not None: bw_rescore_file = target_prefix_frac_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + target_prefix_frac_rescore_file + "." + args.source_lang, pre_gen + target_prefix_frac_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.post_process, target_prefix_frac=args.target_prefix_frac, ) else: bw_rescore_file = rescore_file if args.source_prefix_frac is not None: fw_rescore_file = source_prefix_frac_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + source_prefix_frac_rescore_file + "." + args.source_lang, pre_gen + source_prefix_frac_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.post_process, source_prefix_frac=args.source_prefix_frac, ) else: fw_rescore_file = rescore_file if args.right_to_left1 or args.right_to_left2: rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + "/right_to_left_rescore_data." + args.source_lang, pre_gen + "/right_to_left_rescore_data." + args.target_lang, pre_gen + "/right_to_left_reference_file", right_to_left=True, bpe_symbol=args.post_process, ) print("STEP 3: binarize the translations") if (not args.right_to_left1 or args.score_model2 is not None and not args.right_to_left2 or not rerank1_is_gen): if args.backwards1 or args.backwards2: if args.backwards_score_dict_dir is not None: bw_dict = args.backwards_score_dict_dir else: bw_dict = args.score_dict_dir bw_preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + bw_rescore_file, "--srcdict", bw_dict + "/dict." + scorer1_src + ".txt", "--tgtdict", bw_dict + "/dict." + scorer1_tgt + ".txt", "--destdir", backwards_preprocessed_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(bw_preprocess_param) preprocess.main(input_args) preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + fw_rescore_file, "--srcdict", args.score_dict_dir + "/dict." + scorer1_src + ".txt", "--tgtdict", args.score_dict_dir + "/dict." + scorer1_tgt + ".txt", "--destdir", left_to_right_preprocessed_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) if args.right_to_left1 or args.right_to_left2: preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + "/right_to_left_rescore_data", "--srcdict", args.score_dict_dir + "/dict." + scorer1_src + ".txt", "--tgtdict", args.score_dict_dir + "/dict." + scorer1_tgt + ".txt", "--destdir", right_to_left_preprocessed_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) return gen_output