def cli_main(): parser = options.get_preprocessing_parser() parser.add_argument("--label_schema", type=str, default=None) parser.add_argument("--nonterm_suffix", type=str, default=None) parser.add_argument("--term_suffix", type=str, default=None) args = parser.parse_args() main(args)
def preprocess_summarization_data(data_dir, extra_flags=None): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args( [ "--source-lang", "in", "--target-lang", "out", "--trainpref", os.path.join(data_dir, "train"), "--validpref", os.path.join(data_dir, "valid"), "--testpref", os.path.join(data_dir, "test"), "--thresholdtgt", "0", "--thresholdsrc", "0", "--joined-dictionary", "--destdir", data_dir, ] + (extra_flags or []), ) preprocess.main(preprocess_args)
def cli_main(): parser = options.get_preprocessing_parser() parser.add_argument('--bert', action='store_true', help='Use bert dictionary') args = parser.parse_args() main(args)
def preprocess_lm_data(data_dir, languages=None): preprocess_parser = options.get_preprocessing_parser() if languages is None: preprocess_args = preprocess_parser.parse_args([ "--only-source", "--trainpref", os.path.join(data_dir, "train.out"), "--validpref", os.path.join(data_dir, "valid.out"), "--testpref", os.path.join(data_dir, "test.out"), "--destdir", data_dir, ]) preprocess.main(preprocess_args) else: for lang in languages: lang_dir = os.path.join(data_dir, lang) assert os.path.exists(lang_dir) preprocess_args = preprocess_parser.parse_args([ "--only-source", "--trainpref", os.path.join(lang_dir, "train.out"), "--validpref", os.path.join(lang_dir, "valid.out"), "--testpref", os.path.join(lang_dir, "test.out"), "--destdir", lang_dir, ]) preprocess.main(preprocess_args) shutil.copyfile( os.path.join(data_dir, languages[0], "dict.txt"), os.path.join(data_dir, "dict.txt"), )
def preprocess_lm_data(data_dir): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args([ '--only-source', '--trainpref', os.path.join(data_dir, 'train.out'), '--validpref', os.path.join(data_dir, 'valid.out'), '--testpref', os.path.join(data_dir, 'test.out'), '--destdir', data_dir, ]) preprocess.main(preprocess_args)
def __init__(self, pretrain="auxiliary_data/PhoBERT_base_fairseq"): self.phoBERT = RobertaModel.from_pretrained(pretrain, checkpoint_file='model.pt') self.phoBERT.eval() parser = options.get_preprocessing_parser() parser.add_argument('--bpe-codes', type=str, help='path to fastBPE BPE', default=pretrain + "/bpe.codes") args, unknown = parser.parse_known_args() self.phoBERT.bpe = fastBPE( args) #Incorporate the BPE encoder into PhoBERT
def cli_main(): parser = options.get_preprocessing_parser() # Modified parser.add_argument('--extra-features', nargs='*', help="List of files which have the same number of lines as the src and the tgt files. Each file contains extra labels including the information of the example's domains, speakers, etc.") args = parser.parse_args() if not args.extra_features: args.extra_features = {} main(args)
def cli_main(): parser = options.get_preprocessing_parser() parser.add_argument('--seqtag-data-dir', default=None, type=str, help='directory for IOB formatted data') parser.add_argument('--sentencepiece-model', type=str, default=None, help='directorty for the sentencepiece model') args = parser.parse_args() main(args)
def preprocess_lm_data(data_dir): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args([ "--only-source", "--trainpref", os.path.join(data_dir, "train.out"), "--validpref", os.path.join(data_dir, "valid.out"), "--testpref", os.path.join(data_dir, "test.out"), "--destdir", data_dir, ]) preprocess.main(preprocess_args)
def preprocess_translation_data(data_dir, extra_flags=None): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args( [ '--source-lang', 'in', '--target-lang', 'out', '--trainpref', os.path.join(data_dir, 'train'), '--validpref', os.path.join(data_dir, 'valid'), '--testpref', os.path.join(data_dir, 'test'), '--thresholdtgt', '0', '--thresholdsrc', '0', '--destdir', data_dir, ] + (extra_flags or []), ) preprocess.main(preprocess_args)
def gen_fairseq_data(source_lang, target_lang, trainpref, validpref, nwordssrc, nwordstgt, destdir): from fairseq import options from fairseq_cli import preprocess parser = options.get_preprocessing_parser() args = parser.parse_args() args.source_lang = source_lang args.target_lang = target_lang args.trainpref = trainpref args.validpref = validpref args.nwordssrc = nwordssrc args.nwordstgt = nwordstgt args.destdir = destdir preprocess.main(args)
def _init_model(pretrain_model): bpe_path = os.path.join(pretrain_model, "bpe.codes") BERTweet = RobertaModel.from_pretrained(pretrain_model, checkpoint_file='model.pt') BERTweet.eval() # disable dropout (or leave in train mode to finetune) # Incorporate the BPE encoder into BERTweet-base parser = options.get_preprocessing_parser() parser.add_argument('--bpe-codes', type=str, help='path to fastBPE BPE', default=bpe_path) args = parser.parse_args() BERTweet.bpe = fastBPE(args) # Incorporate the BPE encoder into BERTweet return BERTweet
def cli_main(): parser = options.get_preprocessing_parser() group = parser.add_argument_group('Preprocessing') group.add_argument("--convert_raw", action="store_true", help="convert_raw") group.add_argument("--convert_raw_only", action="store_true", help="convert_raw") group.add_argument("--convert_with_bpe", action="store_true", help="convert_with_bpe") # group.add_argument("--bpe_code", action="store_true", help="convert_with_bpe") group.add_argument('--bpe_code', metavar='FILE', help='bpe_code') group.add_argument("--no_remove_root", action="store_true", help="no_remove_root") group.add_argument("--no_take_pos_tag", action="store_true", help="no_take_pos_tag") group.add_argument("--no_take_nodes", action="store_true", help="no_take_nodes") group.add_argument("--no_reverse_node", action="store_true", help="no_reverse_node") group.add_argument("--no_collapse", action="store_true", help="no_collapse") group.add_argument("--raw_workers", metavar="N", default=0, type=int, help="number of parallel workers") group.add_argument("--eval_workers", metavar="N", default=0, type=int, help="number of parallel workers") args = parser.parse_args() main(args)
def cli_main(): parser = options.get_preprocessing_parser() args = parser.parse_args() #set arguments """ args.source_lang = "src" args.target_lang = "tgt" args.trainpref = "/home/v-jiaya/fairseq-master/data/test/train" args.validpref = "/home/v-jiaya/fairseq-master/data/test/valid" args.testpref = "/home/v-jiaya/fairseq-master/data/test/test" args.destdir="/home/v-jiaya/fairseq-master/data/test/data-bin/" args.workers=1 args.src_mask=True args.joined_dictionary = True """ #end main(args)
def fairseq_preprocess(src_lang, tgt_lang, destdir, traindir=None, validdir=None, testdir=None, nworkers=5): """ Helper function to do pre-processing using fairseq-preprocess """ def preprocessing_done(): if os.path.exists(destdir): # TODO : more extensive checks print("Warning: Check processed dir manually") return True else: return False if not preprocessing_done(): # TODO : to use FseqArgs args = [] args.append(f"--source-lang={src_lang}") #src_dict_path = os.path.join(destdir, f"dict.{src_lang}.txt") #args.append(f"--srcdict={src_dict_path}") args.append(f"--target-lang={tgt_lang}") #tgt_dict_path = os.path.join(destdir, f"dict.{tgt_lang}.txt") #args.append(f"--tgtdict={tgt_dict_path}") if traindir: args.append(f"--trainpref={traindir}/train.tok") if validdir: args.append(f"--validpref={validdir}/valid.tok") if testdir: args.append(f"--testpref={testdir}/test.tok") args.append(f"--destdir={destdir}") args.append(f"--workers={nworkers}") # fairseq preprocessing argument parser parser = options.get_preprocessing_parser() pargs = parser.parse_args(args) preprocess.main(pargs) else: print("Probably, preprocessing is already done. Check dirs.")
def fairseq_preprocess(dataset): dataset_dir = get_dataset_dir(dataset) with lock_directory(dataset_dir): preprocessed_dir = dataset_dir / 'fairseq_preprocessed' with create_directory_or_skip(preprocessed_dir): preprocessing_parser = options.get_preprocessing_parser() preprocess_args = preprocessing_parser.parse_args([ '--source-lang', 'complex', '--target-lang', 'simple', '--trainpref', os.path.join(dataset_dir, f'{dataset}.train'), '--validpref', os.path.join(dataset_dir, f'{dataset}.valid'), '--testpref', os.path.join(dataset_dir, f'{dataset}.test'), '--destdir', str(preprocessed_dir), '--output-format', 'raw', ]) preprocess.main(preprocess_args) return preprocessed_dir
def cli_main(): parser = options.get_preprocessing_parser() args = parser.parse_args() main(args)
from numpy import dot from numpy.linalg import norm from fairseq.data import Dictionary config = RobertaConfig.from_pretrained("PhoBERT_base_transformers/config.json") # Load BPE encoder parser_w2v = argparse.ArgumentParser() parser_w2v.add_argument('--bpe-codes', default="PhoBERT_base_transformers/bpe.codes", required=False, type=str, help='path to fastBPE BPE') from fairseq import options parser_mask = options.get_preprocessing_parser() parser_mask.add_argument('--bpe-codes', type=str, help='', default="PhoBERT_base_fairseq/bpe.codes") from vncorenlp import VnCoreNLP rdrsegmenter = VnCoreNLP("VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') vocab = Dictionary() vocab.add_from_file("PhoBERT_base_transformers/dict.txt") from transformers import RobertaModel phobert_w2v = RobertaModel.from_pretrained( "PhoBERT_base_transformers/model.bin", config=config)
import fasttext model = fasttext.load_model("cc.vi.300.bin") import torch from collections import Counter from typing import List # Load PhoBERT-base in fairseq from fairseq.models.roberta import RobertaModel phobert = RobertaModel.from_pretrained('PhoBERT_base_fairseq', checkpoint_file='model.pt') phobert.eval() # disable dropout (or leave in train mode to finetune) # Incorporate the BPE encoder into PhoBERT-base from fairseq.data.encoders.fastbpe import fastBPE from fairseq import options parser = options.get_preprocessing_parser() parser.add_argument('--bpe-codes', type=str, help='path to fastBPE BPE', default="PhoBERT_base_fairseq/bpe.codes") args = parser.parse_args() phobert.bpe = fastBPE(args) def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]): """ Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy). Args: roberta (RobertaHubInterface): RoBERTa instance bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)`
def lm_scoring(preprocess_directory, bpe_status, gen_output, pre_gen, cur_lm_dict, cur_lm_name, cur_language_model, cur_lm_bpe_code, batch_size, lm_score_file, target_lang, source_lang, prefix_len=None): if prefix_len is not None: assert bpe_status == "different", "bpe status must be different to use prefix len" if bpe_status == "no bpe": # run lm on output without bpe write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, pre_gen+"/rescore_data_no_bpe.de", pre_gen+"/rescore_data_no_bpe.en", pre_gen+"/reference_file_no_bpe") preprocess_lm_param = ["--only-source", "--trainpref", pre_gen+"/rescore_data_no_bpe."+target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_directory] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [preprocess_directory, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--max-tokens", "1024", "--sample-break-mode", "eos", "--gen-subset", "train"] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, 'w') as f: with redirect_stdout(f): eval_lm.main(input_args) elif bpe_status == "shared": preprocess_lm_param = ["--only-source", "--trainpref", pre_gen+"/rescore_data."+target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_directory] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [preprocess_directory, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--sample-break-mode", "eos", "--gen-subset", "train"] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, 'w') as f: with redirect_stdout(f): eval_lm.main(input_args) elif bpe_status == "different": rescore_file = pre_gen+"/rescore_data_no_bpe" rescore_bpe = pre_gen+"/rescore_data_new_bpe" rescore_file += "." rescore_bpe += "." write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, rescore_file+source_lang, rescore_file+target_lang, pre_gen+"/reference_file_no_bpe", bpe_symbol=None) # apply LM bpe to nbest list bpe_src_param = ["-c", cur_lm_bpe_code, "--input", rescore_file+target_lang, "--output", rescore_bpe+target_lang] subprocess.call(["python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py")] + bpe_src_param, shell=False) # uncomment to use fastbpe instead of subword-nmt bpe # bpe_src_param = [rescore_bpe+target_lang, rescore_file+target_lang, cur_lm_bpe_code] # subprocess.call(["/private/home/edunov/fastBPE/fast", "applybpe"] + bpe_src_param, shell=False) preprocess_dir = preprocess_directory preprocess_lm_param = ["--only-source", "--trainpref", rescore_bpe+target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_dir] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [preprocess_dir, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--max-tokens", "1024", "--sample-break-mode", "eos", "--gen-subset", "train"] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, 'w') as f: with redirect_stdout(f): eval_lm.main(input_args)
def gen_and_reprocess_nbest(args): if args.score_dict_dir is None: args.score_dict_dir = args.data if args.prefix_len is not None: assert (args.right_to_left1 is False ), "prefix length not compatible with right to left models" assert (args.right_to_left2 is False ), "prefix length not compatible with right to left models" if args.nbest_list is not None: assert args.score_model2 is None if args.backwards1: scorer1_src = args.target_lang scorer1_tgt = args.source_lang else: scorer1_src = args.source_lang scorer1_tgt = args.target_lang store_data = (os.path.join(os.path.dirname(__file__)) + "/rerank_data/" + args.data_dir_name) if not os.path.exists(store_data): os.makedirs(store_data) ( pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, backwards_preprocessed_dir, lm_preprocessed_dir, ) = rerank_utils.get_directories( args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, args.shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac, ) assert not (args.right_to_left1 and args.backwards1), "backwards right to left not supported" assert not (args.right_to_left2 and args.backwards2), "backwards right to left not supported" assert not (args.prefix_len is not None and args.target_prefix_frac is not None ), "target prefix frac and target prefix len incompatible" # make directory to store generation results if not os.path.exists(pre_gen): os.makedirs(pre_gen) rerank1_is_gen = (args.gen_model == args.score_model1 and args.source_prefix_frac is None) rerank2_is_gen = (args.gen_model == args.score_model2 and args.source_prefix_frac is None) if args.nbest_list is not None: rerank2_is_gen = True # make directories to store preprossed nbest list for reranking if not os.path.exists(left_to_right_preprocessed_dir): os.makedirs(left_to_right_preprocessed_dir) if not os.path.exists(right_to_left_preprocessed_dir): os.makedirs(right_to_left_preprocessed_dir) if not os.path.exists(lm_preprocessed_dir): os.makedirs(lm_preprocessed_dir) if not os.path.exists(backwards_preprocessed_dir): os.makedirs(backwards_preprocessed_dir) score1_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model1_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards1, ) if args.score_model2 is not None: score2_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model2_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards2, ) predictions_bpe_file = pre_gen + "/generate_output_bpe.txt" using_nbest = args.nbest_list is not None if using_nbest: print("Using predefined n-best list from interactive.py") predictions_bpe_file = args.nbest_list else: if not os.path.isfile(predictions_bpe_file): print( "STEP 1: generate predictions using the p(T|S) model with bpe") print(args.data) param1 = [ args.data, "--path", args.gen_model, "--shard-id", str(args.shard_id), "--num-shards", str(args.num_shards), "--nbest", str(args.num_rescore), "--batch-size", str(args.batch_size), "--beam", str(args.num_rescore), "--batch-size", str(args.num_rescore), "--gen-subset", args.gen_subset, "--source-lang", args.source_lang, "--target-lang", args.target_lang, ] if args.sampling: param1 += ["--sampling"] gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, param1) print(input_args) with open(predictions_bpe_file, "w") as f: with redirect_stdout(f): generate.main(input_args) gen_output = rerank_utils.BitextOutputFromGen( predictions_bpe_file, bpe_symbol=args.post_process, nbest=using_nbest, prefix_len=args.prefix_len, target_prefix_frac=args.target_prefix_frac, ) if args.diff_bpe: rerank_utils.write_reprocessed( gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, pre_gen + "/source_gen_bpe." + args.source_lang, pre_gen + "/target_gen_bpe." + args.target_lang, pre_gen + "/reference_gen_bpe." + args.target_lang, ) bitext_bpe = args.rescore_bpe_code bpe_src_param = [ "-c", bitext_bpe, "--input", pre_gen + "/source_gen_bpe." + args.source_lang, "--output", pre_gen + "/rescore_data." + args.source_lang, ] bpe_tgt_param = [ "-c", bitext_bpe, "--input", pre_gen + "/target_gen_bpe." + args.target_lang, "--output", pre_gen + "/rescore_data." + args.target_lang, ] subprocess.call( [ "python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py"), ] + bpe_src_param, shell=False, ) subprocess.call( [ "python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py"), ] + bpe_tgt_param, shell=False, ) if (not os.path.isfile(score1_file) and not rerank1_is_gen) or (args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen): print( "STEP 2: process the output of generate.py so we have clean text files with the translations" ) rescore_file = "/rescore_data" if args.prefix_len is not None: prefix_len_rescore_file = rescore_file + "prefix" + str( args.prefix_len) if args.target_prefix_frac is not None: target_prefix_frac_rescore_file = (rescore_file + "target_prefix_frac" + str(args.target_prefix_frac)) if args.source_prefix_frac is not None: source_prefix_frac_rescore_file = (rescore_file + "source_prefix_frac" + str(args.source_prefix_frac)) if not args.right_to_left1 or not args.right_to_left2: if not args.diff_bpe: rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + rescore_file + "." + args.source_lang, pre_gen + rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.post_process, ) if args.prefix_len is not None: bw_rescore_file = prefix_len_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + prefix_len_rescore_file + "." + args.source_lang, pre_gen + prefix_len_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", prefix_len=args.prefix_len, bpe_symbol=args.post_process, ) elif args.target_prefix_frac is not None: bw_rescore_file = target_prefix_frac_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + target_prefix_frac_rescore_file + "." + args.source_lang, pre_gen + target_prefix_frac_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.post_process, target_prefix_frac=args.target_prefix_frac, ) else: bw_rescore_file = rescore_file if args.source_prefix_frac is not None: fw_rescore_file = source_prefix_frac_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + source_prefix_frac_rescore_file + "." + args.source_lang, pre_gen + source_prefix_frac_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.post_process, source_prefix_frac=args.source_prefix_frac, ) else: fw_rescore_file = rescore_file if args.right_to_left1 or args.right_to_left2: rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + "/right_to_left_rescore_data." + args.source_lang, pre_gen + "/right_to_left_rescore_data." + args.target_lang, pre_gen + "/right_to_left_reference_file", right_to_left=True, bpe_symbol=args.post_process, ) print("STEP 3: binarize the translations") if (not args.right_to_left1 or args.score_model2 is not None and not args.right_to_left2 or not rerank1_is_gen): if args.backwards1 or args.backwards2: if args.backwards_score_dict_dir is not None: bw_dict = args.backwards_score_dict_dir else: bw_dict = args.score_dict_dir bw_preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + bw_rescore_file, "--srcdict", bw_dict + "/dict." + scorer1_src + ".txt", "--tgtdict", bw_dict + "/dict." + scorer1_tgt + ".txt", "--destdir", backwards_preprocessed_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(bw_preprocess_param) preprocess.main(input_args) preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + fw_rescore_file, "--srcdict", args.score_dict_dir + "/dict." + scorer1_src + ".txt", "--tgtdict", args.score_dict_dir + "/dict." + scorer1_tgt + ".txt", "--destdir", left_to_right_preprocessed_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) if args.right_to_left1 or args.right_to_left2: preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + "/right_to_left_rescore_data", "--srcdict", args.score_dict_dir + "/dict." + scorer1_src + ".txt", "--tgtdict", args.score_dict_dir + "/dict." + scorer1_tgt + ".txt", "--destdir", right_to_left_preprocessed_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) return gen_output
def binarize(arguments): parser = options.get_preprocessing_parser() args = parser.parse_args(arguments) binarize_func(args)
def cli_main(): parser = options.get_preprocessing_parser() args = parser.parse_args() src_dict, tgt_dict, char_dict = prepare_dict(args) prepare_raw_data(args, src_dict, tgt_dict, char_dict)