def main(): """Preprocess raw data and produces pickled files.""" data_dir = args.data_dir if args.output_dir is None: pickle_output_dir = data_dir else: pickle_output_dir = args.output_dir tx.utils.maybe_create_dir(pickle_output_dir) pretrained_model_dir = \ tx.modules.PretrainedGPT2Mixin.download_checkpoint( pretrained_model_name=args.pretrained_model_name) # Creates a data pre-processor for, e.g., BPE encoding proc = processor.get_encoder(pretrained_model_dir) config_train = importlib.import_module(args.config_train) # Produces pickle files data_utils.prepare_pickle_data( data_dir=data_dir, max_seq_length=args.max_seq_length, encoder=proc, output_dir=pickle_output_dir, feature_original_types=config_train.feature_original_types)
def prepare_data(): r"""Preprocesses raw data and produces pickle files. """ data_dir = args.data_dir if args.output_dir is None: pickle_output_dir = data_dir else: pickle_output_dir = args.output_dir tx.utils.maybe_create_dir(pickle_output_dir) pretrained_model_dir = tx.modules.load_pretrained_gpt2( pretrained_model_name=args.pretrained_model_name, cache_dir='gpt2_pretrained_models') # Creates a data pre-processor for, e.g., BPE encoding proc = processor.get_encoder(pretrained_model_dir) from configs.config_train import feature_original_types # Produces pickle files data_utils.prepare_pickle_data( data_dir=data_dir, max_seq_length=args.max_seq_length, encoder=proc, output_dir=pickle_output_dir, feature_original_types=feature_original_types)
def main() -> None: """Preprocess raw data and produces pickled files.""" data_dir = args.data_dir if args.output_dir is None: pickle_output_dir = data_dir else: pickle_output_dir = args.output_dir tx.utils.maybe_create_dir(pickle_output_dir) # Create a GPT-2 tokenizer (BPE encoding) tokenizer = tx.data.GPT2Tokenizer( pretrained_model_name=args.pretrained_model_name) config_train: Any = importlib.import_module(args.config_train) # Produces pickle files data_utils.prepare_pickle_data(data_dir=data_dir, max_seq_length=args.max_seq_length, tokenizer=tokenizer, output_dir=pickle_output_dir, feature_types=config_train.feature_types)