def prepare_tokenizer(args): tokenizer_args = { 'tokenizer_type': args.tokenizer_type, 'corpus': None, 'model_path': args.tokenizer_path, 'vocab_size': args.vocab_size, 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir} tokenizer = make_tokenizer(**tokenizer_args) num_tokens = tokenizer.num_tokens before = num_tokens after = before multiple = args.make_vocab_size_divisible_by * \ mpu.get_model_parallel_world_size() while (after % multiple) != 0: after += 1 print_rank_0('> padded vocab (size: {}) with {} dummy ' 'tokens (new size: {})'.format( before, after - before, after)) args.tokenizer_num_tokens = after args.tokenizer_num_type_tokens = tokenizer.num_type_tokens args.eod_token = tokenizer.get_command('eos').Id # after = tokenizer.num_tokens # while after % mpu.get_model_parallel_world_size() != 0: # after += 1 args.vocab_size = after print("prepare tokenizer done", flush=True) return tokenizer
def prepare_tokenizer(args): tokenizer_args = { 'tokenizer_type': args.tokenizer_type, 'corpus': None, 'model_path': args.tokenizer_path, 'vocab_size': args.vocab_size, 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir } tokenizer = make_tokenizer(**tokenizer_args) args.tokenizer_num_tokens = tokenizer.num_tokens args.tokenizer_num_type_tokens = tokenizer.num_type_tokens args.eod_token = tokenizer.get_command('eos').Id after = tokenizer.num_tokens multiple = args.make_vocab_size_divisible_by * \ mpu.get_model_parallel_world_size() if multiple != 0: while (after % multiple) != 0: after += 1 args.vocab_size = after print("prepare tokenizer done", flush=True) return tokenizer
def make_tfrecord_loaders(args): """Load train/val/test dataset from shuffled TFRecords""" import data_utils.tf_dl data_set_args = {'batch_size': args.batch_size, 'max_seq_len': args.seq_length, 'max_preds_per_seq': args.max_preds_per_seq, 'train': True, 'num_workers': max(args.num_workers, 1), 'seed': args.seed + args.rank + 1, 'threaded_dl': args.num_workers > 0 } train = data_utils.tf_dl.TFRecordDataLoader(args.train_data, **data_set_args) data_set_args['train'] = False if args.eval_seq_length is not None: data_set_args['max_seq_len'] = args.eval_seq_length if args.eval_max_preds_per_seq is not None: data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq valid = None if args.valid_data is not None: valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data, **data_set_args) test = None if args.test_data is not None: test = data_utils.tf_dl.TFRecordDataLoader(args.test_data, **data_set_args) tokenizer = data_utils.make_tokenizer(args.tokenizer_type, train, args.tokenizer_path, args.vocab_size, args.tokenizer_model_type, cache_dir=args.cache_dir) return (train, valid, test), tokenizer
def get_tokenizer(args): tokenizer_args = { 'tokenizer_type': args.tokenizer_type, 'corpus': None, 'model_path': args.tokenizer_path, 'vocab_size': args.vocab_size, 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir} return make_tokenizer(**tokenizer_args)
def main(): """Main training program.""" print('Generate Samples') # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) #get the tokenizer tokenizer = make_tokenizer(args.tokenizer_type, None, args.tokenizer_path, args.vocab_size, args.tokenizer_model_type, pad_token=0, character_converage=1.0) # Model, optimizer, and learning rate. model = setup_model(args) #setting default batch size to 1 args.batch_size = 1 #generate samples generate_samples(model, tokenizer, args, torch.cuda.current_device())