def main(): """Main training program.""" # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() args.mem_length = args.mem_length if args.transformer_xl else 0 if args.load and not args.new_save_directory: args.experiment_name = os.path.basename(os.path.normpath(args.load)) else: args.experiment_name = args.experiment_name + datetime.now().strftime( "%m-%d-%H-%M") if args.save: args.save = os.path.join(args.save, args.experiment_name) # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) # Data stuff. global tokenizer tokenizer = prepare_tokenizer(args) train_data, val_data, test_data, = get_train_val_test_data(args, tokenizer) multi_train_data, multi_val_data = None, None if args.multi_task_ratio > 0.0: multi_train_data, multi_val_data = build_multi_task_dataset( args, tokenizer) # Model, optimizer, and learning rate. model, optimizer, lr_scheduler = setup_model_and_optimizer(args) if args.load is not None: with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"), timeout=-1): args.iteration = load_checkpoint(model, optimizer, lr_scheduler, args) else: args.iteration = 0 torch.distributed.barrier() if args.switch_linear: lr_scheduler.switch_linear(args) summary_writer = None if torch.distributed.get_rank() == 0: print('Pretrain GPT2 model') args.log_dir = None if args.train_iters > 0: args.log_dir = get_log_dir(base=args.summary_dir, name=args.experiment_name) summary_writer = get_sample_writer(log_dir=args.log_dir, iteration=args.iteration) print_and_save_args(args, verbose=True, log_dir=args.log_dir) # Resume data loader if necessary. if args.resume_dataloader: print_rank_0("Resume dataloader") if train_data is not None: train_data.batch_sampler.start_iter = args.iteration % len( train_data) if val_data is not None: start_iter_val = (args.iteration // args.eval_interval) * args.eval_iters val_data.batch_sampler.start_iter = start_iter_val % len(val_data) if multi_train_data is not None: multi_train_data.batch_sampler.start_iter = int( args.iteration * args.multi_task_ratio) % len(multi_train_data) if multi_val_data is not None: start_iter_val = (args.iteration // args.eval_interval ) * args.eval_iters * args.multi_task_ratio multi_val_data.batch_sampler.start_iter = start_iter_val % len( multi_val_data) if train_data is not None: train_data_iterator = iter(train_data) else: train_data_iterator = None if multi_train_data is not None: multi_train_iterator = iter(multi_train_data) else: multi_train_iterator = None if val_data is not None: val_data_iterator = iter(val_data) else: val_data_iterator = None if multi_val_data is not None: multi_val_iterator = iter(multi_val_data) else: multi_val_iterator = None # TODO: figure out how to properly set this especially when resuming training iteration = 0 if args.train_iters > 0: if args.do_train: with ExitStack() as stack: def save_on_exit(args_, model_, optimizer_, lr_scheduler_): save_checkpoint(args_.iteration, model_, optimizer_, lr_scheduler_, args_) # stack.callback(save_on_exit, args, model, optimizer, lr_scheduler) iteration, skipped = train( model, optimizer, lr_scheduler, (train_data_iterator, multi_train_iterator), (val_data_iterator, multi_val_iterator), timers, args, summary_writer=summary_writer) if args.do_valid: prefix = 'the end of training for val data' val_loss = evaluate_and_print_results( prefix, val_data_iterator, model, args, timers, verbose=False, forward_step_func=forward_step) if args.save and iteration != 0: save_checkpoint(iteration, model, optimizer, lr_scheduler, args) if test_data is not None: test_data_iterator = iter(test_data) else: test_data_iterator = None if args.do_test: # Run on test data. prefix = 'the end of training for test data' evaluate_and_print_results(prefix, (test_data_iterator, None), model, args, timers, verbose=True, forward_step_func=forward_step)
# Create an outputs/ folder in the blob storage if args.output_dir is None: parent_dir = os.path.join(args.output_dir, 'outputs', str(experiment_name)) output_dir = os.path.join(parent_dir, str(run_id)) os.makedirs(output_dir, exist_ok=True) saved_model_path = os.path.join(output_dir, "saved_models", job_name) os.makedirs(saved_model_path, exist_ok=True) else: saved_model_path = args.output_dir summary_writer = None # Prepare Summary Writer and saved_models path if check_write_log(): #azureml.tensorboard only streams from /logs directory, therefore hardcoded summary_writer = get_sample_writer(name=job_name + str(local_rank), base='./logs') # Loading Tokenizer (vocabulary from blob storage, if exists) logger.info("Extracting the vocabulary") if args.tokenizer_path: logger.info(f'Loading tokenizer from {args.tokenizer_path}') tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path, cache_dir=args.output_dir) else: tokenizer = BertTokenizer.from_pretrained( job_config.get_token_file_type(), cache_dir=args.output_dir) logger.info("Vocabulary contains {} tokens".format( len(list(tokenizer.vocab.keys())))) # Loading Model logger.info("Initializing BertMultiTask model")
# Create an outputs/ folder in the blob storage if args.output_dir is None: parent_dir = os.path.join(args.output_dir, 'outputs', str(run.experiment.name)) output_dir = os.path.join(parent_dir, str(run.id)) os.makedirs(output_dir, exist_ok=True) saved_model_path = os.path.join(output_dir, "saved_models", job_name) os.makedirs(saved_model_path, exist_ok=True) else: saved_model_path = args.output_dir summary_writer = None # Prepare Summary Writer and saved_models path if check_write_log(): #azureml.tensorboard only streams from /logs directory, therefore hardcoded summary_writer = get_sample_writer(name=job_name, base='./logs') # Loading Tokenizer (vocabulary from blob storage, if exists) logger.info("Extracting the vocabulary") if args.tokenizer_path: logger.info(f'Loading tokenizer from {args.tokenizer_path}') tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path, cache_dir=args.output_dir) else: tokenizer = BertTokenizer.from_pretrained( job_config.get_token_file_type(), cache_dir=args.output_dir) logger.info("Vocabulary contains {} tokens".format( len(list(tokenizer.vocab.keys())))) # Loading Model logger.info("Initializing BertMultiTask model")
def main(): """Main training program.""" # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() args.mem_length = args.mem_length if args.transformer_xl else 0 if args.load: args.experiment_name = os.path.basename(os.path.normpath(args.load)) else: args.experiment_name = args.experiment_name + datetime.now().strftime( "%m-%d-%H-%M") if args.save: args.save = os.path.join(args.save, args.experiment_name) # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) # Data stuff. train_data, val_data, test_data, args.vocab_size, \ args.eod_token = get_train_val_test_data(args) # Model, optimizer, and learning rate. model, optimizer, lr_scheduler = setup_model_and_optimizer(args) if args.load is not None: with FileLock("/root/checkpoint_lock", timeout=-1): args.iteration = load_checkpoint(model, optimizer, lr_scheduler, args) else: args.iteration = 0 torch.distributed.barrier() summary_writer = None if torch.distributed.get_rank() == 0: print('Pretrain GPT2 model') print_args(args) summary_writer = get_sample_writer(base=args.summary_dir, name=args.experiment_name, iteration=args.iteration) # Resume data loader if necessary. if args.resume_dataloader: if train_data is not None: train_data.batch_sampler.start_iter = args.iteration % \ len(train_data) if val_data is not None: start_iter_val = (args.train_iters // args.save_interval) * \ args.eval_interval val_data.batch_sampler.start_iter = start_iter_val % \ len(val_data) if train_data is not None: train_data_iterator = iter(train_data) else: train_data_iterator = None if val_data is not None: val_data_iterator = iter(val_data) else: val_data_iterator = None # TODO: figure out how to properly set this especially when resuming training iteration = 0 if args.train_iters > 0: if args.do_train: with ExitStack() as stack: def save_on_exit(args_, model_, optimizer_, lr_scheduler_): save_checkpoint(args_.iteration, model_, optimizer_, lr_scheduler_, args_) # stack.callback(save_on_exit, args, model, optimizer, lr_scheduler) iteration, skipped = train(model, optimizer, lr_scheduler, train_data_iterator, val_data_iterator, timers, args, summary_writer=summary_writer) if args.do_valid: prefix = 'the end of training for val data' val_loss = evaluate_and_print_results(prefix, val_data_iterator, model, args, timers, False) if args.save and iteration != 0: save_checkpoint(iteration, model, optimizer, lr_scheduler, args) if test_data is not None: test_data_iterator = iter(test_data) else: test_data_iterator = None if args.do_test: # Run on test data. prefix = 'the end of training for test data' evaluate_and_print_results(prefix, test_data_iterator, model, args, timers, True)
def finetune(args, train_valid_datasets_provider, model_kwargs, forward_step=finetune_forward_step, end_of_epoch_callback_provider=None): """Main finetune function used across all tasks.""" global tokenizer timers = Timers() tokenizer = prepare_tokenizer(args) pretrain_glm.tokenizer = tokenizer if args.save: args.save = os.path.join(args.save, args.experiment_name) # Train and validation data loaders. timers('train/valid/test dataset/dataloder').start() train_dataloader, valid_dataloader = None, None train_block_dataloader, valid_block_dataloader = None, None if train_valid_datasets_provider is not None and args.epochs > 0: if mpu.get_model_parallel_rank() == 0: train_dataset, valid_dataset = train_valid_datasets_provider( args, tokenizer) train_dataloader, valid_dataloader = _build_train_valid_dataloaders( train_dataset, valid_dataset, args) if args.no_validation: valid_dataloader = None train_iters = torch.cuda.LongTensor([len(train_dataloader)]) else: train_iters = torch.cuda.LongTensor([0]) torch.distributed.broadcast(train_iters, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) if mpu.get_model_parallel_rank() != 0: args.train_iters_per_epoch = train_iters[0].item() args.train_iters = args.epochs * args.train_iters_per_epoch train_dataloader = FakeDataloader(args.train_iters_per_epoch) if args.no_validation: valid_dataloader = None else: valid_dataloader = FakeDataloader(None) if args.block_lm_ratio > 0.0: if mpu.get_model_parallel_rank() == 0: train_block_dataset, valid_block_dataset = train_valid_datasets_provider( args, tokenizer, pattern_text=True) train_block_dataloader = make_data_loader( train_block_dataset, tokenizer, args.batch_size * mpu.get_data_parallel_world_size(), args.train_iters, args, shuffle=True, block_collate=True) valid_block_dataloader = make_data_loader( valid_block_dataset, tokenizer, args.batch_size * mpu.get_data_parallel_world_size(), (args.train_iters // args.eval_interval + 1) * args.eval_iters, args, shuffle=True, block_collate=True) else: train_block_dataloader = FakeDataloader(args.train_iters) valid_block_dataloader = FakeDataloader(None) train_block_dataloader, valid_block_dataloader = iter( train_block_dataloader), iter(valid_block_dataloader) timers('train/valid/test dataset/dataloder').stop() # Build calback function. timers('callback function').start() end_of_epoch_callback, end_of_train_callback = None, None if end_of_epoch_callback_provider is not None: if train_valid_datasets_provider is not None and args.epochs > 0 and not args.no_validation: end_of_epoch_callback = end_of_epoch_callback_provider( args, tokenizer, is_test=False) end_of_train_callback = end_of_epoch_callback_provider(args, tokenizer, is_test=True) timers('callback function').stop() # Build model, optimizer and learning rate scheduler. timers('model and optimizer').start() model, optimizer, lr_scheduler = setup_model_and_optimizer( args, **model_kwargs) timers('model and optimizer').stop() # If pretrained checkpoint is provided and we have not trained for # any iteration (i.e., iteration is zero), then load the pretrained # checkpoint. timers('pretrained checkpoint').start() if args.load_pretrained is not None and not args.pretrained_bert: task_tokens = None if args.continuous_prompt and args.prompt_init: if mpu.get_model_parallel_rank() == 0: dataset = train_dataloader.dataset processor, pvp = dataset.processor, dataset.pvp task_tokens = [] for label in processor.get_labels(): verbalizer = pvp.verbalize(label)[0] verbalizer_ids = tokenizer.EncodeAsIds( verbalizer).tokenization task_tokens += verbalizer_ids print_rank_0("Task tokens: " + tokenizer.DecodeIds(task_tokens)) num_task_tokens = len(task_tokens) else: num_task_tokens, task_tokens = 0, [] num_task_tokens = torch.cuda.LongTensor([num_task_tokens]) torch.distributed.broadcast(num_task_tokens, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) num_task_tokens = num_task_tokens.item() if num_task_tokens > 0: if mpu.get_model_parallel_rank() == 0: task_tokens = torch.cuda.LongTensor(task_tokens) else: task_tokens = torch.empty( num_task_tokens, device=torch.cuda.current_device(), dtype=torch.long) torch.distributed.broadcast( task_tokens, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) task_tokens = task_tokens.tolist() with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"), timeout=-1): load_pretrained(model, args.load_pretrained, args, task_tokens=task_tokens) # This is critical when only model is loaded. We should make sure # master parameters are also updated. if args.fp16 and optimizer is not None: if args.deepspeed: optimizer.refresh_fp32_params() else: optimizer._model_params_to_master_params() if args.load is not None: with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"), timeout=-1): load_checkpoint(model, optimizer, lr_scheduler, args, no_deepspeed=args.no_deepspeed_load) # This is critical when only model is loaded. We should make sure # master parameters are also updated. if args.fp16 and optimizer is not None: if args.deepspeed: optimizer.refresh_fp32_params() else: optimizer._model_params_to_master_params() torch.distributed.barrier() timers('pretrained checkpoint').stop() args.iteration = 0 summary_writer = None if torch.distributed.get_rank() == 0: args.log_dir = get_log_dir(base=args.summary_dir, name=args.experiment_name) if os.path.exists(os.path.join(args.log_dir, "test_results.json") ) and args.load is None and not args.overwrite: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.log_dir)) summary_writer = get_sample_writer(log_dir=args.log_dir, iteration=args.iteration) print_and_save_args(args, verbose=True, log_dir=args.log_dir) # Print setup timing. print_rank_0('done with setups ...') timers.log([ 'train/valid/test dataset/dataloder', 'callback function', 'model and optimizer', 'pretrained checkpoint' ]) print_rank_0('training ...') # Finetune the model. score_dict = None if train_dataloader is not None and args.epochs > 0: if args.block_lm_ratio > 0.0: forward_step = mix_forward_step best_iteration = _train(model, optimizer, lr_scheduler, forward_step, (train_dataloader, train_block_dataloader), (valid_dataloader, valid_block_dataloader), end_of_epoch_callback, args, timers, summary_writer=summary_writer) if end_of_train_callback is not None and best_iteration is not None: with FileLock(os.path.join(pathlib.Path.home(), "checkpoint_lock"), timeout=-1): args.load = os.path.join(args.save, "best") load_checkpoint(model, optimizer, lr_scheduler, args, no_load_optim=True, no_deepspeed=True) args.load = None torch.distributed.barrier() if end_of_train_callback is not None: score_dict = end_of_train_callback(model, epoch=-1, output_predictions=True) # Or just evaluate. else: if end_of_train_callback is not None: print_rank_0('evaluation only mode, setting epoch to -1') score_dict = end_of_train_callback(model, epoch=-1, output_predictions=True) if score_dict is not None and torch.distributed.get_rank() == 0: score_dict.update({"type": "test"}) with open(os.path.join(args.log_dir, "test_results.json"), "w") as output: output.write(json.dumps(score_dict) + "\n") print_rank_0('done :-)')
def finetune(args, train_valid_datasets_provider, model_kwargs, forward_step=finetune_forward_step, end_of_epoch_callback_provider=None): """Main finetune function used across all tasks.""" global tokenizer timers = Timers() tokenizer = prepare_tokenizer(args) if args.save: args.save = os.path.join(args.save, args.experiment_name) # Train and validation data loaders. timers('train/valid/test dataset/dataloder').start() train_dataloader, valid_dataloader = None, None if train_valid_datasets_provider is not None and args.epochs > 0: train_dataset, valid_dataset = train_valid_datasets_provider( args, tokenizer) train_dataloader, valid_dataloader = _build_train_valid_dataloaders( train_dataset, valid_dataset, args) timers('train/valid/test dataset/dataloder').stop() # Build calback function. timers('callback function').start() end_of_epoch_callback, end_of_train_callback = None, None if end_of_epoch_callback_provider is not None: if train_valid_datasets_provider is not None and args.epochs > 0: end_of_epoch_callback = end_of_epoch_callback_provider( args, tokenizer, is_test=False) end_of_train_callback = end_of_epoch_callback_provider(args, tokenizer, is_test=True) timers('callback function').stop() # Build model, optimizer and learning rate scheduler. timers('model and optimizer').start() model, optimizer, lr_scheduler = setup_model_and_optimizer( args, **model_kwargs) timers('model and optimizer').stop() # If pretrained checkpoint is provided and we have not trained for # any iteration (i.e., iteration is zero), then load the pretrained # checkpoint. timers('pretrained checkpoint').start() if args.load_pretrained is not None and not args.pretrained_bert and not args.load: module = model if isinstance(module, (LocalDDP, TorchDDP)): module = module.module if isinstance(module, FP16_Module): module = module.module if not isinstance(module, GLMModel): module = module.model args.load = args.load_pretrained load_checkpoint(module, optimizer, lr_scheduler, args) args.load = None # This is critical when only model is loaded. We should make sure # master parameters are also updated. if args.fp16: optimizer._model_params_to_master_params() if args.load is not None: load_checkpoint(model, optimizer, lr_scheduler, args) # This is critical when only model is loaded. We should make sure # master parameters are also updated. if args.fp16: optimizer._model_params_to_master_params() timers('pretrained checkpoint').stop() args.iteration = 0 summary_writer = None if torch.distributed.get_rank() == 0: args.log_dir = get_log_dir(base=args.summary_dir, name=args.experiment_name) if os.path.exists(os.path.join(args.log_dir, "test_results.json") ) and args.load is None and not args.overwrite: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.log_dir)) summary_writer = get_sample_writer(log_dir=args.log_dir, iteration=args.iteration) print_and_save_args(args, verbose=False, log_dir=args.log_dir) # Print setup timing. print_rank_0('done with setups ...') timers.log([ 'train/valid/test dataset/dataloder', 'callback function', 'model and optimizer', 'pretrained checkpoint' ]) print_rank_0('training ...') # Finetune the model. score_dict = None if train_dataloader is not None and args.epochs > 0: best_iteration = _train(model, optimizer, lr_scheduler, forward_step, train_dataloader, valid_dataloader, end_of_epoch_callback, args, timers, summary_writer=summary_writer) if best_iteration is not None and end_of_train_callback is not None: args.load = os.path.join(args.save, "best") load_checkpoint(model, optimizer, lr_scheduler, args) args.load = None if end_of_train_callback is not None: score_dict = end_of_train_callback(model, epoch=-1, output_predictions=True) # Or just evaluate. else: if end_of_train_callback is not None: print_rank_0('evaluation only mode, setting epoch to -1') score_dict = end_of_train_callback(model, epoch=-1, output_predictions=True) if score_dict is not None and torch.distributed.get_rank() == 0: score_dict.update({"type": "test"}) with open(os.path.join(args.log_dir, "test_results.json"), "w") as output: output.write(json.dumps(score_dict) + "\n") print_rank_0('done :-)')