def start_training(): cfg = shared_configs.get_pretraining_args() set_random_seed(cfg.seed) n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) if hvd.rank() != 0: LOGGER.disabled = True LOGGER.info(f"device: {device} n_gpu: {n_gpu}, " f"rank: {hvd.rank()}, 16-bits training: {cfg.fp16}") model = setup_model(cfg, device=device) model.train() optimizer = setup_e2e_optimizer(model, cfg) # Horovod: (optional) compression algorithm.compressin compression = hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) model, optimizer = amp.initialize(model, optimizer, enabled=cfg.fp16, opt_level='O2', keep_batchnorm_fp32=True) # prepare data tokenizer = BertTokenizerFast.from_pretrained(cfg.tokenizer_dir) train_loaders, val_loaders = setup_dataloaders(cfg, tokenizer) train_loader = MetaLoader(train_loaders, accum_steps=cfg.gradient_accumulation_steps, distributed=n_gpu > 1) img_norm = ImageNorm(mean=cfg.img_pixel_mean, std=cfg.img_pixel_std) train_loader = PrefetchLoader(train_loader, img_norm) val_loaders = { k: PrefetchLoader(v, img_norm) for k, v in val_loaders.items() } # compute the number of steps and update cfg total_train_batch_size = int(n_gpu * cfg.train_batch_size * cfg.gradient_accumulation_steps * cfg.max_n_example_per_group) total_n_epochs = cfg.num_train_epochs cfg.num_train_steps = int( math.ceil(1. * train_loader.n_batches_in_epoch * total_n_epochs / (n_gpu * cfg.gradient_accumulation_steps))) cfg.valid_steps = int( math.ceil(1. * cfg.num_train_steps / cfg.num_valid / cfg.min_valid_steps)) * cfg.min_valid_steps actual_num_valid = int( math.floor(1. * cfg.num_train_steps / cfg.valid_steps)) + 1 # restore restorer = TrainingRestorer(cfg, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: LOGGER.info("Saving training meta...") save_training_meta(cfg) path = join(cfg.output_dir, 'log', "detectron2_model_cfg.yaml") with open(path, "w") as f: f.write(model.cnn.config_file) LOGGER.info("Saving training done...") TB_LOGGER.create(join(cfg.output_dir, 'log')) pbar = tqdm(total=cfg.num_train_steps) model_saver = ModelSaver(join(cfg.output_dir, "ckpt")) add_log_to_file(join(cfg.output_dir, "log", "log.txt")) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(cfg) LOGGER.info("Starting training...") LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info( f" Single-GPU Non-Accumulated batch size = {cfg.train_batch_size}") LOGGER.info(f" max_n_example_per_group = {cfg.max_n_example_per_group}") LOGGER.info(f" Accumulate steps = {cfg.gradient_accumulation_steps}") LOGGER.info( f" Total batch size = #GPUs * Single-GPU batch size * " f"max_n_example_per_group * Accumulate steps [Image] = {total_train_batch_size}" ) LOGGER.info( f" Total #batches - single epoch = {train_loader.n_batches_in_epoch}." ) LOGGER.info(f" Total #steps = {cfg.num_train_steps}") LOGGER.info(f" Total #epochs = {total_n_epochs}.") LOGGER.info( f" Validate every {cfg.valid_steps} steps, in total {actual_num_valid} times" ) # quick hack for amp delay_unscale bug with optimizer.skip_synchronize(): optimizer.zero_grad() if global_step == 0: optimizer.step() debug_step = 5 tasks = [] for name, flag in zip(["mlm", "itm"], [cfg.use_mlm, cfg.use_itm]): if flag: tasks.append(name) task2loss = {t: RunningMeter(f'train_loss/{t}') for t in tasks} task2loss["loss"] = RunningMeter('train_loss/loss') for step, (task, batch) in enumerate(train_loader): # forward pass outputs = forward_step(cfg, model, batch) mlm_loss, itm_loss = 0, 0 if cfg.use_mlm: mlm_loss = outputs["mlm_loss"].mean() task2loss["mlm"](mlm_loss.item()) if cfg.use_itm: itm_loss = outputs["itm_loss"].mean() task2loss["itm"](itm_loss.item()) loss = mlm_loss + itm_loss task2loss["loss"](loss.item()) delay_unscale = (step + 1) % cfg.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() zero_none_grad(model) optimizer.synchronize() # optimizer if (step + 1) % cfg.gradient_accumulation_steps == 0: global_step += 1 TB_LOGGER.log_scalar_dict({ l.name: l.val for l in task2loss.values() if l.val is not None }) n_epoch = int(1. * n_gpu * cfg.gradient_accumulation_steps * global_step / train_loader.n_batches_in_epoch) # learning rate scheduling transformer lr_this_step_transformer = get_lr_sched( global_step, cfg.decay, cfg.learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.step_decay_epochs, multi_step_epoch=n_epoch) # learning rate scheduling cnn lr_this_step_cnn = get_lr_sched( global_step, cfg.cnn_lr_decay, cfg.cnn_learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.cnn_step_decay_epochs, multi_step_epoch=n_epoch) # Hardcoded param group length assert len(optimizer.param_groups) == 8 for pg_n, param_group in enumerate(optimizer.param_groups): if pg_n in [0, 1]: param_group['lr'] = (cfg.transformer_lr_mul * lr_this_step_transformer) elif pg_n in [2, 3]: param_group['lr'] = lr_this_step_transformer elif pg_n in [4, 5]: param_group['lr'] = (cfg.cnn_lr_mul * lr_this_step_cnn) else: param_group['lr'] = lr_this_step_cnn TB_LOGGER.add_scalar("train/lr_transformer", lr_this_step_transformer, global_step) TB_LOGGER.add_scalar("train/lr_cnn", lr_this_step_cnn, global_step) # update model params if cfg.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), cfg.grad_norm) TB_LOGGER.add_scalar("train/grad_norm", grad_norm, global_step) TB_LOGGER.step() # Check if there is None grad none_grads = [ p[0] for p in model.named_parameters() if p[1].requires_grad and p[1].grad is None ] assert len(none_grads) == 0, f"{none_grads}" with optimizer.skip_synchronize(): optimizer.step() optimizer.zero_grad() restorer.step() pbar.update(1) # checkpoint if global_step % cfg.valid_steps == 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_loaders, cfg) model_saver.save(step=global_step, model=model) if global_step >= cfg.num_train_steps: break if cfg.debug and global_step >= debug_step: break if global_step % cfg.valid_steps != 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_loaders, cfg) model_saver.save(step=global_step, model=model)
def start_training(cfg): set_random_seed(cfg.seed) n_gpu = hvd.size() cfg.n_gpu = n_gpu device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) if hvd.rank() != 0: LOGGER.disabled = True LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), bool(cfg.fp16))) model = setup_model(cfg, device=device) model.train() optimizer = setup_e2e_optimizer(model, cfg) # Horovod: (optional) compression algorithm.compressin compression = hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) model, optimizer = amp.initialize(model, optimizer, enabled=cfg.fp16, opt_level='O2', keep_batchnorm_fp32=True) # prepare data tokenizer = BertTokenizerFast.from_pretrained(cfg.tokenizer_dir) train_loader, val_loader = setup_dataloaders(cfg, tokenizer) eval_loader = mk_video_ret_eval_dataloader( anno_path=cfg.val_datasets[0].txt, lmdb_dir=cfg.val_datasets[0].img, cfg=cfg, tokenizer=tokenizer, ) # compute the number of steps and update cfg total_n_examples = len(train_loader.dataset) * cfg.max_n_example_per_group total_train_batch_size = int(n_gpu * cfg.train_batch_size * cfg.gradient_accumulation_steps * cfg.max_n_example_per_group) cfg.num_train_steps = int( math.ceil(1. * cfg.num_train_epochs * total_n_examples / total_train_batch_size)) cfg.valid_steps = int( math.ceil(1. * cfg.num_train_steps / cfg.num_valid / cfg.min_valid_steps)) * cfg.min_valid_steps actual_num_valid = int( math.floor(1. * cfg.num_train_steps / cfg.valid_steps)) + 1 # restore restorer = TrainingRestorer(cfg, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: LOGGER.info("Saving training meta...") save_training_meta(cfg) path = join(cfg.output_dir, 'log', "detectron2_model_cfg.yaml") with open(path, "w") as f: f.write(model.cnn.config_file) LOGGER.info("Saving training done...") TB_LOGGER.create(join(cfg.output_dir, 'log')) pbar = tqdm(total=cfg.num_train_steps) model_saver = ModelSaver(join(cfg.output_dir, "ckpt")) add_log_to_file(join(cfg.output_dir, "log", "log.txt")) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(cfg) LOGGER.info("Starting training...") LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info( f" Single-GPU Non-Accumulated batch size = {cfg.train_batch_size}") LOGGER.info(f" max_n_example_per_group = {cfg.max_n_example_per_group}") LOGGER.info(f" Accumulate steps = {cfg.gradient_accumulation_steps}") LOGGER.info( f" Total batch size = #GPUs * Single-GPU batch size * " f"max_n_example_per_group * Accumulate steps [Image] = {total_train_batch_size}" ) LOGGER.info(f" Total #epochs = {cfg.num_train_epochs}") LOGGER.info(f" Total #steps = {cfg.num_train_steps}") LOGGER.info( f" Validate every {cfg.valid_steps} steps, in total {actual_num_valid} times" ) # quick hack for amp delay_unscale bug with optimizer.skip_synchronize(): optimizer.zero_grad() if global_step == 0: optimizer.step() debug_step = 3 running_loss = RunningMeter('train_loss') for step, batch in enumerate(InfiniteIterator(train_loader)): # forward pass del batch["caption_ids"] mini_batch = dict() for k, v in batch.items(): if k != "visual_inputs": mini_batch[k] = v pool_method = cfg.score_agg_func # could be 1, where only a single clip is used num_clips = cfg.train_n_clips num_frm = cfg.num_frm # (B, T=num_clips*num_frm, C, H, W) --> (B, num_clips, num_frm, C, H, W) bsz = batch["visual_inputs"].shape[0] new_visual_shape = (bsz, num_clips, num_frm) + batch["visual_inputs"].shape[2:] visual_inputs = batch["visual_inputs"].view(*new_visual_shape) logits = [] for clip_idx in range(num_clips): # (B, num_frm, C, H, W) mini_batch["visual_inputs"] = visual_inputs[:, clip_idx] mini_batch["n_examples_list"] = batch["n_examples_list"] outputs = forward_step(model, mini_batch, cfg) logits.append(outputs["logits"]) # the losses are cross entropy and mse, no need to * num_labels logits = torch.stack(logits) # (num_frm, B, 5) if pool_method == "mean": logits = logits.mean(0) # (B, 5) elif pool_method == "max": logits = logits.max(0)[0] # (B, 5) elif pool_method == "lse": logits = logits.permute( 1, 0, 2).contiguous() # (B, num_frm, 5), pooling will be done in CE else: raise ValueError( f"Invalid value for pool_method, " f"got {pool_method}, expect one of [`mean`, `max`, `lse`]") if pool_method == "lse": out = torch.logsumexp(logits.view(logits.shape[0], -1), dim=-1, keepdim=True) \ - torch.logsumexp(logits, dim=1) loss = torch.gather(out, -1, batch["labels"].view(-1, 1)) else: _, loss = model.transformer.calc_loss( logits, batch["labels"], sample_size=len(batch["n_examples_list"])) loss = loss.mean() running_loss(loss.item()) # backward pass delay_unscale = (step + 1) % cfg.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() zero_none_grad(model) optimizer.synchronize() # optimizer if (step + 1) % cfg.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling n_epoch = int(1. * total_train_batch_size * global_step / total_n_examples) # learning rate scheduling transformer lr_this_step_transformer = get_lr_sched( global_step, cfg.decay, cfg.learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.step_decay_epochs, multi_step_epoch=n_epoch) # learning rate scheduling cnn lr_this_step_cnn = get_lr_sched( global_step, cfg.cnn_lr_decay, cfg.cnn_learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.cnn_step_decay_epochs, multi_step_epoch=n_epoch) # Hardcoded param group length assert len(optimizer.param_groups) == 8 for pg_n, param_group in enumerate(optimizer.param_groups): if pg_n in [0, 1]: param_group['lr'] = (cfg.transformer_lr_mul * lr_this_step_transformer) elif pg_n in [2, 3]: param_group['lr'] = lr_this_step_transformer elif pg_n in [4, 5]: param_group['lr'] = (cfg.cnn_lr_mul * lr_this_step_cnn) else: param_group['lr'] = lr_this_step_cnn TB_LOGGER.add_scalar("train/lr_transformer", lr_this_step_transformer, global_step) TB_LOGGER.add_scalar("train/lr_cnn", lr_this_step_cnn, global_step) TB_LOGGER.add_scalar('train/loss', running_loss.val, global_step) # update model params if cfg.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), cfg.grad_norm) TB_LOGGER.add_scalar("train/grad_norm", grad_norm, global_step) TB_LOGGER.step() # Check if there is None grad none_grads = [ p[0] for p in model.named_parameters() if p[1].requires_grad and p[1].grad is None ] assert len(none_grads) == 0, f"{none_grads}" with optimizer.skip_synchronize(): optimizer.step() optimizer.zero_grad() restorer.step() pbar.update(1) # checkpoint if global_step % cfg.valid_steps == 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_loader, eval_loader, cfg, global_step, eval_filepath=cfg.val_datasets[0].txt) model_saver.save(step=global_step, model=model) if global_step >= cfg.num_train_steps: break if cfg.debug and global_step >= debug_step: break if global_step % cfg.valid_steps != 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_loader, eval_loader, cfg, global_step, eval_filepath=cfg.val_datasets[0].txt) model_saver.save(step=global_step, model=model)
def start_training(cfg): set_random_seed(cfg.seed) n_gpu = hvd.size() cfg.n_gpu = n_gpu device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) if hvd.rank() != 0: LOGGER.disabled = True LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), bool(cfg.fp16))) model = setup_model(cfg, device=device) model.train() optimizer = setup_e2e_optimizer(model, cfg) # Horovod: (optional) compression algorithm.compressin compression = hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) model, optimizer = amp.initialize(model, optimizer, enabled=cfg.fp16, opt_level='O2', keep_batchnorm_fp32=True) # prepare data tokenizer = BertTokenizerFast.from_pretrained(cfg.tokenizer_dir) train_loader, val_loader = setup_dataloaders(cfg, tokenizer) # compute the number of steps and update cfg total_n_examples = len(train_loader.dataset) * cfg.max_n_example_per_group total_train_batch_size = int(n_gpu * cfg.train_batch_size * cfg.gradient_accumulation_steps * cfg.max_n_example_per_group) cfg.num_train_steps = int( math.ceil(1. * cfg.num_train_epochs * total_n_examples / total_train_batch_size)) cfg.valid_steps = int( math.ceil(1. * cfg.num_train_steps / cfg.num_valid / cfg.min_valid_steps)) * cfg.min_valid_steps actual_num_valid = int( math.floor(1. * cfg.num_train_steps / cfg.valid_steps)) + 1 # restore restorer = TrainingRestorer(cfg, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: LOGGER.info("Saving training meta...") save_training_meta(cfg) path = join(cfg.output_dir, 'log', "detectron2_model_cfg.yaml") with open(path, "w") as f: f.write(model.cnn.config_file) LOGGER.info("Saving training done...") TB_LOGGER.create(join(cfg.output_dir, 'log')) model_saver = ModelSaver(join(cfg.output_dir, "ckpt")) add_log_to_file(join(cfg.output_dir, "log", "log.txt")) pbar = tqdm(total=cfg.num_train_steps) else: LOGGER.disabled = True model_saver = NoOp() restorer = NoOp() pbar = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(cfg) LOGGER.info("Starting training...") LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info( f" Single-GPU Non-Accumulated batch size = {cfg.train_batch_size}") LOGGER.info(f" max_n_example_per_group = {cfg.max_n_example_per_group}") LOGGER.info(f" Accumulate steps = {cfg.gradient_accumulation_steps}") LOGGER.info( f" Total batch size = #GPUs * Single-GPU batch size * " f"max_n_example_per_group * Accumulate steps [Image] = {total_train_batch_size}" ) LOGGER.info(f" Total #epochs = {cfg.num_train_epochs}") LOGGER.info(f" Total #steps = {cfg.num_train_steps}") LOGGER.info( f" Validate every {cfg.valid_steps} steps, in total {actual_num_valid} times" ) # quick hack for amp delay_unscale bug with optimizer.skip_synchronize(): optimizer.zero_grad() if global_step == 0: optimizer.step() debug_step = 3 running_loss = RunningMeter('train_loss') for step, batch in enumerate(InfiniteIterator(train_loader)): # forward pass outputs, question_ids = forward_step(model, batch) loss = outputs["loss"].mean() loss = loss.float() * cfg.num_labels running_loss(loss.item()) # backward pass delay_unscale = (step + 1) % cfg.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() zero_none_grad(model) optimizer.synchronize() # optimizer if (step + 1) % cfg.gradient_accumulation_steps == 0: global_step += 1 TB_LOGGER.add_scalar('train/loss', running_loss.val, global_step) n_epoch = int(1. * total_train_batch_size * global_step / total_n_examples) # learning rate scheduling transformer lr_this_step_transformer = get_lr_sched( global_step, cfg.decay, cfg.learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.step_decay_epochs, multi_step_epoch=n_epoch) # learning rate scheduling cnn lr_this_step_cnn = get_lr_sched( global_step, cfg.cnn_lr_decay, cfg.cnn_learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.cnn_step_decay_epochs, multi_step_epoch=n_epoch) # Hardcoded param group length assert len(optimizer.param_groups) == 8 for pg_n, param_group in enumerate(optimizer.param_groups): if pg_n in [0, 1]: param_group['lr'] = (cfg.transformer_lr_mul * lr_this_step_transformer) elif pg_n in [2, 3]: param_group['lr'] = lr_this_step_transformer elif pg_n in [4, 5]: param_group['lr'] = (cfg.cnn_lr_mul * lr_this_step_cnn) else: param_group['lr'] = lr_this_step_cnn TB_LOGGER.add_scalar("train/lr_transformer", lr_this_step_transformer, global_step) TB_LOGGER.add_scalar("train/lr_cnn", lr_this_step_cnn, global_step) # update model params if cfg.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), cfg.grad_norm) TB_LOGGER.add_scalar("train/grad_norm", grad_norm, global_step) TB_LOGGER.step() # Check if there is None grad none_grads = [ p[0] for p in model.named_parameters() if p[1].requires_grad and p[1].grad is None ] assert len(none_grads) == 0, f"{none_grads}" with optimizer.skip_synchronize(): optimizer.step() optimizer.zero_grad() restorer.step() pbar.update(1) # checkpoint if global_step % cfg.valid_steps == 0: LOGGER.info(f'Step {global_step}: start validation') vqa_results = validate(model, val_loader, cfg, global_step) model_saver.save(step=global_step, model=model) if global_step >= cfg.num_train_steps: break if cfg.debug and global_step >= debug_step: break if global_step % cfg.valid_steps != 0: LOGGER.info(f'Step {global_step}: start validation') vqa_results = validate(model, val_loader, cfg, global_step) model_saver.save(step=global_step, model=model)