def setup_dataloaders(cfg, tokenizer): LOGGER.info("Init. train_loader and val_loader...") train_loader = mk_video_ret_dataloader(anno_path=cfg.train_datasets[0].txt, lmdb_dir=cfg.train_datasets[0].img, cfg=cfg, tokenizer=tokenizer, is_train=True) val_loader = mk_video_ret_dataloader(anno_path=cfg.val_datasets[0].txt, lmdb_dir=cfg.val_datasets[0].img, cfg=cfg, tokenizer=tokenizer, is_train=False) img_norm = ImageNorm(mean=cfg.img_pixel_mean, std=cfg.img_pixel_std) train_loader = PrefetchLoader(train_loader, img_norm) val_loader = PrefetchLoader(val_loader, img_norm) return train_loader, val_loader
def mk_video_ret_eval_dataloader(anno_path, lmdb_dir, cfg, tokenizer): """ eval_retrieval: bool, will sample one video per batch paired with multiple text. Returns: """ raw_datalist = load_jsonl(anno_path) datalist = mk_video_ret_datalist(raw_datalist, cfg) frm_sampling_strategy = cfg.frm_sampling_strategy if frm_sampling_strategy == "rand": frm_sampling_strategy = "middle" dataset = ClipBertVideoRetrievalEvalDataset( datalist=datalist, tokenizer=tokenizer, img_lmdb_dir=lmdb_dir, max_img_size=cfg.max_img_size, max_txt_len=cfg.max_txt_len, fps=cfg.fps, num_frm=cfg.num_frm, frm_sampling_strategy=frm_sampling_strategy, ensemble_n_clips=cfg.inference_n_clips, ) sampler = DistributedSampler(dataset, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) retrieval_collator = VideoRetrievalCollator(tokenizer=tokenizer, max_length=cfg.max_txt_len) dataloader = DataLoader( dataset, batch_size=1, # already batched in dataset shuffle=False, sampler=sampler, num_workers=cfg.n_workers, pin_memory=cfg.pin_mem, collate_fn=retrieval_collator.collate_batch) img_norm = ImageNorm(mean=cfg.img_pixel_mean, std=cfg.img_pixel_std) dataloader = PrefetchLoader(dataloader, img_norm) return dataloader
def start_training(): cfg = shared_configs.get_pretraining_args() set_random_seed(cfg.seed) n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) if hvd.rank() != 0: LOGGER.disabled = True LOGGER.info(f"device: {device} n_gpu: {n_gpu}, " f"rank: {hvd.rank()}, 16-bits training: {cfg.fp16}") model = setup_model(cfg, device=device) model.train() optimizer = setup_e2e_optimizer(model, cfg) # Horovod: (optional) compression algorithm.compressin compression = hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) model, optimizer = amp.initialize(model, optimizer, enabled=cfg.fp16, opt_level='O2', keep_batchnorm_fp32=True) # prepare data tokenizer = BertTokenizerFast.from_pretrained(cfg.tokenizer_dir) train_loaders, val_loaders = setup_dataloaders(cfg, tokenizer) train_loader = MetaLoader(train_loaders, accum_steps=cfg.gradient_accumulation_steps, distributed=n_gpu > 1) img_norm = ImageNorm(mean=cfg.img_pixel_mean, std=cfg.img_pixel_std) train_loader = PrefetchLoader(train_loader, img_norm) val_loaders = { k: PrefetchLoader(v, img_norm) for k, v in val_loaders.items() } # compute the number of steps and update cfg total_train_batch_size = int(n_gpu * cfg.train_batch_size * cfg.gradient_accumulation_steps * cfg.max_n_example_per_group) total_n_epochs = cfg.num_train_epochs cfg.num_train_steps = int( math.ceil(1. * train_loader.n_batches_in_epoch * total_n_epochs / (n_gpu * cfg.gradient_accumulation_steps))) cfg.valid_steps = int( math.ceil(1. * cfg.num_train_steps / cfg.num_valid / cfg.min_valid_steps)) * cfg.min_valid_steps actual_num_valid = int( math.floor(1. * cfg.num_train_steps / cfg.valid_steps)) + 1 # restore restorer = TrainingRestorer(cfg, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: LOGGER.info("Saving training meta...") save_training_meta(cfg) path = join(cfg.output_dir, 'log', "detectron2_model_cfg.yaml") with open(path, "w") as f: f.write(model.cnn.config_file) LOGGER.info("Saving training done...") TB_LOGGER.create(join(cfg.output_dir, 'log')) pbar = tqdm(total=cfg.num_train_steps) model_saver = ModelSaver(join(cfg.output_dir, "ckpt")) add_log_to_file(join(cfg.output_dir, "log", "log.txt")) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(cfg) LOGGER.info("Starting training...") LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info( f" Single-GPU Non-Accumulated batch size = {cfg.train_batch_size}") LOGGER.info(f" max_n_example_per_group = {cfg.max_n_example_per_group}") LOGGER.info(f" Accumulate steps = {cfg.gradient_accumulation_steps}") LOGGER.info( f" Total batch size = #GPUs * Single-GPU batch size * " f"max_n_example_per_group * Accumulate steps [Image] = {total_train_batch_size}" ) LOGGER.info( f" Total #batches - single epoch = {train_loader.n_batches_in_epoch}." ) LOGGER.info(f" Total #steps = {cfg.num_train_steps}") LOGGER.info(f" Total #epochs = {total_n_epochs}.") LOGGER.info( f" Validate every {cfg.valid_steps} steps, in total {actual_num_valid} times" ) # quick hack for amp delay_unscale bug with optimizer.skip_synchronize(): optimizer.zero_grad() if global_step == 0: optimizer.step() debug_step = 5 tasks = [] for name, flag in zip(["mlm", "itm"], [cfg.use_mlm, cfg.use_itm]): if flag: tasks.append(name) task2loss = {t: RunningMeter(f'train_loss/{t}') for t in tasks} task2loss["loss"] = RunningMeter('train_loss/loss') for step, (task, batch) in enumerate(train_loader): # forward pass outputs = forward_step(cfg, model, batch) mlm_loss, itm_loss = 0, 0 if cfg.use_mlm: mlm_loss = outputs["mlm_loss"].mean() task2loss["mlm"](mlm_loss.item()) if cfg.use_itm: itm_loss = outputs["itm_loss"].mean() task2loss["itm"](itm_loss.item()) loss = mlm_loss + itm_loss task2loss["loss"](loss.item()) delay_unscale = (step + 1) % cfg.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() zero_none_grad(model) optimizer.synchronize() # optimizer if (step + 1) % cfg.gradient_accumulation_steps == 0: global_step += 1 TB_LOGGER.log_scalar_dict({ l.name: l.val for l in task2loss.values() if l.val is not None }) n_epoch = int(1. * n_gpu * cfg.gradient_accumulation_steps * global_step / train_loader.n_batches_in_epoch) # learning rate scheduling transformer lr_this_step_transformer = get_lr_sched( global_step, cfg.decay, cfg.learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.step_decay_epochs, multi_step_epoch=n_epoch) # learning rate scheduling cnn lr_this_step_cnn = get_lr_sched( global_step, cfg.cnn_lr_decay, cfg.cnn_learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.cnn_step_decay_epochs, multi_step_epoch=n_epoch) # Hardcoded param group length assert len(optimizer.param_groups) == 8 for pg_n, param_group in enumerate(optimizer.param_groups): if pg_n in [0, 1]: param_group['lr'] = (cfg.transformer_lr_mul * lr_this_step_transformer) elif pg_n in [2, 3]: param_group['lr'] = lr_this_step_transformer elif pg_n in [4, 5]: param_group['lr'] = (cfg.cnn_lr_mul * lr_this_step_cnn) else: param_group['lr'] = lr_this_step_cnn TB_LOGGER.add_scalar("train/lr_transformer", lr_this_step_transformer, global_step) TB_LOGGER.add_scalar("train/lr_cnn", lr_this_step_cnn, global_step) # update model params if cfg.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), cfg.grad_norm) TB_LOGGER.add_scalar("train/grad_norm", grad_norm, global_step) TB_LOGGER.step() # Check if there is None grad none_grads = [ p[0] for p in model.named_parameters() if p[1].requires_grad and p[1].grad is None ] assert len(none_grads) == 0, f"{none_grads}" with optimizer.skip_synchronize(): optimizer.step() optimizer.zero_grad() restorer.step() pbar.update(1) # checkpoint if global_step % cfg.valid_steps == 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_loaders, cfg) model_saver.save(step=global_step, model=model) if global_step >= cfg.num_train_steps: break if cfg.debug and global_step >= debug_step: break if global_step % cfg.valid_steps != 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_loaders, cfg) model_saver.save(step=global_step, model=model)
def start_inference(cfg): set_random_seed(cfg.seed) n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) if hvd.rank() != 0: LOGGER.disabled = True inference_res_dir = join( cfg.output_dir, f"results_{os.path.splitext(os.path.basename(cfg.inference_txt_db))[0]}/" f"step_{cfg.inference_model_step}_{cfg.inference_n_clips}_{cfg.score_agg_func}" ) if hvd.rank() == 0: os.makedirs(inference_res_dir, exist_ok=True) save_json(cfg, join(inference_res_dir, "raw_args.json"), save_pretty=True) LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), bool(cfg.fp16))) # overwrite cfg with stored_cfg, # but skip keys containing the keyword 'inference' stored_cfg_path = join(cfg.output_dir, "log/args.json") stored_cfg = edict(load_json(stored_cfg_path)) for k, v in cfg.items(): if k in stored_cfg and "inference" not in k: setattr(cfg, k, stored_cfg[k]) # setup models cfg.model_config = join(cfg.output_dir, "log/model_config.json") e2e_weights_path = join( cfg.output_dir, f"ckpt/model_step_{cfg.inference_model_step}.pt") cfg.e2e_weights_path = e2e_weights_path model = setup_model(cfg, device=device) model.eval() # FIXME separate scaling for each loss model = amp.initialize( model, enabled=cfg.fp16, opt_level='O2') global_step = 0 # prepare data tokenizer = BertTokenizerFast.from_pretrained(cfg.tokenizer_dir) cfg.data_ratio = 1. val_loader = mk_tgif_qa_dataloader( task_type=cfg.task, anno_path=cfg.inference_txt_db, lmdb_dir=cfg.inference_img_db, cfg=cfg, tokenizer=tokenizer, is_train=False, return_label=False ) img_norm = ImageNorm(mean=cfg.img_pixel_mean, std=cfg.img_pixel_std) val_loader = PrefetchLoader(val_loader, img_norm) LOGGER.info(cfg) LOGGER.info("Starting inference...") LOGGER.info(f"***** Running inference with {n_gpu} GPUs *****") LOGGER.info(f" Batch size = {cfg.inference_batch_size}") LOGGER.info(f'Step {global_step}: start validation') qa_results, qa_scores = validate( model, val_loader, cfg, global_step, eval_score=True) # cfg.inference_split == "val" if hvd.rank() == 0: save_json(cfg, join(inference_res_dir, "merged_args.json"), save_pretty=True) save_json(qa_scores, join(inference_res_dir, "scores.json"), save_pretty=True) # ###### Saving with Horovod #################### # dummy sync _ = None all_gather_list(_) if n_gpu > 1: # with retrial, as azure blob fails occasionally. max_save_load_trial = 10 save_trial = 0 while save_trial < max_save_load_trial: try: LOGGER.info(f"Save results trial NO. {save_trial}") save_json( qa_results, join(inference_res_dir, f"results_rank{hvd.rank()}.json")) break except Exception as e: save_trial += 1 # dummy sync _ = None all_gather_list(_) # join results if n_gpu > 1 and hvd.rank() == 0: qa_results = [] for rk in range(n_gpu): qa_results.extend(load_json( join(inference_res_dir, f"results_rank{rk}.json"))) LOGGER.info(f'results joined') if hvd.rank() == 0: save_json( qa_results, join(inference_res_dir, f"results_all.json")) LOGGER.info(f'all results written')
def start_inference(cfg): set_random_seed(cfg.seed) n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) if hvd.rank() != 0: LOGGER.disabled = True inference_res_dir = join( cfg.output_dir, f"results_{cfg.inference_split}" f"step_{cfg.inference_model_step}") if hvd.rank() == 0: os.makedirs(inference_res_dir, exist_ok=True) save_json(cfg, join(inference_res_dir, "raw_args.json"), save_pretty=True) LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), bool(cfg.fp16))) # overwrite cfg with stored_cfg, # but skip keys containing the keyword 'inference' stored_cfg_path = join(cfg.output_dir, "log/args.json") stored_cfg = edict(load_json(stored_cfg_path)) for k, v in cfg.items(): if (k in stored_cfg and "inference" not in k and k != "output_dir"): value = stored_cfg[k] # FIXME hardcode changes if isinstance(value, str) and value.startswith("/data"): value = value.replace("/data", "/storage") setattr(cfg, k, value) # setup models cfg.model_config = join(cfg.output_dir, "log/model_config.json") cfg.detectron2_model_cfg = join(cfg.output_dir, "log/detectron2_model_cfg.yaml") e2e_weights_path = join(cfg.output_dir, f"ckpt/model_step_{cfg.inference_model_step}.pt") if exists(e2e_weights_path): cfg.e2e_weights_path = e2e_weights_path else: cfg.bert_weights_path = join( f"{cfg.output_dir}/ckpt", f"transformer_step_{cfg.inference_model_step}.pt") cfg.cnn_weights_path = join( cfg.output_dir, f"ckpt/cnn_step_{cfg.inference_model_step}.pt") model = setup_model(cfg, device=device) model.eval() # FIXME separate scaling for each loss model = amp.initialize(model, enabled=cfg.fp16, opt_level='O2') global_step = 0 # prepare data tokenizer = BertTokenizerFast.from_pretrained(cfg.tokenizer_dir) cfg.data_ratio = 1. val_loader = mk_vqa_dataloader(anno_path=cfg.inference_txt_db, img_lmdb_dir=cfg.inference_img_db, cfg=cfg, tokenizer=tokenizer, is_train=False) img_norm = ImageNorm(mean=cfg.img_pixel_mean, std=cfg.img_pixel_std) val_loader = PrefetchLoader(val_loader, img_norm) LOGGER.info(cfg) LOGGER.info("Starting inference...") LOGGER.info(f"***** Running inference with {n_gpu} GPUs *****") LOGGER.info(f" Batch size = {cfg.inference_batch_size}") LOGGER.info(f'Step {global_step}: start validation') vqa_results = validate(model, val_loader, cfg, global_step, eval_score=cfg.inference_split == "val") if hvd.rank() == 0: save_json(cfg, join(inference_res_dir, "merged_args.json"), save_pretty=True) # ###### Saving with Horovod #################### # dummy sync _ = None all_gather_list(_) if n_gpu > 1: # with retrial, as azure blob fails occasionally. max_save_load_trial = 10 save_trial = 0 while save_trial < max_save_load_trial: try: LOGGER.info(f"Save results trial NO. {save_trial}") save_json( vqa_results, join(inference_res_dir, f"results_rank{hvd.rank()}.json")) break except Exception: save_trial += 1 # dummy sync _ = None all_gather_list(_) # join results if n_gpu > 1 and hvd.rank() == 0: vqa_results = [] for rk in range(n_gpu): vqa_results.extend( load_json(join(inference_res_dir, f"results_rank{rk}.json"))) LOGGER.info('results joined') if hvd.rank() == 0: save_json(vqa_results, join(inference_res_dir, "results_all.json")) LOGGER.info('all results written')