def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) device = torch.device("cuda:1") if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) if hvd.rank() == 0: TB_LOGGER.create(join(opts.output_dir, 'log')) os.makedirs(join(opts.output_dir, 'ckpt')) save_training_meta(opts) # TB_LOGGER.create(join(opts.output_dir, 'log')) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) # store ITM predictions os.makedirs(join(opts.output_dir, 'results_val')) os.makedirs(join(opts.output_dir, 'results_test')) os.makedirs(join(opts.output_dir, 'results_train')) else: LOGGER.disabled = True model_saver = NoOp() # load DBs and image dirs all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) # train train_dataset = MemeAIDataset(json_path = '/home/data/meme_json/train.json', npz_folder = '/home/data/faster_cnn_feature/', mode = 'train') train_loader = DataLoader(train_dataset, batch_size = opts.train_batch_size, shuffle = True, num_workers = opts.n_workers, collate_fn=collate_fn) train_loader = PrefetchLoader(train_loader) # val val_dataset = MemeAIDataset(json_path = '/home/data/meme_json/dev.json', npz_folder = '/home/data/faster_cnn_feature/', mode = 'val') val_loader = DataLoader(val_dataset, batch_size = opts.inf_minibatch_size, shuffle = False, num_workers = opts.n_workers, collate_fn=collate_fn) val_loader = PrefetchLoader(val_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = Meme.from_pretrained( opts.model_config, state_dict=checkpoint, img_dim=IMG_DIM) model.init_output() # pretrain ITM head is different from ranking head model.to(device) # make sure every process has same model parameters in the beginning # broadcast_tensors([p.data for p in model.parameters()], 0) # set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') global_step = 0 # LOGGER.info(f"***** Running training on {n_gpu} GPUs *****") # LOGGER.info(" Num examples = %d", len(train_dataset) * hvd.size()) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter('loss') model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() # while True: for epoch in range(opts.epoch): print('epoch {}/ {}'.format(epoch, opts.epoch)) pbar = tqdm(total=len(train_loader)) model.train() preds = None gt = None for step, batch in enumerate(train_loader): x = batch[0] y = batch[1] n_examples += x['input_ids'].size(0) pred = model(x) if preds is None: preds = torch.sigmoid(pred) gt = y else: preds = torch.cat((preds, torch.sigmoid(pred)), dim = 0) gt = torch.cat((gt, y), dim = 0) loss = F.binary_cross_entropy(torch.sigmoid(pred), y) delay_unscale = (step+1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale ) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() with torch.no_grad(): preds = preds.cpu().numpy().reshape(len(preds), ) gt = gt.cpu().numpy() roc = roc_auc_score(gt, preds) acc = accuracy_score(gt, np.around(preds)) train_log = {'train/roc': roc, 'train/acc': acc} TB_LOGGER.log_scaler_dict({f"train/{k}": v for k, v in train_log.items()}) # monitor training throughput val_log = validate(model, val_loader) TB_LOGGER.log_scaler_dict({f"valid/{k}": v for k, v in val_log.items()}) LOGGER.info(train_log) LOGGER.info(val_log) model_saver.save(model, global_step) pbar.close()
def main(opts): os.makedirs(opts.output_dir) os.makedirs(join(opts.output_dir, 'ckpt')) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) # train train_dataset = MemeAIDataset(json_path='/home/data/meme_json/train.json', npz_folder='/home/data/faster_cnn_feature/', mode='train') train_loader = DataLoader(train_dataset, batch_size=opts.train_batch_size, shuffle=True, num_workers=opts.n_workers, collate_fn=collate_fn) train_loader = PrefetchLoader(train_loader) # val val_dataset = MemeAIDataset(json_path='/home/data/meme_json/dev.json', npz_folder='/home/data/faster_cnn_feature/', mode='val') val_loader = DataLoader(val_dataset, batch_size=opts.inf_minibatch_size, shuffle=False, num_workers=opts.n_workers, collate_fn=collate_fn) val_loader = PrefetchLoader(val_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = Meme.from_pretrained(opts.model_config, state_dict=checkpoint, img_dim=IMG_DIM) model.init_output() # pretrain ITM head is different from ranking head model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=opts.learning_rate) for epoch in range(opts.epoch): print('epoch {}/ {}'.format(epoch, opts.epoch)) pbar = tqdm(total=len(train_loader)) model.train() preds = None gt = None for step, batch in enumerate(train_loader): x = batch[0] x['input_ids'] = x['input_ids'].to(device) x['position_ids'] = x['position_ids'].to(device) x['img_feat'] = x['img_feat'].to(device) x['img_pos_feat'] = x['img_pos_feat'].to(device) x['attn_masks'] = x['attn_masks'].to(device) x['gather_index'] = x['gather_index'].to(device) y = batch[1].to(device) pred = model(x) if preds is None: preds = torch.sigmoid(pred) gt = y else: preds = torch.cat((preds, torch.sigmoid(pred)), dim=0) gt = torch.cat((gt, y), dim=0) loss = F.binary_cross_entropy(torch.sigmoid(pred), y) loss.backward() optimizer.step() optimizer.zero_grad() pbar.update(1) model.eval() with torch.no_grad(): preds = preds.detach().cpu().numpy().reshape(len(preds), ) gt = gt.cpu().numpy() roc = roc_auc_score(gt, preds) acc = accuracy_score(gt, np.around(preds)) train_log = {'train/roc': roc, 'train/acc': acc} val_log = validate(model, val_loader) LOGGER.info(train_log) LOGGER.info(val_log) model_saver.save(model, epoch) pbar.close()