def train(self, train_tuple: DataTuple, eval_tuple: DataTuple): train_ld = train_tuple.loader # Optimizer batch_per_epoch = len(train_ld) t_total = int(batch_per_epoch * args.epochs // args.acc) warmup_ratio = 0.05 warmup_iters = int(t_total * warmup_ratio) print("Batch per epoch: %d" % batch_per_epoch) print("Total Iters: %d" % t_total) print("Warm up Iters: %d" % warmup_iters) optim = AdamW(self.model.parameters(), lr=args.lr) scheduler = get_linear_schedule_with_warmup(optim, warmup_iters, t_total) optim.zero_grad() # Tracking updates for accumulation ups = 0 # Train best_eval_loss = 9595. for epoch in range(args.epochs): # Train self.model.train() total_loss = 0. total_losses = 0. uid2ans = {} for batch in tqdm(train_ld, total=len(train_ld)): loss, losses, logit = self.train_batch(optim, scheduler, batch, ups) total_loss += loss total_losses += losses ups += 1 if args.task_qa: score, label = logit.max(1) for datum, l in zip(batch, label.cpu().numpy()): uid = datum.uid ans = train_tuple.dataset.answer_table.id2ans(l) uid2ans[uid] = ans print("The training loss for Epoch %d is %0.4f" % (epoch, total_loss / (batch_per_epoch * args.acc))) losses_str = "The losses are " # Somehow had to add [0] here, which is not in or. repo for name, loss in zip(LOSSES_NAME, total_losses[0]): losses_str += "%s: %0.4f " % (name, loss / batch_per_epoch) print(losses_str) # Save one halfway if epoch == 5: self.save("Epoch%02d" % (epoch + 1)) self.save("LAST")
def train(self, train_tuple: DataTuple, eval_tuple: DataTuple): train_ld = train_tuple.loader # Optimizer batch_per_epoch = len(train_ld) t_total = int(batch_per_epoch * args.epochs) warmup_ratio = 0.05 warmup_iters = int(t_total * warmup_ratio) print("Batch per epoch: %d" % batch_per_epoch) print("Total Iters: %d" % t_total) print("Warm up Iters: %d" % warmup_iters) optim = AdamW(self.model.parameters(), lr=args.lr) #scheduler = get_linear_schedule_with_warmup(optim, warmup_iters, t_total) # We use cos scheduler here, as it ends smoother than linear & we take the LAST model. scheduler = get_cosine_schedule_with_warmup(optim, warmup_iters, t_total) # Train best_eval_loss = 9595. for epoch in range(args.epochs): # Train self.model.train() total_loss = 0. total_losses = 0. uid2ans = {} for batch in tqdm(train_ld, total=len(train_ld)): loss, losses, logit = self.train_batch(optim, scheduler, batch) total_loss += loss total_losses += losses if args.task_qa: score, label = logit.max(1) for datum, l in zip(batch, label.cpu().numpy()): uid = datum.uid ans = train_tuple.dataset.answer_table.id2ans(l) uid2ans[uid] = ans print("The training loss for Epoch %d is %0.4f" % (epoch, total_loss / batch_per_epoch)) losses_str = "The losses are " # Somehow had to add [0] here, which is not in or repo for name, loss in zip(LOSSES_NAME, total_losses[0]): losses_str += "%s: %0.4f " % (name, loss / batch_per_epoch) print(losses_str) if args.task_qa: train_tuple.evaluator.evaluate(uid2ans, pprint=True) if epoch == 5: self.save("Epoch%02d" % (epoch + 1)) self.save("LAST")
def __init__(self): if args.train is not None: self.train_tuple = get_tuple(args.train, bs=args.batch_size, shuffle=True, drop_last=False) if args.valid is not None: valid_bsize = 2048 if args.multiGPU else 50 self.valid_tuple = get_tuple(args.valid, bs=valid_bsize, shuffle=False, drop_last=False) else: self.valid_tuple = None # Select Model, X is default if args.model == "X": self.model = ModelX(args) elif args.model == "V": self.model = ModelV(args) elif args.model == "U": self.model = ModelU(args) elif args.model == "D": self.model = ModelD(args) elif args.model == 'O': self.model = ModelO(args) else: print(args.model, " is not implemented.") # Load pre-trained weights from paths if args.loadpre is not None: self.model.load(args.loadpre) # GPU options if args.multiGPU: self.model.lxrt_encoder.multi_gpu() self.model = self.model.cuda() # Losses and optimizer self.logsoftmax = nn.LogSoftmax(dim=1) self.nllloss = nn.NLLLoss() if args.train is not None: batch_per_epoch = len(self.train_tuple.loader) self.t_total = int(batch_per_epoch * args.epochs // args.acc) print("Total Iters: %d" % self.t_total) def is_backbone(n): if "encoder" in n: return True elif "embeddings" in n: return True elif "pooler" in n: return True print("F: ", n) return False no_decay = ['bias', 'LayerNorm.weight'] params = list(self.model.named_parameters()) if args.reg: optimizer_grouped_parameters = [ { "params": [p for n, p in params if is_backbone(n)], "lr": args.lr }, { "params": [p for n, p in params if not is_backbone(n)], "lr": args.lr * 500 }, ] for n, p in self.model.named_parameters(): print(n) self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr) else: optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.wd }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] self.optim = AdamW(optimizer_grouped_parameters, lr=args.lr) if args.train is not None: self.scheduler = get_linear_schedule_with_warmup( self.optim, self.t_total * 0.1, self.t_total) self.output = args.output os.makedirs(self.output, exist_ok=True) # SWA Method: if args.contrib: self.optim = SWA(self.optim, swa_start=self.t_total * 0.75, swa_freq=5, swa_lr=args.lr) if args.swa: self.swa_model = AveragedModel(self.model) self.swa_start = self.t_total * 0.75 self.swa_scheduler = SWALR(self.optim, swa_lr=args.lr)