def validate(val_loader, model, criterion, epoch, log_freq=1, print_sum=True, device=None, stereo=True): losses = AverageMeter() # set model to evaluation model.eval() with torch.no_grad(): epoch_time = time.time() end = time.time() for idx, (batch_images, batch_poses) in enumerate(val_loader): data_time = time.time() - end if stereo: batch_images = [x.to(device) for x in batch_images] batch_poses = [x.to(device) for x in batch_poses] else: batch_images = batch_images.to(device) batch_poses = batch_poses.to(device) # compute model output out = model(batch_images) loss = criterion(out, batch_poses) losses.update( loss.data[0], len(batch_images) * batch_images[0].size(0) if stereo else batch_images.size(0)) batch_time = time.time() - end end = time.time() if log_freq != 0 and idx % log_freq == 0: print('Val Epoch: {}\t' 'Time: {batch_time:.3f}\t' 'Data Time: {data_time:.3f}\t' 'Loss: {losses.val:.3f}\t' 'Avg Loss: {losses.avg:.3f}'.format( epoch, batch_time=batch_time, data_time=data_time, losses=losses)) # if idx == 0: # break if print_sum: print( 'Epoch: [{}]\tValidation Loss: {:.3f}\tEpoch time: {:.3f}'.format( epoch, losses.avg, (time.time() - epoch_time)))
def eval_model(val_loader, model, criterion, eval_metric, epoch, use_cuda): losses = AverageMeter() y_pred = [] y_label = [] model.train(False) torch.set_grad_enabled(False) for i, data in enumerate(val_loader): xi, xv, y = data[0], data[1], data[2] if use_cuda: xi, xv, y = xi.cuda(), xv.cuda(), y.cuda() outputs = model(xi, xv) loss = criterion(outputs, y) pred = torch.sigmoid(outputs).cpu() y_pred.extend(pred.data.numpy()) y_label.extend(y.data.numpy()) losses.update(loss.item(), y.shape[0]) total_metric = eval_metric(y_label, y_pred) return losses.avg, total_metric
def train_epoch(train_loader, model, criterion, optimizer, epoch, use_cuda): losses = AverageMeter() model.train(True) torch.set_grad_enabled(True) for i, data in enumerate(train_loader): xi, xv, y = data[0], data[1], data[2] if use_cuda: xi, xv, y = xi.cuda(), xv.cuda(), y.cuda() optimizer.zero_grad() outputs = model(xi, xv) loss = criterion(outputs, y) loss.backward() optimizer.step() losses.update(loss.item(), y.shape[0]) progress_bar(i, len(train_loader), 'batch {}, train loss {:.5f}'.format(i, losses.avg)) logging.info('Epoch: [{0}]\t Loss {loss.avg:.4f}\t'.format(epoch, loss=losses))
def train(args): if args.batch_size % args.num_instance != 0: new_batch_size = (args.batch_size // args.num_instance) * args.num_instance print( f"given batch size is {args.batch_size} and num_instances is {args.num_instance}." + f"Batch size must be divided into {args.num_instance}. Batch size will be replaced into {new_batch_size}" ) args.batch_size = new_batch_size # prepare dataset train_loader, val_loader, num_query, train_data_len, num_classes = make_data_loader( args) model = build_model(args, num_classes) print("model size: {:.5f}M".format( sum(p.numel() for p in model.parameters()) / 1e6)) loss_fn, center_criterion = make_loss(args, num_classes) optimizer, optimizer_center = make_optimizer(args, model, center_criterion) if args.cuda: model = model.cuda() if args.amp: if args.center_loss: model, [optimizer, optimizer_center] = \ amp.initialize(model, [optimizer, optimizer_center], opt_level="O1") else: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() if args.center_loss: center_criterion = center_criterion.cuda() for state in optimizer_center.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() model_state_dict = model.state_dict() optim_state_dict = optimizer.state_dict() if args.center_loss: optim_center_state_dict = optimizer_center.state_dict() center_state_dict = center_criterion.state_dict() reid_evaluator = ReIDEvaluator(args, model, num_query) start_epoch = 0 global_step = 0 if args.pretrain != '': # load pre-trained model weights = torch.load(args.pretrain) model_state_dict = weights["state_dict"] model.load_state_dict(model_state_dict) if args.center_loss: center_criterion.load_state_dict( torch.load(args.pretrain.replace( 'model', 'center_param'))["state_dict"]) if args.resume: start_epoch = weights["epoch"] global_step = weights["global_step"] optimizer.load_state_dict( torch.load(args.pretrain.replace('model', 'optimizer'))["state_dict"]) if args.center_loss: optimizer_center.load_state_dict( torch.load( args.pretrain.replace( 'model', 'optimizer_center'))["state_dict"]) print(f'Start epoch: {start_epoch}, Start step: {global_step}') scheduler = WarmupMultiStepLR(optimizer, args.steps, args.gamma, args.warmup_factor, args.warmup_step, "linear", -1 if start_epoch == 0 else start_epoch) current_epoch = start_epoch best_epoch = 0 best_rank1 = 0 best_mAP = 0 if args.resume: rank, mAP = reid_evaluator.evaluate(val_loader) best_rank1 = rank[0] best_mAP = mAP best_epoch = current_epoch + 1 batch_time = AverageMeter() total_losses = AverageMeter() model_save_dir = os.path.join(args.save_dir, 'ckpts') os.makedirs(model_save_dir, exist_ok=True) summary_writer = SummaryWriter(log_dir=os.path.join( args.save_dir, "tensorboard_log"), purge_step=global_step) def summary_loss(score, feat, labels, top_name='global'): loss = 0.0 losses = loss_fn(score, feat, labels) for loss_name, loss_val in losses.items(): if loss_name.lower() == "accuracy": summary_writer.add_scalar(f"Score/{top_name}/triplet", loss_val, global_step) continue if "dist" in loss_name.lower(): summary_writer.add_histogram(f"Distance/{loss_name}", loss_val, global_step) continue loss += loss_val summary_writer.add_scalar(f"losses/{top_name}/{loss_name}", loss_val, global_step) ohe_labels = torch.zeros_like(score) ohe_labels.scatter_(1, labels.unsqueeze(1), 1.0) cls_score = torch.softmax(score, dim=1) cls_score = torch.sum(cls_score * ohe_labels, dim=1).mean() summary_writer.add_scalar(f"Score/{top_name}/X-entropy", cls_score, global_step) return loss def save_weights(file_name, eph, steps): torch.save( { "state_dict": model_state_dict, "epoch": eph + 1, "global_step": steps }, file_name) torch.save({"state_dict": optim_state_dict}, file_name.replace("model", "optimizer")) if args.center_loss: torch.save({"state_dict": center_state_dict}, file_name.replace("model", "optimizer_center")) torch.save({"state_dict": optim_center_state_dict}, file_name.replace("model", "center_param")) # training start for epoch in range(start_epoch, args.max_epoch): model.train() t0 = time.time() for i, (inputs, labels, _, _) in enumerate(train_loader): if args.cuda: inputs = inputs.cuda() labels = labels.cuda() cls_scores, features = model(inputs, labels) # losses total_loss = summary_loss(cls_scores[0], features[0], labels, 'global') if args.use_local_feat: total_loss += summary_loss(cls_scores[1], features[1], labels, 'local') optimizer.zero_grad() if args.center_loss: optimizer_center.zero_grad() # backward with global loss if args.amp: optimizers = [optimizer] if args.center_loss: optimizers.append(optimizer_center) with amp.scale_loss(total_loss, optimizers) as scaled_loss: scaled_loss.backward() else: with torch.autograd.detect_anomaly(): total_loss.backward() # optimization optimizer.step() if args.center_loss: for name, param in center_criterion.named_parameters(): try: param.grad.data *= (1. / args.center_loss_weight) except AttributeError: continue optimizer_center.step() batch_time.update(time.time() - t0) total_losses.update(total_loss.item()) # learning_rate current_lr = optimizer.param_groups[0]['lr'] summary_writer.add_scalar("lr", current_lr, global_step) t0 = time.time() if (i + 1) % args.log_period == 0: print( f"Epoch: [{epoch}][{i+1}/{train_data_len}] " + f"Batch Time {batch_time.val:.3f} ({batch_time.mean:.3f}) " + f"Total_loss {total_losses.val:.3f} ({total_losses.mean:.3f})" ) global_step += 1 print( f"Epoch: [{epoch}]\tEpoch Time {batch_time.sum:.3f} s\tLoss {total_losses.mean:.3f}\tLr {current_lr:.2e}" ) if args.eval_period > 0 and (epoch + 1) % args.eval_period == 0 or ( epoch + 1) == args.max_epoch: rank, mAP = reid_evaluator.evaluate( val_loader, mode="retrieval" if args.dataset_name == "cub200" else "reid") rank_string = "" for r in (1, 2, 4, 5, 8, 10, 16, 20): rank_string += f"Rank-{r:<3}: {rank[r-1]:.1%}" if r != 20: rank_string += " " summary_writer.add_text("Recall@K", rank_string, global_step) summary_writer.add_scalar("Rank-1", rank[0], (epoch + 1)) rank1 = rank[0] is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_mAP = mAP best_epoch = epoch + 1 if (epoch + 1) % args.save_period == 0 or (epoch + 1) == args.max_epoch: pth_file_name = os.path.join( model_save_dir, f"{args.backbone}_model_{epoch + 1}.pth.tar") save_weights(pth_file_name, eph=epoch, steps=global_step) if is_best: pth_file_name = os.path.join( model_save_dir, f"{args.backbone}_model_best.pth.tar") save_weights(pth_file_name, eph=epoch, steps=global_step) # end epoch current_epoch += 1 batch_time.reset() total_losses.reset() torch.cuda.empty_cache() # update learning rate scheduler.step() print(f"Best rank-1 {best_rank1:.1%}, achived at epoch {best_epoch}") summary_writer.add_hparams( { "dataset_name": args.dataset_name, "triplet_dim": args.triplet_dim, "margin": args.margin, "base_lr": args.base_lr, "use_attn": args.use_attn, "use_mask": args.use_mask, "use_local_feat": args.use_local_feat }, { "mAP": best_mAP, "Rank1": best_rank1 })
def test(model, loader_test, data_length, device, criterion, batch_size, print_logger, step, use_top5=False, verbose=False): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() t1 = time.time() with torch.no_grad(): # switch to evaluate mode model.eval() end = time.time() for i, data in enumerate(loader_test): inputs = data[0]["data"].to(device) targets = data[0]["label"].squeeze().long().to(device) # for i, (inputs, targets) in enumerate(loader_test, 1): # inputs = inputs.to(device) # targets = targets.to(device) # compute output output = model(inputs) loss = criterion(output, targets) #measure accuracy and record loss prec1, prec5 = accuracy(output, targets, topk=(1, 5)) losses.update(loss.item(), batch_size) top1.update(prec1[0], batch_size) top5.update(prec5[0], batch_size) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress # measure elapsed time t2 = time.time() print_logger.info('Test Step [{0}]: ' 'Loss {loss.avg:.4f} ' 'Prec@1(1,5) {top1.avg:.2f}, {top5.avg:.2f} ' 'Time {time}'.format(step, loss=losses, top1=top1, top5=top5, time=t2 - t1)) loader_test.reset() return top1.avg
def finetune(model,loader_train,data_length,device,criterion,optimizer,scheduler,\ print_freq, print_logger,step,batch_size,epochs=1,use_top5=False,verbose=True): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() best_acc = 0. # switch to train mode model.train() end = time.time() t1 = time.time() num_iterations = int(data_length / batch_size) for epoch in range(epochs): scheduler.step(epoch) for i, data in enumerate(loader_train): inputs = data[0]["data"].to(device) targets = data[0]["label"].squeeze().long().to(device) # measure data loading time data_time.update(time.time() - end) optimizer.zero_grad() # compute output output = model(inputs) loss = criterion(output, targets) # compute gradient loss.backward() nn.utils.clip_grad_norm_(model.parameters(), CLIP_VALUE) optimizer.step() optimizer.moment = [] # measure accuracy and record loss prec1, prec5 = accuracy(output.data, targets, topk=(1, 5)) losses.update(loss.item(), batch_size) top1.update(prec1.item(), batch_size) top5.update(prec5.item(), batch_size) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % print_freq == 0: print_logger.info( 'Finetune Step [{0}] Epoch [{1}|{2}] ({3}/{4}): ' 'Loss {loss.avg:.4f} ' 'Prec@1(1,5) {top1.avg:.2f}, {top5.avg:.2f} '.format( step, epoch, epochs, i, num_iterations, loss=losses, top1=top1, top5=top5)) if use_top5: if top5.avg > best_acc: best_acc = top5.avg else: if top1.avg > best_acc: best_acc = top1.avg loader_train.reset() return model, top1.avg
def finetune_one_batch(model, pre_params, loader_train, data_length, device, criterion, optimizer, scheduler, print_freq, print_logger, step, batch_size, epochs=1, use_top5=False, verbose=True): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() best_acc = 0. informance = 0.0 params = [] model.train() end = time.time() t1 = time.time() for epoch in range(epochs): if scheduler is not None: scheduler.step(epoch) for batch_idx, data in enumerate(loader_train, 0): # for i,(inputs,targets) in enumerate(loader_train,0): # pdb.set_trace() inputs, targets = data inputs = inputs.to(device) targets = targets.to(device) # measure data loading time data_time.update(time.time() - end) optimizer.zero_grad() # compute output output = model(inputs) loss = criterion(output, targets) # compute gradient loss.backward() nn.utils.clip_grad_norm_(model.parameters(), CLIP_VALUE) params = [] optimizer.step() # measure accuracy and record loss prec1, prec5 = accuracy(output.data, targets, topk=(1, 5)) losses.update(loss.item(), batch_size) top1.update(prec1.item(), batch_size) top5.update(prec5.item(), batch_size) # measure elapsed time batch_time.update(time.time() - end) end = time.time() print_logger.info( 'Finetune One Batch Step [{0}]: ' 'Loss {loss.avg:.4f} ' 'Prec@1(1,5) {top1.avg:.2f}, {top5.avg:.2f} '.format( step, loss=losses, top1=top1, top5=top5)) for _, p in model.named_parameters(): params.append(p) moment = optimizer.moment informance = [0.0 for i in range(len(moment))] suminfo = 0.0 for i in range(len(moment)): informance[i] = moment[i] * torch.pow( (pre_params[i] - params[i]), 2) suminfo = 0.0 for info in informance: suminfo += torch.sum(info).item() if use_top5: if top5.avg > best_acc: best_acc = top5.avg else: if top1.avg > best_acc: best_acc = top1.avg optimizer.moment = [] return model, suminfo, top1.avg
def train(train_loader, model, criterion, optimizer, epoch, max_epoch, log_freq=1, print_sum=True, poses_mean=None, poses_std=None, device=None, stereo=True): # switch model to training model.train() losses = AverageMeter() epoch_time = time.time() gt_poses = np.empty((0, 7)) pred_poses = np.empty((0, 7)) end = time.time() for idx, (batch_images, batch_poses) in enumerate(train_loader): data_time = (time.time() - end) if stereo: batch_images = [x.to(device) for x in batch_images] batch_poses = [x.to(device) for x in batch_poses] else: batch_images = batch_images.to(device) batch_poses = batch_poses.to(device) out = model(batch_images) loss = criterion(out, batch_poses) # print('loss = {}'.format(loss)) # Make an optimization step optimizer.zero_grad() loss.backward() optimizer.step() losses.update( loss.data[0], len(batch_images) * batch_images[0].size(0) if stereo else batch_images.size(0)) # move data to cpu & numpy if stereo: bp = [x.detach().cpu().numpy() for x in batch_poses] outp = [x.detach().cpu().numpy() for x in out] gt_poses = np.vstack((gt_poses, *bp)) pred_poses = np.vstack((pred_poses, *outp)) else: bp = batch_poses.detach().cpu().numpy() outp = out.detach().cpu().numpy() gt_poses = np.vstack((gt_poses, bp)) pred_poses = np.vstack((pred_poses, outp)) batch_time = (time.time() - end) end = time.time() if log_freq != 0 and idx % log_freq == 0: print('Epoch: [{}/{}]\tBatch: [{}/{}]\t' 'Time: {batch_time:.3f}\t' 'Data Time: {data_time:.3f}\t' 'Loss: {losses.val:.3f}\t' 'Avg Loss: {losses.avg:.3f}\t'.format(epoch, max_epoch - 1, idx, len(train_loader) - 1, batch_time=batch_time, data_time=data_time, losses=losses)) # if idx == 0: # break # un-normalize translation unnorm = (poses_mean is not None) and (poses_std is not None) if unnorm: gt_poses[:, :3] = gt_poses[:, :3] * poses_std + poses_mean pred_poses[:, :3] = pred_poses[:, :3] * poses_std + poses_mean t_loss = np.asarray([ np.linalg.norm(p - t) for p, t in zip(pred_poses[:, :3], gt_poses[:, :3]) ]) q_loss = np.asarray([ quaternion_angular_error(p, t) for p, t in zip(pred_poses[:, 3:], gt_poses[:, 3:]) ]) # if unnorm: # print('poses_std = {:.3f}'.format(np.linalg.norm(poses_std))) # print('T: median = {:.3f}, mean = {:.3f}'.format(np.median(t_loss), np.mean(t_loss))) # print('R: median = {:.3f}, mean = {:.3f}'.format(np.median(q_loss), np.mean(q_loss))) if print_sum: print( 'Ep: [{}/{}]\tTrain Loss: {:.3f}\tTe: {:.3f}\tRe: {:.3f}\t Et: {:.2f}s\t\ {criterion_sx:.5f}:{criterion_sq:.5f}'.format( epoch, max_epoch - 1, losses.avg, np.mean(t_loss), np.mean(q_loss), (time.time() - epoch_time), criterion_sx=criterion.sx.data[0], criterion_sq=criterion.sq.data[0]))