def evaluate(dataloader, model, dev, topk=(1, )): """ :param dataloader: :param model: :param dev: devices, gpu or cpu :param topk: [tuple] output the top topk accuracy :return: [list[float]] topk accuracy """ model.eval() test_accuracy = AverageMeter() test_accuracy.reset() with torch.no_grad(): for _, sample in enumerate(tqdm(dataloader, ncols=100, ascii=' >')): x = sample['data'].to(dev) y = sample['label'].to(dev) output = model(x) logits = output['logits'] acc = accuracy(logits, y, topk) test_accuracy.update(acc[0], x.size(0)) return test_accuracy.avg
def do_train(cfg, model, train_loader, optimizer, scheduler, loss_fn): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: model.to(device) if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = nn.DataParallel(model) loss_meter = AverageMeter() acc_meter = AverageMeter() # train scaler = GradScaler() for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() model.train() for n_iter, (img, vid) in enumerate(train_loader): optimizer.zero_grad() if cfg.INPUT.AUGMIX: bs = img[0].size(0) images_cat = torch.cat(img, dim=0).to( device) # [3 * batch, 3, 32, 32] target = vid.to(device) with autocast(): logits, feat = model(images_cat, target) logits_orig, logits_augmix1, logits_augmix2 = logits[:bs], logits[ bs:2 * bs], logits[2 * bs:] loss = loss_fn(logits_orig, feat, target) p_orig, p_augmix1, p_augmix2 = F.softmax( logits_orig, dim=-1), F.softmax(logits_augmix1, dim=-1), F.softmax(logits_augmix2, dim=-1) # Clamp mixture distribution to avoid exploding KL divergence p_mixture = torch.clamp( (p_orig + p_augmix1 + p_augmix2) / 3., 1e-7, 1).log() loss += 12 * ( F.kl_div(p_mixture, p_orig, reduction='batchmean') + F.kl_div(p_mixture, p_augmix1, reduction='batchmean') + F.kl_div(p_mixture, p_augmix2, reduction='batchmean')) / 3. else: img = img.to(device) target = vid.to(device) with autocast(): if cfg.MODEL.CHANNEL_HEAD: score, feat, channel_head_feature = model(img, target) #print(feat.shape, channel_head_feature.shape) loss = loss_fn(score, feat, channel_head_feature, target) else: score, feat = model(img, target) loss = loss_fn(score, feat, target) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_lr()[0])) scheduler.step() end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
def main(cfg, device): init_seeds() cfg.use_fp16 = False if device.type == 'cpu' else cfg.use_fp16 # logging ---------------------------------------------------------------------------------------------------------------------------------------- logger_root = f'Results/{cfg.dataset}' if not os.path.isdir(logger_root): os.makedirs(logger_root, exist_ok=True) logtime = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') result_dir = os.path.join(logger_root, f'{logtime}-{cfg.log}') # result_dir = os.path.join(logger_root, f'ablation_study-{cfg.log}') #TODO logger = Logger(logging_dir=result_dir, DEBUG=False) logger.set_logfile(logfile_name='log.txt') save_params(cfg, f'{result_dir}/params.json', json_format=True) logger.debug(f'Result Path: {result_dir}') # model, optimizer, scheduler -------------------------------------------------------------------------------------------------------------------- opt_lvl = 'O1' if cfg.use_fp16 else 'O0' n_classes = cfg.n_classes net1 = ResNet(arch=cfg.net1, num_classes=n_classes, pretrained=True) optimizer1 = build_sgd_optimizer(net1.parameters(), cfg.lr, cfg.weight_decay) net1, optimizer1 = amp.initialize(net1.to(device), optimizer1, opt_level=opt_lvl, keep_batchnorm_fp32=None, loss_scale=None, verbosity=0) net2 = ResNet(arch=cfg.net2, num_classes=n_classes, pretrained=True) optimizer2 = build_sgd_optimizer(net2.parameters(), cfg.lr, cfg.weight_decay) net2, optimizer2 = amp.initialize(net2.to(device), optimizer2, opt_level=opt_lvl, keep_batchnorm_fp32=None, loss_scale=None, verbosity=0) lr_plan = make_lr_plan(cfg.lr, cfg.stage1, cfg.epochs) with open(f'{result_dir}/network.txt', 'w') as f: f.writelines(net1.__repr__()) f.write('\n\n---------------------------\n\n') f.writelines(net1.__repr__()) # drop rate scheduler ---------------------------------------------------------------------------------------------------------------------------- T_k = cfg.stage1 final_drop_rate = 0.25 final_ldl_rate = cfg.ldl_rate drop_rate_scheduler = np.ones(cfg.epochs) * final_drop_rate drop_rate_scheduler[:T_k] = np.linspace(0, final_drop_rate, T_k) drop_rate_scheduler[T_k:cfg.epochs] = np.linspace(final_drop_rate, final_ldl_rate, cfg.epochs - T_k) # dataset, dataloader ---------------------------------------------------------------------------------------------------------------------------- transform = build_transform(rescale_size=cfg.rescale_size, crop_size=cfg.crop_size) dataset = build_webfg_dataset(os.path.join(cfg.database, cfg.dataset), CLDataTransform(transform['train']), transform['test']) logger.debug(f"Number of Training Samples: {dataset['n_train_samples']}") logger.debug(f"Number of Testing Samples: {dataset['n_test_samples']}") train_loader = DataLoader(dataset['train'], batch_size=cfg.batch_size, shuffle=True, num_workers=8, pin_memory=True) test_loader = DataLoader(dataset['test'], batch_size=16, shuffle=False, num_workers=8, pin_memory=True) # meters ----------------------------------------------------------------------------------------------------------------------------------------- train_loss1, train_loss2 = AverageMeter(), AverageMeter() train_accuracy1, train_accuracy2 = AverageMeter(), AverageMeter() iter_time = AverageMeter() # training --------------------------------------------------------------------------------------------------------------------------------------- start_epoch = 0 best_accuracy1, best_accuracy2 = 0.0, 0.0 best_epoch1, best_epoch2 = None, None if cfg.dataset == 'cifar100' and cfg.noise_type != 'clean': t = torch.tensor(dataset['train'].noisy_labels) else: t = torch.tensor(dataset['train'].targets) labels2learn1 = torch.full(size=(dataset['n_train_samples'], n_classes), fill_value=0.0) labels2learn1.scatter_(dim=1, index=torch.unsqueeze(t, dim=1), value=1.0 * 10) labels2learn2 = labels2learn1 flag = [0, 0, 0] for epoch in range(start_epoch, cfg.epochs): start_time = time.time() train_loss1.reset() train_accuracy1.reset() train_loss2.reset() train_accuracy2.reset() net1.train() net2.train() adjust_lr(optimizer1, lr_plan[epoch]) adjust_lr(optimizer2, lr_plan[epoch]) optimizer1.zero_grad() optimizer2.zero_grad() # train this epoch for it, sample in enumerate(train_loader): s = time.time() # optimizer1.zero_grad() # optimizer2.zero_grad() indices = sample['index'] x1, x2 = sample['data'] x1, x2 = x1.to(device), x2.to(device) y0 = sample['label'].to(device) y = get_smoothed_label_distribution(y0, nc=n_classes, epsilon=cfg.epsilon) output1 = net1(x1) output2 = net2(x2) logits1 = output1['logits'] logits2 = output2['logits'] if epoch < cfg.stage1: # warmup if flag[0] == 0: step_flagging('stage 1') flag[0] += 1 loss1 = cross_entropy(logits1, y) loss2 = cross_entropy(logits2, y) else: # learn label distributions if flag[1] == 0: step_flagging('stage 2') flag[1] += 1 with torch.no_grad(): cce_losses1 = cross_entropy(logits1, y, reduction='none') cce_losses2 = cross_entropy(logits2, y, reduction='none') losses1 = cce_losses1 losses2 = cce_losses2 # ent_losses1 = entropy_loss(logits1, reduction='none') # ent_losses2 = entropy_loss(logits2, reduction='none') # losses1 = cce_losses1 + ent_losses1 # (N) # losses2 = cce_losses2 + ent_losses2 # (N) sample_selection = sample_selector(losses1, losses2, drop_rate_scheduler[epoch]) # for selected "clean" samples, train in a co-teaching manner logits_clean1 = logits1[sample_selection['clean2']] logits_clean2 = logits2[sample_selection['clean1']] y_clean1 = y[sample_selection['clean2']] y_clean2 = y[sample_selection['clean1']] losses_clean1 = cross_entropy(logits_clean1, y_clean1, reduction='none') + entropy_loss(logits_clean1, reduction='none') # (Nc1) losses_clean2 = cross_entropy(logits_clean2, y_clean2, reduction='none') + entropy_loss(logits_clean2, reduction='none') # (Nc2) loss_c1_1 = losses_clean1.mean() loss_c2_1 = losses_clean2.mean() # for selected "unclean" samples, train in a label distribution learning manner (exchange again) y_t1 = labels2learn1[indices, :].clone().to(device) y_t2 = labels2learn2[indices, :].clone().to(device) y_t1.requires_grad = True y_t2.requires_grad = True y_d1 = F.softmax(y_t1, dim=1) + 1e-8 y_d2 = F.softmax(y_t2, dim=1) + 1e-8 logits_unclean1 = logits1[sample_selection['unclean2']] logits_unclean2 = logits2[sample_selection['unclean1']] y_d_unclean1 = y_d1[sample_selection['unclean2']] y_d_unclean2 = y_d2[sample_selection['unclean1']] w1 = np.random.beta(cfg.phi, cfg.phi, logits_unclean1.size(0)) w2 = np.random.beta(cfg.phi, cfg.phi, logits_unclean2.size(0)) w1 = x1.new(w1).view(logits_unclean1.size(0), 1, 1, 1) w2 = x2.new(w2).view(logits_unclean2.size(0), 1, 1, 1) idx1 = np.random.choice(sample_selection['clean2'].cpu().numpy(), logits_unclean1.size(0), replace=False if sample_selection['clean2'].size(0) >= logits_unclean1.size(0) else True) idx1 = torch.tensor(idx1).to(device) idx2 = np.random.choice(sample_selection['clean1'].cpu().numpy(), logits_unclean2.size(0), replace=False if sample_selection['clean1'].size(0) >= logits_unclean2.size(0) else True) idx2 = torch.tensor(idx2).to(device) mixed_x1 = w1 * x1[sample_selection['unclean2']] + (1-w1) * x1[idx1] mixed_x2 = w2 * x2[sample_selection['unclean1']] + (1-w2) * x2[idx2] mixed_y1 = w1 * y_d_unclean1 + (1-w1) * y_d1[idx1] mixed_y2 = w2 * y_d_unclean2 + (1-w2) * y_d2[idx2] mixed_output1 = net1(mixed_x1) mixed_output2 = net2(mixed_x2) mixed_logits1 = mixed_output1['logits'] mixed_logits2 = mixed_output2['logits'] loss_c1_2 = kl_div(F.softmax(mixed_logits1, dim=1) + 1e-8, mixed_y1).mean() loss_c2_2 = kl_div(F.softmax(mixed_logits2, dim=1) + 1e-8, mixed_y2).mean() loss_c1 = loss_c1_1 + loss_c1_2 * cfg.beta loss_c2 = loss_c2_1 + loss_c2_2 * cfg.beta # consistency loss loss_o1 = cross_entropy(F.softmax(y_t1[sample_selection['clean2']], dim=1), y[sample_selection['clean2']]) loss_o2 = cross_entropy(F.softmax(y_t2[sample_selection['clean1']], dim=1), y[sample_selection['clean1']]) # final loss loss1 = (1 - cfg.alpha) * loss_c1 + cfg.alpha * loss_o1 loss2 = (1 - cfg.alpha) * loss_c2 + cfg.alpha * loss_o2 train_acc1 = accuracy(logits1, y0, topk=(1,)) train_acc2 = accuracy(logits2, y0, topk=(1,)) train_loss1.update(loss1.item(), x1.size(0)) train_loss2.update(loss2.item(), x2.size(0)) train_accuracy1.update(train_acc1[0], x1.size(0)) train_accuracy2.update(train_acc2[0], x2.size(0)) if cfg.use_fp16: with amp.scale_loss(loss1, optimizer1) as scaled_loss1: scaled_loss1.backward() with amp.scale_loss(loss2, optimizer2) as scaled_loss2: scaled_loss2.backward() else: loss1.backward() loss2.backward() optimizer1.step() optimizer2.step() optimizer1.zero_grad() optimizer2.zero_grad() if epoch >= cfg.stage1: y_t1.data.sub_(cfg.lmd * y_t1.grad.data) y_t2.data.sub_(cfg.lmd * y_t2.grad.data) labels2learn1[indices, :] = y_t1.detach().clone().cpu().data labels2learn2[indices, :] = y_t2.detach().clone().cpu().data del y_t1, y_t2 iter_time.update(time.time() - s, 1) if (cfg.log_freq is not None and (it + 1) % cfg.log_freq == 0) or (it + 1 == len(train_loader)): total_mem = torch.cuda.get_device_properties(0).total_memory / 2**30 mem = torch.cuda.memory_reserved() / 2**30 console_content = f"Epoch:[{epoch + 1:>3d}/{cfg.epochs:>3d}] " \ f"Iter:[{it + 1:>4d}/{len(train_loader):>4d}] " \ f"Train Accuracy 1:[{train_accuracy1.avg:6.2f}] " \ f"Train Accuracy 2:[{train_accuracy2.avg:6.2f}] " \ f"Loss 1:[{train_loss1.avg:4.4f}] " \ f"Loss 2:[{train_loss2.avg:4.4f}] " \ f"GPU-MEM:[{mem:6.3f}/{total_mem:6.3f} Gb] " \ f"{iter_time.avg:6.2f} sec/iter" logger.debug(console_content) # evaluate this epoch test_accuracy1 = evaluate(test_loader, net1, device) test_accuracy2 = evaluate(test_loader, net2, device) if test_accuracy1 > best_accuracy1: best_accuracy1 = test_accuracy1 best_epoch1 = epoch + 1 torch.save(net1.state_dict(), f'{result_dir}/net1_best_epoch.pth') if test_accuracy2 > best_accuracy2: best_accuracy2 = test_accuracy2 best_epoch2 = epoch + 1 torch.save(net2.state_dict(), f'{result_dir}/net2_best_epoch.pth') # logging this epoch runtime = time.time() - start_time logger.info(f'epoch: {epoch + 1:>3d} | ' f'train loss(1/2): ({train_loss1.avg:>6.4f}/{train_loss2.avg:>6.4f}) | ' f'train accuracy(1/2): ({train_accuracy1.avg:>6.3f}/{train_accuracy2.avg:>6.3f}) | ' f'test accuracy(1/2): ({test_accuracy1:>6.3f}/{test_accuracy2:>6.3f}) | ' f'epoch runtime: {runtime:6.2f} sec | ' f'best accuracy(1/2): ({best_accuracy1:6.3f}/{best_accuracy2:6.3f}) @ epoch: ({best_epoch1:03d}/{best_epoch2:03d})') plot_results_cotraining(result_file=f'{result_dir}/log.txt') torch.save(labels2learn1, f'{result_dir}/labels_learned.pt') # rename results dir ----------------------------------------------------------------------------------------------------------------------------- best_accuracy = max(best_accuracy1, best_accuracy2) os.rename(result_dir, f'{result_dir}-bestAcc_{best_accuracy:.4f}')
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD eval_period = cfg.SOLVER.EVAL_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("reid_baseline.train") logger.info('start training') print("torch.cuda.device_count()", torch.cuda.device_count()) if device: model.to(device) if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) print("多卡训练") # model = DDP(model, delay_allreduce=True) # 必须在initialze之后 # model = nn.DataParallel(model) # model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # 字母小写o,不是零。 torch.distributed.init_process_group( 'gloo', init_method='file:///tmp/somefile', rank=0, world_size=1) # model = convert_syncbn_model(model) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # model = DistributedDataParallel(model, delay_allreduce=True) # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) model = nn.DataParallel(model) # model = convert_syncbn_model(model) else: print("单卡训练") model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model.to(device=0) loss_meter = AverageMeter() acc_meter = AverageMeter() # evaluator = R1_mAP_eval(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM) # model.base._freeze_stages() logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN)) # model, optimizer = amp.initialize(model, optimizer, opt_level='O1') for epoch in range(1, epochs + 1): if epoch == 5: print("balance 数据训练") # cfg.DATASETS.ROOT_DIR = '/home/lab3/bi/0716/Veri/ai_city/tools/mix_train_balance_flip.pkl' cfg.DATASETS.ROOT_DIR = 'datasets/mix_train_balance.pkl' train_loader, val_loader, num_query, num_classes = make_dataloader( cfg) # model.base._freeze_stages() start_time = time.time() loss_meter.reset() acc_meter.reset() # evaluator.reset() scheduler.step() model.train() # print(scheduler.get_lr()[0]) for n_iter, (img, vid) in enumerate(tqdm(train_loader)): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) #grid mask # img = grid(img) # score, feat,score_f1,score_f2,score_f3,f4,f4_score = model(img, target) # score, feat,score_f1,score_f2,feat1,score_layer2 = model(img, target) score, feat, score_f1, score_f2, feat1 = model(img, target) # print(feat.shape) loss = loss_fn(score, feat, target, score_f1, score_f2, feat1) # loss = loss_fn(score, feat, target,score_f1,score_f2,feat1,score_layer2) if cfg.SOLVER.FP16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # scaled_loss.backward(retain_graph=True) else: loss.backward() # loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) # print(loss_meter.val) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_lr()[0])) end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_epoch{}.pth'.format(epoch))) if epoch == 10: reduce_model_dict = model.half().state_dict() del_keys = [] for key in reduce_model_dict.keys(): if 'class' in key or 'sub1' in key or 'sub2' in key or 'base.fc' in key: del_keys.append(key) for key in del_keys: del reduce_model_dict[key] torch.save( reduce_model_dict, os.path.join( cfg.OUTPUT_DIR, cfg.MODEL.NAME + str(cfg.INPUT.SIZE_TRAIN[0]) + 'half.pth'))
def do_train(Cfg, model_G, model_Dip, model_Dii, model_D_reid, train_loader, val_loader, optimizerG, optimizerDip, optimizerDii, GAN_loss, L1_loss, ReID_loss, schedulerG, schedulerDip, schedulerDii): log_period = Cfg.SOLVER.LOG_PERIOD checkpoint_period = Cfg.SOLVER.CHECKPOINT_PERIOD eval_period = Cfg.SOLVER.EVAL_PERIOD output_dir = Cfg.DATALOADER.LOG_DIR # need modified the following in cfg epsilon = 0.00001 margin = 0.4 #################################### device = "cuda" epochs = Cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger('pose-transfer-gan.train') logger.info('Start training') if device: if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model_G = nn.DataParallel(model_G) model_Dii = nn.DataParallel(model_Dii) model_Dip = nn.DataParallel(model_Dip) model_G.to(device) model_Dip.to(device) model_Dii.to(device) model_D_reid.to(device) lossG_meter = AverageMeter() lossDip_meter = AverageMeter() lossDii_meter = AverageMeter() distDreid_meter = AverageMeter() fake_ii_pool = ImagePool(50) fake_ip_pool = ImagePool(50) #evaluator = R1_mAP(num_query, max_rank=50, feat_norm=Cfg.TEST.FEAT_NORM) #train for epoch in range(1, epochs + 1): start_time = time.time() lossG_meter.reset() lossDip_meter.reset() lossDii_meter.reset() distDreid_meter.reset() schedulerG.step() schedulerDip.step() schedulerDii.step() model_G.train() model_Dip.train() model_Dii.train() model_D_reid.eval() for iter, batch in enumerate(train_loader): img1 = batch['img1'].to(device) pose1 = batch['pose1'].to(device) img2 = batch['img2'].to(device) pose2 = batch['pose2'].to(device) input_G = (img1, pose2) #forward fake_img2 = model_G(input_G) optimizerG.zero_grad() #train G input_Dip = torch.cat((fake_img2, pose2), 1) pred_fake_ip = model_Dip(input_Dip) loss_G_ip = GAN_loss(pred_fake_ip, True) input_Dii = torch.cat((fake_img2, img1), 1) pred_fake_ii = model_Dii(input_Dii) loss_G_ii = GAN_loss(pred_fake_ii, True) loss_L1, _, _ = L1_loss(fake_img2, img2) feats_real = model_D_reid(img2) feats_fake = model_D_reid(fake_img2) dist_cos = torch.acos( torch.clamp(torch.sum(feats_real * feats_fake, 1), -1 + epsilon, 1 - epsilon)) same_id_tensor = torch.FloatTensor( dist_cos.size()).fill_(1).to('cuda') dist_cos_margin = torch.max(dist_cos - margin, torch.zeros_like(dist_cos)) loss_reid = ReID_loss(dist_cos_margin, same_id_tensor) factor = loss_reid_factor(epoch) loss_G = 0.5 * loss_G_ii * Cfg.LOSS.GAN_WEIGHT + 0.5 * loss_G_ip * Cfg.LOSS.GAN_WEIGHT + loss_L1 + loss_reid * Cfg.LOSS.REID_WEIGHT * factor loss_G.backward() optimizerG.step() #train Dip for i in range(Cfg.SOLVER.DG_RATIO): optimizerDip.zero_grad() real_input_ip = torch.cat((img2, pose2), 1) fake_input_ip = fake_ip_pool.query( torch.cat((fake_img2, pose2), 1).data) pred_real_ip = model_Dip(real_input_ip) loss_Dip_real = GAN_loss(pred_real_ip, True) pred_fake_ip = model_Dip(fake_input_ip) loss_Dip_fake = GAN_loss(pred_fake_ip, False) loss_Dip = 0.5 * Cfg.LOSS.GAN_WEIGHT * (loss_Dip_real + loss_Dip_fake) loss_Dip.backward() optimizerDip.step() #train Dii for i in range(Cfg.SOLVER.DG_RATIO): optimizerDii.zero_grad() real_input_ii = torch.cat((img2, img1), 1) fake_input_ii = fake_ii_pool.query( torch.cat((fake_img2, img1), 1).data) pred_real_ii = model_Dii(real_input_ii) loss_Dii_real = GAN_loss(pred_real_ii, True) pred_fake_ii = model_Dii(fake_input_ii) loss_Dii_fake = GAN_loss(pred_fake_ii, False) loss_Dii = 0.5 * Cfg.LOSS.GAN_WEIGHT * (loss_Dii_real + loss_Dii_fake) loss_Dii.backward() optimizerDii.step() lossG_meter.update(loss_G.item(), 1) lossDip_meter.update(loss_Dip.item(), 1) lossDii_meter.update(loss_Dii.item(), 1) distDreid_meter.update(dist_cos.mean().item(), 1) if (iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] G Loss: {:.3f}, Dip Loss: {:.3f}, Dii Loss: {:.3f}, Base G_Lr: {:.2e}, Base Dip_Lr: {:.2e}, Base Dii_Lr: {:.2e}" .format(epoch, (iter + 1), len(train_loader), lossG_meter.avg, lossDip_meter.avg, lossDii_meter.avg, schedulerG.get_lr()[0], schedulerDip.get_lr()[0], schedulerDii.get_lr()[0])) #scheduler.get_lr()[0] logger.info("ReID Cos Distance: {:.3f}".format( distDreid_meter.avg)) end_time = time.time() time_per_batch = (end_time - start_time) / (iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save(model_G.state_dict(), output_dir + 'model_G_{}.pth'.format(epoch)) torch.save(model_Dip.state_dict(), output_dir + 'model_Dip_{}.pth'.format(epoch)) torch.save(model_Dii.state_dict(), output_dir + 'model_Dii_{}.pth'.format(epoch)) # if epoch % eval_period == 0: np.save(output_dir + 'train_Bx6x128x64_epoch{}.npy'.format(epoch), fake_ii_pool.images[0].cpu().numpy()) logger.info('Entering Evaluation...') tmp_results = [] model_G.eval() for iter, batch in enumerate(val_loader): with torch.no_grad(): img1 = batch['img1'].to(device) pose1 = batch['pose1'].to(device) img2 = batch['img2'].to(device) pose2 = batch['pose2'].to(device) input_G = (img1, pose2) fake_img2 = model_G(input_G) tmp_result = torch.cat((img1, img2, fake_img2), 1).cpu().numpy() tmp_results.append(tmp_result) np.save(output_dir + 'test_Bx6x128x64_epoch{}.npy'.format(epoch), tmp_results[0])
def do_train(Cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query): log_period = Cfg.LOG_PERIOD checkpoint_period = Cfg.CHECKPOINT_PERIOD eval_period = Cfg.EVAL_PERIOD output_dir = Cfg.LOG_DIR device = "cuda" epochs = Cfg.MAX_EPOCHS logger = logging.getLogger('{}.train'.format(Cfg.PROJECT_NAME)) logger.info('start training') if device: if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = nn.DataParallel(model) model.to(device) loss_meter = AverageMeter() acc_meter = AverageMeter() evaluator = R1_mAP(num_query, max_rank=50, feat_norm=Cfg.FEAT_NORM) #train for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() evaluator.reset() model.train() for iter, (img, vid) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) score, feat = model(img, target) loss = loss_fn(score, feat, target) loss.backward() optimizer.step() if 'center' in Cfg.LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / Cfg.CENTER_LOSS_WEIGHT) optimizer_center.step() acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) if (iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_lr()[0])) end_time = time.time() time_per_batch = (end_time - start_time) / (iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) scheduler.step() if epoch % checkpoint_period == 0: torch.save(model.state_dict(), output_dir + Cfg.MODEL_NAME + '_{}.pth'.format(epoch)) if epoch % eval_period == 0: model.eval() for iter, (img, vid, camid) in enumerate(val_loader): with torch.no_grad(): img = img.to(device) feat = model(img) evaluator.update((feat, vid, camid)) cmc, mAP, _, _, _, _ = evaluator.compute() logger.info("Validation Results - Epoch: {}".format(epoch)) logger.info("mAP: {:.1%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.1%}".format( r, cmc[r - 1]))
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query, last_epoch): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: model.to(device) if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = nn.DataParallel(model) else: if cfg.SOLVER.FP16: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') loss_meter = AverageMeter() acc_meter = AverageMeter() # train for epoch in range(last_epoch, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() model.train() try: for n_iter, (img, vid) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) score, feat = model(img, target) loss = loss_fn(score, feat, target) if cfg.SOLVER.FP16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_lr()[0])) scheduler.step() end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch))) except: torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: # dist.init_process_group(backend='nccl',init_method='env://') model.to(device) if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model = nn.DataParallel(model) # model = torch.nn.parallel.DistributedDataParallel(model,find_unused_parameters=True) else: if cfg.SOLVER.FP16: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') loss_meter = AverageMeter() all_loss_meter = AverageMeter() acc_meter = AverageMeter() pcb_losses = AverageMeter() pcb_merge_losses = AverageMeter() pcb_optimizer = get_pcb_optimizer(model) pcb_scheduler = WarmupMultiStepLR(pcb_optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_EPOCHS, cfg.SOLVER.WARMUP_METHOD) # train for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() all_loss_meter.reset() acc_meter.reset() pcb_losses.reset() pcb_merge_losses.reset() model.train() for n_iter, (img, vid) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) if cfg.MODEL.IF_USE_PCB: score, feat, pcb_out = model(img, target) loss = loss_fn(score, feat, target) loss0, loss1, loss2, loss3, loss4, loss5, loss_merge = pcb_loss_forward( pcb_feat=pcb_out, targets=target) pcb_loss = (loss0 + loss1 + loss2 + loss3 + loss4 + loss5) / 6 all_loss = loss + 0.5 * pcb_loss + 0.5 * loss_merge if cfg.SOLVER.FP16: with amp.scale_loss(all_loss, optimizer) as scaled_loss: scaled_loss.backward() else: all_loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) all_loss_meter.update(all_loss.item(), img.shape[0]) pcb_losses.update(pcb_loss.item(), img.shape[0]) pcb_merge_losses.update(loss_merge.item(), img.shape[0]) acc_meter.update(acc, 1) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] All_Loss: {:.3f},Global_Loss: {:.3f},PCB_Loss: {:.3f},Merge_Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), all_loss_meter.avg, loss_meter.avg, pcb_losses.avg, pcb_merge_losses.avg, acc_meter.avg, scheduler.get_lr()[0])) else: score, feat = model(img, target) loss = loss_fn(score, feat, target) if cfg.SOLVER.FP16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) # all_loss_meter.update(all_loss.item(), img.shape[0]) # pcb_losses.update(pcb_loss.item(), img.shape[0]) acc_meter.update(acc, 1) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Global_Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_lr()[0])) # if cfg.SOLVER.FP16: # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() # else: # loss.backward(retain_graph=True) # loss0, loss1, loss2, loss3, loss4, loss5 = pcb_loss_forward(pcb_feat=pcb_out, targets=target) # pcb_loss = (loss0 + loss1 + loss2 + loss3 + loss4 + loss5) / 6 # all_loss = 0.1 * loss + 0.9 * pcb_loss # if cfg.SOLVER.FP16: # with amp.scale_loss(all_loss, optimizer) as scaled_loss: # scaled_loss.backward() # else: # all_loss.backward() # wenli:if use mulit task to train ,may overfit.Deprecated use this method # pcb_optimizer.zero_grad() # torch.autograd.backward([loss0, loss1, loss2, loss3, loss4, loss5], # [torch.ones(1)[0].cuda(), torch.ones(1)[0].cuda(), torch.ones(1)[0].cuda(), # torch.ones(1)[0].cuda(), torch.ones(1)[0].cuda(), torch.ones(1)[0].cuda(), # torch.ones(1)[0].cuda()]) # pcb_optimizer.step() # optimizer.step() # if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: # for param in center_criterion.parameters(): # param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) # optimizer_center.step() # acc = (score.max(1)[1] == target).float().mean() # loss_meter.update(loss.item(), img.shape[0]) # all_loss_meter.update(all_loss.item(), img.shape[0]) # pcb_losses.update(pcb_loss.item(), img.shape[0]) # acc_meter.update(acc, 1) # if (n_iter + 1) % log_period == 0: # logger.info("Epoch[{}] Iteration[{}/{}] All_Loss: {:.3f},Global_Loss: {:.3f},PCB_Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" # .format(epoch, (n_iter + 1), len(train_loader), # all_loss_meter.avg,loss_meter.avg, pcb_losses.avg,acc_meter.avg, scheduler.get_lr()[0])) #pcb_scheduler.step() scheduler.step() end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
def do_train(Cfg, model, train_loader, test_loader, optimizer, scheduler, loss_fn): log_period = Cfg.LOG_PERIOD checkpoint_period = Cfg.CHECKPOINT_PERIOD output_dir = Cfg.LOG_DIR device = "cuda" epochs = Cfg.MAX_EPOCHS logger = logging.getLogger('{}'.format(Cfg.PROJECT_NAME)) logger.info('start training') if device: if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format(torch.cuda.device_count())) model = nn.DataParallel(model) model.to(device) loss_meter = AverageMeter() acc_meter = AverageMeter() precision_meter = AverageMeter() recall_meter = AverageMeter() #train for epoch in range(1, epochs+1): start_time = time.time() loss_meter.reset() acc_meter.reset() precision_meter.reset() recall_meter.reset() scheduler.step() model.train() for iter, ((feat, adj, cid, h1id), gtmat) in enumerate(train_loader): optimizer.zero_grad() feat, adj, cid, h1id, gtmat = map(lambda x: x.cuda(), (feat, adj, cid, h1id, gtmat)) pred = model(feat, adj, h1id) labels = make_labels(gtmat).long() loss = loss_fn(pred, labels) p, r, acc = accuracy(pred, labels) loss.backward() optimizer.step() loss_meter.update(loss.item(), feat.size(0)) acc_meter.update(acc.item(), feat.size(0)) precision_meter.update(p, feat.size(0)) recall_meter.update(r, feat.size(0)) if (iter+1) % log_period == 0: logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, P:{:.3f}, R:{:.3f}, Base Lr: {:.2e}" .format(epoch, (iter+1), len(train_loader), loss_meter.avg, acc_meter.avg, precision_meter.avg, recall_meter.avg, scheduler.get_lr()[0])) end_time = time.time() time_per_batch = (end_time - start_time) / (iter + 1) logger.info("Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save(model.state_dict(), output_dir+Cfg.MODEL_NAME+'_{}.pth'.format(epoch)) model.eval() acc_meter.reset() precision_meter.reset() recall_meter.reset() for iter, ((feat, adj, cid, h1id, unique_nodes_list), gtmat) in enumerate(test_loader): feat, adj, cid, h1id, gtmat = map(lambda x: x.cuda(), (feat, adj, cid, h1id, gtmat)) pred = model(feat, adj, h1id) labels = make_labels(gtmat).long() p, r, acc = accuracy(pred, labels) acc_meter.update(acc.item(), feat.size(0)) precision_meter.update(p, feat.size(0)) recall_meter.update(r, feat.size(0)) logger.info("Test Result: Acc: {:.3f}, P:{:.3f}, R:{:.3f}" .format(acc_meter.avg, precision_meter.avg, recall_meter.avg))
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query, local_rank): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD eval_period = cfg.SOLVER.EVAL_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("transreid.train") logger.info('start training') _LOCAL_PROCESS_GROUP = None if device: model.to(local_rank) if torch.cuda.device_count() > 1 and cfg.MODEL.DIST_TRAIN: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], find_unused_parameters=True) loss_meter = AverageMeter() acc_meter = AverageMeter() evaluator = R1_mAP_eval(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM) scaler = amp.GradScaler() # train for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() evaluator.reset() scheduler.step(epoch) model.train() for n_iter, (img, vid, target_cam, target_view) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) target_cam = target_cam.to(device) target_view = target_view.to(device) with amp.autocast(enabled=True): score, feat = model(img, target, cam_label=target_cam, view_label=target_view) loss = loss_fn(score, feat, target, target_cam) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) scaler.step(optimizer_center) scaler.update() if isinstance(score, list): acc = (score[0].max(1)[1] == target).float().mean() else: acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) torch.cuda.synchronize() if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler._get_lr(epoch)[0])) end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) if cfg.MODEL.DIST_TRAIN: pass else: logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: if cfg.MODEL.DIST_TRAIN: if dist.get_rank() == 0: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch))) else: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch))) if epoch % eval_period == 0: if cfg.MODEL.DIST_TRAIN: if dist.get_rank() == 0: model.eval() for n_iter, (img, vid, camid, camids, target_view, _) in enumerate(val_loader): with torch.no_grad(): img = img.to(device) camids = camids.to(device) target_view = target_view.to(device) feat = model(img, cam_label=camids, view_label=target_view) evaluator.update((feat, vid, camid)) cmc, mAP, _, _, _, _, _ = evaluator.compute() logger.info("Validation Results - Epoch: {}".format(epoch)) logger.info("mAP: {:.1%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.1%}".format( r, cmc[r - 1])) torch.cuda.empty_cache() else: model.eval() for n_iter, (img, vid, camid, camids, target_view, _) in enumerate(val_loader): with torch.no_grad(): img = img.to(device) camids = camids.to(device) target_view = target_view.to(device) feat = model(img, cam_label=camids, view_label=target_view) evaluator.update((feat, vid, camid)) cmc, mAP, _, _, _, _, _ = evaluator.compute() logger.info("Validation Results - Epoch: {}".format(epoch)) logger.info("mAP: {:.1%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.1%}".format( r, cmc[r - 1])) torch.cuda.empty_cache()
def do_train( cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query, local_rank, ): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("reid_baseline.train") logger.info('start training') _LOCAL_PROCESS_GROUP = None if device: model.to(local_rank) if torch.cuda.device_count() > 1 and cfg.MODEL.DIST_TRAIN: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], find_unused_parameters=True) scaler = amp.GradScaler() loss_meter = AverageMeter() acc_meter = AverageMeter() # train for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() scheduler.step(epoch) model.train() for n_iter, (img, vid, target_cam) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target_cam = target_cam.to(device) if cfg.SOLVER.FP16_ENABLED: #### FP16 training with amp.autocast(enabled=True): score, feat = model(img, target_cam, cam_label=None) loss = loss_fn(score, feat, target_cam) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() else: score, feat = model(img, target_cam, cam_label=None) loss = loss_fn(score, feat, target_cam, target_cam) loss.backward() optimizer.step() if isinstance(score, list): acc = (score[0].max(1)[1] == target_cam).float().mean() else: acc = (score.max(1)[1] == target_cam).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) torch.cuda.synchronize() if (n_iter + 1) % log_period == 0: base_lr = scheduler._get_lr( epoch )[0] if cfg.SOLVER.WARMUP_METHOD == 'cosine' else scheduler.get_lr( )[0] logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, base_lr)) end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) if cfg.MODEL.DIST_TRAIN: pass else: logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: if cfg.MODEL.DIST_TRAIN: if dist.get_rank() == 0: torch.save( model.module.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch))) else: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
def main(): torch.backends.cudnn.deterministic = True cudnn.benchmark = True #parser = argparse.ArgumentParser(description="ReID Baseline Training") #parser.add_argument( #"--config_file", default="", help="path to config file", type=str) #parser.add_argument("opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER) #args = parser.parse_args() config_file = 'configs/baseline_veri_r101_a.yml' if config_file != "": cfg.merge_from_file(config_file) #cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) logger = setup_logger("reid_baseline", output_dir, if_train=True) logger.info("Saving model in the path :{}".format(cfg.OUTPUT_DIR)) logger.info(config_file) if config_file != "": logger.info("Loaded configuration file {}".format(config_file)) with open(config_file, 'r') as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.MODEL.DEVICE_ID path = 'D:/Python_SMU/Veri/verigms/gms/' pkl = {} entries = os.listdir(path) for name in entries: f = open((path + name), 'rb') if name == 'featureMatrix.pkl': s = name[0:13] else: s = name[0:3] pkl[s] = pickle.load(f) f.close with open('cids.pkl', 'rb') as handle: b = pickle.load(handle) with open('index.pkl', 'rb') as handle: c = pickle.load(handle) train_transforms, val_transforms, dataset, train_set, val_set = make_dataset( cfg, pkl_file='index.pkl') num_workers = cfg.DATALOADER.NUM_WORKERS num_classes = dataset.num_train_pids #pkl_f = 'index.pkl' pid = 0 pidx = {} for img_path, pid, _, _ in dataset.train: path = img_path.split('\\')[-1] folder = path[1:4] pidx[folder] = pid pid += 1 if 'triplet' in cfg.DATALOADER.SAMPLER: train_loader = DataLoader(train_set, batch_size=cfg.SOLVER.IMS_PER_BATCH, sampler=RandomIdentitySampler( dataset.train, cfg.SOLVER.IMS_PER_BATCH, cfg.DATALOADER.NUM_INSTANCE), num_workers=num_workers, pin_memory=True, collate_fn=train_collate_fn) elif cfg.DATALOADER.SAMPLER == 'softmax': print('using softmax sampler') train_loader = DataLoader(train_set, batch_size=cfg.SOLVER.IMS_PER_BATCH, shuffle=True, num_workers=num_workers, pin_memory=True, collate_fn=train_collate_fn) else: print('unsupported sampler! expected softmax or triplet but got {}'. format(cfg.SAMPLER)) print("train loader loaded successfully") val_loader = DataLoader(val_set, batch_size=cfg.TEST.IMS_PER_BATCH, shuffle=False, num_workers=num_workers, pin_memory=True, collate_fn=train_collate_fn) print("val loader loaded successfully") if cfg.MODEL.PRETRAIN_CHOICE == 'finetune': model = make_model(cfg, num_class=576) model.load_param_finetune(cfg.MODEL.PRETRAIN_PATH) print('Loading pretrained model for finetuning......') else: model = make_model(cfg, num_class=num_classes) loss_func, center_criterion = make_loss(cfg, num_classes=num_classes) optimizer, optimizer_center = make_optimizer(cfg, model, center_criterion) scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_EPOCHS, cfg.SOLVER.WARMUP_METHOD) print("model,optimizer, loss, scheduler loaded successfully") height, width = cfg.INPUT.SIZE_TRAIN log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD eval_period = cfg.SOLVER.EVAL_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = nn.DataParallel(model) model.to(device) loss_meter = AverageMeter() acc_meter = AverageMeter() evaluator = R1_mAP_eval(len(dataset.query), max_rank=50, feat_norm=cfg.TEST.FEAT_NORM) model.base._freeze_stages() logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN)) data_index = search(pkl) print("Ready for training") for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() evaluator.reset() scheduler.step() model.train() for n_iter, (img, label, index, pid, cid) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() #img = img.to(device) #target = vid.to(device) trainX, trainY = torch.zeros( (train_loader.batch_size * 3, 3, height, width), dtype=torch.float32), torch.zeros( (train_loader.batch_size * 3), dtype=torch.int64) for i in range(train_loader.batch_size): labelx = label[i] indexx = index[i] cidx = pid[i] if indexx > len(pkl[labelx]) - 1: indexx = len(pkl[labelx]) - 1 a = pkl[labelx][indexx] minpos = np.argmin(ma.masked_where(a == 0, a)) pos_dic = train_set[data_index[cidx][1] + minpos] #print(pos_dic[1]) neg_label = int(labelx) while True: neg_label = random.choice(range(1, 770)) if neg_label is not int(labelx) and os.path.isdir( os.path.join('D:/datasets/veri-split/train', strint(neg_label))) is True: break negative_label = strint(neg_label) neg_cid = pidx[negative_label] neg_index = random.choice(range(0, len(pkl[negative_label]))) neg_dic = train_set[data_index[neg_cid][1] + neg_index] trainX[i] = img[i] trainX[i + train_loader.batch_size] = pos_dic[0] trainX[i + (train_loader.batch_size * 2)] = neg_dic[0] trainY[i] = cidx trainY[i + train_loader.batch_size] = pos_dic[3] trainY[i + (train_loader.batch_size * 2)] = neg_dic[3] #print(trainY) trainX = trainX.cuda() trainY = trainY.cuda() score, feat = model(trainX, trainY) loss = loss_func(score, feat, trainY) loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() acc = (score.max(1)[1] == trainY).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_lr()[0])) end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch))) if epoch % eval_period == 0: model.eval() for n_iter, (img, vid, camid, _, _) in enumerate(val_loader): with torch.no_grad(): img = img.to(device) feat = model(img) evaluator.update((feat, vid, camid)) cmc, mAP, _, _, _, _, _ = evaluator.compute() logger.info("Validation Results - Epoch: {}".format(epoch)) logger.info("mAP: {:.1%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.1%}".format( r, cmc[r - 1]))
def do_train(cfg, model, center_criterion, train_loader, train_loader_b, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD eval_period = cfg.SOLVER.EVAL_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model.to(device) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model = nn.DataParallel(model) else: model.to(device) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # model.to(device) # model, optimizer = amp.initialize(model, optimizer, opt_level='O1') loss_meter = AverageMeter() acc_meter = AverageMeter() evaluator = R1_mAP_eval(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM) # model.base._freeze_stages() logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN)) # train for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() evaluator.reset() scheduler.step() model.train() if epoch <= 80: loader = train_loader else: loader = train_loader_b for n_iter, (img, vid) in enumerate(loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) if 'bdb' in cfg.MODEL.NAME: score, score2, feat1, feat2 = model(img, target) loss = loss_fn([score, score2], [feat1, feat2], target) else: score, feat = model(img, target) if cfg.DATALOADER.SAMPLER == 'softmax': loss = F.cross_entropy(score, target) else: loss = loss_fn(score, feat, target, model) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(loader), loss_meter.avg, acc_meter.avg, scheduler.get_lr()[0])) end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
if device: if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format(torch.cuda.device_count())) model = nn.DataParallel(model) model.to(device) loss_meter = AverageMeter() acc_meter = AverageMeter() evaluator = R1_mAP_eval(num_query, max_rank=50, feat_norm='yes') model.base._freeze_stages() logger.info('Freezing the stages number:{}'.format(-1)) # train for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() evaluator.reset() scheduler.step() model.train() for n_iter, (img, vid) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) feat = model(img, target) loss,score = loss_func(feat, target) loss.backward() optimizer.step()
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD eval_period = cfg.SOLVER.EVAL_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS start_epoch = cfg.SOLVER.START_EPOCH logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = nn.DataParallel(model) model.to(device) loss_meter = AverageMeter() acc_meter1 = AverageMeter() acc_meter2 = AverageMeter() # acc_cam = AverageMeter() evaluator = R1_mAP_eval(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM) if torch.cuda.device_count() > 1: model.module.base._freeze_stages() else: model.base._freeze_stages() logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN)) # train for epoch in range(start_epoch, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter1.reset() acc_meter2.reset() # acc_cam.reset() evaluator.reset() scheduler.step() model.train() for n_iter, (img, vid, _) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) # camid = camid.to(device) scores, feat = model(img, target) loss = loss_fn(scores, feat, target) loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() acc = [(score.max(1)[1] == target).float().mean() for score in scores] # cam_acc = (cam_score.max(1)[1] == camid).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter1.update(acc[0].item(), 1) acc_meter2.update(acc[1].item(), 1) # acc_cam.update(cam_acc.item(), 1) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc1: {:.3f}, Acc2: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter1.avg, acc_meter2.avg, scheduler.get_lr()[0])) end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: if torch.cuda.device_count() > 1: torch.save( { 'static_dict': model.module.state_dict(), 'optimizer_static_dict': optimizer.state_dict() }, os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch))) else: torch.save( { 'static_dict': model.state_dict(), 'optimizer_static_dict': optimizer.state_dict() }, os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch))) if epoch % eval_period == 0: model.eval() for n_iter, (img, vid, camid, _, _) in enumerate(val_loader): with torch.no_grad(): img = img.to(device) feat = model(img) evaluator.update((feat, vid, camid)) cmc, mAP, _, _, _, _, _ = evaluator.compute() logger.info("Validation Results - Epoch: {}".format(epoch)) logger.info("mAP: {:.1%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.1%}".format( r, cmc[r - 1]))
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: model.to(device) #print("cuda个数", torch.cuda.device_count()) if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = nn.DataParallel(model) loss_meter = AverageMeter() acc_meter = AverageMeter() # train scaler = GradScaler() for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() model.train() for n_iter, (img, vid) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.cuda(non_blocking=True) target = vid.cuda(non_blocking=True) with autocast(): score, feat = model(img, target) loss = loss_fn(score, feat, target) scaler.scale(loss).backward() # optimizer.module.step() scaler.step(optimizer) #scaler.update() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) scaler.step(optimizer_center) scaler.update() acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_lr()[0])) scheduler.step() end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query, writer): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS tmp_input_data = torch.rand( (10, 3, cfg.INPUT.SIZE_TRAIN[0], cfg.INPUT.SIZE_TRAIN[1])) writer.add_graph(model, (tmp_input_data, )) logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: model.to(device) if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = nn.DataParallel(model) model = model.cuda() else: if cfg.SOLVER.FP16: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') loss_meter = AverageMeter() id_loss_meter = AverageMeter() tri_loss_meter = AverageMeter() cen_loss_meter = AverageMeter() acc_meter = AverageMeter() lr_meter = AverageMeter() if cfg.SOLVER.SWA: swa_model = torch.optim.swa_utils.AveragedModel(model) # train for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() id_loss_meter.reset() tri_loss_meter.reset() cen_loss_meter.reset() lr_meter.reset() model.train() if cfg.SOLVER.GRADUAL_UNLOCK: model.base.gradual_unlock(cfg.SOLVER.MAX_EPOCHS, epoch) for n_iter, (img, vid) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) if cfg.DATASETS.MIXUP: img, target_a, target_b, lam = mixup_data(img, target) score, feat = model(img, target) if cfg.DATASETS.MIXUP: all_loss = mixup_criterion(loss_fn, score, feat, target_a, target_b, lam) else: all_loss = loss_fn(score, feat, target) loss, id_loss, tri_loss, cen_loss = all_loss if cfg.SOLVER.FP16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() loss_meter.update(loss.item(), img.shape[0]) id_loss_meter.update(id_loss.item(), img.shape[0]) if torch.is_tensor(tri_loss): tri_loss_meter.update(tri_loss.item(), img.shape[0]) else: tri_loss_meter.update(tri_loss, 1) if torch.is_tensor(cen_loss): cen_loss_meter.update(cen_loss.item(), img.shape[0]) else: cen_loss_meter.update(cen_loss, 1) acc = (score.max(1)[1] == target).float().mean() acc_meter.update(acc, 1) lr_meter.update(scheduler.get_last_lr()[0]) writer.add_scalar('data/total_loss', loss_meter.avg, (epoch - 1) * len(train_loader) + n_iter) writer.add_scalar('data/id_loss', id_loss_meter.avg, (epoch - 1) * len(train_loader) + n_iter) writer.add_scalar('data/tri_loss', tri_loss_meter.avg, (epoch - 1) * len(train_loader) + n_iter) writer.add_scalar('data/cen_loss', cen_loss_meter.avg, (epoch - 1) * len(train_loader) + n_iter) writer.add_scalar('data/learning_rate', lr_meter.avg, (epoch - 1) * len(train_loader) + n_iter) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_last_lr()[0])) scheduler.step() end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: src_path = os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch)) torch.save(model.state_dict(), src_path) try: dest_root = os.path.join( '/mnt/nfs-internstorage/user/zjf/NAIC2020/models', cfg.SAVE_FLAG) if not os.path.exists(dest_root): os.mkdir(dest_root) dst_path = os.path.join( dest_root, cfg.MODEL.NAME + '_{}.pth'.format(epoch)) shutil.copy(src_path, dst_path) except: print('No bak models...') pass if cfg.SOLVER.SWA and epoch in cfg.SOLVER.SWA_START: swa_model.update_parameters(model) logger.info('swa combine the {} epoch model'.format(epoch)) if cfg.SOLVER.SWA: try: swa_model.cpu() torch.optim.swa_utils.update_bn(train_loader, swa_model) swa_model.cuda() src_path = os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_swa.pth') torch.save(swa_model.state_dict(), src_path) logger.info('swa model is successfuly saved.') except: logger.info('swa model save failed.')
def do_train_xbm(cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD eval_period = cfg.SOLVER.EVAL_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model.to(device) #model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model = nn.DataParallel(model) else: model.to(device) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') loss_meter = AverageMeter() acc_meter = AverageMeter() xbm1 = XBM(6000, 2048) evaluator = R1_mAP_eval(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM) # model.base._freeze_stages() logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN)) # train for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() evaluator.reset() scheduler.step() model.train() for n_iter, (img, vid, cps) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) cps = cps.to(device) target = vid.to(device) score, feat = model(img, target) if epoch >= 10: xbm1.enqueue_dequeue(feat.detach(), target.detach(), cps.detach()) xbm1_feats, xbm1_targets, xbm1_cps = xbm1.get() loss = torch.nn.functional.cross_entropy(score, target) + TripletLoss()(feat, (target,cps))[0]+\ TripletLoss_XBM()(feat,xbm1_feats,(target,cps),(xbm1_targets,xbm1_cps))[0] else: loss = torch.nn.functional.cross_entropy( score, target) + TripletLoss()(feat, (target, cps))[0] # xbm_loss = TripletLoss()(xbm_feats, xbm_targets) # loss = (loss + xbm_loss[0]) # loss = loss_fn(score, feat, target) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() if type(score) is list: acc = (score[0].max(1)[1] == target).float().mean() else: acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_lr()[0])) end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch)))