def train(self, epochs): scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=self.lr_reduce_factor, patience=self.patience) epoch_times = [] prev_loss = -1 best_dev_score = -1 for epoch in range(1, epochs + 1): start = time.time() self.logger.info('Epoch {} started...'.format(epoch)) self.train_epoch(epoch) accuracy, f1, new_loss = self.evaluate(self.dev_evaluator, 'dev') if self.use_tensorboard: self.writer.add_scalar('sick/lr', self.optimizer.param_groups[0]['lr'], epoch) self.writer.add_scalar('sick/dev/accuracy', accuracy, epoch) self.writer.add_scalar('sick/dev/f1', f1, epoch) self.writer.add_scalar('sick/dev/cross_entropy_loss', new_loss, epoch) end = time.time() duration = end - start self.logger.info('Epoch {} finished in {:.2f} minutes'.format(epoch, duration / 60)) epoch_times.append(duration) if accuracy > best_dev_score: best_dev_score = accuracy save_checkpoint(epoch, self.model.arch, self.model.state_dict(), self.optimizer.state_dict(), best_dev_score, self.model_outfile) if abs(prev_loss - new_loss) <= 0.0002: self.logger.info('Early stopping. Loss changed by less than 0.0002.') break prev_loss = new_loss scheduler.step(accuracy) self.logger.info('Training took {:.2f} minutes overall...'.format(sum(epoch_times) / 60))
def train(self, epochs): scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=self.lr_reduce_factor, patience=self.patience) epoch_times = [] prev_loss = -1 best_dev_score = -1 for epoch in range(1, epochs + 1): start = time.time() self.logger.info('Epoch {} started...'.format(epoch)) self.train_epoch(epoch) dev_scores = self.evaluate(self.dev_evaluator, 'dev') pearson, spearman, new_loss = self.evaluate(self.dev_evaluator, 'dev') if self.use_tensorboard: self.writer.add_scalar('sts/lr', self.optimizer.param_groups[0]['lr'], epoch) self.writer.add_scalar('sts/dev/pearson_r', dev_scores[0], epoch) self.writer.add_scalar('sts/dev/kl_div_loss', new_loss, epoch) end = time.time() duration = end - start self.logger.info('Epoch {} finished in {:.2f} minutes'.format(epoch, duration / 60)) epoch_times.append(duration) if pearson > best_dev_score: best_dev_score = pearson save_checkpoint(epoch, self.model.arch, self.model.state_dict(), self.optimizer.state_dict(), best_dev_score, self.model_outfile) if prev_loss < new_loss and prev_loss != 0: self.logger.info('Early stopping.') break prev_loss = new_loss scheduler.step(dev_scores[0]) self.logger.info('Training took {:.2f} minutes overall...'.format(sum(epoch_times) / 60))
def fit(self, train_data, test_data, optimizer, criterion, lr_scheduler): best_test_acc = -np.inf for epoch in range(self.opt.train.num_epochs): self.loss.reset() self.acc.reset() self.net.train() lr = lr_scheduler.update(epoch) for param_group in optimizer.param_groups: param_group['lr'] = lr logging.info('Epoch [%d] learning rate update to %.3e' % (epoch, lr)) tic = time.time() btic = time.time() for i, data in enumerate(train_data): imgs, labels = data labels = labels.cuda() scores = self.net(imgs) loss = criterion(scores, labels) optimizer.zero_grad() loss.backward() optimizer.step() self.loss.add(loss.item()) acc = (scores.max(1)[1] == labels.long()).float().mean() self.acc.add(acc.item()) if self.opt.misc.log_interval and not (i + 1) % self.opt.misc.log_interval: loss_mean = self.loss.value()[0] acc_mean = self.acc.value()[0] logging.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\t' 'acc=%f' % ( epoch, i + 1, train_data.batch_size * self.opt.misc.log_interval / (time.time() - btic), loss_mean, acc_mean)) btic = time.time() loss_mean = self.loss.value()[0] acc_mean = self.acc.value()[0] throughput = int(train_data.batch_size * len(train_data) / (time.time() - tic)) logging.info('[Epoch %d] training: loss=%f\tacc=%f' % ( epoch, loss_mean, acc_mean)) logging.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' % (epoch, throughput, time.time() - tic)) is_best = False if test_data is not None and self.opt.misc.eval_step and not (epoch + 1) % self.opt.misc.eval_step: test_acc = self.test_func(test_data) logging.info('[Epoch %d] test acc: %f' % (epoch, test_acc)) is_best = test_acc > best_test_acc if is_best: best_test_acc = test_acc state_dict = self.net.module.state_dict() if not (epoch + 1) % self.opt.misc.save_step: save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1, }, is_best=is_best, save_dir=self.opt.misc.save_dir, filename='model' + '.pth.tar')
def train(self, epochs): scheduler = None if self.lr_reduce_factor != 1 and self.lr_reduce_factor != None: scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=self.lr_reduce_factor, patience=self.patience) epoch_times = [] prev_loss = -1 best_dev_score = -1 for epoch in range(1, epochs + 1): start = time.time() self.logger.info('Epoch {} started...'.format(epoch)) self.train_epoch(epoch) mean_average_precision, mean_reciprocal_rank, new_loss = self.evaluate( self.dev_evaluator, 'dev', epoch, self.logOutput) if self.use_tensorboard: self.writer.add_scalar( '{}/lr'.format(self.train_loader.dataset.NAME), self.optimizer.param_groups[0]['lr'], epoch) self.writer.add_scalar( '{}/dev/cross_entropy_loss'.format( self.train_loader.dataset.NAME), new_loss, epoch) self.writer.add_scalar( '{}/dev/map'.format(self.train_loader.dataset.NAME), mean_average_precision, epoch) self.writer.add_scalar( '{}/dev/mrr'.format(self.train_loader.dataset.NAME), mean_reciprocal_rank, epoch) end = time.time() duration = end - start self.logger.info('Epoch {} finished in {:.2f} minutes'.format( epoch, duration / 60)) epoch_times.append(duration) if mean_average_precision > best_dev_score: best_dev_score = mean_average_precision save_checkpoint(epoch, self.model.arch, self.model.state_dict(), self.optimizer.state_dict(), best_dev_score, self.model_outfile) if abs(prev_loss - new_loss) <= 0.0002: self.logger.info( 'Early stopping. Loss changed by less than 0.0002.') break prev_loss = new_loss if scheduler is not None: scheduler.step(mean_average_precision) self.logger.info('Training took {:.2f} minutes overall...'.format( sum(epoch_times) / 60))
def main(args): # s_ = time.time() save_dir = args.save_dir #模型存储位置 mkdir_if_missing(save_dir) #检查该存储文件是否可用/utils库 sys.stdout = logging.Logger(os.path.join(save_dir, 'log.txt')) display(args) #打印当前训练模型的参数 start = 0 model = models.create(args.net, pretrained = False , model_path = None, normalized = True) #@@@创建模型/ pretrained = true 将会去读取现有预训练模型/models文件中的函数 model = torch.nn.DataParallel(model) #使用torch进行模型的并行训练/分布 model = model.cuda() #使用GPU print('initial model is save at %s' % save_dir) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) #优化器 criterion = losses.create(args.loss, margin_same=args.margin_same, margin_diff=args.margin_diff).cuda() #TWConstrativeloss data = DataSet.create(name = args.data, root=args.data_root, set_name = args.set_name) #数据 set_name = "test" or "train" ; train_loader = torch.utils.data.DataLoader( data.train, batch_size=args.batch_size,shuffle = True, drop_last=True, pin_memory=True, num_workers=args.nThreads) for epoch in range(start, 50): #args.epochs L = train(epoch=epoch, model=model, criterion=criterion, optimizer=optimizer, train_loader=train_loader, args=args) losses_.append(L) if (epoch+1) % args.save_step == 0 or epoch==0: if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': (epoch+1), }, is_best=False, fpath=osp.join(args.save_dir, 'ckp_ep' + str(epoch + 1) + '.pth.tar')) # added batch_nums = range(1, len(losses_) + 1) import matplotlib.pyplot as plt plt.plot(batch_nums, losses_) plt.show()
def train(self, epochs): scheduler = None if self.lr_reduce_factor != 1 and self.lr_reduce_factor != None: scheduler = ReduceLROnPlateau(self.optimizer, mode='min', factor=self.lr_reduce_factor, patience=self.patience) epoch_times = [] prev_loss = -1 best_dev_score = -1 for epoch in range(1, epochs + 1): scheduler.step() start = time.time() self.logger.info('Epoch {} started...'.format(epoch)) self.train_epoch(epoch) pearson, spearman, mse, new_loss = self.evaluate( self.dev_evaluator, 'dev', epoch, self.logOutput) if self.use_tensorboard: self.writer.add_scalar('sts2014/lr', self.optimizer.param_groups[0]['lr'], epoch) self.writer.add_scalar('sts2014/dev/pearson_r', pearson, epoch) self.writer.add_scalar('sts2014/dev/kl_div_loss', new_loss, epoch) end = time.time() duration = end - start self.logger.info('Epoch {} finished in {:.2f} minutes'.format( epoch, duration / 60)) epoch_times.append(duration) if pearson > best_dev_score: best_dev_score = pearson save_checkpoint(epoch, self.model.arch, self.model.state_dict(), self.optimizer.state_dict(), best_dev_score, self.model_outfile) if abs(prev_loss - new_loss) <= 0.0002: self.logger.info( 'Early stopping. Loss changed by less than 0.0002.') break prev_loss = new_loss if scheduler is not None: scheduler.step(new_loss) self.logger.info('Training took {:.2f} minutes overall...'.format( sum(epoch_times) / 60))
def main(args): np.random.seed(args.seed) torch.manual_seed(args.seed) cudnn.benchmark = True sys.stdout = Logger(osp.join(args.logs_dir, 'log.txt')) trainset = Feeder(args.feat_path, args.knn_graph_path, args.label_path, args.seed, args.k_at_hop, args.active_connection) trainloader = DataLoader(trainset, batch_size=args.batch_size, num_workers=args.workers, shuffle=True, pin_memory=True) net = model.gcn().cuda() opt = torch.optim.SGD(net.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss().cuda() save_checkpoint({ 'state_dict': net.state_dict(), 'epoch': 0, }, False, fpath=osp.join(args.logs_dir, 'epoch_{}.ckpt'.format(0))) for epoch in range(args.epochs): adjust_lr(opt, epoch) train(trainloader, net, criterion, opt, epoch) save_checkpoint({ 'state_dict': net.state_dict(), 'epoch': epoch + 1, }, False, fpath=osp.join(args.logs_dir, 'epoch_{}.ckpt'.format(epoch + 1)))
def main(args): # s_ = time.time() save_dir = args.save_dir mkdir_if_missing(save_dir) sys.stdout = logging.Logger(os.path.join(save_dir, 'log.txt')) display(args) start = 0 model = models.create(args.net, pretrained=True, dim=args.dim) # for vgg and densenet if args.resume is None: model_dict = model.state_dict() else: # resume model print('load model from {}'.format(args.resume)) chk_pt = load_checkpoint(args.resume) weight = chk_pt['state_dict'] start = chk_pt['epoch'] model.load_state_dict(weight) model = torch.nn.DataParallel(model) model = model.cuda() # freeze BN if args.freeze_BN is True: print(40 * '#', '\n BatchNorm frozen') model.apply(set_bn_eval) else: print(40 * '#', 'BatchNorm NOT frozen') # Fine-tune the model: the learning rate for pre-trained parameter is 1/10 new_param_ids = set(map(id, model.module.classifier.parameters())) new_params = [ p for p in model.module.parameters() if id(p) in new_param_ids ] base_params = [ p for p in model.module.parameters() if id(p) not in new_param_ids ] param_groups = [{ 'params': base_params, 'lr_mult': 0.0 }, { 'params': new_params, 'lr_mult': 1.0 }] print('initial model is save at %s' % save_dir) optimizer = torch.optim.Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay) criterion = losses.create(args.loss, margin=args.margin, alpha=args.alpha, base=args.loss_base).cuda() # Decor_loss = losses.create('decor').cuda() data = DataSet.create(args.data, ratio=args.ratio, width=args.width, origin_width=args.origin_width, root=args.data_root) train_loader = torch.utils.data.DataLoader( data.train, batch_size=args.batch_size, sampler=FastRandomIdentitySampler(data.train, num_instances=args.num_instances), drop_last=True, pin_memory=True, num_workers=args.nThreads) # save the train information for epoch in range(start, args.epochs): train(epoch=epoch, model=model, criterion=criterion, optimizer=optimizer, train_loader=train_loader, args=args) if epoch == 1: optimizer.param_groups[0]['lr_mul'] = 0.1 if (epoch + 1) % args.save_step == 0 or epoch == 0: if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': (epoch + 1), }, is_best=False, fpath=osp.join( args.save_dir, 'ckp_ep' + str(epoch + 1) + '.pth.tar'))
def train(**kwargs): opt._parse(kwargs) # torch.backends.cudnn.deterministic = True # I think this line may slow down the training process # set random seed and cudnn benchmark torch.manual_seed(opt.seed) random.seed(opt.seed) np.random.seed(opt.seed) use_gpu = torch.cuda.is_available() sys.stdout = Logger( os.path.join('./pytorch-ckpt/current', opt.save_dir, 'log_train.txt')) if use_gpu: print('currently using GPU') cudnn.benchmark = True else: print('currently using cpu') print(opt._state_dict()) print('initializing dataset {}'.format(opt.trainset_name)) if opt.trainset_name == 'combine': #input dataset name as 'datasets' train_dataset = data_manager.init_combine_dataset( name=opt.trainset_name, options=opt, datasets=opt.datasets, num_bn_sample=opt.batch_num_bn_estimatation * opt.test_batch, share_cam=opt.share_cam, num_pids=opt.num_pids) elif opt.trainset_name == 'unreal': # input dataset dir in 'datasets' train_dataset = data_manager.init_unreal_dataset( name=opt.trainset_name, datasets=opt.datasets, num_pids=opt.num_pids, num_cams=opt.num_cams, img_per_person=opt.img_per_person) else: train_dataset = data_manager.init_dataset( name=opt.trainset_name, num_bn_sample=opt.batch_num_bn_estimatation * opt.test_batch, num_pids=opt.num_pids) pin_memory = True if use_gpu else False summary_writer = SummaryWriter( os.path.join('./pytorch-ckpt/current', opt.save_dir, 'tensorboard_log')) if opt.cam_bal: IDSampler = IdentityCameraSampler else: IDSampler = IdentitySampler if opt.trainset_name == 'combine': samp = IDSampler(train_dataset.train, opt.train_batch, opt.num_instances, train_dataset.cams_of_dataset, train_dataset.len_of_real_dataset) else: samp = IDSampler(train_dataset.train, opt.train_batch, opt.num_instances) trainloader = DataLoader(data_manager.init_datafolder( opt.trainset_name, train_dataset.train, TrainTransform(opt.height, opt.width)), sampler=samp, batch_size=opt.train_batch, num_workers=opt.workers, pin_memory=pin_memory, drop_last=True, collate_fn=NormalCollateFn()) print('initializing model ...') num_pid = train_dataset.num_train_pids if opt.loss == 'softmax' else None model = ResNetBuilder(num_pid) if opt.model_path is not None and 'moco' in opt.model_path: model = load_moco_model(model, opt.model_path) elif opt.model_path is not None: model = load_previous_model(model, opt.model_path, load_fc_layers=False) optim_policy = model.get_optim_policy() print('model size: {:.5f}M'.format( sum(p.numel() for p in model.parameters()) / 1e6)) if use_gpu: model = CamDataParallel(model).cuda() xent = nn.CrossEntropyLoss() triplet = TripletLoss() def standard_cls_criterion(feat, preditions, targets, global_step, summary_writer): identity_loss = xent(preditions, targets) identity_accuracy = torch.mean( (torch.argmax(preditions, dim=1) == targets).float()) summary_writer.add_scalar('cls_loss', identity_loss.item(), global_step) summary_writer.add_scalar('cls_accuracy', identity_accuracy.item(), global_step) return identity_loss def triplet_criterion(feat, preditons, targets, global_step, summary_writer): triplet_loss, acc = triplet(feat, targets) summary_writer.add_scalar('loss', triplet_loss.item(), global_step) print(np.mean(acc.item())) summary_writer.add_scalar('accuracy', acc.item(), global_step) return triplet_loss # get trainer and evaluator optimizer, adjust_lr = get_our_optimizer_strategy(opt, optim_policy) if opt.loss == 'softmax': crit = standard_cls_criterion elif opt.loss == 'triplet': crit = triplet_criterion reid_trainer = CameraClsTrainer(opt, model, optimizer, crit, summary_writer) print('Start training') for epoch in range(opt.max_epoch): adjust_lr(optimizer, epoch) reid_trainer.train(epoch, trainloader) if (epoch + 1) % opt.save_step == 0: if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1, }, save_dir=os.path.join('./pytorch-ckpt/current', opt.save_dir), ep=epoch + 1) # if (epoch+1)%15==0: # save_checkpoint({ # 'state_dict': state_dict, # 'epoch': epoch + 1, # }, save_dir=os.path.join('./pytorch-ckpt/current', opt.save_dir)) if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1, }, save_dir=os.path.join('./pytorch-ckpt/current', opt.save_dir))
def main(): global xent_criterion, triplet_criterion, ment_criterion logger.info("init done") if os.path.exists(cfg.TRAIN.LOG_DIR): shutil.rmtree(cfg.TRAIN.LOG_DIR) os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) if cfg.TRAIN.LOG_DIR: add_file_handler('global', os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) dataset, train_loader, _, _ = build_data_loader() model = BagReID_IBN(dataset.num_train_pids, dataset.num_train_mates) xent_criterion = CrossEntropyLabelSmooth(dataset.num_train_pids) triplet_criterion = TripletLoss(margin=cfg.TRAIN.TRI_MARGIN) ment_criterion = CrossEntropyMate(cfg.TRAIN.MATE_LOSS_WEIGHT) if cfg.TRAIN.OPTIM == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=cfg.SOLVER.LEARNING_RATE, momentum=cfg.SOLVER.MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY) else: optimizer = torch.optim.Adam(model.parameters(), lr=cfg.SOLVER.LEARNING_RATE, weight_decay=cfg.SOLVER.WEIGHT_DECAY) optimizers = [optimizer] schedulers = build_lr_schedulers(optimizers) if cfg.CUDA: model.cuda() if torch.cuda.device_count() > 1: model = DataParallel(model) if cfg.TRAIN.LOG_DIR: summary_writer = SummaryWriter(cfg.TRAIN.LOG_DIR) else: summary_writer = None logger.info("model prepare done") start_epoch = cfg.TRAIN.START_EPOCH # start training for epoch in range(start_epoch, cfg.TRAIN.NUM_EPOCHS): train(epoch, train_loader, model, criterion, optimizers, summary_writer) for scheduler in schedulers: scheduler.step() # skip if not save model if cfg.TRAIN.EVAL_STEP > 0 and (epoch + 1) % cfg.TRAIN.EVAL_STEP == 0 \ or (epoch + 1) == cfg.TRAIN.NUM_EPOCHS: if cfg.CUDA and torch.cuda.device_count() > 1: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1 }, is_best=False, save_dir=cfg.TRAIN.SNAPSHOT_DIR, filename='checkpoint_ep' + str(epoch + 1) + '.pth.tar')
def fit(self, train_data, test_data, num_query, optimizer, criterion, lr_scheduler): best_rank1 = -np.inf for epoch in range(self.opt.train.num_epochs): self.loss.reset() self.acc.reset() self.net.train() # update learning rate lr = lr_scheduler.update(epoch) for param_group in optimizer.param_groups: param_group['lr'] = lr logging.info('learning rate update to {:.3e}'.format(lr)) tic = time.time() btic = time.time() for i, inputs in enumerate(train_data): data, pids, _ = inputs label = pids.cuda() score, feat = self.net(data) loss = criterion(score, feat, label) optimizer.zero_grad() loss.backward() optimizer.step() self.loss.update(loss.item()) acc = (score.max(1)[1] == label.long()).float().mean().item() self.acc.update(acc) log_interval = self.opt.misc.log_interval if log_interval and not (i + 1) % log_interval: loss_name, loss_value = self.loss.get() metric_name, metric_value = self.acc.get() logging.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\t' '%s=%f' % (epoch, i + 1, train_data.batch_size * log_interval / (time.time() - btic), loss_name, loss_value, metric_name, metric_value)) btic = time.time() loss_name, loss_value = self.loss.get() metric_name, metric_value = self.acc.get() throughput = int(train_data.batch_size * len(train_data) / (time.time() - tic)) logging.info( '[Epoch %d] training: %s=%f\t%s=%f' % (epoch, loss_name, loss_value, metric_name, metric_value)) logging.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' % (epoch, throughput, time.time() - tic)) is_best = False if test_data is not None and self.opt.misc.eval_step and not ( epoch + 1) % self.opt.misc.eval_step: rank1 = self.test_func(test_data, num_query) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 state_dict = self.net.module.state_dict() if not (epoch + 1) % self.opt.misc.save_step: save_checkpoint( { 'state_dict': state_dict, 'epoch': epoch + 1, }, is_best=is_best, save_dir=self.opt.misc.save_dir, filename=self.opt.network.name + str(epoch + 1) + '.pth.tar')
def train(args, network, train_data, valid_data, optimizer, criterion, device, log_path, label2name): lr_scheduler = LRScheduler(base_lr=0.01, step=(30, 60), factor=0.1) network = network.to(device) best_test_acc = -np.inf losses = AverageValueMeter() acces = AverageValueMeter() for epoch in range(120): losses.reset() acces.reset() network.train() lr = lr_scheduler.update(epoch) for param_group in optimizer.param_groups: param_group['lr'] = lr # print_str = 'Epoch [%d] learning rate update to %.3e' % (epoch, lr) # print(print_str) # with open(log_path, 'a') as f: f.write(print_str + '\n') tic = time.time() for i, data in enumerate(train_data): imgs, labels = data imgs = imgs.to(device) labels = labels.to(device) scores = network(imgs) loss = criterion(scores, labels) optimizer.zero_grad() loss.backward() optimizer.step() losses.add(loss.item()) acc = (scores.max(1)[1] == labels.long()).float().mean() acces.add(acc.item()) if (i + 1) % args.log_interval == 0: loss_mean = losses.value()[0] acc_mean = acces.value()[0] print_str = 'Epoch[%d] Batch [%d]\tloss=%f\tacc=%f' % ( epoch, i + 1, loss_mean, acc_mean) print(print_str) with open(log_path, 'a') as f: f.write(print_str + '\n') btic = time.time() loss_mean = losses.value()[0] acc_mean = acces.value()[0] print_str = '[Epoch %d] Training: loss=%f\tacc=%f\ttime cost: %.3f' % ( epoch, loss_mean, acc_mean, time.time() - tic) print(print_str) with open(log_path, 'a') as f: f.write(print_str + '\n') is_best = False if valid_data is not None: test_acc = test(network, valid_data, device) print_str = '[Epoch %d] test acc: %f' % (epoch, test_acc) print(print_str) with open(log_path, 'a') as f: f.write(print_str + '\n') is_best = test_acc > best_test_acc if is_best: best_test_acc = test_acc state_dict = network.state_dict() if (epoch + 1) % args.save_step == 0: save_checkpoint( { 'state_dict': state_dict, 'epoch': epoch + 1, 'label2name': label2name, }, is_best=is_best, save_dir=os.path.join(args.save_dir, 'models'), filename='model' + '.pth')
def train(**kwargs): opt._parse(kwargs) # torch.backends.cudnn.deterministic = True # I think this line may slow down the training process # set random seed and cudnn benchmark torch.manual_seed(opt.seed) random.seed(opt.seed) np.random.seed(opt.seed) use_gpu = torch.cuda.is_available() sys.stdout = Logger(os.path.join('./pytorch-ckpt/current', opt.save_dir, 'log_train.txt')) if use_gpu: print('currently using GPU') cudnn.benchmark = True else: print('currently using cpu') print('initializing dataset {}'.format(opt.trainset_name)) train_dataset = data_manager.init_dataset(name=opt.trainset_name, num_bn_sample=opt.batch_num_bn_estimatation * opt.test_batch) pin_memory = True if use_gpu else False summary_writer = SummaryWriter(os.path.join('./pytorch-ckpt/current', opt.save_dir, 'tensorboard_log')) trainloader = DataLoader( data_manager.init_datafolder(opt.trainset_name, train_dataset.train, TrainTransform(opt.height, opt.width)), sampler=IdentitySampler(train_dataset.train, opt.train_batch, opt.num_instances), batch_size=opt.train_batch, num_workers=opt.workers, pin_memory=pin_memory, drop_last=True, collate_fn=NormalCollateFn() ) print('initializing model ...') model = ResNetBuilder(train_dataset.num_train_pids) optim_policy = model.get_optim_policy() print('model size: {:.5f}M'.format(sum(p.numel() for p in model.parameters()) / 1e6)) if use_gpu: model = CamDataParallel(model).cuda() xent = nn.CrossEntropyLoss() def standard_cls_criterion(preditions, targets, global_step, summary_writer): identity_loss = xent(preditions, targets) identity_accuracy = torch.mean((torch.argmax(preditions, dim=1) == targets).float()) summary_writer.add_scalar('cls_loss', identity_loss.item(), global_step) summary_writer.add_scalar('cls_accuracy', identity_accuracy.item(), global_step) return identity_loss # get trainer and evaluator optimizer, adjust_lr = get_optimizer_strategy(opt, optim_policy) reid_trainer = CameraClsTrainer(opt, model, optimizer, standard_cls_criterion, summary_writer) print('Start training') for epoch in range(opt.max_epoch): adjust_lr(optimizer, epoch) reid_trainer.train(epoch, trainloader) if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1, }, save_dir=os.path.join('./pytorch-ckpt/current', opt.save_dir))
def train(**kwargs): opt._parse(kwargs) # set random seed and cudnn benchmark torch.manual_seed(opt.seed) use_gpu = torch.cuda.is_available() sys.stdout = Logger(osp.join(opt.save_dir, 'log_train.txt')) print('=========user config==========') pprint(opt._state_dict()) print('============end===============') if use_gpu: print('currently using GPU') cudnn.benchmark = True torch.cuda.manual_seed_all(opt.seed) else: print('currently using cpu') print('initializing dataset {}'.format(opt.dataset)) dataset = data_manager.init_dataset(name=opt.dataset) pin_memory = True if use_gpu else False summary_writer = SummaryWriter(osp.join(opt.save_dir, 'tensorboard_log')) if 'triplet' in opt.model_name: trainloader = DataLoader( ImageData(dataset.train, TrainTransform(opt.height, opt.width)), sampler=RandomIdentitySampler(dataset.train, opt.num_instances), batch_size=opt.train_batch, num_workers=opt.workers, pin_memory=pin_memory, drop_last=True) else: trainloader = DataLoader(ImageData( dataset.train, TrainTransform(opt.height, opt.width)), batch_size=opt.train_batch, shuffle=True, num_workers=opt.workers, pin_memory=pin_memory) queryloader = DataLoader(ImageData(dataset.query, TestTransform(opt.height, opt.width)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory) galleryloader = DataLoader(ImageData(dataset.gallery, TestTransform(opt.height, opt.width)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory) print('initializing model ...') if opt.model_name == 'softmax' or opt.model_name == 'softmax_triplet': model, optim_policy = get_baseline_model(dataset.num_train_pids) elif opt.model_name == 'triplet': model, optim_policy = get_baseline_model(num_classes=None) print('model size: {:.5f}M'.format( sum(p.numel() for p in model.parameters()) / 1e6)) # xent_criterion = nn.CrossEntropyLoss() xent_criterion = CrossEntropyLabelSmooth(dataset.num_train_pids) tri_criterion = TripletLoss(opt.margin) def cls_criterion(cls_scores, targets): cls_loss = xent_criterion(cls_scores, targets) return cls_loss def triplet_criterion(feat, targets): triplet_loss, _, _ = tri_criterion(feat, targets) return triplet_loss def cls_tri_criterion(cls_scores, feat, targets): cls_loss = xent_criterion(cls_scores, targets) triplet_loss, _, _ = tri_criterion(feat, targets) loss = cls_loss + triplet_loss return loss # get optimizer optimizer = torch.optim.Adam(optim_policy, lr=opt.lr, weight_decay=opt.weight_decay) def adjust_lr(optimizer, ep): if ep < 20: lr = 1e-4 * (ep + 1) / 2 elif ep < 80: lr = 1e-3 * opt.num_gpu elif ep < 180: lr = 1e-4 * opt.num_gpu elif ep < 300: lr = 1e-5 * opt.num_gpu elif ep < 320: lr = 1e-5 * 0.1**((ep - 320) / 80) * opt.num_gpu elif ep < 400: lr = 1e-6 elif ep < 480: lr = 1e-4 * opt.num_gpu else: lr = 1e-5 * opt.num_gpu for p in optimizer.param_groups: p['lr'] = lr start_epoch = opt.start_epoch if use_gpu: model = nn.DataParallel(model).cuda() # get trainer and evaluator if opt.model_name == 'softmax': reid_trainer = clsTrainer(opt, model, optimizer, cls_criterion, summary_writer) elif opt.model_name == 'softmax_triplet': reid_trainer = cls_tripletTrainer(opt, model, optimizer, cls_tri_criterion, summary_writer) elif opt.model_name == 'triplet': reid_trainer = tripletTrainer(opt, model, optimizer, triplet_criterion, summary_writer) reid_evaluator = ResNetEvaluator(model) # start training best_rank1 = -np.inf best_epoch = 0 for epoch in range(start_epoch, opt.max_epoch): if opt.step_size > 0: adjust_lr(optimizer, epoch + 1) reid_trainer.train(epoch, trainloader) # skip if not save model if opt.eval_step > 0 and (epoch + 1) % opt.eval_step == 0 or ( epoch + 1) == opt.max_epoch: rank1 = reid_evaluator.evaluate(queryloader, galleryloader) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1, }, is_best=is_best, save_dir=opt.save_dir, filename='checkpoint_ep' + str(epoch + 1) + '.pth.tar') print('Best rank-1 {:.1%}, achived at epoch {}'.format( best_rank1, best_epoch))
def metric_training(self, balance_testset): self._train_prepare() max_acc, last_acc, drop_count, fail_max_count = .0, .0, 0, 0 for epoch in range(self.start_epoch, self.start_epoch + self.train_epoches): if self.step_size > 0: self.optimizer = _adjust_learning_rate(self.optimizer, epoch + 1) next_margin = self.margin """ get a brand new training set """ train_pictures = get_training_set_list_practice( self.train_root, train_limit=100, random_training_set=False) train_using_metriclearning(self.model, self.optimizer, self.tri_criterion, epoch, self.train_root, train_pictures=train_pictures, prefix=self.prefix, WIDTH=self.w, HEIGHT=self.h, batch_size=self.batch_size) # true testing on seen classes acc = evaluate_with_models([self.model], _range=self.num_train_pids + 1, test_dir=self.test_root, sample_file_dir=self.sample_file_dir, seen='seen', temp_prefix=self.prefix, balance_testset=balance_testset) print( 'Margin: {}, Epoch: {}, Acc: {:.4}%(on seen pictures)'.format( self.margin, epoch, acc * 100)) log(log_path=os.path.join(self.save_dir, 'readme.txt'), epoch=epoch, accuracy=acc, train_cls_count=self.num_train_pids, test_cls_count=self.num_test_pids, method='metric', note='update:%.2f, on seen' % self.update_conv_layers) if self.test_unseen_root is not None: # true testing on unseen classes acc_unseen = evaluate_with_models( [self.model], _range=self.num_train_pids, test_dir=self.test_unseen_root, sample_file_dir=self.sample_file_dir, seen='unseen', temp_prefix=self.prefix, balance_testset=balance_testset) log(log_path=os.path.join(self.save_dir, 'readme.txt'), epoch=epoch, accuracy=acc_unseen, train_cls_count=self.num_train_pids, test_cls_count=self.num_test_unseen_pids, method='metric', note='update:%.2f, on unseen' % self.update_conv_layers) print('Margin: {}, Epoch: {}, Acc: {:.4}%(on unseen pictures)'. format(self.margin, epoch, acc_unseen * 100)) else: acc_unseen = -1 max_acc = max(acc, max_acc) if last_acc == .0: last_acc = acc else: if acc < last_acc: drop_count += 1 else: drop_count = 0 last_acc = acc if max_acc == acc: fail_max_count = 0 else: fail_max_count += 1 if 'inception3' == self.model_type: save_model_name = 'inception_v3_metric_conv%.2f.tar' % ( self.update_conv_layers) else: save_model_name = 'resnet_metric_conv%.2f.tar' % ( self.update_conv_layers) state_dict = self.model.module.state_dict( ) if self.use_gpu else self.model.state_dict() # save model, and check if its the best model. save as the best model if positive save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch, }, is_best=acc == max_acc, save_dir=self.save_dir, filename=save_model_name, acc=acc, method='metric', prefix=self.prefix) # if the accuracy keep dropping, stop training if drop_count == 10 or fail_max_count == 20: print( 'Accuracy dropping for %d times or smaller the max for %d times, stop in epoch %d\n' % (drop_count, fail_max_count, epoch)) break self.margin = next_margin return self.save_dir, max_acc
def train_cycle_gan(**kwargs): opt._parse(kwargs) torch.manual_seed(opt.seed) # Write standard output into file sys.stdout = Logger(os.path.join(opt.save_dir, 'log_train.txt')) print('========user config========') pprint(opt._state_dict()) print('===========end=============') if opt.use_gpu: print('currently using GPU') torch.cuda.manual_seed_all(opt.seed) else: print('currently using cpu') pin_memory = True if opt.use_gpu else False print('initializing dataset {}'.format(opt.dataset_mode)) dataset = UnalignedDataset(opt) trainloader = DataLoader(dataset, opt.batchSize, True, num_workers=opt.workers, pin_memory=pin_memory) summaryWriter = SummaryWriter(os.path.join(opt.save_dir, 'tensorboard_log')) print('initializing model ... ') use_dropout = not opt.no_dropout netG_A = define_G(opt.input_nc, opt.output_nc, opt.ndf, opt.which_model_netG, opt.norm, use_dropout) netG_B = define_G(opt.output_nc, opt.input_nc, opt.ndf, opt.which_model_netG, opt.norm, use_dropout) use_sigmoid = opt.no_lsgan netD_A = define_D(opt.output_nc, opt.ndf, opt.which_model_netD, opt.n_layers_D, opt.norm, use_sigmoid) netD_B = define_D(opt.input_nc, opt.ndf, opt.which_model_netD, opt.n_layers_D, opt.norm, use_sigmoid) # print(netD_A) optimizer_G = torch.optim.Adam(itertools.chain(netG_A.parameters(), netG_B.parameters()), lr=opt.lr, betas=(opt.beta1, 0.999)) optimizer_D = torch.optim.Adam(itertools.chain(netD_A.parameters(), netD_B.parameters()), lr=opt.lr, betas=(opt.beta1, 0.999)) def get_scheduler(optimizer, opt): if opt.lr_policy == 'lambda': def lambda_rule(epoch): lr_l = 1.0 - max(0, epoch + 1 + opt.start_epoch - opt.niter) / float(opt.lr_decay_iters + 1) return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif opt.lr_policy == 'step': scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1) elif opt.lr_policy == 'plateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5) else: return NotImplementedError( 'learning rate policy [{}] is not implemented'.format( opt.lr_policy)) return scheduler scheduler_G = get_scheduler(optimizer_G, opt) scheduler_D = get_scheduler(optimizer_D, opt) start_epoch = opt.start_epoch if opt.use_gpu: netG_A = torch.nn.DataParallel(netG_A).cuda() netG_B = torch.nn.DataParallel(netG_B).cuda() netD_A = torch.nn.DataParallel(netD_A).cuda() netD_B = torch.nn.DataParallel(netD_B).cuda() # get trainer cycleganTrainer = Trainer(opt, netG_A, netG_B, netD_A, netD_B, optimizer_G, optimizer_D, summaryWriter) # start training for epoch in range(start_epoch, opt.max_epoch): scheduler_G.step() scheduler_D.step() # train over whole dataset cycleganTrainer.train(epoch, trainloader) if (epoch + 1) % opt.save_freq == 0 or (epoch + 1) == opt.max_epoch: if opt.use_gpu: state_dict_netG_A = netG_A.module.state_dict() state_dict_netG_B = netG_B.module.state_dict() state_dict_netD_A = netD_A.module.state_dict() state_dict_netD_B = netD_B.module.state_dict() else: state_dict_netG_A = netG_A.state_dict() state_dict_netG_B = netG_B.state_dict() state_dict_netD_A = netD_A.state_dict() state_dict_netD_B = netD_B.state_dict() save_checkpoint( { 'netG_A': state_dict_netG_A, 'netG_B': state_dict_netG_B, 'netD_A': state_dict_netD_A, 'netD_B': state_dict_netD_B, 'epoch': epoch + 1, }, False, save_dir=opt.save_dir, filename='checkpoint_ep' + str(epoch + 1))
def train(self, epochs): scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=self.lr_reduce_factor, patience=self.patience) epoch_times = [] prev_loss = -1 best_dev_score = -1 for epoch in range(1, epochs + 1): start = time.time() self.logger.info('Epoch {} started...'.format(epoch)) left_out_a, left_out_b, left_out_ext_feats, left_out_label = self.train_epoch(epoch) # manually evaluating the validating set all_predictions, all_true_labels = [], [] val_kl_div_loss = 0 for i in range(len(left_out_a)): # Select embedding sent1 = self.embedding(left_out_a[i]).transpose(1, 2) sent2 = self.embedding(left_out_b[i]).transpose(1, 2) sent1_nonstatic, sent2_nonstatic = None, None if self.nonstatic_embedding is not None: sent1_nonstatic = self.nonstatic_embedding(left_out_a[i]).transpose(1, 2) sent2_nonstatic = self.nonstatic_embedding(left_out_b[i]).transpose(1, 2) output = self.model(sent1, sent2, left_out_ext_feats[i], sent1_nonstatic=sent1_nonstatic, sent2_nonstatic=sent2_nonstatic) val_kl_div_loss += F.kl_div(output, left_out_label[i], reduction='sum').item() predict_classes = torch.arange(0, self.train_loader.dataset.NUM_CLASSES, device=left_out_a[i].device)\ .float().expand(len(left_out_a[i]), self.train_loader.dataset.NUM_CLASSES) predictions = (predict_classes * output.detach().exp()).sum(dim=1) true_labels = (predict_classes * left_out_label[i].detach()).sum(dim=1) all_predictions.append(predictions) all_true_labels.append(true_labels) predictions = torch.cat(all_predictions).cpu().numpy() true_labels = torch.cat(all_true_labels).cpu().numpy() pearson_r = pearsonr(predictions, true_labels)[0] val_kl_div_loss /= len(predictions) if self.use_tensorboard: self.writer.add_scalar('msrvid/dev/pearson_r', pearson_r, epoch) for param_group in self.optimizer.param_groups: self.logger.info('Validation size: %s Pearson\'s r: %s', output.size(0), pearson_r) self.logger.info('Learning rate: %s', param_group['lr']) if self.use_tensorboard: self.writer.add_scalar('msrvid/lr', param_group['lr'], epoch) self.writer.add_scalar('msrvid/dev/kl_div_loss', val_kl_div_loss, epoch) break scheduler.step(pearson_r) end = time.time() duration = end - start self.logger.info('Epoch {} finished in {:.2f} minutes'.format(epoch, duration / 60)) epoch_times.append(duration) if pearson_r > best_dev_score: best_dev_score = pearson_r save_checkpoint(epoch, self.model.arch, self.model.state_dict(), self.optimizer.state_dict(), best_dev_score, self.model_outfile) if abs(prev_loss - val_kl_div_loss) <= 0.0005: self.logger.info('Early stopping. Loss changed by less than 0.0005.') break prev_loss = val_kl_div_loss self.evaluate(self.test_evaluator, 'test') self.logger.info('Training took {:.2f} minutes overall...'.format(sum(epoch_times) / 60))
def train(**kwargs): opt._parse(kwargs) #opt.lr=0.00002 opt.model_name='PCB' # set random seed and cudnn benchmark torch.manual_seed(opt.seed) os.makedirs(opt.save_dir, exist_ok=True) use_gpu = torch.cuda.is_available() sys.stdout = Logger(osp.join(opt.save_dir, 'log_train.txt')) print('=========user config==========') pprint(opt._state_dict()) print('============end===============') if use_gpu: print('currently using GPU') cudnn.benchmark = True torch.cuda.manual_seed_all(opt.seed) else: print('currently using cpu') print('initializing dataset {}'.format(opt.dataset)) dataset = data_manager.init_dataset(name=opt.dataset, mode=opt.mode) tgt_dataset = data_manager.init_dataset(name=opt.tgt_dataset,mode=opt.mode) pin_memory = True if use_gpu else False summary_writer = SummaryWriter(osp.join(opt.save_dir, 'tensorboard_log')) trainloader = DataLoader( ImageData(dataset.train, TrainTransform(opt.datatype)), batch_size=opt.train_batch, num_workers=opt.workers, pin_memory=pin_memory, drop_last=True ) tgt_trainloader = DataLoader( ImageData(tgt_dataset.train, TrainTransform(opt.datatype)), batch_size=opt.train_batch,num_workers=opt.workers, pin_memory=pin_memory,drop_last=True ) tgt_queryloader = DataLoader( ImageData(tgt_dataset.query, TestTransform(opt.datatype)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory ) tgt_galleryloader = DataLoader( ImageData(tgt_dataset.gallery, TestTransform(opt.datatype)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory ) tgt_queryFliploader = DataLoader( ImageData(tgt_dataset.query, TestTransform(opt.datatype, True)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory ) tgt_galleryFliploader = DataLoader( ImageData(tgt_dataset.gallery, TestTransform(opt.datatype, True)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory ) print('initializing model ...') model = PCB(dataset.num_train_pids) optim_policy = model.get_optim_policy() start_epoch = opt.start_epoch if opt.pretrained_model: checkpoint = torch.load(opt.pretrained_model) state_dict = checkpoint['state_dict'] # state_dict = {k: v for k, v in state_dict.items() \ # if not ('reduction' in k or 'softmax' in k)} try: model.load_state_dict(state_dict, False) print('load pretrained model ' + opt.pretrained_model) except: RuntimeError('please keep the same size with source dataset..') else: raise RuntimeError('please load a pre-trained model...') print('model size: {:.5f}M'.format(sum(p.numel() for p in model.parameters()) / 1e6)) if use_gpu: model = nn.DataParallel(model).cuda() reid_evaluator = ResNetEvaluator(model) if opt.evaluate: print('transfer directly....... ') reid_evaluator.evaluate(tgt_queryloader, tgt_galleryloader, tgt_queryFliploader, tgt_galleryFliploader, re_ranking=opt.re_ranking, savefig=opt.savefig) return #xent_criterion = CrossEntropyLabelSmooth(dataset.num_train_pids) embedding_criterion = SelfTraining_TripletLoss(margin=0.5,num_instances=4) # def criterion(triplet_y, softmax_y, labels): # losses = [embedding_criterion(output, labels)[0] for output in triplet_y] + \ # [xent_criterion(output, labels) for output in softmax_y] # loss = sum(losses) # return loss def criterion(triplet_y, softmax_y, labels): #losses = [torch.sum(torch.stack([xent_criterion(logits, labels) for logits in softmax_y]))] losses = [torch.sum(torch.stack([embedding_criterion(output,labels) for output in triplet_y]))] loss = sum(losses) return loss # get optimizer if opt.optim == "sgd": optimizer = torch.optim.SGD(optim_policy, lr=opt.lr, momentum=0.9, weight_decay=opt.weight_decay) else: optimizer = torch.optim.Adam(optim_policy, lr=opt.lr, weight_decay=opt.weight_decay) # get trainer and evaluator reid_trainer = PCBTrainer(opt, model, optimizer, criterion, summary_writer) def adjust_lr(optimizer, ep): if ep < 50: lr = opt.lr * (ep // 5 + 1) elif ep < 200: lr = opt.lr*10 elif ep < 300: lr = opt.lr else: lr = opt.lr*0.1 for p in optimizer.param_groups: p['lr'] = lr # start training best_rank1 = opt.best_rank best_epoch = 0 print('transfer directly.....') reid_evaluator.evaluate(tgt_queryloader, tgt_galleryloader, tgt_queryFliploader, tgt_galleryFliploader, re_ranking=opt.re_ranking, savefig=opt.savefig) for iter_n in range(start_epoch,opt.max_epoch): if opt.lambda_value == 0: source_features = 0 else: # get source datas' feature print('Iteration {}: Extracting Source Dataset Features...'.format(iter_n + 1)) source_features, _ = extract_pcb_features(model, trainloader) # extract training images' features print('Iteration {}: Extracting Target Dataset Features...'.format(iter_n + 1)) target_features, _ = extract_pcb_features(model, tgt_trainloader) # synchronization feature order with dataset.train # calculate distance and rerank result print('Calculating feature distances...') target_features = target_features.numpy() rerank_dist = re_ranking( source_features, target_features, lambda_value=opt.lambda_value) if iter_n == 0: # DBSCAN cluster tri_mat = np.triu(rerank_dist, 1) # tri_mat.dim=2 取上三角 tri_mat = tri_mat[np.nonzero(tri_mat)] # tri_mat.dim=1 tri_mat = np.sort(tri_mat, axis=None) top_num = np.round(opt.rho * tri_mat.size).astype(int) eps = tri_mat[:top_num].mean() # DBSCAN聚类半径 print('eps in cluster: {:.3f}'.format(eps)) cluster = DBSCAN(eps=eps, min_samples=4, metric='precomputed', n_jobs=8) # select & cluster images as training set of this epochs print('Clustering and labeling...') labels = cluster.fit_predict(rerank_dist) del(rerank_dist) del(source_features) del(target_features) try: gc.collect() except: print('cannot collect') num_ids = len(set(labels)) - 1 print('Iteration {} have {} training ids'.format(iter_n + 1, num_ids)) # generate new dataset new_dataset = [] for (fname, _, _), label in zip(tgt_dataset.train, labels): if label == -1: continue # dont need to change codes in trainer.py _parsing_input function and sampler function after add 0 new_dataset.append((fname, label, 0)) print('Iteration {} have {} training images'.format(iter_n + 1, len(new_dataset))) selftrain_loader = DataLoader( ImageData(new_dataset, TrainTransform(opt.datatype)), sampler=RandomIdentitySampler(new_dataset, opt.num_instances), batch_size=opt.train_batch, num_workers=opt.workers, pin_memory=pin_memory, drop_last=True ) # train model with new generated dataset trainer = PCBTrainer(opt, model, optimizer, criterion, summary_writer) reid_evaluator = ResNetEvaluator(model) # Start training for epoch in range(opt.selftrain_iterations): trainer.train(epoch, selftrain_loader) # skip if not save model if opt.eval_step > 0 and (iter_n + 1) % opt.eval_step == 0 or (iter_n + 1) == opt.max_epoch: # just avoid out of memory during eval,and can't save the model if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({'state_dict': state_dict, 'epoch': iter_n + 1}, is_best=0, save_dir=opt.save_dir, filename='checkpoint_ep' + str(iter_n + 1) + '.pth.tar') if (iter_n + 1) % (opt.eval_step*4) == 0: if opt.mode == 'class': rank1 = test(model, tgt_queryloader) else: rank1 = reid_evaluator.evaluate(tgt_queryloader, tgt_galleryloader, tgt_queryFliploader, tgt_galleryFliploader) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = iter_n + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() if is_best: save_checkpoint({'state_dict': state_dict, 'epoch': iter_n + 1}, is_best=is_best, save_dir=opt.save_dir, filename='checkpoint_ep' + str(iter_n + 1) + '.pth.tar') print('Best rank-1 {:.1%}, achived at epoch {}'.format(best_rank1, best_epoch))
def main(args): if not torch.cuda.is_available(): args.cpu_only = True np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.cpu_only: torch.cuda.manual_seed_all(args.seed) cudnn.benchmark = True # Logs directory mkdir_if_missing(args.logs_dir) if not args.eval_only: sys.stdout = Logger(osp.join(args.logs_dir, 'log.txt')) # Data train_dataset, test_dataset, num_classes = get_datasets( args.dataset, args.data_dir) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.workers, shuffle=True, pin_memory=True) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.workers, shuffle=False, pin_memory=True) # Model model = WideResNet(args.depth, args.width, num_classes, dropout_rate=args.dropout) criterion = nn.CrossEntropyLoss() start_epoch, best_prec1 = 0, 0 if args.resume: checkpoint = load_checkpoint(args.resume) model.load_state_dict(checkpoint['model']) start_epoch = checkpoint['epoch'] + 1 best_prec1 = checkpoint['best_prec1'] print("=> Load from {}, start epoch {}, best prec1 {:.2%}".format( args.resume, start_epoch, best_prec1)) if not args.cpu_only: model = DataParallel(model).cuda() criterion = criterion.cuda() # Optimizer if args.optim_method == 'sgd': optimizer = SGD(model.parameters(), lr=args.lr, nesterov=True, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = Adam(model.parameters(), lr=args.lr) # Evaluation only if args.eval_only: evaluate(start_epoch - 1, test_loader, model, criterion, args.cpu_only) return # Training epoch_steps = json.loads(args.epoch_steps)[::-1] for epoch in range(start_epoch, args.epochs): # Adjust learning rate power = 0 for i, step in enumerate(epoch_steps): if epoch >= step: power = len(epoch_steps) - i lr = args.lr * (args.lr_decay_ratio**power) for g in optimizer.param_groups: g['lr'] = lr # Training train(epoch, train_loader, model, criterion, optimizer, args.cpu_only) prec1 = evaluate(epoch, test_loader, model, criterion, args.cpu_only) is_best = prec1 > best_prec1 best_prec1 = max(best_prec1, prec1) # Save checkpoint checkpoint = {'epoch': epoch, 'best_prec1': best_prec1} if args.cpu_only: checkpoint['model'] = model.state_dict() else: checkpoint['model'] = model.module.state_dict() save_checkpoint(checkpoint, is_best, osp.join(args.logs_dir, 'checkpoint.pth.tar')) print('\n * Finished epoch {} Prec1: {:.2%} Best: {:.2%}{}\n'.format( epoch, prec1, best_prec1, ' *' if is_best else ''))
lr = 1e-5 for p in optimizer.param_groups: p['lr'] = lr # start training best_rank1 = opt.best_rank best_epoch = 0 for epoch in range(start_epoch, opt.max_epoch): if opt.adjust_lr: adjust_lr(optimizer, epoch + 1) plate_trainer.train(epoch, trainloader) # skip if not save model if opt.eval_step > 0 and (epoch + 1) % opt.eval_step == 0 or ( epoch + 1) == opt.max_epoch: if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1 }, is_best=False, save_dir=opt.save_dir, filename='checkpoint_ep' + str(epoch + 1) + '.pth.tar') print('Best rank-1 {:.1%}, achived at epoch {}'.format( best_rank1, best_epoch))
def main(args): np.random.seed(args.seed) torch.manual_seed(args.seed) # cudnn.benchmark = True # Redirect print to both console and log file if not args.evaluate: sys.stdout = Logger(osp.join(args.logs_dir, 'log.txt')) # Create data loaders if args.height is None or args.width is None: args.height, args.width = (144, 56) if args.arch == 'inception' else \ (240, 240) dataset, num_classes, train_loader, val_loader, test_loader = \ get_data(args.dataset, args.split, args.data_dir, args.height, args.width, args.batch_size, args.workers, args.combine_trainval) # Create model img_branch = models.create(args.arch, cut_layer=args.cut_layer, num_classes=num_classes, num_features=args.features) diff_branch = models.create(args.arch, cut_layer=args.cut_layer, num_classes=num_classes, num_features=args.features) # Load from checkpoint start_epoch = best_top1 = 0 if args.resume: checkpoint = load_checkpoint(args.resume) img_branch.load_state_dict(checkpoint['state_dict_img']) diff_branch.load_state_dict(checkpoint['state_dict_diff']) start_epoch = checkpoint['epoch'] best_top1 = checkpoint['best_top1'] print("=> Start epoch {} best top1 {:.1%}".format( start_epoch, best_top1)) img_branch = nn.DataParallel(img_branch).cuda() diff_branch = nn.DataParallel(diff_branch).cuda() # img_branch = nn.DataParallel(img_branch) # diff_branch = nn.DataParallel(diff_branch) # Criterion criterion = nn.CrossEntropyLoss().cuda() # criterion = nn.CrossEntropyLoss() # Evaluator evaluator = Evaluator(img_branch, diff_branch, criterion) if args.evaluate: # print("Validation:") # top1, _ = evaluator.evaluate(val_loader) # print("Validation acc: {:.1%}".format(top1)) print("Test:") top1, (gt, pred) = evaluator.evaluate(test_loader) print("Test acc: {:.1%}".format(top1)) from confusion_matrix import plot_confusion_matrix plot_confusion_matrix(gt, pred, dataset.classes, args.logs_dir) return img_param_groups = [ { 'params': img_branch.module.low_level_modules.parameters(), 'lr_mult': 0.1 }, { 'params': img_branch.module.high_level_modules.parameters(), 'lr_mult': 0.1 }, { 'params': img_branch.module.classifier.parameters(), 'lr_mult': 1 }, ] diff_param_groups = [ { 'params': diff_branch.module.low_level_modules.parameters(), 'lr_mult': 0.1 }, { 'params': diff_branch.module.high_level_modules.parameters(), 'lr_mult': 0.1 }, { 'params': diff_branch.module.classifier.parameters(), 'lr_mult': 1 }, ] img_optimizer = torch.optim.SGD(img_param_groups, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) diff_optimizer = torch.optim.SGD(diff_param_groups, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) # Trainer trainer = Trainer(img_branch, diff_branch, criterion) # Schedule learning rate def adjust_lr(epoch): step_size = args.step_size lr = args.lr * (0.1**(epoch // step_size)) for g in img_optimizer.param_groups: g['lr'] = lr * g.get('lr_mult', 1) for g in diff_optimizer.param_groups: g['lr'] = lr * g.get('lr_mult', 1) # Start training for epoch in range(start_epoch, args.epochs): adjust_lr(epoch) trainer.train(epoch, train_loader, img_optimizer, diff_optimizer) if epoch < args.start_save: continue top1, _ = evaluator.evaluate(val_loader) is_best = top1 > best_top1 best_top1 = max(top1, best_top1) save_checkpoint( { 'state_dict_img': img_branch.module.state_dict(), 'state_dict_diff': diff_branch.module.state_dict(), 'epoch': epoch + 1, 'best_top1': best_top1, }, is_best, fpath=osp.join(args.logs_dir, 'checkpoint.pth.tar')) print('\n * Finished epoch {:3d} top1: {:5.1%} best: {:5.1%}{}\n'. format(epoch, top1, best_top1, ' *' if is_best else '')) # Final test print('Test with best model:') checkpoint = load_checkpoint(osp.join(args.logs_dir, 'model_best.pth.tar')) img_branch.module.load_state_dict(checkpoint['state_dict_img']) diff_branch.module.load_state_dict(checkpoint['state_dict_diff']) top1, (gt, pred) = evaluator.evaluate(test_loader) from confusion_matrix import plot_confusion_matrix plot_confusion_matrix(gt, pred, dataset.classes, args.logs_dir) print('\n * Test Accuarcy: {:5.1%}\n'.format(top1))
def train(**kwargs): opt._parse(kwargs) os.makedirs(opt.save_dir, exist_ok=True) use_gpu = torch.cuda.is_available() sys.stdout = Logger(osp.join(opt.save_dir, 'log_train.txt')) print('=========user config==========') pprint(opt._state_dict()) print('============end===============') if use_gpu: print('currently using GPU') cudnn.benchmark = True torch.cuda.manual_seed_all(opt.seed) else: print('currently using cpu') print('initializing dataset {}'.format(opt.dataset)) dataset = data_manager.init_dataset(name=opt.dataset, use_all=opt.use_all) summary_writer = SummaryWriter(osp.join(opt.save_dir, 'tensorboard_log')) # load data pin_memory = True if use_gpu else False dataloader = load_data(dataset, pin_memory) print('initializing model ...') if opt.loss == 'softmax' or opt.loss == 'softmax_triplet': model = ResNetBuilder(dataset.num_train_pids, opt.last_stride, True) elif opt.loss == 'triplet': model = ResNetBuilder(None, opt.last_stride, True) if opt.pretrained_model: if use_gpu: state_dict = torch.load(opt.pretrained_model)['state_dict'] else: state_dict = torch.load(opt.pretrained_model, map_location='cpu')['state_dict'] model.load_state_dict(state_dict, False) print('load pretrained model ' + opt.pretrained_model) print('model size: {:.5f}M'.format( sum(p.numel() for p in model.parameters()) / 1e6)) optim_policy = model.get_optim_policy() if use_gpu: model = nn.DataParallel(model).cuda() reid_evaluator = ResNetEvaluator(model) if opt.evaluate: reid_evaluator.evaluate(dataloader['query'], dataloader['gallery'], dataloader['queryFlip'], dataloader['galleryFlip'], savefig=opt.savefig) return criterion = get_loss() # optimizer if opt.optim == "sgd": optimizer = torch.optim.SGD(optim_policy, lr=opt.lr, momentum=0.9, weight_decay=5e-4) else: optimizer = torch.optim.Adam(optim_policy, lr=opt.lr, weight_decay=5e-4) scheduler = WarmupMultiStepLR(optimizer, [40, 70], 0.1, 0.01, 10, 'linear') start_epoch = opt.start_epoch # get trainer and evaluator reid_trainer = Trainer(opt, model, optimizer, criterion, summary_writer) # start training best_rank1 = opt.best_rank best_epoch = 0 for epoch in range(start_epoch, opt.max_epoch): scheduler.step() reid_trainer.train(epoch, dataloader['train']) # skip if not save model if opt.eval_step > 0 and (epoch + 1) % opt.eval_step == 0 or ( epoch + 1) == opt.max_epoch: rank1 = reid_evaluator.evaluate(dataloader['query'], dataloader['gallery'], dataloader['queryFlip'], dataloader['galleryFlip']) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1 }, is_best=is_best, save_dir=opt.save_dir, filename='checkpoint_ep' + str(epoch + 1) + '.pth.tar') print('Best rank-1 {:.1%}, achived at epoch {}'.format( best_rank1, best_epoch))
def main(): # load the hyper-parameter with open('config.yml', encoding='utf-8') as f: CONFIG_DICT = yaml.safe_load( f ) # CONFIG_DICT is a dict that involves train_param, test_param and save_dir TRAIN_PARAM = CONFIG_DICT['train'] TEST_PARAM = CONFIG_DICT['test'] SAVA_DIR = CONFIG_DICT['save_path'] os.environ['CUDA_VISIBLE_DEVICES'] = TRAIN_PARAM['gpu_device'] torch.manual_seed(TRAIN_PARAM['seed']) if not TRAIN_PARAM['evaluate']: sys.stdout = Logging(osp.join(SAVA_DIR['log_dir'], 'log_train.txt')) else: sys.stdout = Logging(osp.join(SAVA_DIR['log_dir'], 'log_test.txt')) print("==========\nArgs:{}\n==========".format(TRAIN_PARAM)) # GPU use Y/N use_gpu = torch.cuda.is_available() if use_gpu: print("Currently using GPU {}".format([TRAIN_PARAM['gpu_device']])) cudnn.benchmark = True torch.cuda.manual_seed_all(TRAIN_PARAM['seed']) else: use_cpu = True print("Initializing dataset {}".format(TRAIN_PARAM['dataset'])) # load data dataset = Datasets.init_dataset(name=TRAIN_PARAM['dataset'], root=TRAIN_PARAM['root']) pin_memory = True if use_gpu else False # define the tranform method train_transform = T.Compose([ T.RandomSizedRectCrop(width=TRAIN_PARAM['width'], height=TRAIN_PARAM['height']), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) test_transform = T.Compose([ T.RectScale(width=TRAIN_PARAM['width'], height=TRAIN_PARAM['height']), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) train_loader = DataLoader(dataset=ImageDataset(dataset.train, transform=train_transform), sampler=RandomIdentitySampler( dataset.train, num_instances=TRAIN_PARAM['num_instances']), batch_size=TRAIN_PARAM['train_batch'], num_workers=TRAIN_PARAM['workers'], pin_memory=pin_memory, drop_last=True) query_loader = DataLoader(dataset=ImageDataset(dataset=dataset.query, transform=test_transform), batch_size=TEST_PARAM['test_batch'], shuffle=False, num_workers=TEST_PARAM['test_workers'], pin_memory=pin_memory, drop_last=False) gallery_loader = DataLoader(dataset=ImageDataset(dataset=dataset.gallery, transform=test_transform), batch_size=TEST_PARAM['test_batch'], shuffle=False, num_workers=TEST_PARAM['test_workers'], pin_memory=pin_memory, drop_last=False) # load model print("Initializing model: {}".format(TRAIN_PARAM['arch'])) model = models.init_model(name=TRAIN_PARAM['arch'], num_classes=dataset.num_train_pids, loss={'xent', 'htri'}) print("Model size: {:.5f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) # load loss_fuc # we judge if the softmax / triHard is in the TRAIN_PARAM['losses'], or else setting None criterion_xent = loss_fuc.init_losses( name='softmax', num_classes=dataset.num_train_pids, use_gpu=use_gpu) if 'softmax' in TRAIN_PARAM['losses'] else None criterion_trihard = loss_fuc.init_losses( name='trihard', margin=TRAIN_PARAM['margin'] ) if 'trihard' in TRAIN_PARAM['losses'] else None # load optim optim = optimizer.init_optim(optim=TRAIN_PARAM['optim'], params=model.parameters(), lr=TRAIN_PARAM['lr'], weight_decay=TRAIN_PARAM['weight_decay']) if TRAIN_PARAM['step_size'] > 0: scheduler = lr_scheduler.StepLR(optimizer=optim, step_size=TRAIN_PARAM['step_size'], gamma=TRAIN_PARAM['gamma']) start_epoch = TRAIN_PARAM['start_epoch'] # resume or not if TRAIN_PARAM['resume']: checkpoint = load_checkpoint(TRAIN_PARAM['resume']) model.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] best_top1 = checkpoint['best_top1'] print("=> Start epoch {} best top1 {:.1%}".format( start_epoch, best_top1)) if use_gpu: model = nn.DataParallel(model).cuda() criterion_trihard.cuda() criterion_xent.cuda() # test or not if TRAIN_PARAM['evaluate']: print("Evaluate only") # test(model, query_loader, gallery_loader, use_gpu) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") # instance the class Trainer trainer = Trainer(model=model, criterion_xent=criterion_xent, criterion_trihard=criterion_trihard, eval=TRAIN_PARAM['triHard_only']) # start train for epoch in range(TRAIN_PARAM['start_epoch'], TRAIN_PARAM['max_epoch']): start_train_time = time.time() trainer.train(epoch=epoch, optimizer=optim, data_loader=train_loader, use_gpu=use_gpu, print_freq=TRAIN_PARAM['print_freq']) train_time += round(time.time() - start_train_time) # if (epoch + 1) > TEST_PARAM['start_eval'] \ and TEST_PARAM['eval_step'] > 0 \ and (epoch + 1) % TEST_PARAM['eval_step'] == 0 or ( epoch + 1) == TRAIN_PARAM['max_epoch']: print("==> Test") rank1 = test(model, query_loader, gallery_loader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(TRAIN_PARAM['checkpoint_dir'], 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))
def trainer(data_pth, a, b, _time=0, layers=18): seed = 0 # dataset options height = 128 width = 128 # optimization options optim = 'Adam' max_epoch = 20 train_batch = 64 test_batch = 64 lr = 0.1 step_size = 40 gamma = 0.1 weight_decay = 5e-4 momentum = 0.9 test_margin = b margin = a num_instances = 4 num_gpu = 1 # model options last_stride = 1 pretrained_model_18 = 'model/resnet18-5c106cde.pth' pretrained_model_50 = 'model/resnet50-19c8e357.pth' pretrained_model_34 = 'model/resnet34-333f7ec4.pth' pretrained_model_101 = 'model/resnet101-5d3b4d8f.pth' pretrained_model_152 = 'model/resnet152-b121ed2d.pth' # miscs print_freq = 20 eval_step = 1 save_dir = 'model/pytorch-ckpt/time%d' % _time workers = 1 start_epoch = 0 torch.manual_seed(seed) use_gpu = torch.cuda.is_available() if use_gpu: print('currently using GPU') cudnn.benchmark = True torch.cuda.manual_seed_all(seed) else: print('currently using cpu') pin_memory = True if use_gpu else False print('initializing dataset {}'.format('Tableware')) dataset = Tableware(data_pth) trainloader = DataLoader( ImageData(dataset.train, TrainTransform(height, width)), batch_size=train_batch, num_workers=workers, pin_memory=pin_memory, drop_last=True ) # testloader = DataLoader( # ImageData(dataset.test, TestTransform(height, width)), # batch_size=test_batch, num_workers=workers, # pin_memory=pin_memory, drop_last=True # ) # model, optim_policy = get_baseline_model(model_path=pretrained_model) if layers == 18: model, optim_policy = get_baseline_model(model_path=pretrained_model_18, layers=18) else: model, optim_policy = get_baseline_model(model_path=pretrained_model_50, layers=50) # model, optim_policy = get_baseline_model(model_path=pretrained_model_18, layers=18) # model, optim_policy = get_baseline_model(model_path=pretrained_model_34, layers=34) # model, optim_policy = get_baseline_model(model_path=pretrained_model_101, layers=101) # model = load_model(model, model_path='./model/pytorch-ckpt/87_layers18_margin20_epoch87.tar') print('model\'s parameters size: {:.5f} M'.format(sum(p.numel() for p in model.parameters()) / 1e6)) inner_dist = 0 outer_dist = 0 max_outer = 0 min_outer = 0 max_iner = 0 min_iner = 0 tri_criterion = TripletLoss(margin) # get optimizer optimizer = torch.optim.Adam( optim_policy, lr=lr, weight_decay=weight_decay ) def adjust_lr(optimizer, ep): if ep < 20: lr = 1e-4 * (ep + 1) / 2 elif ep < 80: lr = 1e-3 * num_gpu elif ep < 180: lr = 1e-4 * num_gpu elif ep < 300: lr = 1e-5 * num_gpu elif ep < 320: lr = 1e-5 * 0.1 ** ((ep - 320) / 80) *num_gpu elif ep < 400: lr = 1e-6 elif ep < 480: lr = 1e-4 * num_gpu else: lr = 1e-5 * num_gpu for p in optimizer.param_groups: p['lr'] = lr if use_gpu: model = nn.DataParallel(model).cuda() evaluator = Evaluator(model) for epoch in range(start_epoch, max_epoch): if step_size > 0: adjust_lr(optimizer, epoch + 1) next_margin = margin # skip if not save model if eval_step > 0 and (epoch + 1) % eval_step == 0 or (epoch + 1) == max_epoch: save_record_path = 'margin_'+ str(margin) + '_epoch_' + str(epoch + 1) + '.txt' _t1 =time.time() train(model, optimizer, tri_criterion, epoch, print_freq, trainloader, data_pth=data_pth) _t2 = time.time() print('time for training:', '%.2f' % (_t2 - _t1), 's') """ acc, inner_dist, outer_dist, max_outer, min_outer, max_iner, min_iner = evaluator.evaluate(testloader, test_margin, save_record_path) print('margin:{}, epoch:{}, acc:{}'.format(margin, epoch+1, acc)) f = open('record.txt', 'a') f.write('margin:{}, epoch:{}, acc:{}\n'.format(margin, epoch+1, acc)) f.close() """ is_best = False # save_model_path = 'new_margin({})_epoch({}).pth.tar'.format(margin, epoch+1) save_model_path = 'time{}_layers{}_margin{}_epoch{}.tar'.format(_time, layers, margin, epoch+1) # save_model_path = 'layers34_margin{}_epoch{}.tar'.format(margin, epoch+1) # save_model_path = 'layers101_margin{}_epoch{}.tar'.format(margin, epoch+1) if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1, }, is_best=is_best, save_dir=save_dir, filename=save_model_path) model.eval() acc = do_get_feature_and_t(model, margin=20, epoch=1) margin = next_margin return save_model_path, inner_dist, outer_dist, max_outer, min_outer, max_iner, min_iner
def train(self, balance_testset): """ Training using hard sample + sample re-weighting(proposed by keke) """ self._train_prepare() analyzer = Analyzer(sample_file_dir=self.sample_file_dir, test_dir=self.train_root, prefix=self.prefix, WIDTH=self.w, HEIGHT=self.h) max_acc, last_acc, drop_count, fail_max_count = .0, .0, 0, 0 max_acc_unseen = .0 overlap_dict = {} epoch = self.start_epoch for epoch in range(self.start_epoch, self.start_epoch + self.train_epochs): s_time = time.time() if self.step_size > 0: self.optimizer = _adjust_learning_rate(self.optimizer, epoch) next_margin = self.margin # get a brand new training set for a new epoch if self.num_train_imgs > self.num_train_pids * 100: if epoch == self.start_epoch: true_exter_class_top = None elif epoch % 5 == 0 or epoch == (self.start_epoch + 1): true_exter_class_top = exter_class_top else: pass if true_exter_class_top is not None: print('length of true_exter_class_top:', len(true_exter_class_top), ', and:', true_exter_class_top) train_pictures = get_training_set_list( self.train_root, train_limit=70, random_training_set=False, special_classes=true_exter_class_top) else: train_pictures = None # and then go through the training set, to get data needed for hard-sample if epoch % 5 == 0 or epoch == self.start_epoch: distance_dict, class_to_nearest_class = analyzer.analysis_for_hard_sample( self.model, test_pictures=train_pictures) train_using_metriclearning( self.model, self.optimizer, self.tri_criterion, epoch, self.train_root, train_pictures=train_pictures, batch_size=self.batch_size, distance_dict=distance_dict, class_to_nearest_class=class_to_nearest_class) exter_class_top, overlap_rate_dict_ls = analyzer.analysis_for_exter_class_overlap( model_path=None, model=self.model, WIDTH=self.w, HEIGHT=self.h) e_time = time.time() if epoch % 5 == 0 or epoch == 2: overlap_dict[epoch] = overlap_rate_dict_ls # true testing on seen classes acc = self.tester.evaluate_with_models(seen='seen') print( 'Margin: {}, Epoch: {}, Acc: {:.3}%, Top overlap rate: {:.4} (on seen pictures)[Hard Sample + Sample Re-weighting]' .format(self.margin, epoch, acc * 100, overlap_rate_dict_ls[0][1])) if self.test_unseen_root is not None: # true testing on unseen classes acc_unseen = self.tester.evaluate_with_models(seen='unseen') max_acc_unseen = max(max_acc_unseen, acc_unseen) note = 'update:%.2f, on unseen%s' % ( self.update_conv_layers, ' - New Unseen Accuracy' if max_acc_unseen == acc_unseen else '') log(log_path=os.path.join(self.save_dir, 'readme.txt'), epoch=epoch, accuracy=acc_unseen, train_cls_count=self.num_train_pids, test_cls_count=self.num_test_unseen_pids, method='metric', note=note) print( 'Margin: {}, Epoch: {}, Acc: {:.3}% (on unseen pictures)[Hard Sample + Sample Re-weighting]' .format(self.margin, epoch, acc_unseen * 100)) else: acc_unseen = -1 max_acc = max(acc, max_acc) note = 'update:%.2f, on seen%s' % (self.update_conv_layers, ' - New Seen Accuracy' if max_acc == acc else '') log(log_path=os.path.join(self.save_dir, 'readme.txt'), epoch=epoch, accuracy=acc, train_cls_count=self.num_train_pids, test_cls_count=self.num_test_pids, method='metric', epoch_time=(e_time - s_time), note=note) if epoch == self.start_epoch: last_acc = acc else: if acc < last_acc: drop_count += 1 else: drop_count = 0 last_acc = acc if max_acc == acc: fail_max_count = 0 else: fail_max_count += 1 if 'inception3' == self.model_type: save_model_name = 'inception_v3_metric.tmpmodel.tar' else: save_model_name = 'resnet_metric.tmpmodel.tar' state_dict = self.model.module.state_dict( ) if self.use_gpu else self.model.state_dict() # save model, and check if its the best model. save as the best model if positive save_checkpoint( { 'state_dict': state_dict, 'epoch': epoch, }, is_best=acc == max_acc, save_dir=self.save_dir, filename=save_model_name, acc=acc, ) # if the accuracy keep dropping, stop training if (drop_count == 12 or fail_max_count == 24) and self.enable_stop_machanism: print( 'Accuracy dropping for %d times or smaller the max for %d times, stop in epoch %d\n' % (drop_count, fail_max_count, epoch)) break # if overlap_rate_dict_ls[0][1] < .1: # print('Top exter class overlap rate reach a smaller value than threshold, stop in epoch %d\n' % epoch) # break self.margin = next_margin with open(os.path.join(self.save_dir, 'readme.txt'), 'ab+') as f: c = '\r\n[Hard Sample + Sample Re-weighting] Training finished with: %d epoch, %.2f%% accuracy.' % ( epoch, max_acc * 100) f.write(c.encode()) self._clean_tmp_model(epoch) pickle_write( './results/temp/%s_v5_overlap_rate_dict.pkl' % self.prefix, overlap_dict) return max_acc, epoch
def trainer(data_pth, a, b, _time=0, layers=18): seed = 0 # dataset options height, width = 128, 128 # optimization options optim = 'Adam' max_epoch = 20 train_batch = 64 test_batch = 64 lr = 0.1 step_size = 40 gamma = 0.1 weight_decay = 5e-4 momentum = 0.9 test_margin = b margin = a num_instances = 4 num_gpu = 1 # model options last_stride = 1 pretrained_model_18 = 'model/resnet18-5c106cde.pth' pretrained_model_50 = 'model/resnet50-19c8e357.pth' pretrained_model_34 = 'model/resnet34-333f7ec4.pth' pretrained_model_101 = 'model/resnet101-5d3b4d8f.pth' pretrained_model_152 = 'model/resnet152-b121ed2d.pth' # miscs print_freq = 10 eval_step = 1 save_dir = 'model/pytorch-ckpt/time%d' % _time workers = 1 torch.manual_seed(seed) use_gpu = torch.cuda.is_available() if use_gpu: print('currently using GPU') cudnn.benchmark = True torch.cuda.manual_seed_all(seed) else: print('currently using cpu') pin_memory = True if use_gpu else False # model, optim_policy = get_baseline_model(model_path=pretrained_model) if layers == 18: model, optim_policy = get_baseline_model( model_path=pretrained_model_18, layers=18) else: model, optim_policy = get_baseline_model( model_path=pretrained_model_50, layers=50) # model, optim_policy = get_baseline_model(model_path=pretrained_model_18, layers=18) # model, optim_policy = get_baseline_model(model_path=pretrained_model_34, layers=34) # model, optim_policy = get_baseline_model(model_path=pretrained_model_101, layers=101) # model = load_model(model, model_path='./model/pytorch-ckpt/87_layers18_margin20_epoch87.tar') print('model\'s parameters size: {:.5f} M'.format( sum(p.numel() for p in model.parameters()) / 1e6)) tri_criterion = TripletLoss(margin) # get optimizer optimizer = torch.optim.Adam(optim_policy, lr=lr, weight_decay=weight_decay) def adjust_lr(optimizer, ep): if ep < 20: lr = 1e-4 * (ep + 1) / 2 elif ep < 80: lr = 1e-3 * num_gpu elif ep < 180: lr = 1e-4 * num_gpu elif ep < 300: lr = 1e-5 * num_gpu elif ep < 320: lr = 1e-5 * 0.1**((ep - 320) / 80) * num_gpu elif ep < 400: lr = 1e-6 elif ep < 480: lr = 1e-4 * num_gpu else: lr = 1e-5 * num_gpu for p in optimizer.param_groups: p['lr'] = lr if use_gpu: model = nn.DataParallel(model).cuda() max_acc = .0 for epoch in range(max_epoch): if step_size > 0: adjust_lr(optimizer, epoch + 1) next_margin = margin # skip if not save model if eval_step > 0 and (epoch + 1) % eval_step == 0 or (epoch + 1) == max_epoch: _t1 = time.time() train(model, optimizer, tri_criterion, epoch + 1, print_freq, None, data_pth=data_pth) _t2 = time.time() print('time for training:', '%.2f' % (_t2 - _t1), 's') acc = evaluate_model(model, margin=20, epoch=1) if acc > max_acc: max_acc = acc print('max acc:', max_acc, ', epoch:', epoch + 1) if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_model_name = 'layers{}_margin{}_epoch{}.tar'.format( layers, margin, epoch + 1) save_checkpoint( { 'state_dict': state_dict, 'epoch': epoch + 1, }, is_best=False, save_dir=save_dir, filename=save_model_name) margin = next_margin return save_model_name
def main(): logger = logging.getLogger('global') global criterion_xent, criterion_triplet, criterion_center if os.path.exists(cfg.TRAIN.LOG_DIR): shutil.rmtree(cfg.TRAIN.LOG_DIR) os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) # log add_file_handler('global', os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) summary_writer = SummaryWriter(cfg.TRAIN.LOG_DIR) # visualise dataset, train_loader, _, _ = build_data_loader() model = BagReID_RESNET(dataset.num_train_bags) criterion_xent = CrossEntropyLabelSmooth(dataset.num_train_bags, use_gpu=cfg.CUDA) criterion_triplet = TripletLoss(margin=cfg.TRAIN.MARGIN) criterion_center = CenterLoss(dataset.num_train_bags, cfg.MODEL.GLOBAL_FEATS + cfg.MODEL.PART_FEATS, use_gpu=cfg.CUDA) if cfg.TRAIN.OPTIM == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=cfg.SOLVER.LEARNING_RATE, momentum=cfg.SOLVER.MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY) else: optimizer = torch.optim.Adam(model.parameters(), lr=cfg.SOLVER.LEARNING_RATE, weight_decay=cfg.SOLVER.WEIGHT_DECAY) center_optimizer = torch.optim.SGD(criterion_center.parameters(), lr=cfg.SOLVER.LEARNING_RATE_CENTER) optimizers = [optimizer, center_optimizer] schedulers = build_lr_schedulers(optimizers) if cfg.CUDA: model.cuda() if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model, device_ids=cfg.DEVICES) logger.info("model prepare done") # start training for epoch in range(cfg.TRAIN.NUM_EPOCHS): train(epoch, train_loader, model, criterion, optimizers, summary_writer) for scheduler in schedulers: scheduler.step() # skip if not save model if cfg.TRAIN.EVAL_STEP > 0 and (epoch + 1) % cfg.TRAIN.EVAL_STEP == 0 \ or (epoch + 1) == cfg.TRAIN.NUM_EPOCHS: if cfg.CUDA and torch.cuda.device_count() > 1: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1 }, is_best=False, save_dir=cfg.TRAIN.SNAPSHOT_DIR, filename='checkpoint_ep' + str(epoch + 1) + '.pth')
def train(**kwargs): opt._parse(kwargs) #opt.lr=0.00002 opt.model_name = 'AlignedReid' # set random seed and cudnn benchmark torch.manual_seed(opt.seed) os.makedirs(opt.save_dir, exist_ok=True) use_gpu = torch.cuda.is_available() sys.stdout = Logger(osp.join(opt.save_dir, 'log_train.txt')) print('=========user config==========') pprint(opt._state_dict()) print('============end===============') if use_gpu: print('currently using GPU') cudnn.benchmark = True torch.cuda.manual_seed_all(opt.seed) else: print('currently using cpu') print('initializing dataset {}'.format(opt.dataset)) dataset = data_manager.init_dataset(name=opt.dataset, mode=opt.mode) pin_memory = True if use_gpu else False summary_writer = SummaryWriter(osp.join(opt.save_dir, 'tensorboard_log')) trainloader = DataLoader(ImageData(dataset.train, TrainTransform(opt.datatype)), sampler=RandomIdentitySampler( dataset.train, opt.num_instances), batch_size=opt.train_batch, num_workers=opt.workers, pin_memory=pin_memory, drop_last=True) queryloader = DataLoader(ImageData(dataset.query, TestTransform(opt.datatype)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory) galleryloader = DataLoader(ImageData(dataset.gallery, TestTransform(opt.datatype)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory) queryFliploader = queryloader galleryFliploader = galleryloader # queryFliploader = DataLoader( # ImageData(dataset.query, TestTransform(opt.datatype, True)), # batch_size=opt.test_batch, num_workers=opt.workers, # pin_memory=pin_memory # ) # # galleryFliploader = DataLoader( # ImageData(dataset.gallery, TestTransform(opt.datatype, True)), # batch_size=opt.test_batch, num_workers=opt.workers, # pin_memory=pin_memory # ) print('initializing model ...') model = AlignedResNet50(num_classes=dataset.num_train_pids, loss={'softmax', 'metric'}, aligned=True, use_gpu=use_gpu) optim_policy = model.get_optim_policy() if opt.pretrained_model: state_dict = torch.load(opt.pretrained_model)['state_dict'] # state_dict = {k: v for k, v in state_dict.items() \ # if not ('reduction' in k or 'softmax' in k)} model.load_state_dict(state_dict, False) print('load pretrained model ' + opt.pretrained_model) print('model size: {:.5f}M'.format( sum(p.numel() for p in model.parameters()) / 1e6)) if use_gpu: model = nn.DataParallel(model).cuda() reid_evaluator = AlignedEvaluator(model) if opt.evaluate: #rank1 = test(model, queryloader, galleryloader, use_gpu) reid_evaluator.evaluate(queryloader, galleryloader, queryFliploader, galleryFliploader, re_ranking=opt.re_ranking, savefig=opt.savefig, test_distance='global') return # xent_criterion = nn.CrossEntropyLoss() xent_criterion = CrossEntropyLabelSmooth(dataset.num_train_pids, use_gpu=use_gpu) embedding_criterion = TripletLossAlignedReID(margin=opt.margin) # def criterion(triplet_y, softmax_y, labels): # losses = [embedding_criterion(output, labels)[0] for output in triplet_y] + \ # [xent_criterion(output, labels) for output in softmax_y] # loss = sum(losses) # return loss def criterion(outputs, features, local_features, labels): if opt.htri_only: if isinstance(features, tuple): global_loss, local_loss = DeepSupervision( embedding_criterion, features, labels) else: global_loss, local_loss = embedding_criterion( features, labels, local_features) else: if isinstance(outputs, tuple): xent_loss = DeepSupervision(xent_criterion, outputs, labels) else: xent_loss = xent_criterion(outputs, labels) if isinstance(features, tuple): global_loss, local_loss = DeepSupervision( embedding_criterion, features, labels) else: global_loss, local_loss = embedding_criterion( features, labels, local_features) loss = xent_loss + global_loss + local_loss return loss # get optimizer if opt.optim == "sgd": optimizer = torch.optim.SGD(optim_policy, lr=opt.lr, momentum=0.9, weight_decay=opt.weight_decay) else: optimizer = torch.optim.Adam(optim_policy, lr=opt.lr, weight_decay=opt.weight_decay) start_epoch = opt.start_epoch # get trainer and evaluator reid_trainer = AlignedTrainer(opt, model, optimizer, criterion, summary_writer) def adjust_lr(optimizer, ep): if ep < 50: lr = opt.lr * (ep // 5 + 1) elif ep < 200: lr = opt.lr * 10 elif ep < 300: lr = opt.lr else: lr = opt.lr * 0.1 for p in optimizer.param_groups: p['lr'] = lr # start training best_rank1 = opt.best_rank best_epoch = 0 print('start train......') for epoch in range(start_epoch, opt.max_epoch): if opt.adjust_lr: adjust_lr(optimizer, epoch + 1) reid_trainer.train(epoch, trainloader) # skip if not save model if opt.eval_step > 0 and (epoch + 1) % opt.eval_step == 0 or ( epoch + 1) == opt.max_epoch: # just avoid out of memory during eval,and can't save the model if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1 }, is_best=0, save_dir=opt.save_dir, filename='checkpoint_ep' + str(epoch + 1) + '.pth.tar') if opt.mode == 'class': rank1 = test(model, queryloader) else: rank1 = reid_evaluator.evaluate(queryloader, galleryloader, queryFliploader, galleryFliploader) #rank1 = test(model, queryloader, galleryloader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() if is_best: save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1 }, is_best=is_best, save_dir=opt.save_dir, filename='checkpoint_ep' + str(epoch + 1) + '.pth.tar') print('Best rank-1 {:.1%}, achived at epoch {}'.format( best_rank1, best_epoch))
def train(**kwargs): opt._parse(kwargs) opt.model_name = 'bfe_test' # set random seed and cudnn benchmark torch.manual_seed(opt.seed) os.makedirs(opt.save_dir, exist_ok=True) use_gpu = torch.cuda.is_available() sys.stdout = Logger(osp.join(opt.save_dir, 'log_train.txt')) print('=========user config==========') pprint(opt._state_dict()) print('============end===============') if use_gpu: print('currently using GPU') cudnn.benchmark = True torch.cuda.manual_seed_all(opt.seed) else: print('currently using cpu') print('initializing dataset {}'.format(opt.dataset)) dataset = data_manager.init_dataset(name=opt.dataset, mode=opt.mode) pin_memory = True if use_gpu else False summary_writer = SummaryWriter(osp.join(opt.save_dir, 'tensorboard_log')) trainloader = DataLoader(ImageData(dataset.train, TrainTransform(opt.datatype)), sampler=RandomIdentitySampler( dataset.train, opt.num_instances), batch_size=opt.train_batch, num_workers=opt.workers, pin_memory=pin_memory, drop_last=True) queryloader = DataLoader(ImageData(dataset.query, TestTransform(opt.datatype)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory) galleryloader = DataLoader(ImageData(dataset.gallery, TestTransform(opt.datatype)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory) queryFliploader = DataLoader(ImageData(dataset.query, TestTransform(opt.datatype, True)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory) galleryFliploader = DataLoader(ImageData(dataset.gallery, TestTransform(opt.datatype, True)), batch_size=opt.test_batch, num_workers=opt.workers, pin_memory=pin_memory) print('initializing model ...') model = BFE(dataset.num_train_pids, 1.0, 0.33) optim_policy = model.get_optim_policy() if opt.pretrained_model: state_dict = torch.load(opt.pretrained_model)['state_dict'] # state_dict = {k: v for k, v in state_dict.items() \ # if not ('reduction' in k or 'softmax' in k)} model.load_state_dict(state_dict, False) print('load pretrained model ' + opt.pretrained_model) print('model size: {:.5f}M'.format( sum(p.numel() for p in model.parameters()) / 1e6)) if use_gpu: model = nn.DataParallel(model).cuda() reid_evaluator = ResNetEvaluator(model) if opt.evaluate: reid_evaluator.evaluate(queryloader, galleryloader, queryFliploader, galleryFliploader, re_ranking=opt.re_ranking, savefig=opt.savefig) return # xent_criterion = nn.CrossEntropyLoss() xent_criterion = CrossEntropyLabelSmooth(dataset.num_train_pids) if opt.loss == 'triplet': embedding_criterion = TripletLoss(opt.margin) elif opt.loss == 'lifted': embedding_criterion = LiftedStructureLoss(hard_mining=True) elif opt.loss == 'weight': embedding_criterion = Margin() def criterion(triplet_y, softmax_y, labels): losses = [embedding_criterion(output, labels)[0] for output in triplet_y] + \ [xent_criterion(output, labels) for output in softmax_y] loss = sum(losses) return loss # get optimizer if opt.optim == "sgd": optimizer = torch.optim.SGD(optim_policy, lr=opt.lr, momentum=0.9, weight_decay=opt.weight_decay) else: optimizer = torch.optim.Adam(optim_policy, lr=opt.lr, weight_decay=opt.weight_decay) start_epoch = opt.start_epoch # get trainer and evaluator reid_trainer = cls_tripletTrainer(opt, model, optimizer, criterion, summary_writer) def adjust_lr(optimizer, ep): if ep < 10: lr = opt.lr * 0.1 * (ep / 10.0) # warm_up elif ep < 50: lr = opt.lr * (ep // 5 + 1) elif ep < 200: lr = opt.lr * 10.0 elif ep < 300: lr = opt.lr else: lr = opt.lr * 0.1 for p in optimizer.param_groups: p['lr'] = lr # start training best_rank1 = opt.best_rank best_epoch = 0 for epoch in range(start_epoch, opt.max_epoch): if opt.adjust_lr: adjust_lr(optimizer, epoch + 1) reid_trainer.train(epoch, trainloader) # skip if not save model if opt.eval_step > 0 and (epoch + 1) % opt.eval_step == 0 or ( epoch + 1) == opt.max_epoch: if opt.mode == 'class': rank1 = test(model, queryloader) else: rank1 = reid_evaluator.evaluate(queryloader, galleryloader, queryFliploader, galleryFliploader) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1 }, is_best=is_best, save_dir=opt.save_dir, filename='checkpoint_ep' + str(epoch + 1) + '.pth.tar') print('Best rank-1 {:.1%}, achived at epoch {}'.format( best_rank1, best_epoch))
def train(**kwargs): opt._parse(kwargs) # set random seed and cudnn benchmark sys.stdout = Logger(osp.join(opt.save_dir, 'log_train.txt')) print('=========user config==========') pprint(opt._state_dict()) print('============end===============') print('initializing dataset {}'.format(opt.dataset)) dataset = data_manager.init_dataset(name=opt.dataset) summary_writer = SummaryWriter(osp.join(opt.save_dir, 'tensorboard_log')) if 'triplet' in opt.model_name: trainloader = DataLoader( ImageData(dataset.train, TrainTransform(opt.height, opt.width)), sampler=RandomIdentitySampler(dataset.train, opt.num_instances), batch_size=opt.train_batch, num_workers=opt.workers, last_batch='discard') else: trainloader = DataLoader( ImageData(dataset.train, TrainTransform(opt.height, opt.width)), batch_size=opt.train_batch, shuffle=True, num_workers=opt.workers, ) queryloader = DataLoader( ImageData(dataset.query, TestTransform(opt.height, opt.width)), batch_size=opt.test_batch, num_workers=opt.workers, ) galleryloader = DataLoader( ImageData(dataset.gallery, TestTransform(opt.height, opt.width)), batch_size=opt.test_batch, num_workers=opt.workers, ) print('initializing model ...') model = get_baseline_model(dataset.num_train_pids, mx.gpu(0), opt.pretrained_model) print('model size: {:.5f}M'.format( sum(p.data().size for p in model.collect_params().values()) / 1e6)) xent_criterion = gluon.loss.SoftmaxCrossEntropyLoss() tri_criterion = TripletLoss(opt.margin) def cls_criterion(cls_scores, feat, targets): cls_loss = xent_criterion(cls_scores, targets) return cls_loss def triplet_criterion(cls_scores, feat, targets): triplet_loss, dist_ap, dist_an = tri_criterion(feat, targets) return triplet_loss def cls_tri_criterion(cls_scores, feat, targets): cls_loss = xent_criterion(cls_scores, targets) triplet_loss, dist_ap, dist_an = tri_criterion(feat, targets) loss = cls_loss + triplet_loss return loss # get optimizer optimizer = gluon.Trainer(model.collect_params(), opt.optim, { 'learning_rate': opt.lr, 'wd': opt.weight_decay }) def adjust_lr(optimizer, ep): if ep < 20: lr = 1e-4 * (ep + 1) / 2 elif ep < 80: lr = 1e-3 * opt.num_gpu elif ep < 180: lr = 1e-4 * opt.num_gpu elif ep < 300: lr = 1e-5 * opt.num_gpu elif ep < 320: lr = 1e-5 * 0.1**((ep - 320) / 80) * opt.num_gpu elif ep < 400: lr = 1e-6 elif ep < 480: lr = 1e-4 * opt.num_gpu else: lr = 1e-5 * opt.num_gpu optimizer.set_learning_rate(lr) start_epoch = opt.start_epoch # get trainer and evaluator use_criterion = None if opt.model_name == 'softmax': use_criterion = cls_criterion elif opt.model_name == 'softmax_triplet': use_criterion = cls_tri_criterion elif opt.model_name == 'triplet': use_criterion = triplet_criterion reid_trainer = reidTrainer(opt, model, optimizer, use_criterion, summary_writer, mx.gpu(0)) reid_evaluator = reidEvaluator(model, mx.gpu(0)) # start training best_rank1 = -np.inf best_epoch = 0 for epoch in range(start_epoch, opt.max_epoch): if opt.step_size > 0: adjust_lr(optimizer, epoch + 1) reid_trainer.train(epoch, trainloader) # skip if not save model if opt.eval_step > 0 and (epoch + 1) % opt.eval_step == 0 or ( epoch + 1) == opt.max_epoch: rank1 = reid_evaluator.evaluate(queryloader, galleryloader) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 state_dict = {'model': model, 'epoch': epoch} save_checkpoint(state_dict, is_best=is_best, save_dir=opt.save_dir, filename='checkpoint_ep' + str(epoch + 1) + '.params') print('Best rank-1 {:.1%}, achived at epoch {}'.format( best_rank1, best_epoch))