def init_log(self, data_bunch: db.VideoDataBunch = None, model: nn.Module = None, criterion: nn.Module = None, optimizer: optim.Adam = None, lr_scheduler: lrs.MultiStepLR = None): self.print_options() if self.opts.debug: self.log(str(data_bunch)) self.log(str(model)) self.log(str(criterion)) self.log(str(optimizer)) self.log(str(lr_scheduler.state_dict() if lr_scheduler else None))
def train_general(args): args.optimizer = 'Adam' args.n_classes = 2 args.batch_size = 8 # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus # print(args.model_name) # print(args.test) if args.model_name == 'FCNet': model = FCNet(args).cuda() model = torch.nn.DataParallel(model) if args.optimizer == 'SGD': optimizer = SGD(model.parameters(), .1, weight_decay=5e-4, momentum=.99) elif args.optimizer == 'Adam': optimizer = Adam(model.parameters(), .1, weight_decay=5e-4) criterion = cross_entropy2d scheduler = MultiStepLR(optimizer, [100, 200, 400, 800, 3200], .1) elif args.model_name == 'CENet': model = CE_Net_(args).cuda() model = torch.nn.DataParallel(model) if args.optimizer == 'SGD': optimizer = SGD(model.parameters(), .1, weight_decay=5e-4, momentum=.99) scheduler = MultiStepLR(optimizer, [100, 200, 400, 800, 3200], .1) elif args.optimizer == 'Adam': optimizer = Adam(model.parameters(), .001, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, [400, 3200], .1) # criterion = cross_entropy2d criterion = DiceLoss() # scheduler = MultiStepLR(optimizer, [100, 200, 400, 800, 3200], .1) start_iter = 0 if args.model_path is not None: if os.path.isfile(args.model_path): checkpoint = torch.load(args.model_path) model.load_state_dict(checkpoint["model_state"]) optimizer.load_state_dict(checkpoint["optimizer_state"]) scheduler.load_state_dict(checkpoint["scheduler_state"]) start_iter = checkpoint["epoch"] else: print('Unable to load {}'.format(args.model_name)) train_loader, valid_loader = get_loaders(args) try: os.mkdir('logs/') except: pass try: os.mkdir('results/') except: pass try: os.mkdir('results/' + args.model_name) except: pass writer = SummaryWriter(log_dir='logs/') best = -100.0 i = start_iter flag = True running_metrics_val = Acc_Meter() val_loss_meter = averageMeter() time_meter = averageMeter() # while i <= args.niter and flag: while i <= 300000 and flag: for (images, labels) in train_loader: i += 1 start_ts = time.time() scheduler.step() model.train() images = images.cuda() labels = labels.cuda() optimizer.zero_grad() outputs = model(images) loss = criterion(input=outputs, target=labels) loss.backward() optimizer.step() time_meter.update(time.time() - start_ts) # if (i + 1) % cfg["training"]["print_interval"] == 0: if (i + 1) % 50 == 0: fmt_str = "Iter [{:d}/{:d}] Loss: {:.4f} Time/Image: {:.4f}" print_str = fmt_str.format( i + 1, 300000, loss.item(), time_meter.avg / args.batch_size, ) print(print_str) # logger.info(print_str) # writer.add_scalar("loss/train_loss", loss.item(), i + 1) # time_meter.reset() # if (i + 1) % cfg["training"]["val_interval"] == 0 or (i + 1) == cfg["training"]["train_iters"]: if (i + 1) % 500 == 0 or (i + 1) == 300000: model.eval() with torch.no_grad(): for i_val, (images_val, labels_val) in tqdm(enumerate(valid_loader)): images_val = images_val.cuda() # to(device) labels_val = labels_val.cuda() # to(device) outputs = model(images_val) # val_loss = loss_fn(input=outputs, target=labels_val) val_loss = criterion(input=outputs, target=labels_val) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels_val.data.cpu().numpy() running_metrics_val.update(gt, pred) val_loss_meter.update(val_loss.item()) # writer.add_scalar("loss/val_loss", val_loss_meter.avg, i + 1) print("Iter %d Loss: %.4f" % (i + 1, val_loss_meter.avg)) results = running_metrics_val.get_acc() for k, v in results.items(): writer.add_scalar(k, v, i + 1) print(results) val_loss_meter.reset() running_metrics_val.reset() if results['cls_acc'] >= best: best = results['cls_acc'] state = { "epoch": i + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict(), "best": best, } save_path = os.path.join( "results/{}/results_{}_best_model.pkl".format( args.model_name, i + 1), ) torch.save(state, save_path) if (i + 1) == 300000: flag = False break writer.close()
def main(): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu cudnn.benchmark = True start_epoch = args.start_epoch lr_decay_step = list(map(int, args.lr_decay_step.split(','))) # Data loading print_logger.info('=> Preparing data..') loader = import_module('data.' + args.dataset).Data(args) num_classes = 0 if args.dataset in ['cifar10']: num_classes = 10 model = eval(args.block_type + 'ResNet56_od')( groups=args.group_num, expansion=args.expansion, num_stu=args.num_stu, num_classes=num_classes).cuda() if len(args.gpu) > 1: device_id = [] for i in range((len(args.gpu) + 1) // 2): device_id.append(i) model = torch.nn.DataParallel(model, device_ids=device_id) best_prec = 0.0 if not model: print_logger.info("Model arch Error") return print_logger.info(model) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer, milestones=lr_decay_step, gamma=args.lr_decay_factor) # Optionally resume from a checkpoint resume = args.resume if resume: print('=> Loading checkpoint {}'.format(resume)) checkpoint = torch.load(resume) state_dict = checkpoint['state_dict'] if args.adjust_ckpt: new_state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } else: new_state_dict = state_dict if args.start_epoch == 0: start_epoch = checkpoint['epoch'] best_prec = checkpoint['best_prec'] model.load_state_dict(new_state_dict) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) print('=> Continue from epoch {}...'.format(start_epoch)) if args.test_only: test_prec = test(args, loader.loader_test, model) print('=> Test Prec@1: {:.2f}'.format(test_prec[0])) return record_top5 = 0. for epoch in range(start_epoch, args.epochs): scheduler.step(epoch) train_loss, train_prec = train(args, loader.loader_train, model, criterion, optimizer, epoch) test_prec = test(args, loader.loader_test, model, epoch) is_best = best_prec < test_prec[0] if is_best: record_top5 = test_prec[1] best_prec = max(test_prec[0], best_prec) state = { 'state_dict': model.state_dict(), 'test_prec': test_prec[0], 'best_prec': best_prec, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'epoch': epoch + 1 } if epoch % args.save_freq == 0 or is_best: ckpt.save_model(state, epoch + 1, is_best) print_logger.info("=>Best accuracy {:.3f}, {:.3f}".format( best_prec, record_top5))
def train(train_loop_func, logger, args): if args.amp: amp_handle = amp.init(enabled=args.fp16) # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(1e4) if args.distributed: args.seed = (args.seed + torch.distributed.get_rank()) % 2**32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) train_loader = get_train_loader(args, args.seed - 2**31) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(backbone=args.backbone) args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) start_epoch = 0 iteration = 0 loss_func = Loss(dboxes) if use_cuda: ssd300.cuda() loss_func.cuda() if args.fp16 and not args.amp: ssd300 = network_to_half(ssd300) if args.distributed: ssd300 = DDP(ssd300) optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.fp16: if args.amp: optimizer = amp_handle.wrap_optimizer(optimizer) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.) if args.checkpoint is not None: if os.path.isfile(args.checkpoint): load_checkpoint(ssd300, args.checkpoint) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage. cuda(torch.cuda.current_device())) start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] scheduler.load_state_dict(checkpoint['scheduler']) ssd300.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('Provided checkpoint is not path to a file') return inv_map = {v: k for k, v in val_dataset.label_map.items()} total_time = 0 if args.mode == 'evaluation': acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Model precision {} mAP'.format(acc)) return mean, std = generate_mean_std(args) for epoch in range(start_epoch, args.epochs): start_epoch_time = time.time() scheduler.step() iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration, logger, args, mean, std) end_epoch_time = time.time() - start_epoch_time total_time += end_epoch_time if args.local_rank == 0: logger.update_epoch_time(epoch, end_epoch_time) if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: logger.update_epoch(epoch, acc) if args.save and args.local_rank == 0: print("saving model...") obj = { 'epoch': epoch + 1, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'label_map': val_dataset.label_info } if args.distributed: obj['model'] = ssd300.module.state_dict() else: obj['model'] = ssd300.state_dict() torch.save(obj, './models/epoch_{}.pt'.format(epoch)) train_loader.reset() print('total training time: {}'.format(total_time))
def train(train_loop_func, logger, args): # Check that GPUs are actually available use_cuda = not args.no_cuda train_samples = 118287 # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='smddp', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(1e4) if args.distributed: args.seed = (args.seed + torch.distributed.get_rank()) % 2**32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) np.random.seed(seed=args.seed) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) train_loader = get_train_loader(args, args.seed - 2**31) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path)) args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) start_epoch = 0 iteration = 0 loss_func = Loss(dboxes) if use_cuda: ssd300.cuda() loss_func.cuda() optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.amp: ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2') if args.distributed: ssd300 = DDP(ssd300) if args.checkpoint is not None: if os.path.isfile(args.checkpoint): load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device())) start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] scheduler.load_state_dict(checkpoint['scheduler']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('Provided checkpoint is not path to a file') return inv_map = {v: k for k, v in val_dataset.label_map.items()} total_time = 0 if args.mode == 'evaluation': acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Model precision {} mAP'.format(acc)) return mean, std = generate_mean_std(args) for epoch in range(start_epoch, args.epochs): start_epoch_time = time.time() scheduler.step() iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration, logger, args, mean, std) end_epoch_time = time.time() - start_epoch_time total_time += end_epoch_time if torch.distributed.get_rank() == 0: throughput = train_samples / end_epoch_time logger.update_epoch_time(epoch, end_epoch_time) logger.update_throughput_speed(epoch, throughput) if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.save and args.local_rank == 0: print("saving model...") obj = {'epoch': epoch + 1, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'label_map': val_dataset.label_info} if args.distributed: obj['model'] = ssd300.module.state_dict() else: obj['model'] = ssd300.state_dict() save_path = os.path.join(args.save, f'epoch_{epoch}.pt') torch.save(obj, save_path) logger.log('model path', save_path) train_loader.reset() if torch.distributed.get_rank() == 0: DLLogger.log((), { 'Total training time': '%.2f' % total_time + ' secs' }) logger.log_summary()
model_state_path = os.path.join( Path.MODEL_DIR, 'densenet161_noisy_gpu_{}.tar'.format(args.gpu)) for epoch in range(MAX_ITERATIONS): model.train() for idx, ((x1, y1), (x2, y2)) in enumerate(zip(focus_dl, train_dl)): optimizer.zero_grad() x1, y1 = x1.float().cuda(), y1.float().cuda() x2, y2 = x2.float().cuda(), y2.float().cuda() x2, y2 = mixup_data(x2, y2, x2, y2, alpha=alpha) out2 = model(x2) out1 = model(x1) loss2 = criterion(out2, y2) loss1 = criterion(out1, y1) loss = w1[epoch] * loss1 + w2[epoch] * loss2 loss.backward() optimizer.step() scheduler.step() model_state = { "model_name": 'freesound', "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), "state_dict": model.state_dict() } torch.save(model_state, model_state_path) model_state = torch.load(model_state_path) model.load_state_dict(model_state["state_dict"])
def main(): if not torch.cuda.is_available(): raise Exception("need gpu to train network!") torch.manual_seed(0) torch.cuda.manual_seed_all(0) cudnn.benchmark = True cudnn.enabled = True logger = get_logger(__name__, Config.log) Config.gpus = torch.cuda.device_count() logger.info("use {} gpus".format(Config.gpus)) config = { key: value for key, value in Config.__dict__.items() if not key.startswith("__") } logger.info(f"args: {config}") start_time = time.time() # dataset and dataloader logger.info("start loading data") train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) train_dataset = ImageFolder(Config.train_dataset_path, train_transform) train_loader = DataLoader( train_dataset, batch_size=Config.batch_size, shuffle=True, num_workers=Config.num_workers, pin_memory=True, ) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) val_dataset = ImageFolder(Config.val_dataset_path, val_transform) val_loader = DataLoader( val_dataset, batch_size=Config.batch_size, num_workers=Config.num_workers, pin_memory=True, ) logger.info("finish loading data") # network net = ChannelDistillResNet1834(Config.num_classes, Config.dataset_type) net = nn.DataParallel(net).cuda() # loss and optimizer criterion = [] for loss_item in Config.loss_list: loss_name = loss_item["loss_name"] loss_type = loss_item["loss_type"] if "kd" in loss_type: criterion.append(losses.__dict__[loss_name](loss_item["T"]).cuda()) else: criterion.append(losses.__dict__[loss_name]().cuda()) optimizer = SGD(net.parameters(), lr=Config.lr, momentum=0.9, weight_decay=1e-4) scheduler = MultiStepLR(optimizer, milestones=[30, 60, 90], gamma=0.1) # only evaluate if Config.evaluate: # load best model if not os.path.isfile(Config.evaluate): raise Exception( f"{Config.evaluate} is not a file, please check it again") logger.info("start evaluating") logger.info(f"start resuming model from {Config.evaluate}") checkpoint = torch.load(Config.evaluate, map_location=torch.device("cpu")) net.load_state_dict(checkpoint["model_state_dict"]) prec1, prec5 = validate(val_loader, net) logger.info( f"epoch {checkpoint['epoch']:0>3d}, top1 acc: {prec1:.2f}%, top5 acc: {prec5:.2f}%" ) return start_epoch = 1 # resume training if os.path.exists(Config.resume): logger.info(f"start resuming model from {Config.resume}") checkpoint = torch.load(Config.resume, map_location=torch.device("cpu")) start_epoch += checkpoint["epoch"] net.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) logger.info( f"finish resuming model from {Config.resume}, epoch {checkpoint['epoch']}, " f"loss: {checkpoint['loss']:3f}, lr: {checkpoint['lr']:.6f}, " f"top1_acc: {checkpoint['acc']}%, loss {checkpoint['loss']}%") if not os.path.exists(Config.checkpoints): os.makedirs(Config.checkpoints) logger.info("start training") best_acc = 0. for epoch in range(start_epoch, Config.epochs + 1): prec1, prec5, loss = train(train_loader, net, criterion, optimizer, scheduler, epoch, logger) logger.info( f"train: epoch {epoch:0>3d}, top1 acc: {prec1:.2f}%, top5 acc: {prec5:.2f}%" ) prec1, prec5 = validate(val_loader, net) logger.info( f"val: epoch {epoch:0>3d}, top1 acc: {prec1:.2f}%, top5 acc: {prec5:.2f}%" ) # remember best prec@1 and save checkpoint torch.save( { "epoch": epoch, "acc": prec1, "loss": loss, "lr": scheduler.get_lr()[0], "model_state_dict": net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), }, os.path.join(Config.checkpoints, "latest.pth")) if prec1 > best_acc: shutil.copyfile(os.path.join(Config.checkpoints, "latest.pth"), os.path.join(Config.checkpoints, "best.pth")) best_acc = prec1 training_time = (time.time() - start_time) / 3600 logger.info( f"finish training, best acc: {best_acc:.2f}%, total training time: {training_time:.2f} hours" )
def main(): args = get_cli_args() print('Will save to ' + args.output_dir) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open(os.path.join(args.output_dir, 'cmdline.txt'), 'w') as f: f.write(" ".join([ "'" + a + "'" if (len(a) == 0 or a[0] != '-') else a for a in sys.argv ])) set_seed(args.seed) device = torch.device(args.device) writer = SummaryWriter(args.output_dir) train_dataset, test_dataset = create_tough_dataset( args, fold_nr=args.cvfold, n_folds=args.num_folds, seed=args.seed, exclude_Vertex_from_train=args.db_exclude_vertex, exclude_Prospeccts_from_train=args.db_exclude_prospeccts) logger.info('Train set size: %d, test set size: %d', len(train_dataset), len(test_dataset)) # Create model and optimizer (or resume pre-existing) if args.resume != '': if args.resume == 'RESUME': args.resume = args.output_dir + '/model.pth.tar' model, optimizer, scheduler = resume(args, train_dataset, device) else: model = create_model(args, train_dataset, device) if args.input_normalization: model.set_input_scaler( estimate_scaler(args, train_dataset, nsamples=200)) optimizer = create_optimizer(args, model) scheduler = MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_decay) ############ def train(): model.train() loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size // args.batch_parts, num_workers=args.nworkers, shuffle=True, drop_last=True, worker_init_fn=set_worker_seed) if logging.getLogger().getEffectiveLevel() > logging.DEBUG: loader = tqdm(loader, ncols=100) loss_buffer, loss_stabil_buffer, pos_dist_buffer, neg_dist_buffer = [], [], [], [] t0 = time.time() for bidx, batch in enumerate(loader): if 0 < args.max_train_samples < bidx * args.batch_size // args.batch_parts: break t_loader = 1000 * (time.time() - t0) inputs = batch['inputs'].to( device) # dimensions: batch_size x (4 or 2) x 24 x 24 x 24 targets = batch['targets'].to(device) if bidx % args.batch_parts == 0: optimizer.zero_grad() t0 = time.time() outputs = model(inputs.view(-1, *inputs.shape[2:])) outputs = outputs.view(*inputs.shape[:2], -1) loss_joint, loss_match, loss_stabil, pos_dist, neg_dist = compute_loss( args, outputs, targets, True) loss_joint.backward() if bidx % args.batch_parts == args.batch_parts - 1: if args.batch_parts > 1: for p in model.parameters(): p.grad.data.div_(args.batch_parts) optimizer.step() t_trainer = 1000 * (time.time() - t0) loss_buffer.append(loss_match.item()) loss_stabil_buffer.append(loss_stabil.item( ) if isinstance(loss_stabil, torch.Tensor) else loss_stabil) pos_dist_buffer.extend(pos_dist.cpu().numpy().tolist()) neg_dist_buffer.extend(neg_dist.cpu().numpy().tolist()) logger.debug( 'Batch loss %f, Loader time %f ms, Trainer time %f ms.', loss_buffer[-1], t_loader, t_trainer) t0 = time.time() ret = { 'loss': np.mean(loss_buffer), 'loss_stabil': np.mean(loss_stabil_buffer), 'pos_dist': np.mean(pos_dist_buffer), 'neg_dist': np.mean(neg_dist_buffer) } return ret ############ def test(): model.eval() loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size // args.batch_parts, num_workers=args.nworkers, worker_init_fn=set_worker_seed) if logging.getLogger().getEffectiveLevel() > logging.DEBUG: loader = tqdm(loader, ncols=100) loss_buffer, loss_stabil_buffer, pos_dist_buffer, neg_dist_buffer = [], [], [], [] with torch.no_grad(): for bidx, batch in enumerate(loader): if 0 < args.max_test_samples < bidx * args.batch_size // args.batch_parts: break inputs = batch['inputs'].to(device) targets = batch['targets'].to(device) outputs = model(inputs.view(-1, *inputs.shape[2:])) outputs = outputs.view(*inputs.shape[:2], -1) loss_joint, loss_match, loss_stabil, pos_dist, neg_dist = compute_loss( args, outputs, targets, False) loss_buffer.append(loss_match.item()) loss_stabil_buffer.append(loss_stabil.item( ) if isinstance(loss_stabil, torch.Tensor) else loss_stabil) pos_dist_buffer.extend(pos_dist.cpu().numpy().tolist()) neg_dist_buffer.extend(neg_dist.cpu().numpy().tolist()) return { 'loss': np.mean(loss_buffer), 'loss_stabil': np.mean(loss_stabil_buffer), 'pos_dist': np.mean(pos_dist_buffer), 'neg_dist': np.mean(neg_dist_buffer) } ############ # Training loop for epoch in range(args.start_epoch, args.epochs): print(f'Epoch {epoch}/{args.epochs} ({args.output_dir}):') scheduler.step() train_stats = train() for k, v in train_stats.items(): writer.add_scalar('train/' + k, v, epoch) print( f"-> Train distances: p {train_stats['pos_dist']}, n {train_stats['neg_dist']}, \tLoss: {train_stats['loss']}" ) if (epoch + 1) % args.test_nth_epoch == 0 or epoch + 1 == args.epochs: test_stats = test() for k, v in test_stats.items(): writer.add_scalar('test/' + k, v, epoch) print( f"-> Test distances: p {test_stats['pos_dist']}, n {test_stats['neg_dist']}, \tLoss: {test_stats['loss']}" ) torch.save( { 'epoch': epoch + 1, 'args': args, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, os.path.join(args.output_dir, 'model.pth.tar')) if math.isnan(train_stats['loss']): break
def main(): # random.seed(0) # torch.manual_seed(0) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False # np.random.seed(0) # torch.cuda.manual_seed(0) # set all hyperparameters network_name = 'WRN_40_2' num_epochs = 35 batch_size = 1 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") n_retrain_epochs = 40 trials = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9] lr = 3e-4 opt = "Adam" use_temp = False use_steps = False # set paths checkpointPath = './GNN_model/CIFAR10_checkpoints/CP__num_e_{}__retrain_e_{}__lr_{}__opt_{}__useTemp_{}__useSteps_{}__epoch_{}.pt'.format(num_epochs, n_retrain_epochs, lr, opt, use_temp, use_steps, '{}') continue_train = False checkpointLoadPath = './GNN_model/CIFAR10_checkpoints/CP__num_e_{}__retrain_e_{}__lr_{}__opt_{}__useTemp_{}__useSteps_{}__epoch_{}.pt'.format(num_epochs, n_retrain_epochs, lr, opt, use_temp, use_steps, '20') # get GNN path info = networks_data.get(network_name) trained_model_path = info.get('trained_GNN_path').replace('.pt', '___num_e_{}__retrain_e_{}__lr_{}__opt_{}__useTemp_{}__useSteps_{}.pt'.format(num_epochs, n_retrain_epochs, lr, opt, use_temp, use_steps)) # declare GNN model model = GNNPrunningNet(in_channels=6, out_channels=128).to(device) if opt == "Adam": optimizer = torch.optim.Adam(model.parameters(), lr=lr) else: # lr = 0.1 optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[int(elem*num_epochs) for elem in [0.3, 0.6, 0.8]], gamma=0.2) crit = GNN_prune_loss # declate TensorBoard writer summary_path = '{}-num_e_{}__retrain_e_{}__lr_{}__opt_{}__useTemp_{}__useSteps_{}/training'.format(network_name, num_epochs, n_retrain_epochs, lr, opt, use_temp, use_steps) writer = SummaryWriter(summary_path) root = info.get('root') net_graph_path = info.get('graph_path') sd_path = info.get('sd_path') net = info.get('network') orig_net_loss = info.get('orig_net_loss') isWRN = (network_name == "WRN_40_2") train_dataset = GraphDataset(root, network_name, isWRN, net_graph_path) train_loader = DataLoader(train_dataset, batch_size=batch_size) orig_net = net().to(device) orig_net.load_state_dict(torch.load(sd_path, map_location=device)) model.train() dataset_name = info.get('dataset_name') network_train_data = datasets_train.get(dataset_name) print("Start training") if continue_train == True: cp = torch.load(checkpointLoadPath, map_location=device) trained_epochs = cp['epoch'] + 1 sd = cp['model_state_dict'] model.load_state_dict(sd) op_sd = cp['optimizer_state_dict'] optimizer.load_state_dict(op_sd) else: trained_epochs = 0 loss_all = 0.0 data_all = 0.0 sparse_all = 0.0 if use_temp == True: T = 1.0 if trained_epochs > 0: T = np.power(2, np.floor(trained_epochs / int(num_epochs/3))) for epoch in range(trained_epochs, num_epochs): for data in train_loader: data = data.to(device) optimizer.zero_grad() output = model(data) if use_temp == True: # Use temperature nom = torch.pow((torch.exp(torch.tensor(T, device=device))), output) dom = torch.pow((torch.exp(torch.tensor(T, device=device))), output) + torch.pow((torch.exp(torch.tensor(T, device=device))), (1-output)) output = nom/dom # continue as usual sparse_term, data_term, data_grad = crit(output, orig_net, orig_net_loss, network_name, network_train_data, device, gamma1=10, gamma2=0.1) if use_steps == True: if epoch % 3 == 0: # do 2 steps in data direction then 1 in sparsity sparse_term.backward() else: output.backward(data_grad) else: sparse_term.backward(retain_graph=True) output.backward(data_grad) data_all += data.num_graphs * data_term.item() sparse_all += data.num_graphs * sparse_term.item() loss_all += data_all + sparse_all optimizer.step() print("epoch {}. total loss is: {}".format(epoch+1, (data_term.item() + sparse_term.item()) / len(train_dataset))) if opt != "Adam": scheduler.step() if use_temp == True: # increase temperature 3 times if (epoch+1) % int(num_epochs/3) == 0: T *= 2 if epoch % 10 == 9: writer.add_scalars('Learning curve', { 'loss data term': data_all/10, 'loss sparsity term': sparse_all/10, 'training loss': loss_all/10 }, epoch+1) # save checkpoint if opt == "Adam": torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss_all, }, checkpointPath.format(epoch+1)) else: torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss_all, 'scheduler_state_dict': scheduler.state_dict(), }, checkpointPath.format(epoch+1)) loss_all = 0.0 data_all = 0.0 sparse_all = 0.0 torch.save(model.state_dict(), trained_model_path) print("Start evaluating") model.load_state_dict(torch.load(trained_model_path, map_location=device)) model.eval() network_val_data = datasets_test.get(dataset_name) val_data_loader = torch.utils.data.DataLoader(network_val_data, batch_size=1024, shuffle=False, num_workers=8) for trial, p_factor in enumerate(trials): with torch.no_grad(): for data in train_loader: data = data.to(device) pred = model(data) prunedNet = getPrunedNet(pred, orig_net, network_name, prune_factor=p_factor).to(device) # Train the pruned network prunedNet.train() data_train_loader = torch.utils.data.DataLoader(network_train_data, batch_size=256, shuffle=False, num_workers=8) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(prunedNet.parameters(), lr=0.1, momentum=0.9, nesterov=True, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[int(elem*n_retrain_epochs) for elem in [0.3, 0.6, 0.8]], gamma=0.2) for epoch in range (n_retrain_epochs): for i, (images, labels) in enumerate(data_train_loader): images, labels = images.to(device), labels.to(device) optimizer.zero_grad() output = prunedNet(images) loss = criterion(output, labels) if i % 30 == 0: print('Train - Epoch %d, Batch: %d, Loss: %f' % (epoch+1, i, loss.detach().cpu().item())) loss.backward() optimizer.step() scheduler.step() # Evaluate the pruned net with torch.no_grad(): total_correct = 0 cuda_time = 0.0 cpu_time = 0.0 for i, (images, labels) in enumerate(val_data_loader): images, labels = images.to(device), labels.to(device) with torch.autograd.profiler.profile(use_cuda=True) as prof: output = prunedNet(images) cuda_time += sum([item.cuda_time for item in prof.function_events]) cpu_time += sum([item.cpu_time for item in prof.function_events]) pred = output.detach().max(1)[1] total_correct += pred.eq(labels.view_as(pred)).sum() p_acc = float(total_correct) / len(network_val_data) p_num_params = gnp(prunedNet) p_cuda_time = cuda_time / len(network_val_data) p_cpu_time = cpu_time / len(network_val_data) print("The pruned network for prune factor {} accuracy is: {}".format(p_factor, p_acc)) print("The pruned network number of parameters is: {}".format(p_num_params)) print("The pruned network cuda time is: {}".format(p_cuda_time)) print("The pruned network cpu time is: {}".format(p_cpu_time)) # Evaluate the original net with torch.no_grad(): total_correct = 0 cuda_time = 0.0 cpu_time = 0.0 for i, (images, labels) in enumerate(val_data_loader): images, labels = images.to(device), labels.to(device) with torch.autograd.profiler.profile(use_cuda=True) as prof: output = orig_net(images) cuda_time += sum([item.cuda_time for item in prof.function_events]) cpu_time += sum([item.cpu_time for item in prof.function_events]) pred = output.detach().max(1)[1] total_correct += pred.eq(labels.view_as(pred)).sum() o_acc = float(total_correct) / len(network_val_data) o_num_params = gnp(orig_net) o_cuda_time = cuda_time / len(network_val_data) o_cpu_time = cpu_time / len(network_val_data) print("The original network accuracy is: {}".format(o_acc)) print("The original network number of parameters is: {}".format(o_num_params)) print("The original network cuda time is: {}".format(o_cuda_time)) print("The original network cpu time is: {}".format(o_cpu_time)) writer.add_scalars('Network accuracy', { 'original': o_acc, 'pruned': p_acc }, 100*p_factor) writer.add_scalars('Network number of parameters', { 'original': o_num_params, 'pruned': p_num_params }, 100*p_factor) writer.add_scalars('Network GPU time', { 'original': o_cuda_time, 'pruned': p_cuda_time }, 100*p_factor) writer.add_scalars('Network CPU time', { 'original': o_cpu_time, 'pruned': p_cpu_time }, 100*p_factor) writer.close()
class ImageNetAgent: def __init__(self, config, rank=-1): self.rank = rank self.config = config # Training environment if config['train']['mode'] == 'parallel': gpu_id = config['train']['gpus'][rank] self.device = "cuda:{}".format(gpu_id) else: self.device = config['train']['device'] if torch.cuda.is_available( ) else "cpu" # Dataset train_transform = T.Compose([ T.RandomResizedCrop( (config['dataset']['size'], config['dataset']['size'])), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) valid_transform = T.Compose([ T.Resize(256), T.CenterCrop( (config['dataset']['size'], config['dataset']['size'])), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) train_dataset = ImageFolder(config['dataset']['train']['root'], transform=train_transform) valid_dataset = ImageFolder(config['dataset']['valid']['root'], transform=valid_transform) # Dataloader if config['train']['mode'] == 'parallel': self.sampler = DistributedSampler(train_dataset) self.train_loader = DataLoader( train_dataset, sampler=self.sampler, batch_size=config['dataloader']['batch_size'], num_workers=config['dataloader']['num_workers'], pin_memory=True, shuffle=False) else: self.train_loader = DataLoader( train_dataset, batch_size=config['dataloader']['batch_size'], num_workers=config['dataloader']['num_workers'], pin_memory=True, shuffle=True) self.valid_loader = DataLoader( valid_dataset, batch_size=config['dataloader']['batch_size'], num_workers=config['dataloader']['num_workers'], pin_memory=True, shuffle=False) # Model if config['model']['name'] == "resnet18": model_cls = resnet18 else: model_cls = get_model_cls(config['model']['name']) model = model_cls(**config['model']['kwargs']) if config['train']['mode'] == 'parallel': model = model.to(self.device) self.model = DDP(model, device_ids=[config['train']['gpus'][rank]]) # checkpoint = torch.load("run/darknet53_dist/best.pth") # self.model.load_state_dict(checkpoint['model']) else: self.model = model.to(self.device) # Optimizer self.optimizer = optim.SGD( self.model.parameters(), lr=config['optimizer']['lr'], momentum=config['optimizer']['momentum'], weight_decay=config['optimizer']['weight_decay']) # Scheduler self.scheduler = MultiStepLR( self.optimizer, milestones=config['scheduler']['milestones'], gamma=config['scheduler']['gamma']) # Loss funciton self.criterion = nn.CrossEntropyLoss().to(self.device) # Tensorboard self.log_dir = osp.join(config['train']['log_dir'], config['train']['exp_name']) if ((self.rank == 0 and config['train']['mode'] == 'parallel') or self.rank < 0): self.writer = SummaryWriter(logdir=self.log_dir) # Dynamic state self.current_epoch = -1 self.current_loss = 10000 def resume(self): checkpoint_path = osp.join(self.log_dir, 'best.pth') if self.config['train']['mode'] == 'parallel': master_gpu_id = self.config['train']['gpus'][0] map_location = {'cuda:{}'.format(master_gpu_id): self.device} checkpoint = torch.load(checkpoint_path, map_location=map_location) else: checkpoint = torch.load(checkpoint_path) # Load pretrained model self.model.load_state_dict(checkpoint['model']) self.optimizer.load_state_dict(checkpoint['optimizer']) self.scheduler.load_state_dict(checkpoint['scheduler']) # Resume to training state self.current_loss = checkpoint['current_loss'] self.current_epoch = checkpoint['current_epoch'] print("Resume Training at epoch {}".format(self.current_epoch)) def train(self): for epoch in range(self.current_epoch + 1, self.config['train']['n_epochs']): self.current_epoch = epoch if self.config['train']['mode'] == 'parallel': self.sampler.set_epoch(self.current_epoch) self.train_one_epoch() self.validate() self.scheduler.step() else: self.train_one_epoch() self.validate() self.scheduler.step() def train_one_epoch(self): losses = [] running_samples = 0 running_corrects = 0 self.model.train() loop = tqdm( self.train_loader, desc= (f"[{self.rank}] Train Epoch {self.current_epoch}/{self.config['train']['n_epochs']}" f"- LR: {self.optimizer.param_groups[0]['lr']:.3f}"), leave=True) for batch_idx, (imgs, labels) in enumerate(loop): imgs = imgs.to(self.device, non_blocking=True) labels = labels.to(self.device, non_blocking=True) outputs = self.model(imgs) loss = self.criterion(outputs, labels) self.optimizer.zero_grad() loss.backward() self.optimizer.step() preds = torch.max(outputs.data, 1)[1] corrects = float(torch.sum(preds == labels.data)) running_samples += imgs.size(0) running_corrects += corrects losses.append(loss.item()) loop.set_postfix(loss=sum(losses) / len(losses)) if self.rank <= 0: epoch_loss = sum(losses) / len(losses) epoch_acc = running_corrects / running_samples self.writer.add_scalar("Train Loss", epoch_loss, self.current_epoch) self.writer.add_scalar("Train Acc", epoch_acc, self.current_epoch) print("Epoch {}:{}, Train Loss: {:.2f}, Train Acc: {:.2f}".format( self.current_epoch, self.config['train']['n_epochs'], epoch_loss, epoch_acc)) def validate(self): losses = [] running_samples = 0 running_corrects = 0 self.model.eval() loop = tqdm( self.valid_loader, desc= (f"Valid Epoch {self.current_epoch}/{self.config['train']['n_epochs']}" f"- LR: {self.optimizer.param_groups[0]['lr']:.3f}"), leave=True) with torch.no_grad(): for batch_idx, (imgs, labels) in enumerate(loop): imgs = imgs.to(self.device, non_blocking=True) labels = labels.to(self.device, non_blocking=True) outputs = self.model(imgs) loss = self.criterion(outputs, labels) preds = torch.max(outputs.data, 1)[1] corrects = float(torch.sum(preds == labels.data)) running_samples += imgs.size(0) running_corrects += corrects losses.append(loss.item()) loop.set_postfix(loss=sum(losses) / len(losses)) if self.rank <= 0: epoch_loss = sum(losses) / len(losses) epoch_acc = running_corrects / running_samples print("Epoch {}:{}, Valid Loss: {:.2f}, Valid Acc: {:.2f}".format( self.current_epoch, self.config['train']['n_epochs'], epoch_loss, epoch_acc)) self.writer.add_scalar("Valid Loss", epoch_loss, self.current_epoch) self.writer.add_scalar("Valid Acc", epoch_acc, self.current_epoch) if epoch_loss < self.current_loss: self.current_loss = epoch_loss self._save_checkpoint() def finalize(self): pass def _save_checkpoint(self): checkpoints = { 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict(), 'current_epoch': self.current_epoch, 'current_loss': self.current_loss } checkpoint_path = osp.join(self.log_dir, 'best.pth') torch.save(checkpoints, checkpoint_path) print("Save checkpoint to '{}'".format(checkpoint_path))
def main(): # Views the training images and displays the distance on anchor-negative and anchor-positive # print the experiment configuration print('\nCurrent time is \33[91m{}\33[0m'.format(str(time.asctime()))) print('Parsed options: {}'.format(vars(args))) print('Number of Classes: {}\n'.format(len(train_dir.speakers))) # instantiate # model and initialize weights model = ExporingResNet(layers=[3, 4, 6, 3], num_classes=1211) assert os.path.isfile(args.resume) print('=> loading checkpoint {}'.format(args.resume)) checkpoint = torch.load(args.resume) filtered = { k: v for k, v in checkpoint['state_dict'].items() if 'num_batches_tracked' not in k } model.load_state_dict(filtered) model.fc2 = nn.Linear(args.embedding_size, len(train_dir.speakers)) # criterion = AngularSoftmax(in_feats=args.embedding_size, # num_classes=len(train_dir.classes)) if args.cuda: model.cuda() fc2_params = list(map(id, model.fc2.parameters())) base_params = filter(lambda p: id(p) not in fc2_params, model.parameters()) optimizer = torch.optim.SGD( [{ 'params': base_params }, { 'params': model.fc2.parameters(), 'lr': args.lr * 10 }], lr=args.lr, momentum=args.momentum, ) # optimizer2 = create_optimizer(model.fc2.parameters(), args.optimizer, **opt_kwargs) scheduler = MultiStepLR(optimizer, milestones=[8], gamma=0.1) start = 0 if args.save_init: check_path = '{}/checkpoint_{}.pth'.format(args.check_path, start) torch.save( { 'epoch': start, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, check_path) # optionally resume from a checkpoint start += args.start_epoch print('Start epoch is : ' + str(start)) end = start + args.epochs # pdb.set_trace() train_loader = torch.utils.data.DataLoader(train_dir, batch_size=args.batch_size, shuffle=True, **kwargs) valid_loader = torch.utils.data.DataLoader(valid_dir, batch_size=args.batch_size, shuffle=False, **kwargs) test_loader = torch.utils.data.DataLoader(test_part, batch_size=args.test_batch_size, shuffle=False, **kwargs) criterion = nn.CrossEntropyLoss().cuda() for epoch in range(start, end): # pdb.set_trace() for param_group in optimizer.param_groups: print( '\n\33[1;34m Current \'{}\' learning rate is {}.\33[0m'.format( args.optimizer, param_group['lr'])) train(train_loader, model, optimizer, criterion, scheduler, epoch) test(test_loader, valid_loader, model, epoch) scheduler.step() # break writer.close()
end = time.time() log_2 = 'Using {} sec for epoch {}'.format(start - end, i + 1) cu.write_log(log_dir, log_2) ## validation vali_loss, recall, precision, accuracy = validation( model, validation_dataloader, loss, gpu, optimizer, validation_dataset) log_3 = '----------------Recall for epoch {} is : {}'.format(i + 1, recall) log_4 = '----------------Precision for epoch {} is : {}'.format( i + 1, precision) log_5 = '----------------Accuracy for epoch {} is {}'.format( i + 1, accuracy) log_6 = '----------------Validation loss for epoch {} is {}'.format( i + 1, vali_loss) cu.write_log(log_dir, log_3) cu.write_log(log_dir, log_4) cu.write_log(log_dir, log_5) cu.write_log(log_dir, log_6) if vali_loss < vali_loss_p: vali_loss_p = vali_loss # give loss new value checkpoint = { 'model_state': model.state_dict(), 'criterion_state': loss.state_dict(), 'optimizer_state': optimizer.state_dict(), 'scheduler_state': scheduler.state_dict(), 'epochs': i + 1 } torch.save(checkpoint, checkpoint_dir + 'model' + '.pth')
class Trainner(object): def __init__(self, opt, saver, summary): self.opt = opt self.saver = saver self.summary = summary self.global_steps = 0 self.setup_models() print('initialize trainner') def setup_models(self): self.criteriasMSE = nn.MSELoss() self.model = EDSeg(self.opt).cuda(self.opt.device) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.opt.lr) self.schedule = MultiStepLR(self.optimizer, milestones=[1600], gamma=self.opt.gamma) if self.opt.device_count > 1: self.model = nn.DataParallel(self.model, device_ids=self.opt.devices) def train_iter(self, images, labels): self.model.train() output = self.model(images) loss = self.criteriasMSE(output, images) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.schedule.step() if self.global_steps % self.opt.v_freq == 0: lr = self.optimizer.param_groups[0]['lr'] self.summary.add_scalar('loss', loss.item(), self.global_steps) self.summary.add_scalar('lr', lr, self.global_steps) self.summary.train_image(images, output, labels, self.global_steps) def train_epoch(self, dataloader): iterator = tqdm(dataloader, leave=True, dynamic_ncols=True) self.dataset_len = len(dataloader) for i, data in enumerate(iterator): iterator.set_description(f'Epoch[{self.epoch}/{self.opt.epochs}|{self.global_steps}]') self.global_steps = self.epoch * self.dataset_len + i if isinstance(data, tuple): images = data[0] labels = data[1] else: labels = None images = data images = images.to(self.opt.device) labels = labels.to(self.opt.device) if labels else None self.train_iter(images, labels) def train(self, train_dataloader, valid_dataloader): self.epoch = 0 while self.epoch < self.opt.epochs: self.train_epoch(train_dataloader) self.validate(valid_dataloader) self.epoch += 1 def validate(self, dataloader): self.model.eval() errs = [] output = None for data in dataloader: if isinstance(data, tuple): images = data[0] labels = data[1] else: labels = None images = data images = images.to(self.opt.device) labels = labels.to(self.opt.device) if labels else None output = self.model(images) err = self.criteriasMSE(output, images) errs.append(err) mean_err = torch.tensor(errs).mean() self.summary.add_scalar('validate_error', mean_err, self.global_steps) self.summary.val_image(images, output, labels, self.global_steps) self.save_checkpoint(mean_err) def save_checkpoint(self, err): state = {'epoch': self.epoch, 'err': err, 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'schedule': self.schedule.state_dict()} self.saver.save_checkpoint(state) def load_checkpoint(self, state): self.model.load_state_dict(state['model']) self.optimizer.load_state_dict(state['optimizer']) self.schedule.load_state_dict(state['schedule']) self.epoch = state['epoch'] def resume(self): if self.opt.resume: if self.opt.resume_best: state = self.saver.load_best() elif self.opt.resume_latest: state = self.saver.load_latest() elif self.opt.resume_epoch is not None: state = self.saver.load_epoch(self.opt.resume_epoch) else: raise RuntimeError('resume settings error, please check your config file.') self.load_checkpoint(state) else: print('resume not enabled, pass')
class Processor(): """Processor for Skeleton-based Action Recgnition""" def __init__(self, arg): self.arg = arg self.save_arg() self._init_dist_pytorch(backend='nccl', world_size=torch.cuda.device_count()) self.load_model() self.load_param_groups() self.load_optimizer() self.load_lr_scheduler() self.load_data() self.global_step = 0 self.lr = self.arg.base_lr self.best_acc = 0 self.best_acc_epoch = 0 if self.arg.half: self.print_log('*************************************') self.print_log('*** Using Half Precision Training ***') self.print_log('*************************************') self.model, self.optimizer = apex.amp.initialize( self.model, self.optimizer, opt_level=f'O{self.arg.amp_opt_level}') if self.arg.amp_opt_level != 1: self.print_log( '[WARN] nn.DataParallel is not yet supported by amp_opt_level != "O1"' ) self.runner() def _init_dist_pytorch(self, backend, **kwargs): rank = int(os.environ['RANK']) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend, **kwargs) def load_model(self): Model = import_class(self.arg.model) # Copy model file and main shutil.copy2(inspect.getfile(Model), self.arg.work_dir) shutil.copy2(os.path.join('.', __file__), self.arg.work_dir) self._model = Model(**self.arg.model_args).cuda() self.loss = nn.CrossEntropyLoss().cuda() self.print_log( f'Model total number of params: {count_params(self._model)}') if self.arg.weights: try: self.global_step = int(arg.weights[:-3].split('-')[-1]) except: print('Cannot parse global_step from model weights filename') self.global_step = 0 self.print_log(f'Loading weights from {self.arg.weights}') if '.pkl' in self.arg.weights: with open(self.arg.weights, 'r') as f: weights = pickle.load(f) elif '.pth' in self.arg.weights: weights = torch.load(self.arg.weights)["state_dict"] weights = OrderedDict([[k.split('network.')[-1], v.cuda()] for k, v in weights.items()]) else: weights = torch.load(self.arg.weights) weights = OrderedDict([[k.split('module.')[-1], v.cuda()] for k, v in weights.items()]) for w in self.arg.ignore_weights: if weights.pop(w, None) is not None: self.print_log(f'Sucessfully Remove Weights: {w}') else: self.print_log(f'Can Not Remove Weights: {w}') if '.pth' in self.arg.weights: try: self._model.load_state_dict(weights) except: state = self._model.state_dict() diff = list( set(state.keys()).difference(set(weights.keys()))) self.print_log('Can not find these weights:') for d in diff: self.print_log(' ' + d) state.update(weights) self._model.load_state_dict(state) elif self.arg.weights.endswith(".pt") or self.arg.weights.endswith( ".pkl"): model_params = self._model.state_dict() weights['data_bn.weight'] = model_params['data_bn.weight'] weights['data_bn.bias'] = model_params['data_bn.bias'] weights['data_bn.running_mean'] = model_params[ 'data_bn.running_mean'] weights['data_bn.running_var'] = model_params[ 'data_bn.running_var'] weights['fc.weight'] = model_params['fc.weight'] weights['fc.bias'] = model_params['fc.bias'] weights['gcn3d1.gcn3d.0.gcn3d.1.A_res'] = model_params[ 'gcn3d1.gcn3d.0.gcn3d.1.A_res'] weights['gcn3d1.gcn3d.1.gcn3d.1.A_res'] = model_params[ 'gcn3d1.gcn3d.1.gcn3d.1.A_res'] weights['sgcn1.0.A_res'] = model_params['sgcn1.0.A_res'] weights['gcn3d2.gcn3d.0.gcn3d.1.A_res'] = model_params[ 'gcn3d2.gcn3d.0.gcn3d.1.A_res'] weights['gcn3d2.gcn3d.1.gcn3d.1.A_res'] = model_params[ 'gcn3d2.gcn3d.1.gcn3d.1.A_res'] weights['sgcn2.0.A_res'] = model_params['sgcn2.0.A_res'] weights['gcn3d3.gcn3d.0.gcn3d.1.A_res'] = model_params[ 'gcn3d3.gcn3d.0.gcn3d.1.A_res'] weights['gcn3d3.gcn3d.1.gcn3d.1.A_res'] = model_params[ 'gcn3d3.gcn3d.1.gcn3d.1.A_res'] weights['sgcn3.0.A_res'] = model_params['sgcn3.0.A_res'] weights[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.0.weight'] = model_params[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.0.weight'] weights[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.0.weight'] = model_params[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.0.weight'] weights['sgcn1.0.mlp.layers.0.weight'] = model_params[ 'sgcn1.0.mlp.layers.0.weight'] weights[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.0.bias'] = model_params[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.0.bias'] weights[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.1.weight'] = model_params[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.1.weight'] weights[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.1.bias'] = model_params[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.1.bias'] weights[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.1.running_mean'] = model_params[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.1.running_mean'] weights[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.1.running_var'] = model_params[ 'gcn3d1.gcn3d.0.gcn3d.1.mlp.layers.1.running_var'] weights['gcn3d1.gcn3d.0.out_conv.weight'] = model_params[ 'gcn3d1.gcn3d.0.out_conv.weight'] weights[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.0.bias'] = model_params[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.0.bias'] weights[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.1.weight'] = model_params[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.1.weight'] weights[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.1.bias'] = model_params[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.1.bias'] weights[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.1.running_mean'] = model_params[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.1.running_mean'] weights[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.1.running_var'] = model_params[ 'gcn3d1.gcn3d.1.gcn3d.1.mlp.layers.1.running_var'] weights['gcn3d1.gcn3d.1.out_conv.weight'] = model_params[ 'gcn3d1.gcn3d.1.out_conv.weight'] model_params.update(weights) self._model.load_state_dict(model_params, strict=False) else: raise "Support *.pth or *.pkl or *.pt pretrain" print(self.arg.center_loss) self._model_full = msg3d_with_loss(self._model, self.loss, self.arg.center_loss) rank = int(os.environ['RANK']) # self._model.to(rank) self._model_full.to(rank) self.model = MMDistributedDataParallel(self._model_full.cuda()) def load_param_groups(self): """ Template function for setting different learning behaviour (e.g. LR, weight decay) of different groups of parameters """ self.param_groups = defaultdict(list) for name, params in self.model.named_parameters(): self.param_groups['other'].append(params) self.optim_param_groups = { 'other': { 'params': self.param_groups['other'] } } def load_optimizer(self): params = list(self.optim_param_groups.values()) if self.arg.optimizer == 'SGD': self.optimizer = optim.SGD(params, lr=self.arg.base_lr, momentum=0.9, nesterov=self.arg.nesterov, weight_decay=self.arg.weight_decay) elif self.arg.optimizer == 'Adam': self.optimizer = optim.Adam(params, lr=self.arg.base_lr, weight_decay=self.arg.weight_decay) else: raise ValueError('Unsupported optimizer: {}'.format( self.arg.optimizer)) # Load optimizer states if any if self.arg.checkpoint is not None: self.print_log( f'Loading optimizer states from: {self.arg.checkpoint}') self.optimizer.load_state_dict( torch.load(self.arg.checkpoint)['optimizer_states']) current_lr = self.optimizer.param_groups[0]['lr'] self.print_log(f'Starting LR: {current_lr}') self.print_log( f'Starting WD1: {self.optimizer.param_groups[0]["weight_decay"]}' ) if len(self.optimizer.param_groups) >= 2: self.print_log( f'Starting WD2: {self.optimizer.param_groups[1]["weight_decay"]}' ) def load_lr_scheduler(self): self.lr_scheduler = MultiStepLR(self.optimizer, milestones=self.arg.step, gamma=0.1) if self.arg.checkpoint is not None: scheduler_states = torch.load( self.arg.checkpoint)['lr_scheduler_states'] self.print_log( f'Loading LR scheduler states from: {self.arg.checkpoint}') self.lr_scheduler.load_state_dict(scheduler_states) self.print_log( f'Starting last epoch: {scheduler_states["last_epoch"]}') self.print_log( f'Loaded milestones: {scheduler_states["last_epoch"]}') def load_data(self): Feeder = import_class(self.arg.feeder) self.data_loader = dict() def worker_seed_fn(worker_id): # give workers different seeds return init_seed(self.arg.seed + worker_id + 1) rank = int(os.environ['RANK']) world_size = torch.cuda.device_count() if self.arg.phase == 'train': dataset_train = Feeder(**self.arg.train_feeder_args) sampler_train = DistributedSampler(dataset_train, world_size, rank, shuffle=True) self.data_loader['train'] = torch.utils.data.DataLoader( dataset=dataset_train, batch_size=self.arg.batch_size // world_size, sampler=sampler_train, shuffle=False, num_workers=self.arg.num_worker // world_size, drop_last=True, worker_init_fn=worker_seed_fn) dataset_test = Feeder(**self.arg.test_feeder_args) # sampler_test = DistributedSampler(dataset_test, world_size, rank, shuffle=False) self.data_loader['test'] = torch.utils.data.DataLoader( dataset=dataset_test, batch_size=self.arg.test_batch_size // world_size, # sampler=sampler_test, shuffle=False, num_workers=self.arg.num_worker // world_size, drop_last=False, worker_init_fn=worker_seed_fn) def save_arg(self): # save arg arg_dict = vars(self.arg) if not os.path.exists(self.arg.work_dir): os.makedirs(self.arg.work_dir, exist_ok=True) with open(os.path.join(self.arg.work_dir, 'config.yaml'), 'w') as f: yaml.dump(arg_dict, f) def print_time(self): localtime = time.asctime(time.localtime(time.time())) self.print_log(f'Local current time: {localtime}') def print_log(self, s, print_time=True): if print_time: localtime = time.asctime(time.localtime(time.time())) s = f'[ {localtime} ] {s}' print(s) if self.arg.print_log: with open(os.path.join(self.arg.work_dir, 'log.txt'), 'a') as f: print(s, file=f) def record_time(self): self.cur_time = time.time() return self.cur_time def split_time(self): split_time = time.time() - self.cur_time self.record_time() return split_time def save_states(self, epoch, states, out_folder, out_name): out_folder_path = os.path.join(self.arg.work_dir, out_folder) out_path = os.path.join(out_folder_path, out_name) os.makedirs(out_folder_path, exist_ok=True) torch.save(states, out_path) def save_checkpoint(self, epoch, out_folder='checkpoints'): state_dict = { 'epoch': epoch, 'optimizer_states': self.optimizer.state_dict(), 'lr_scheduler_states': self.lr_scheduler.state_dict(), } checkpoint_name = f'checkpoint-{epoch}-fwbz{self.arg.forward_batch_size}-{int(self.global_step)}.pt' self.save_states(epoch, state_dict, out_folder, checkpoint_name) def save_weights(self, epoch, out_folder='weights'): state_dict = self.model.state_dict() weights = OrderedDict([[k.split('module.')[-1], v.cpu()] for k, v in state_dict.items()]) weights_name = f'weights-{epoch}-{int(self.global_step)}.pt' self.save_states(epoch, weights, out_folder, weights_name) def runner(self): def parse_losses(losses): log_vars = OrderedDict() for loss_name, loss_value in losses.items(): if isinstance(loss_value, torch.Tensor): log_vars[loss_name] = loss_value.mean() elif isinstance(loss_value, list): log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) else: raise TypeError( '{} is not a tensor or list of tensors'.format( loss_name)) loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key) log_vars['loss'] = loss for name in log_vars: log_vars[name] = log_vars[name].item() return loss, log_vars def batch_processor(model, data, train_mode): losses = model(**data) # losses = model(data) loss, log_vars = parse_losses(losses) outputs = dict(loss=loss, log_vars=log_vars, num_samples=len(data['batchdata'].data)) return outputs self.runner = Runner(self.model, batch_processor, self.optimizer, self.arg.work_dir) optimizer_config = DistOptimizerHook( grad_clip=dict(max_norm=20, norm_type=2)) if not "policy" in self.arg.policy: lr_config = dict(policy='step', step=self.arg.step) else: lr_config = dict(**self.arg.policy) checkpoint_config = dict(interval=5) log_config = dict(interval=20, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) self.runner.register_training_hooks(lr_config, optimizer_config, checkpoint_config, log_config) self.runner.register_hook(DistSamplerSeedHook()) Feeder = import_class(self.arg.feeder) self.runner.register_hook( DistEvalTopKAccuracyHook(Feeder(**self.arg.test_feeder_args), interval=self.arg.test_interval, k=(1, 5))) def eval(self, epoch, save_score=False, loader_name=['test'], wrong_file=None, result_file=None): # Skip evaluation if too early if epoch + 1 < self.arg.eval_start: return if wrong_file is not None: f_w = open(wrong_file, 'w') if result_file is not None: f_r = open(result_file, 'w') with torch.no_grad(): self.model = self.model.cuda() self.model.eval() self.print_log(f'Eval epoch: {epoch + 1}') for ln in loader_name: loss_values = [] score_batches = [] step = 0 process = tqdm(self.data_loader[ln], dynamic_ncols=True) for batch_idx, (data, label, index) in enumerate(process): data = data.float().cuda() label = label.long().cuda() output = self.model(data) if isinstance(output, tuple): output, l1 = output l1 = l1.mean() else: l1 = 0 loss = self.loss(output, label) score_batches.append(output.data.cpu().numpy()) loss_values.append(loss.item()) _, predict_label = torch.max(output.data, 1) step += 1 if wrong_file is not None or result_file is not None: predict = list(predict_label.cpu().numpy()) true = list(label.data.cpu().numpy()) for i, x in enumerate(predict): if result_file is not None: f_r.write(str(x) + ',' + str(true[i]) + '\n') if x != true[i] and wrong_file is not None: f_w.write( str(index[i]) + ',' + str(x) + ',' + str(true[i]) + '\n') score = np.concatenate(score_batches) loss = np.mean(loss_values) accuracy = self.data_loader[ln].dataset.top_k(score, 1) if accuracy > self.best_acc: self.best_acc = accuracy self.best_acc_epoch = epoch + 1 print('Accuracy: ', accuracy, ' model: ', self.arg.work_dir) if self.arg.phase == 'train' and not self.arg.debug: self.val_writer.add_scalar('loss', loss, self.global_step) self.val_writer.add_scalar('loss_l1', l1, self.global_step) self.val_writer.add_scalar('acc', accuracy, self.global_step) score_dict = dict( zip(self.data_loader[ln].dataset.sample_name, score)) self.print_log( f'\tMean {ln} loss of {len(self.data_loader[ln])} batches: {np.mean(loss_values)}.' ) for k in self.arg.show_topk: self.print_log( f'\tTop {k}: {100 * self.data_loader[ln].dataset.top_k(score, k):.2f}%' ) if save_score: with open( '{}/epoch{}_{}_score.pkl'.format( self.arg.work_dir, epoch + 1, ln), 'wb') as f: pickle.dump(score_dict, f) # Empty cache after evaluation torch.cuda.empty_cache() def start(self): if self.arg.phase == 'train': self.runner.run([self.data_loader['train']], workflow=[('train', 1)], max_epochs=self.arg.num_epoch) elif self.arg.phase == 'test': wf = rf = None if self.arg.weights is None: raise ValueError('Please appoint --weights.') self.print_log(f'Model: {self.arg.model}') self.print_log(f'Weights: {self.arg.weights}') self.eval(epoch=0, save_score=self.arg.save_score, loader_name=['test'], wrong_file=wf, result_file=rf) self.print_log('Done.\n')
def main_worker(train_loader, val_loader, num_classes, args, cifar=False): global best_acc1 scale_lr_and_momentum(args, cifar=cifar) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') norm_kwargs = {'mode': args.norm_mode, 'alpha_fwd': args.afwd, 'alpha_bkw': args.abkw, 'ecm': args.ecm, 'gn_num_groups': args.gn_num_groups} model_kwargs = {'num_classes': num_classes, 'norm_layer': norm_layer, 'norm_kwargs': norm_kwargs, 'cifar': cifar, 'kernel_size': 3 if cifar else 7, 'stride': 1 if cifar else 2, 'padding': 1 if cifar else 3, 'inplanes': 16 if cifar else 64} if cifar: model_kwargs['depth'] = args.depth args.arch = 'resnetD' # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True, **model_kwargs).to(device) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](**model_kwargs).to(device) print(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(get_parameter_groups(model, cifar=cifar), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer, milestones=args.lr_milestones, gamma=args.lr_multiplier) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = False if args.seed else True if args.evaluate: validate(val_loader, model, criterion, device, args) return for epoch in range(args.start_epoch, args.epochs): if epoch: scheduler.step() # train for one epoch train(train_loader, model, criterion, optimizer, epoch, device, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, device, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, is_best, args)
def train(train_loop_func, logger, args): # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(1e4) if args.distributed: args.seed = (args.seed + torch.distributed.get_rank()) % 2**32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) torch.multiprocessing.set_sharing_strategy('file_system') # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) #82783 # train_loader = get_train_loader(args, args.seed - 2**31, 118287) # target_loader = get_target_loader(args, args.seed - 2**31, 118287) train_loader = get_train_loader(args, args.seed - 2**31, 5000) target_loader = get_target_loader(args, args.seed - 2**31, 5000) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = DASSD300(backbone=ResNet(args.backbone, args.backbone_path)) # ?????args.learning_rate = args.learning_rate * args.N_gpu * ((args.batch_size + args.batch_size // 2) / 32) args.learning_rate = args.learning_rate * args.N_gpu * ( (args.batch_size + args.batch_size) / 32) start_epoch = 0 iteration = 0 loss_func = DALoss(dboxes) da_loss_func = ImageLevelAdaptationLoss() if use_cuda: ssd300.cuda() loss_func.cuda() da_loss_func.cuda() optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.amp: ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2') if args.distributed: ssd300 = DDP(ssd300) if args.checkpoint is not None: if os.path.isfile(args.checkpoint): load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage. cuda(torch.cuda.current_device())) start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] scheduler.load_state_dict(checkpoint['scheduler']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('Provided checkpoint is not path to a file') return inv_map = {v: k for k, v in val_dataset.label_map.items()} total_time = 0 if args.mode == 'evaluation': acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Model precision {} mAP'.format(acc)) return mean, std = generate_mean_std(args) meters = { 'total': AverageValueMeter(), 'ssd': AverageValueMeter(), 'da': AverageValueMeter() } vis = Visualizer(env='da ssd', port=6006) for epoch in range(start_epoch, args.epochs): start_epoch_time = time.time() scheduler.step() iteration = train_loop_func(ssd300, loss_func, da_loss_func, epoch, optimizer, train_loader, target_loader, encoder, iteration, logger, args, mean, std, meters, vis) end_epoch_time = time.time() - start_epoch_time total_time += end_epoch_time if args.local_rank == 0: logger.update_epoch_time(epoch, end_epoch_time) if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: logger.update_epoch(epoch, acc) vis.log(acc, win='Evaluation') if args.save and args.local_rank == 0: print("saving model...") obj = { 'epoch': epoch + 1, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'label_map': val_dataset.label_info } if args.distributed: obj['model'] = ssd300.module.state_dict() else: obj['model'] = ssd300.state_dict() torch.save(obj, './models/epoch_{}.pt'.format(epoch)) train_loader.reset() target_loader.reset() print('total training time: {}'.format(total_time))
def main(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) train_params = { "batch_size": opt.batch_size, "shuffle": True, "drop_last": False, "num_workers": opt.num_workers, "collate_fn": collate_fn } eval_params = { "batch_size": opt.batch_size, "shuffle": True, "drop_last": False, "num_workers": opt.num_workers, "collate_fn": collate_fn } dboxes = generate_dboxes() model = SSD() train_set = OIDataset(SimpleTransformer(dboxes), train=True) train_loader = DataLoader(train_set, **train_params) val_set = OIDataset(SimpleTransformer(dboxes, eval=True), validation=True) val_loader = DataLoader(val_set, **eval_params) encoder = Encoder(dboxes) opt.lr = opt.lr * (opt.batch_size / 32) criterion = Loss(dboxes) optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay, nesterov=True) scheduler = MultiStepLR(optimizer=optimizer, milestones=opt.multistep, gamma=0.1) if torch.cuda.is_available(): model.cuda() criterion.cuda() model = torch.nn.DataParallel(model) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.save_folder): os.makedirs(opt.save_folder) checkpoint_path = os.path.join(opt.save_folder, "SSD.pth") writer = SummaryWriter(opt.log_path) if os.path.isfile(checkpoint_path): checkpoint = torch.load(checkpoint_path) first_epoch = checkpoint["epoch"] + 1 model.module.load_state_dict(checkpoint["model_state_dict"]) scheduler.load_state_dict(checkpoint["scheduler"]) optimizer.load_state_dict(checkpoint["optimizer"]) else: first_epoch = 0 for epoch in range(first_epoch, opt.epochs): train(model, train_loader, epoch, writer, criterion, optimizer, scheduler) evaluate(model, val_loader, encoder, opt.nms_threshold) checkpoint = { "epoch": epoch, "model_state_dict": model.module.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict() } torch.save(checkpoint, checkpoint_path)
def run_resnet(args): # The Resnet paper states the following transforms are applied on the train set train_set = datasets.CIFAR100( "./data/", train=True, download=True, transform=transforms.Compose([ transforms.Normalize( # pre-computed (0.5071, 0.4865, 0.4409), (0.2673, 0.2564, 0.2762)), transforms.Pad(4), transforms.RandomHorizontalFlip(), transforms.RandomCrop(32), transforms.ToTensor() ])) test_set = datasets.CIFAR100( "./data/", train=False, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize( # pre-computed (0.5088, 0.4874, 0.4419), (0.2683, 0.2574, 0.2771)) ])) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.b, shuffle=True) test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.b, shuffle=True) checkpoints_dir = "checkpoints/resnet/" final_dir = 'models/' device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = Resnet(3).to(device) optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=0.0001, momentum=0.9) loss = nn.CrossEntropyLoss() # learning rate should be decayed at 32k and 64k milestones scheduler = MultiStepLR(optimizer, milestones=[320, 480], gamma=0.1) for epoch in range(0, 640): loss_train = 0.0 for images, labels in train_loader: images = images.to(device) labels = labels.to(device) predictions = model(images) batch_loss = loss(predictions, labels) optimizer.zero_grad() batch_loss.backward() optimizer.step() loss_train += batch_loss.item() print('{} Epoch {}, Training loss {}'.format( datetime.datetime.now(), epoch + 1, loss_train / len(train_loader))) scheduler.step() if epoch % 100 == 0 or epoch == 0: checkpoint_path = os.path.join(checkpoints_dir, 'epoch_' + str(epoch) + '.pt') torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict() }, checkpoint_path) model_path = os.path.join(final_dir, 'lenet.pth') torch.save(model.state_dict(), model_path) model.eval() with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: images = images.to(device) labels = labels.to(device) outputs = model(images) _, predicted = torch.max(outputs, dim=1) total += labels.shape[0] correct += (predicted == labels).sum().item() print("Accuracy = {}".format(100 * (correct / total)))
class Learner: def __init__(self): self.args = self.parse_command_line() self.checkpoint_dir, self.logfile, self.checkpoint_path_validation, self.checkpoint_path_final \ = get_log_files(self.args.checkpoint_dir, self.args.resume_from_checkpoint, False) print_and_log(self.logfile, "Options: %s\n" % self.args) print_and_log(self.logfile, "Checkpoint Directory: %s\n" % self.checkpoint_dir) self.writer = SummaryWriter() #gpu_device = 'cuda:0' gpu_device = 'cuda' self.device = torch.device( gpu_device if torch.cuda.is_available() else 'cpu') self.model = self.init_model() self.train_set, self.validation_set, self.test_set = self.init_data() self.vd = video_reader.VideoDataset(self.args) self.video_loader = torch.utils.data.DataLoader( self.vd, batch_size=1, num_workers=self.args.num_workers) self.loss = loss self.accuracy_fn = aggregate_accuracy if self.args.opt == "adam": self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.learning_rate) elif self.args.opt == "sgd": self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.args.learning_rate) self.test_accuracies = TestAccuracies(self.test_set) self.scheduler = MultiStepLR(self.optimizer, milestones=self.args.sch, gamma=0.1) self.start_iteration = 0 if self.args.resume_from_checkpoint: self.load_checkpoint() self.optimizer.zero_grad() def init_model(self): model = CNN_TRX(self.args) model = model.to(self.device) if self.args.num_gpus > 1: model.distribute_model() return model def init_data(self): train_set = [self.args.dataset] validation_set = [self.args.dataset] test_set = [self.args.dataset] return train_set, validation_set, test_set """ Command line parser """ def parse_command_line(self): parser = argparse.ArgumentParser() parser.add_argument("--dataset", choices=["ssv2", "kinetics"], default="ssv2", help="Dataset to use.") parser.add_argument("--learning_rate", "-lr", type=float, default=0.001, help="Learning rate.") parser.add_argument( "--tasks_per_batch", type=int, default=16, help="Number of tasks between parameter optimizations.") parser.add_argument("--checkpoint_dir", "-c", default=None, help="Directory to save checkpoint to.") parser.add_argument("--test_model_path", "-m", default=None, help="Path to model to load and test.") parser.add_argument("--training_iterations", "-i", type=int, default=50020, help="Number of meta-training iterations.") parser.add_argument("--resume_from_checkpoint", "-r", dest="resume_from_checkpoint", default=False, action="store_true", help="Restart from latest checkpoint.") parser.add_argument("--way", type=int, default=5, help="Way of single dataset task.") parser.add_argument( "--shot", type=int, default=1, help="Shots per class for context of single dataset task.") parser.add_argument("--query_per_class", type=int, default=5, help="Target samples (i.e. queries) per class.") parser.add_argument("--seq_len", type=int, default=8, help="Frames per video.") parser.add_argument("--num_workers", type=int, default=10, help="Num dataloader workers.") parser.add_argument("--method", choices=["resnet18", "resnet34", "resnet50"], default="resnet50", help="method") parser.add_argument("--trans_linear_out_dim", type=int, default=1152, help="Transformer linear_out_dim") parser.add_argument("--opt", choices=["adam", "sgd"], default="sgd", help="Optimizer") parser.add_argument("--trans_dropout", type=int, default=0.1, help="Transformer dropout") parser.add_argument( "--save_freq", type=int, default=5000, help="Number of iterations between checkpoint saves.") parser.add_argument("--img_size", type=int, default=224, help="Input image size to the CNN after cropping.") parser.add_argument('--temp_set', nargs='+', type=int, help='cardinalities e.g. 2,3 is pairs and triples', default=[2, 3]) parser.add_argument("--scratch", choices=["bc", "bp"], default="bp", help="Computer to run on") parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to split the ResNet over") parser.add_argument("--debug_loader", default=False, action="store_true", help="Load 1 vid per class for debugging") parser.add_argument("--split", type=int, default=3, help="Dataset split.") parser.add_argument('--sch', nargs='+', type=int, help='iters to drop learning rate', default=[1000000]) args = parser.parse_args() if args.scratch == "bc": args.scratch = "/mnt/storage/home/tp8961/scratch" elif args.scratch == "bp": args.num_gpus = 4 args.num_workers = 5 args.scratch = "/work/tp8961" if args.checkpoint_dir == None: print("need to specify a checkpoint dir") exit(1) if (args.method == "resnet50") or (args.method == "resnet34"): args.img_size = 224 if args.method == "resnet50": args.trans_linear_in_dim = 2048 else: args.trans_linear_in_dim = 512 if args.dataset == "ssv2": args.traintestlist = os.path.join( args.scratch, "video_datasets/splits/somethingsomethingv2TrainTestlist") args.path = os.path.join( args.scratch, "video_datasets/data/somethingsomethingv2_256x256q5_1.zip") elif args.dataset == "kinetics": args.traintestlist = os.path.join( args.scratch, "video_datasets/splits/kineticsTrainTestlist") args.path = os.path.join( args.scratch, "video_datasets/data/kinetics_256q5_1.zip") return args def run(self): config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True with tf.compat.v1.Session(config=config) as session: train_accuracies = [] losses = [] total_iterations = self.args.training_iterations iteration = self.start_iteration for task_dict in self.video_loader: if iteration >= total_iterations: break iteration += 1 torch.set_grad_enabled(True) task_loss, task_accuracy = self.train_task(task_dict) train_accuracies.append(task_accuracy) losses.append(task_loss) # optimize if ((iteration + 1) % self.args.tasks_per_batch == 0) or (iteration == (total_iterations - 1)): self.optimizer.step() self.optimizer.zero_grad() self.scheduler.step() if (iteration + 1) % PRINT_FREQUENCY == 0: # print training stats print_and_log( self.logfile, 'Task [{}/{}], Train Loss: {:.7f}, Train Accuracy: {:.7f}' .format(iteration + 1, total_iterations, torch.Tensor(losses).mean().item(), torch.Tensor(train_accuracies).mean().item())) train_accuracies = [] losses = [] if ((iteration + 1) % self.args.save_freq == 0) and (iteration + 1) != total_iterations: self.save_checkpoint(iteration + 1) if ((iteration + 1) in TEST_ITERS) and (iteration + 1) != total_iterations: accuracy_dict = self.test(session) self.test_accuracies.print(self.logfile, accuracy_dict) # save the final model torch.save(self.model.state_dict(), self.checkpoint_path_final) self.logfile.close() def train_task(self, task_dict): context_images, target_images, context_labels, target_labels, real_target_labels, batch_class_list = self.prepare_task( task_dict) model_dict = self.model(context_images, context_labels, target_images) target_logits = model_dict['logits'] task_loss = self.loss(target_logits, target_labels, self.device) / self.args.tasks_per_batch task_accuracy = self.accuracy_fn(target_logits, target_labels) task_loss.backward(retain_graph=False) return task_loss, task_accuracy def test(self, session): self.model.eval() with torch.no_grad(): self.video_loader.dataset.train = False accuracy_dict = {} accuracies = [] iteration = 0 item = self.args.dataset for task_dict in self.video_loader: if iteration >= NUM_TEST_TASKS: break iteration += 1 context_images, target_images, context_labels, target_labels, real_target_labels, batch_class_list = self.prepare_task( task_dict) model_dict = self.model(context_images, context_labels, target_images) target_logits = model_dict['logits'] accuracy = self.accuracy_fn(target_logits, target_labels) accuracies.append(accuracy.item()) del target_logits accuracy = np.array(accuracies).mean() * 100.0 confidence = (196.0 * np.array(accuracies).std()) / np.sqrt( len(accuracies)) accuracy_dict[item] = { "accuracy": accuracy, "confidence": confidence } self.video_loader.dataset.train = True self.model.train() return accuracy_dict def prepare_task(self, task_dict, images_to_device=True): context_images, context_labels = task_dict['support_set'][ 0], task_dict['support_labels'][0] target_images, target_labels = task_dict['target_set'][0], task_dict[ 'target_labels'][0] real_target_labels = task_dict['real_target_labels'][0] batch_class_list = task_dict['batch_class_list'][0] if images_to_device: context_images = context_images.to(self.device) target_images = target_images.to(self.device) context_labels = context_labels.to(self.device) target_labels = target_labels.type(torch.LongTensor).to(self.device) return context_images, target_images, context_labels, target_labels, real_target_labels, batch_class_list def shuffle(self, images, labels): """ Return shuffled data. """ permutation = np.random.permutation(images.shape[0]) return images[permutation], labels[permutation] def save_checkpoint(self, iteration): d = { 'iteration': iteration, 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict() } torch.save( d, os.path.join(self.checkpoint_dir, 'checkpoint{}.pt'.format(iteration))) torch.save(d, os.path.join(self.checkpoint_dir, 'checkpoint.pt')) def load_checkpoint(self): checkpoint = torch.load( os.path.join(self.checkpoint_dir, 'checkpoint.pt')) self.start_iteration = checkpoint['iteration'] self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) self.scheduler.load_state_dict(checkpoint['scheduler'])
def main(args: argparse.Namespace): logger = CompleteLogger(args.log, args.phase) print(args) if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') cudnn.benchmark = True # Data loading code normalize = T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) train_transform = T.Compose([ T.RandomRotation(args.rotation), T.RandomResizedCrop(size=args.image_size, scale=args.resize_scale), T.ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25), T.GaussianBlur(), T.ToTensor(), normalize ]) val_transform = T.Compose( [T.Resize(args.image_size), T.ToTensor(), normalize]) image_size = (args.image_size, args.image_size) heatmap_size = (args.heatmap_size, args.heatmap_size) source_dataset = datasets.__dict__[args.source] train_source_dataset = source_dataset(root=args.source_root, transforms=train_transform, image_size=image_size, heatmap_size=heatmap_size) train_source_loader = DataLoader(train_source_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) val_source_dataset = source_dataset(root=args.source_root, split='test', transforms=val_transform, image_size=image_size, heatmap_size=heatmap_size) val_source_loader = DataLoader(val_source_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True) target_dataset = datasets.__dict__[args.target] train_target_dataset = target_dataset(root=args.target_root, transforms=train_transform, image_size=image_size, heatmap_size=heatmap_size) train_target_loader = DataLoader(train_target_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) val_target_dataset = target_dataset(root=args.target_root, split='test', transforms=val_transform, image_size=image_size, heatmap_size=heatmap_size) val_target_loader = DataLoader(val_target_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True) print("Source train:", len(train_source_loader)) print("Target train:", len(train_target_loader)) print("Source test:", len(val_source_loader)) print("Target test:", len(val_target_loader)) train_source_iter = ForeverDataIterator(train_source_loader) train_target_iter = ForeverDataIterator(train_target_loader) # create model model = models.__dict__[args.arch]( num_keypoints=train_source_dataset.num_keypoints).to(device) criterion = JointsMSELoss() # define optimizer and lr scheduler optimizer = Adam(model.get_parameters(lr=args.lr)) lr_scheduler = MultiStepLR(optimizer, args.lr_step, args.lr_factor) # optionally resume from a checkpoint start_epoch = 0 if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) start_epoch = checkpoint['epoch'] + 1 # define visualization function tensor_to_image = Compose([ Denormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ToPILImage() ]) def visualize(image, keypoint2d, name): """ Args: image (tensor): image in shape 3 x H x W keypoint2d (tensor): keypoints in shape K x 2 name: name of the saving image """ train_source_dataset.visualize( tensor_to_image(image), keypoint2d, logger.get_image_path("{}.jpg".format(name))) if args.phase == 'test': # evaluate on validation set source_val_acc = validate(val_source_loader, model, criterion, None, args) target_val_acc = validate(val_target_loader, model, criterion, visualize, args) print("Source: {:4.3f} Target: {:4.3f}".format(source_val_acc['all'], target_val_acc['all'])) for name, acc in target_val_acc.items(): print("{}: {:4.3f}".format(name, acc)) return # start training best_acc = 0 for epoch in range(start_epoch, args.epochs): logger.set_epoch(epoch) lr_scheduler.step() # train for one epoch train(train_source_iter, train_target_iter, model, criterion, optimizer, epoch, visualize if args.debug else None, args) # evaluate on validation set source_val_acc = validate(val_source_loader, model, criterion, None, args) target_val_acc = validate(val_target_loader, model, criterion, visualize if args.debug else None, args) # remember best acc and save checkpoint torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args }, logger.get_checkpoint_path(epoch)) if target_val_acc['all'] > best_acc: shutil.copy(logger.get_checkpoint_path(epoch), logger.get_checkpoint_path('best')) best_acc = target_val_acc['all'] print("Source: {:4.3f} Target: {:4.3f} Target(best): {:4.3f}".format( source_val_acc['all'], target_val_acc['all'], best_acc)) for name, acc in target_val_acc.items(): print("{}: {:4.3f}".format(name, acc)) logger.close()
def main(): start_epoch = 0 best_prec1 = 0.0 seed=np.random.randint(10000) if seed is not None: np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if args.gpus is not None: device = torch.device("cuda:{}".format(args.gpus[0])) cudnn.benchmark = False # cudnn.deterministic = True cudnn.enabled = True else: device = torch.device("cpu") now = datetime.now().strftime('%Y-%m-%d-%H:%M:%S') if args.mission is not None: if 'vgg' == args.arch and args.batchnorm: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}_bn/{args.mission}/{now}' elif 'resnet20' == args.arch: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}/{args.mission}/{now}' else: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}/{args.mission}/{now}' else: if 'vgg' == args.arch and args.batchnorm: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}_bn/{now}' else: args.job_dir = f'{args.job_dir}/{args.dataset}/{args.arch}{args.num_layers}/{now}' _make_dir(args.job_dir) ckpt = utils.checkpoint(args) print_logger = utils.get_logger(os.path.join(args.job_dir, "logger.log")) utils.print_params(vars(args), print_logger.info) writer_train = SummaryWriter(args.job_dir +'/run/train') writer_test = SummaryWriter(args.job_dir+ '/run/test') ## hyperparameters settings ## n_layers = (args.num_layers - 2) * 2 unit_k_bits = int(args.k_bits) kbits_list = [unit_k_bits for i in range(n_layers)] print_logger.info(f'k_bits_list {kbits_list}') # Data loading print('=> Preparing data..') if args.dataset in ['cifar10', 'cifar100','mnist']: IMAGE_SIZE = 32 elif args.dataset == 'tinyimagenet': IMAGE_SIZE = 64 else: IMAGE_SIZE = 224 if args.dataset == 'imagenet': train_loader = get_imagenet_iter_dali(type = 'train',image_dir=args.data_dir, batch_size=args.train_batch_size,num_threads=args.workers,crop=IMAGE_SIZE,device_id=0,num_gpus=1) val_loader = get_imagenet_iter_dali(type='val', image_dir=args.data_dir, batch_size=args.eval_batch_size,num_threads=args.workers,crop=IMAGE_SIZE,device_id=0,num_gpus=1) elif args.dataset == 'tinyimagenet': train_loader = get_imagenet_iter_dali(type = 'train',image_dir=args.data_dir, batch_size=args.train_batch_size,num_threads=args.workers,crop=IMAGE_SIZE,device_id=0,num_gpus=1) val_loader = get_imagenet_iter_dali(type='val', image_dir=args.data_dir, batch_size=args.eval_batch_size,num_threads=args.workers,crop=IMAGE_SIZE,device_id=0,num_gpus=1) elif args.dataset == 'cifar10': train_loader = get_cifar_iter_dali(type='train', image_dir=args.data_dir, batch_size=args.train_batch_size,num_threads=args.workers) val_loader = get_cifar_iter_dali(type='val', image_dir=args.data_dir, batch_size=args.eval_batch_size,num_threads=args.workers) # Create model print('=> Building model...') if args.dataset =='cifar10': num_classes = 10 train_data_length = 50000 eval_data_length =10000 elif args.dataset == 'imagenet': num_classes = 1000 train_data_length = 50000 eval_data_length =10000 # arch = args.arch # model = models.__dict__[arch] model_config = {'k_bits':kbits_list,'num_layers':args.num_layers,'pre_k_bits':args.pre_k_bits,'ratio':args.ratio} if args.arch == 'mobilenetv2': model_config = {'k_bits':kbits_list,'num_layers':args.num_layers,'pre_k_bits':args.pre_k_bits,'ratio':args.ratio,'width_mult':args.width_mult} if 'vgg' == args.arch and args.batchnorm: model,model_k_bits = import_module(f"models.{args.dataset}.{args.archtype}.{args.arch}").__dict__[f'{args.arch}{args.num_layers}_bn'](model_config) elif 'resnet20' == args.arch: model,model_k_bits = import_module(f"models.{args.dataset}.{args.archtype}.{args.arch}").__dict__[f'{args.arch}'](model_config) else: model,model_k_bits = import_module(f"models.{args.dataset}.{args.archtype}.{args.arch}").__dict__[f'{args.arch}{args.num_layers}'](model_config) model = model.to(device) print_logger.info(f'model_k_bits_list {model_k_bits}') # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer, milestones=[0.5 * args.train_epochs, 0.75 * args.train_epochs], gamma=0.1) # Optionally resume from a checkpoint resume = args.resume if resume: print('=> Loading checkpoint {}'.format(resume)) checkpoint = torch.load(resume, map_location=device) state_dict = checkpoint['state_dict'] start_epoch = checkpoint['epoch'] pre_train_best_prec1 = checkpoint['best_prec1'] model_check = load_check(state_dict,model) pdb.set_trace() model.load_state_dict(model_check) print('Prec@1:',pre_train_best_prec1) if args.test_only: test_prec1 = test(args, device, val_loader, model, criterion, writer_test,print_logger,start_epoch ) print('=> Test Prec@1: {:.2f}'.format(test_prec1)) print(f'sample k_bits {kbits_list}') return for epoch in range(0, args.train_epochs): scheduler.step(epoch) train_loss, train_prec1 = train(args, device, train_loader, train_data_length, model, criterion, optimizer, writer_train, print_logger, epoch) test_prec1 = test(args, device, val_loader, eval_data_length, model, criterion, writer_test, print_logger, epoch) is_best = best_prec1 < test_prec1 best_prec1 = max(test_prec1, best_prec1) state = { 'state_dict': model.state_dict(), 'test_prec1': test_prec1, 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'epoch': epoch + 1 } ckpt.save_model(state, epoch + 1, is_best,mode='train') print_logger.info('==> BEST ACC {:.3f}'.format(best_prec1.item()))
def main(): # Views the training images and displays the distance on anchor-negative and anchor-positive # print the experiment configuration print('\nCurrent time is \33[91m{}\33[0m'.format(str(time.asctime()))) print('Parsed options: {}'.format(vars(args))) print('Number of Classes: {}\n'.format(len(train_dir.speakers))) # instantiate # model and initialize weights model = AttenSiResNet(layers=[3, 4, 6, 3], num_classes=len(train_dir.speakers)) if args.cuda: model.cuda() optimizer = create_optimizer(model.parameters(), args.optimizer, **opt_kwargs) scheduler = MultiStepLR(optimizer, milestones=[18, 24], gamma=0.1) # criterion = AngularSoftmax(in_feats=args.embedding_size, # num_classes=len(train_dir.classes)) start = 0 # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print('=> loading checkpoint {}'.format(args.resume)) checkpoint = torch.load(args.resume) start = checkpoint['epoch'] checkpoint = torch.load(args.resume) filtered = {k: v for k, v in checkpoint['state_dict'].items() if 'num_batches_tracked' not in k} model.load_state_dict(filtered) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) # criterion.load_state_dict(checkpoint['criterion']) else: print('=> no checkpoint found at {}'.format(args.resume)) start += args.start_epoch print('Start epoch is : ' + str(start)) end = start + args.epochs # pdb.set_trace() train_loader = torch.utils.data.DataLoader(train_dir, batch_size=args.batch_size, # collate_fn=PadCollate(dim=2, fix_len=True), shuffle=True, **kwargs) valid_loader = torch.utils.data.DataLoader(valid_dir, batch_size=args.batch_size, # collate_fn=PadCollate(dim=2, fix_len=True), shuffle=False, **kwargs) test_loader = torch.utils.data.DataLoader(test_part, batch_size=args.test_batch_size, shuffle=False, **kwargs) criterion = nn.CrossEntropyLoss().cuda() check_path = '{}/checkpoint_{}.pth'.format(args.check_path, -1) torch.save({'epoch': -1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict()}, # 'criterion': criterion.state_dict() check_path) for epoch in range(start, end): # pdb.set_trace() for param_group in optimizer.param_groups: print('\n\33[1;34m Current \'{}\' learning rate is {}.\33[0m'.format(args.optimizer, param_group['lr'])) train(train_loader, model, optimizer, criterion, scheduler, epoch) test(test_loader, valid_loader, model, epoch) scheduler.step() # break writer.close()
def main(opt): if torch.cuda.is_available(): torch.distributed.init_process_group(backend='nccl', init_method='env://') num_gpus = torch.distributed.get_world_size() torch.cuda.manual_seed(123) else: torch.manual_seed(123) num_gpus = 1 train_params = { "batch_size": opt.batch_size * num_gpus, "shuffle": True, "drop_last": False, "num_workers": opt.num_workers, "collate_fn": collate_fn } test_params = { "batch_size": opt.batch_size * num_gpus, "shuffle": False, "drop_last": False, "num_workers": opt.num_workers, "collate_fn": collate_fn } if opt.model == "ssd": dboxes = generate_dboxes(model="ssd") model = SSD(backbone=ResNet(), num_classes=len(coco_classes)) else: dboxes = generate_dboxes(model="ssdlite") model = SSDLite(backbone=MobileNetV2(), num_classes=len(coco_classes)) train_set = CocoDataset(opt.data_path, 2017, "train", SSDTransformer(dboxes, (300, 300), val=False)) train_loader = DataLoader(train_set, **train_params) test_set = CocoDataset(opt.data_path, 2017, "val", SSDTransformer(dboxes, (300, 300), val=True)) test_loader = DataLoader(test_set, **test_params) encoder = Encoder(dboxes) opt.lr = opt.lr * num_gpus * (opt.batch_size / 32) criterion = Loss(dboxes) optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay, nesterov=True) scheduler = MultiStepLR(optimizer=optimizer, milestones=opt.multistep, gamma=0.1) if torch.cuda.is_available(): model.cuda() criterion.cuda() if opt.amp: from apex import amp from apex.parallel import DistributedDataParallel as DDP model, optimizer = amp.initialize(model, optimizer, opt_level='O1') else: from torch.nn.parallel import DistributedDataParallel as DDP # It is recommended to use DistributedDataParallel, instead of DataParallel # to do multi-GPU training, even if there is only a single node. model = DDP(model) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.save_folder): os.makedirs(opt.save_folder) checkpoint_path = os.path.join(opt.save_folder, "SSD.pth") writer = SummaryWriter(opt.log_path) if os.path.isfile(checkpoint_path): checkpoint = torch.load(checkpoint_path) first_epoch = checkpoint["epoch"] + 1 model.module.load_state_dict(checkpoint["model_state_dict"]) scheduler.load_state_dict(checkpoint["scheduler"]) optimizer.load_state_dict(checkpoint["optimizer"]) else: first_epoch = 0 for epoch in range(first_epoch, opt.epochs): train(model, train_loader, epoch, writer, criterion, optimizer, scheduler, opt.amp) evaluate(model, test_loader, epoch, writer, encoder, opt.nms_threshold) checkpoint = { "epoch": epoch, "model_state_dict": model.module.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict() } torch.save(checkpoint, checkpoint_path)
mean_generator_adversarial_loss / len(train_dataloader), mean_generator_content_loss / len(train_dataloader), mean_generator_total_loss / len(train_dataloader))) log_value('generator_perceptual_loss', mean_generator_perceptual_loss / len(train_dataloader), epoch) log_value('generator_adversarial_loss', mean_generator_adversarial_loss / len(train_dataloader), epoch) log_value('generator_content_loss', mean_generator_content_loss / len(train_dataloader), epoch) log_value('generator_total_loss', mean_generator_total_loss / len(train_dataloader), epoch) log_value('discriminator_loss', mean_discriminator_loss / len(train_dataloader), epoch) scheduler_generator.step() scheduler_discriminator.step() # Do checkpointing 保存模型 generator_state = {'generator_model': generator.state_dict(), 'generator_optimizer': optim_generator.state_dict(), 'scheduler_generator': scheduler_generator.state_dict(), 'epoch': epoch} discriminator_state = {'discriminator_model': discriminator.state_dict(), 'discriminator_optimizer': optim_discriminator.state_dict(), 'scheduler_discriminator': scheduler_discriminator.state_dict(), 'epoch': epoch} # save model torch.save(generator_state, opt.generatorWeights) torch.save(discriminator_state, opt.discriminatorWeights) if epoch % 5 == 0: # 验证集 out_path = 'pretraining_results/SRF_' + str(opt.upSampling) + '/' if not os.path.exists(out_path): os.makedirs(out_path) with torch.no_grad():
def main_train(args): # 获取命令参数 if args.resume_training is not None: if not os.path.isfile(args.resume_training): print(f"{args.resume_training} 不是一个合法的文件!") return else: print(f"加载检查点:{args.resume_training}") cuda = args.cuda resume = args.resume_training batch_size = args.batch_size milestones = args.milestones lr = args.lr total_epoch = args.epochs resume_checkpoint_filename = args.resume_training best_model_name = args.best_model_name checkpoint_name = args.best_model_name data_path = args.data_path start_epoch = 1 print("加载数据....") dataset = ISONetData(data_path=data_path) dataset_test = ISONetData(data_path=data_path, train=False) data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=6, pin_memory=True) data_loader_test = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False) print("成功加载数据...") print(f"训练集数量: {len(dataset)}") print(f"验证集数量: {len(dataset_test)}") model_path = Path("models") checkpoint_path = model_path.joinpath("checkpoint") if not model_path.exists(): model_path.mkdir() if not checkpoint_path.exists(): checkpoint_path.mkdir() if torch.cuda.is_available(): device = torch.cuda.current_device() else: print("cuda 无效!") cuda = False net = ISONet() criterion = nn.MSELoss(reduction="mean") optimizer = optim.Adam(net.parameters(), lr=lr) if cuda: net = net.to(device=device) criterion = criterion.to(device=device) scheduler = MultiStepLR(optimizer=optimizer, milestones=milestones, gamma=0.1) writer = SummaryWriter() # 恢复训练 if resume: print("恢复训练中...") checkpoint = torch.load( checkpoint_path.joinpath(resume_checkpoint_filename)) net.load_state_dict(checkpoint["net"]) optimizer.load_state_dict((checkpoint["optimizer"])) scheduler.load_state_dict(checkpoint["scheduler"]) resume_epoch = checkpoint["epoch"] best_test_loss = checkpoint["best_test_loss"] start_epoch = resume_epoch + 1 print(f"从第[{start_epoch}]轮开始训练...") print(f"上一次的损失为: [{best_test_loss}]...") else: # 初始化权重 for m in net.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight) elif isinstance(m, nn.Linear): nn.init.constant_(m.bias, 0) if not locals().get("best_test_loss"): best_test_loss = 0 record = 0 for epoch in range(start_epoch, total_epoch): print(f"开始第 [{epoch}] 轮训练...") net.train() writer.add_scalar("Train/Learning Rate", scheduler.get_last_lr()[0], epoch) for i, (data, label) in enumerate(data_loader, 0): if i == 0: start_time = int(time.time()) if cuda: data = data.to(device=device) label = label.to(device=device) label = label.unsqueeze(1) optimizer.zero_grad() output = net(data) loss = criterion(output, label) loss.backward() optimizer.step() if i % 500 == 499: end_time = int(time.time()) use_time = end_time - start_time print( f">>> epoch[{epoch}] loss[{loss:.4f}] {i * batch_size}/{len(dataset)} lr{scheduler.get_last_lr()} ", end="") left_time = ((len(dataset) - i * batch_size) / 500 / batch_size) * (end_time - start_time) print( f"耗费时间:[{end_time - start_time:.2f}]秒,估计剩余时间: [{left_time:.2f}]秒" ) start_time = end_time # 记录到 tensorboard if i % 128 == 127: writer.add_scalar("Train/loss", loss, record) record += 1 # validate print("测试模型...") net.eval() test_loss = 0 with torch.no_grad(): loss_t = nn.MSELoss(reduction="mean") if cuda: loss_t = loss_t.to(device) for data, label in data_loader_test: if cuda: data = data.to(device) label = label.to(device) # expand dim label = label.unsqueeze_(1) predict = net(data) # sum up batch loss test_loss += loss_t(predict, label).item() test_loss /= len(dataset_test) test_loss *= batch_size print( f'\nTest Data: Average batch[{batch_size}] loss: {test_loss:.4f}\n' ) scheduler.step() writer.add_scalar("Test/Loss", test_loss, epoch) checkpoint = { "net": net.state_dict(), "optimizer": optimizer.state_dict(), "epoch": epoch, "scheduler": scheduler.state_dict(), "best_test_loss": best_test_loss } if best_test_loss == 0: print("保存模型中...") torch.save(net.state_dict(), model_path.joinpath(best_model_name)) best_test_loss = test_loss else: # 保存更好的模型 if test_loss < best_test_loss: print("获取到更好的模型,保存中...") torch.save(net.state_dict(), model_path.joinpath(best_model_name)) best_test_loss = test_loss # 保存检查点 if epoch % args.save_every_epochs == 0: c_time = time2str() torch.save( checkpoint, checkpoint_path.joinpath( f"{checkpoint_name}_{epoch}_{c_time}.cpth")) print(f"保存检查点: [{checkpoint_name}_{epoch}_{c_time}.cpth]...\n")
class AuxModel: def __init__(self, config, logger, wandb): self.config = config self.logger = logger self.writer = SummaryWriter(config.log_dir) self.wandb = wandb cudnn.enabled = True # set up model self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = get_model(config) if len(config.gpus) > 1: self.model = nn.DataParallel(self.model) self.model = self.model.to(self.device) self.best_acc = 0 self.best_AUC = 0 self.class_loss_func = nn.CrossEntropyLoss() self.pixel_loss = nn.L1Loss() if config.mode == 'train': # set up optimizer, lr scheduler and loss functions lr = config.lr self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr, betas=(.5, .999)) self.scheduler = MultiStepLR(self.optimizer, milestones=[50, 150], gamma=0.1) self.wandb.watch(self.model) self.start_iter = 0 # resume if config.training_resume: self.load(config.model_dir + '/' + config.training_resume) cudnn.benchmark = True elif config.mode == 'val': self.load(os.path.join(config.testing_model)) else: self.load(os.path.join(config.testing_model)) def entropy_loss(self, x): return torch.sum(-F.softmax(x, 1) * F.log_softmax(x, 1), 1).mean() def train_epoch_main_task(self, src_loader, tar_loader, epoch, print_freq): self.model.train() batch_time = AverageMeter() losses = AverageMeter() main_loss = AverageMeter() top1 = AverageMeter() for it, src_batch in enumerate(src_loader['main_task']): t = time.time() self.optimizer.zero_grad() src = src_batch src = to_device(src, self.device) src_imgs, src_cls_lbls = src self.optimizer.zero_grad() src_main_logits = self.model(src_imgs, 'main_task') src_main_loss = self.class_loss_func(src_main_logits, src_cls_lbls) loss = src_main_loss * self.config.loss_weight['main_task'] main_loss.update(loss.item(), src_imgs.size(0)) precision1_train, precision2_train = accuracy(src_main_logits, src_cls_lbls, topk=(1, 2)) top1.update(precision1_train[0], src_imgs.size(0)) loss.backward() self.optimizer.step() losses.update(loss.item(), src_imgs.size(0)) # measure elapsed time batch_time.update(time.time() - t) self.start_iter += 1 if self.start_iter % print_freq == 0: print_string = 'Epoch {:>2} | iter {:>4} | loss:{:.3f}| acc:{:.3f}| src_main: {:.3f} |' + '|{:4.2f} s/it' self.logger.info( print_string.format(epoch, self.start_iter, losses.avg, top1.avg, main_loss.avg, batch_time.avg)) self.writer.add_scalar('losses/all_loss', losses.avg, self.start_iter) self.writer.add_scalar('losses/src_main_loss', src_main_loss, self.start_iter) self.scheduler.step() self.wandb.log({"Train Loss": main_loss.avg}) # del loss, src_class_loss, src_aux_loss, tar_aux_loss, tar_entropy_loss # del src_aux_logits, src_class_logits # del tar_aux_logits, tar_class_logits def train_epoch_all_tasks(self, src_loader, tar_loader, epoch, print_freq): self.model.train() batch_time = AverageMeter() losses = AverageMeter() main_loss = AverageMeter() top1 = AverageMeter() start_steps = epoch * len(tar_loader['main_task']) total_steps = self.config.num_epochs * len(tar_loader['main_task']) max_num_iter_src = max([ len(src_loader[task_name]) for task_name in self.config.task_names ]) for it in range(max_num_iter_src): t = time.time() # this is based on DANN paper p = float(it + start_steps) / total_steps alpha = 2. / (1. + np.exp(-10 * p)) - 1 self.optimizer.zero_grad() src = next(iter(src_loader['main_task'])) tar = next(iter(tar_loader['main_task'])) src = to_device(src, self.device) tar = to_device(tar, self.device) src_imgs, src_cls_lbls = src tar_imgs, _ = tar src_main_logits = self.model(src_imgs, 'main_task') src_main_loss = self.class_loss_func(src_main_logits, src_cls_lbls) loss = src_main_loss * self.config.loss_weight['main_task'] main_loss.update(loss.item(), src_imgs.size(0)) tar_main_logits = self.model(tar_imgs, 'main_task') tar_main_loss = self.entropy_loss(tar_main_logits) loss += tar_main_loss tar_aux_loss = {} src_aux_loss = {} #TO DO: separating dataloaders and iterate over tasks for task in self.config.task_names: if self.config.tasks[task]['type'] == 'classification_adapt': r = torch.randperm(src_imgs.size()[0] + tar_imgs.size()[0]) src_tar_imgs = torch.cat((src_imgs, tar_imgs), dim=0) src_tar_imgs = src_tar_imgs[r, :, :, :] src_tar_img = src_tar_imgs[:src_imgs.size()[0], :, :, :] src_tar_lbls = torch.cat((torch.zeros( (src_imgs.size()[0])), torch.ones( (tar_imgs.size()[0]))), dim=0) src_tar_lbls = src_tar_lbls[r] src_tar_lbls = src_tar_lbls[:src_imgs.size()[0]] src_tar_lbls = src_tar_lbls.long().cuda() src_tar_logits = self.model(src_tar_img, 'domain_classifier', alpha) tar_aux_loss['domain_classifier'] = self.class_loss_func( src_tar_logits, src_tar_lbls) loss += tar_aux_loss[ 'domain_classifier'] * self.config.loss_weight[ 'domain_classifier'] if self.config.tasks[task]['type'] == 'classification_self': src = next(iter(src_loader[task])) tar = next(iter(tar_loader[task])) src = to_device(src, self.device) tar = to_device(tar, self.device) src_aux_imgs, src_aux_lbls = src tar_aux_imgs, tar_aux_lbls = tar tar_aux_logits = self.model(tar_aux_imgs, task) src_aux_logits = self.model(src_aux_imgs, task) tar_aux_loss[task] = self.class_loss_func( tar_aux_logits, tar_aux_lbls) src_aux_loss[task] = self.class_loss_func( src_aux_logits, src_aux_lbls) loss += src_aux_loss[task] * self.config.loss_weight[ task] # todo: magnification weight loss += tar_aux_loss[task] * self.config.loss_weight[ task] # todo: main task weight if self.config.tasks[task]['type'] == 'pixel_self': src = next(iter(src_loader[task])) tar = next(iter(tar_loader[task])) src = to_device(src, self.device) tar = to_device(tar, self.device) src_aux_imgs, src_aux_lbls = src tar_aux_imgs, tar_aux_lbls = tar tar_aux_mag_logits = self.model(tar_aux_imgs, task) src_aux_mag_logits = self.model(src_aux_imgs, task) tar_aux_loss[task] = self.pixel_loss( tar_aux_mag_logits, tar_aux_lbls) src_aux_loss[task] = self.pixel_loss( src_aux_mag_logits, src_aux_lbls) loss += src_aux_loss[task] * self.config.loss_weight[ task] # todo: magnification weight loss += tar_aux_loss[task] * self.config.loss_weight[task] precision1_train, precision2_train = accuracy(src_main_logits, src_cls_lbls, topk=(1, 2)) top1.update(precision1_train[0], src_imgs.size(0)) loss.backward() self.optimizer.step() losses.update(loss.item(), src_imgs.size(0)) # measure elapsed time batch_time.update(time.time() - t) self.start_iter += 1 if self.start_iter % print_freq == 0: printt = '' for task_name in self.config.aux_task_names: if task_name == 'domain_classifier': printt = printt + ' | tar_aux_' + task_name + ': {:.3f} |' else: printt = printt + 'src_aux_' + task_name + ': {:.3f} | tar_aux_' + task_name + ': {:.3f}' print_string = 'Epoch {:>2} | iter {:>4} | loss:{:.3f} | acc: {:.3f} | src_main: {:.3f} |' + printt + '{:4.2f} s/it' src_aux_loss_all = [ loss.item() for loss in src_aux_loss.values() ] tar_aux_loss_all = [ loss.item() for loss in tar_aux_loss.values() ] self.logger.info( print_string.format(epoch, self.start_iter, losses.avg, top1.avg, main_loss.avg, *src_aux_loss_all, *tar_aux_loss_all, batch_time.avg)) self.writer.add_scalar('losses/all_loss', losses.avg, self.start_iter) self.writer.add_scalar('losses/src_main_loss', src_main_loss, self.start_iter) for task_name in self.config.aux_task_names: if task_name == 'domain_classifier': # self.writer.add_scalar('losses/src_aux_loss_'+task_name, src_aux_loss[task_name], i_iter) self.writer.add_scalar( 'losses/tar_aux_loss_' + task_name, tar_aux_loss[task_name], self.start_iter) else: self.writer.add_scalar( 'losses/src_aux_loss_' + task_name, src_aux_loss[task_name], self.start_iter) self.writer.add_scalar( 'losses/tar_aux_loss_' + task_name, tar_aux_loss[task_name], self.start_iter) self.scheduler.step() self.wandb.log({"Train Loss": main_loss.avg}) # del loss, src_class_loss, src_aux_loss, tar_aux_loss, tar_entropy_loss # del src_aux_logits, src_class_logits # del tar_aux_logits, tar_class_logits def train(self, src_loader, tar_loader, val_loader, test_loader): num_batches = len(src_loader['main_task']) print_freq = max(num_batches // self.config.training_num_print_epoch, 1) start_epoch = self.start_iter // num_batches num_epochs = self.config.num_epochs for epoch in range(start_epoch, num_epochs): if len(self.config.task_names) == 1: self.train_epoch_main_task(src_loader, tar_loader, epoch, print_freq) else: self.train_epoch_all_tasks(src_loader, tar_loader, epoch, print_freq) self.logger.info('learning rate: %f ' % get_lr(self.optimizer)) # validation self.save(self.config.model_dir, 'last') if val_loader is not None: self.logger.info('validating...') class_acc, AUC = self.test(val_loader) # self.writer.add_scalar('val/aux_acc', class_acc, i_iter) self.writer.add_scalar('val/class_acc', class_acc, self.start_iter) if class_acc > self.best_acc: self.best_acc = class_acc self.save(self.config.best_model_dir, 'best_acc') if AUC > self.best_AUC: self.best_AUC = AUC self.save(self.config.best_model_dir, 'best_AUC') # todo copy current model to best model self.logger.info('Best validation accuracy: {:.2f} %'.format( self.best_acc)) if test_loader is not None: self.logger.info('testing...') class_acc = self.test(test_loader) # self.writer.add_scalar('test/aux_acc', class_acc, i_iter) self.writer.add_scalar('test/class_acc', class_acc, self.start_iter) # if class_acc > self.best_acc: # self.best_acc = class_acc # todo copy current model to best model self.logger.info( 'Best testing accuracy: {:.2f} %'.format(class_acc)) self.logger.info('Best validation accuracy: {:.2f} %'.format( self.best_acc)) self.logger.info('Finished Training.') def save(self, path, ext): state = { "iter": self.start_iter + 1, "model_state": self.model.state_dict(), "optimizer_state": self.optimizer.state_dict(), "scheduler_state": self.scheduler.state_dict(), "best_acc": self.best_acc, } save_path = os.path.join(path, f'model_{ext}.pth') self.logger.info('Saving model to %s' % save_path) torch.save(state, save_path) def load(self, path): checkpoint = torch.load(path) self.model.load_state_dict(checkpoint['model_state']) self.logger.info('Loaded model from: ' + path) if self.config.mode == 'train': self.model.load_state_dict(checkpoint['model_state']) self.optimizer.load_state_dict(checkpoint['optimizer_state']) self.scheduler.load_state_dict(checkpoint['scheduler_state']) self.start_iter = checkpoint['iter'] self.best_acc = checkpoint['best_acc'] self.logger.info('Start iter: %d ' % self.start_iter) def test(self, val_loader): val_loader_iterator = iter(val_loader) num_val_iters = len(val_loader) tt = tqdm(range(num_val_iters), total=num_val_iters, desc="Validating") loss = AverageMeter() kk = 1 aux_correct = 0 class_correct = 0 total = 0 if self.config.dataset == 'kather': soft_labels = np.zeros((1, 9)) if self.config.dataset == 'oscc' or self.config.dataset == 'cam': soft_labels = np.zeros((1, 2)) true_labels = [] self.model.eval() with torch.no_grad(): for cur_it in tt: data = next(val_loader_iterator) data = to_device(data, self.device) imgs, cls_lbls = data # Get the inputs logits = self.model(imgs, 'main_task') test_loss = self.class_loss_func(logits, cls_lbls) loss.update(test_loss.item(), imgs.size(0)) if self.config.save_output == True: smax = nn.Softmax(dim=1) smax_out = smax(logits) soft_labels = np.concatenate( (soft_labels, smax_out.cpu().numpy()), axis=0) true_labels = np.append(true_labels, cls_lbls.cpu().numpy()) pred_trh = smax_out.cpu().numpy()[:, 1] pred_trh[pred_trh >= 0.5] = 1 pred_trh[pred_trh < 0.5] = 0 compare = cls_lbls.cpu().numpy() - pred_trh kk += 1 _, cls_pred = logits.max(dim=1) class_correct += torch.sum(cls_pred == cls_lbls) total += imgs.size(0) tt.close() self.wandb.log({"Test Loss": loss.avg}) # if self.config.save_output == True: soft_labels = soft_labels[1:, :] if self.config.dataset == 'oscc' or self.config.dataset == 'cam': AUC = calculate_stat(soft_labels, true_labels, 2, self.config.class_names, type='binary', thresh=0.5) if self.config.dataset == 'kather': AUC = calculate_stat(soft_labels, true_labels, 9, self.config.class_names, type='multi', thresh=0.5) class_acc = 100 * float(class_correct) / total self.logger.info('class_acc: {:.2f} %'.format(class_acc)) self.wandb.log({"Test acc": class_acc, "Test AUC": 100 * AUC}) return class_acc, AUC
def main(): # Views the training images and displays the distance on anchor-negative and anchor-positive # print the experiment configuration print('\nCurrent time is \33[91m{}\33[0m'.format(str(time.asctime()))) print('Parsed options: {}'.format(vars(args))) print('Number of Classes: {}\n'.format(len(train_dir.speakers))) # instantiate # model and initialize weights # instantiate model and initialize weights model_kwargs = { 'input_dim': args.feat_dim, 'embedding_size': args.embedding_size, 'num_classes': len(train_dir.speakers), 'dropout_p': args.dropout_p } print('Model options: {}'.format(model_kwargs)) model = create_model(args.model, **model_kwargs) if args.cuda: model.cuda() start = 0 # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print('=> loading checkpoint {}'.format(args.resume)) checkpoint = torch.load(args.resume) start = checkpoint['epoch'] checkpoint = torch.load(args.resume) filtered = { k: v for k, v in checkpoint['state_dict'].items() if 'num_batches_tracked' not in k } model.load_state_dict(filtered) # optimizer.load_state_dict(checkpoint['optimizer']) # scheduler.load_state_dict(checkpoint['scheduler']) # criterion.load_state_dict(checkpoint['criterion']) else: print('=> no checkpoint found at {}'.format(args.resume)) ce_criterion = nn.CrossEntropyLoss() if args.loss_type == 'soft': xe_criterion = None elif args.loss_type == 'asoft': ce_criterion = None model.classifier = AngleLinear(in_features=args.embedding_size, out_features=train_dir.num_spks, m=args.m) xe_criterion = AngleSoftmaxLoss(lambda_min=args.lambda_min, lambda_max=args.lambda_max) elif args.loss_type == 'center': xe_criterion = CenterLoss(num_classes=train_dir.num_spks, feat_dim=args.embedding_size) elif args.loss_type == 'amsoft': model.classifier = AdditiveMarginLinear(feat_dim=args.embedding_size, n_classes=train_dir.num_spks) xe_criterion = AMSoftmaxLoss(margin=args.margin, s=args.s) optimizer = create_optimizer(model.parameters(), args.optimizer, **opt_kwargs) if args.loss_type == 'center': optimizer = torch.optim.SGD([{ 'params': xe_criterion.parameters(), 'lr': args.lr * 5 }, { 'params': model.parameters() }], lr=args.lr, weight_decay=args.weight_decay, momentum=args.momentum) if args.finetune: if args.loss_type == 'asoft' or args.loss_type == 'amsoft': classifier_params = list(map(id, model.classifier.parameters())) rest_params = filter(lambda p: id(p) not in classifier_params, model.parameters()) optimizer = torch.optim.SGD( [{ 'params': model.classifier.parameters(), 'lr': args.lr * 5 }, { 'params': rest_params }], lr=args.lr, weight_decay=args.weight_decay, momentum=args.momentum) milestones = args.milestones.split(',') milestones = [int(x) for x in milestones] milestones.sort() # print('Scheduler options: {}'.format(milestones)) scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1) if args.save_init and not args.finetune: check_path = '{}/checkpoint_{}.pth'.format(args.check_path, start) torch.save( { 'epoch': start, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, check_path) start += args.start_epoch print('Start epoch is : ' + str(start)) end = args.epochs + 1 # pdb.set_trace() train_loader = torch.utils.data.DataLoader(train_dir, batch_size=args.batch_size, shuffle=True, **kwargs) valid_loader = torch.utils.data.DataLoader(valid_dir, batch_size=args.batch_size, shuffle=False, **kwargs) test_loader = torch.utils.data.DataLoader(test_part, batch_size=args.test_batch_size, shuffle=False, **kwargs) ce = [ce_criterion, xe_criterion] if args.cuda: model = model.cuda() for i in range(len(ce)): if ce[i] != None: ce[i] = ce[i].cuda() for epoch in range(start, end): # pdb.set_trace() print('\n\33[1;34m Current \'{}\' learning rate is '.format( args.optimizer), end='') for param_group in optimizer.param_groups: print('{:.5f} '.format(param_group['lr']), end='') print(' \33[0m') train(train_loader, model, optimizer, ce, epoch) test(test_loader, valid_loader, model, epoch) scheduler.step() # break writer.close()
class Processor(): """Processor for Skeleton-based Action Recgnition""" def __init__(self, arg): self.arg = arg self.save_arg() if arg.phase == 'train': # Added control through the command line arg.train_feeder_args[ 'debug'] = arg.train_feeder_args['debug'] or self.arg.debug logdir = os.path.join(arg.work_dir, 'trainlogs') if not arg.train_feeder_args['debug']: # logdir = arg.model_saved_name if os.path.isdir(logdir): print(f'log_dir {logdir} already exists') if arg.assume_yes: answer = 'y' else: answer = input('delete it? [y]/n:') if answer.lower() in ('y', ''): shutil.rmtree(logdir) print('Dir removed:', logdir) else: print('Dir not removed:', logdir) self.train_writer = SummaryWriter( os.path.join(logdir, 'train'), 'train') self.val_writer = SummaryWriter(os.path.join(logdir, 'val'), 'val') else: self.train_writer = SummaryWriter( os.path.join(logdir, 'debug'), 'debug') self.load_model() self.load_param_groups() self.load_optimizer() self.load_lr_scheduler() self.load_data() self.global_step = 0 self.lr = self.arg.base_lr self.best_acc = 0 self.best_acc_epoch = 0 if self.arg.half: self.print_log('*************************************') self.print_log('*** Using Half Precision Training ***') self.print_log('*************************************') self.model, self.optimizer = apex.amp.initialize( self.model, self.optimizer, opt_level=f'O{self.arg.amp_opt_level}') if self.arg.amp_opt_level != 1: self.print_log( '[WARN] nn.DataParallel is not yet supported by amp_opt_level != "O1"' ) if type(self.arg.device) is list: if len(self.arg.device) > 1: self.print_log( f'{len(self.arg.device)} GPUs available, using DataParallel' ) self.model = nn.DataParallel(self.model, device_ids=self.arg.device, output_device=self.output_device) def load_model(self): output_device = self.arg.device[0] if type( self.arg.device) is list else self.arg.device self.output_device = output_device Model = import_class(self.arg.model) # Copy model file and main shutil.copy2(inspect.getfile(Model), self.arg.work_dir) shutil.copy2(os.path.join('.', __file__), self.arg.work_dir) self.model = Model(**self.arg.model_args).cuda(output_device) self.loss = nn.CrossEntropyLoss().cuda(output_device) self.print_log( f'Model total number of params: {count_params(self.model)}') if self.arg.weights: try: self.global_step = int(arg.weights[:-3].split('-')[-1]) except: print('Cannot parse global_step from model weights filename') self.global_step = 0 self.print_log(f'Loading weights from {self.arg.weights}') if '.pkl' in self.arg.weights: with open(self.arg.weights, 'r') as f: weights = pickle.load(f) else: weights = torch.load(self.arg.weights) weights = OrderedDict( [[k.split('module.')[-1], v.cuda(output_device)] for k, v in weights.items()]) for w in self.arg.ignore_weights: if weights.pop(w, None) is not None: self.print_log(f'Sucessfully Remove Weights: {w}') else: self.print_log(f'Can Not Remove Weights: {w}') try: self.model.load_state_dict(weights) except: state = self.model.state_dict() diff = list(set(state.keys()).difference(set(weights.keys()))) self.print_log('Can not find these weights:') for d in diff: self.print_log(' ' + d) state.update(weights) self.model.load_state_dict(state) def load_param_groups(self): """ Template function for setting different learning behaviour (e.g. LR, weight decay) of different groups of parameters """ self.param_groups = defaultdict(list) for name, params in self.model.named_parameters(): self.param_groups['other'].append(params) self.optim_param_groups = { 'other': { 'params': self.param_groups['other'] } } def load_optimizer(self): params = list(self.optim_param_groups.values()) if self.arg.optimizer == 'SGD': self.optimizer = optim.SGD(params, lr=self.arg.base_lr, momentum=0.9, nesterov=self.arg.nesterov, weight_decay=self.arg.weight_decay) elif self.arg.optimizer == 'Adam': self.optimizer = optim.Adam(params, lr=self.arg.base_lr, weight_decay=self.arg.weight_decay) else: raise ValueError('Unsupported optimizer: {}'.format( self.arg.optimizer)) # Load optimizer states if any if self.arg.checkpoint is not None: self.print_log( f'Loading optimizer states from: {self.arg.checkpoint}') self.optimizer.load_state_dict( torch.load(self.arg.checkpoint)['optimizer_states']) current_lr = self.optimizer.param_groups[0]['lr'] self.print_log(f'Starting LR: {current_lr}') self.print_log( f'Starting WD1: {self.optimizer.param_groups[0]["weight_decay"]}' ) if len(self.optimizer.param_groups) >= 2: self.print_log( f'Starting WD2: {self.optimizer.param_groups[1]["weight_decay"]}' ) def load_lr_scheduler(self): self.lr_scheduler = MultiStepLR(self.optimizer, milestones=self.arg.step, gamma=0.1) if self.arg.checkpoint is not None: scheduler_states = torch.load( self.arg.checkpoint)['lr_scheduler_states'] self.print_log( f'Loading LR scheduler states from: {self.arg.checkpoint}') self.lr_scheduler.load_state_dict(scheduler_states) self.print_log( f'Starting last epoch: {scheduler_states["last_epoch"]}') self.print_log( f'Loaded milestones: {scheduler_states["last_epoch"]}') def load_data(self): Feeder = import_class(self.arg.feeder) self.data_loader = dict() def worker_seed_fn(worker_id): # give workers different seeds return init_seed(self.arg.seed + worker_id + 1) if self.arg.phase == 'train': self.data_loader['train'] = torch.utils.data.DataLoader( dataset=Feeder(**self.arg.train_feeder_args), batch_size=self.arg.batch_size, shuffle=True, num_workers=self.arg.num_worker, drop_last=True, worker_init_fn=worker_seed_fn) self.data_loader['test'] = torch.utils.data.DataLoader( dataset=Feeder(**self.arg.test_feeder_args), batch_size=self.arg.test_batch_size, shuffle=False, num_workers=self.arg.num_worker, drop_last=False, worker_init_fn=worker_seed_fn) def save_arg(self): # save arg arg_dict = vars(self.arg) if not os.path.exists(self.arg.work_dir): os.makedirs(self.arg.work_dir) with open(os.path.join(self.arg.work_dir, 'config.yaml'), 'w') as f: yaml.dump(arg_dict, f) def print_time(self): localtime = time.asctime(time.localtime(time.time())) self.print_log(f'Local current time: {localtime}') def print_log(self, s, print_time=True): if print_time: localtime = time.asctime(time.localtime(time.time())) s = f'[ {localtime} ] {s}' print(s) if self.arg.print_log: with open(os.path.join(self.arg.work_dir, 'log.txt'), 'a') as f: print(s, file=f) def record_time(self): self.cur_time = time.time() return self.cur_time def split_time(self): split_time = time.time() - self.cur_time self.record_time() return split_time def save_states(self, epoch, states, out_folder, out_name): out_folder_path = os.path.join(self.arg.work_dir, out_folder) out_path = os.path.join(out_folder_path, out_name) os.makedirs(out_folder_path, exist_ok=True) torch.save(states, out_path) def save_checkpoint(self, epoch, out_folder='checkpoints'): state_dict = { 'epoch': epoch, 'optimizer_states': self.optimizer.state_dict(), 'lr_scheduler_states': self.lr_scheduler.state_dict(), } checkpoint_name = f'checkpoint-{epoch}-fwbz{self.arg.forward_batch_size}-{int(self.global_step)}.pt' self.save_states(epoch, state_dict, out_folder, checkpoint_name) def save_weights(self, epoch, out_folder='weights'): state_dict = self.model.state_dict() weights = OrderedDict([[k.split('module.')[-1], v.cpu()] for k, v in state_dict.items()]) weights_name = f'weights-{epoch}-{int(self.global_step)}.pt' self.save_states(epoch, weights, out_folder, weights_name) def train(self, epoch, save_model=False): self.model.train() loader = self.data_loader['train'] loss_values = [] self.train_writer.add_scalar('epoch', epoch + 1, self.global_step) self.record_time() timer = dict(dataloader=0.001, model=0.001, statistics=0.001) current_lr = self.optimizer.param_groups[0]['lr'] self.print_log(f'Training epoch: {epoch + 1}, LR: {current_lr:.4f}') process = tqdm(loader, dynamic_ncols=True) for batch_idx, (data, label, index) in enumerate(process): self.global_step += 1 # get data with torch.no_grad(): data = data.float().cuda(self.output_device) label = label.long().cuda(self.output_device) timer['dataloader'] += self.split_time() # backward self.optimizer.zero_grad() ############## Gradient Accumulation for Smaller Batches ############## real_batch_size = self.arg.forward_batch_size splits = len(data) // real_batch_size assert len(data) % real_batch_size == 0, \ 'Real batch size should be a factor of arg.batch_size!' for i in range(splits): left = i * real_batch_size right = left + real_batch_size batch_data, batch_label = data[left:right], label[left:right] # forward output = self.model(batch_data) if isinstance(output, tuple): output, l1 = output l1 = l1.mean() else: l1 = 0 loss = self.loss(output, batch_label) / splits if self.arg.half: with apex.amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() loss_values.append(loss.item()) timer['model'] += self.split_time() # Display loss process.set_description( f'(BS {real_batch_size}) loss: {loss.item():.4f}') value, predict_label = torch.max(output, 1) acc = torch.mean((predict_label == batch_label).float()) self.train_writer.add_scalar('acc', acc, self.global_step) self.train_writer.add_scalar('loss', loss.item() * splits, self.global_step) self.train_writer.add_scalar('loss_l1', l1, self.global_step) ##################################### # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 2) self.optimizer.step() # statistics self.lr = self.optimizer.param_groups[0]['lr'] self.train_writer.add_scalar('lr', self.lr, self.global_step) timer['statistics'] += self.split_time() # Delete output/loss after each batch since it may introduce extra mem during scoping # https://discuss.pytorch.org/t/gpu-memory-consumption-increases-while-training/2770/3 del output del loss # statistics of time consumption and loss proportion = { k: f'{int(round(v * 100 / sum(timer.values()))):02d}%' for k, v in timer.items() } mean_loss = np.mean(loss_values) num_splits = self.arg.batch_size // self.arg.forward_batch_size self.print_log( f'\tMean training loss: {mean_loss:.4f} (BS {self.arg.batch_size}: {mean_loss * num_splits:.4f}).' ) self.print_log( '\tTime consumption: [Data]{dataloader}, [Network]{model}'.format( **proportion)) # PyTorch > 1.2.0: update LR scheduler here with `.step()` # and make sure to save the `lr_scheduler.state_dict()` as part of checkpoint self.lr_scheduler.step() if save_model: # save training checkpoint & weights self.save_weights(epoch + 1) self.save_checkpoint(epoch + 1) def eval(self, epoch, save_score=False, loader_name=['test'], wrong_file=None, result_file=None): # Skip evaluation if too early if epoch + 1 < self.arg.eval_start: return if wrong_file is not None: f_w = open(wrong_file, 'w') if result_file is not None: f_r = open(result_file, 'w') with torch.no_grad(): self.model = self.model.cuda(self.output_device) self.model.eval() self.print_log(f'Eval epoch: {epoch + 1}') for ln in loader_name: loss_values = [] score_batches = [] step = 0 process = tqdm(self.data_loader[ln], dynamic_ncols=True) for batch_idx, (data, label, index) in enumerate(process): data = data.float().cuda(self.output_device) label = label.long().cuda(self.output_device) output = self.model(data) if isinstance(output, tuple): output, l1 = output l1 = l1.mean() else: l1 = 0 loss = self.loss(output, label) score_batches.append(output.data.cpu().numpy()) loss_values.append(loss.item()) _, predict_label = torch.max(output.data, 1) step += 1 if wrong_file is not None or result_file is not None: predict = list(predict_label.cpu().numpy()) true = list(label.data.cpu().numpy()) for i, x in enumerate(predict): if result_file is not None: f_r.write(str(x) + ',' + str(true[i]) + '\n') if x != true[i] and wrong_file is not None: f_w.write( str(index[i]) + ',' + str(x) + ',' + str(true[i]) + '\n') score = np.concatenate(score_batches) loss = np.mean(loss_values) accuracy = self.data_loader[ln].dataset.top_k(score, 1) if accuracy > self.best_acc: self.best_acc = accuracy self.best_acc_epoch = epoch + 1 print('Accuracy: ', accuracy, ' model: ', self.arg.work_dir) if self.arg.phase == 'train' and not self.arg.debug: self.val_writer.add_scalar('loss', loss, self.global_step) self.val_writer.add_scalar('loss_l1', l1, self.global_step) self.val_writer.add_scalar('acc', accuracy, self.global_step) score_dict = dict( zip(self.data_loader[ln].dataset.sample_name, score)) self.print_log( f'\tMean {ln} loss of {len(self.data_loader[ln])} batches: {np.mean(loss_values)}.' ) for k in self.arg.show_topk: self.print_log( f'\tTop {k}: {100 * self.data_loader[ln].dataset.top_k(score, k):.2f}%' ) if save_score: with open( '{}/epoch{}_{}_score.pkl'.format( self.arg.work_dir, epoch + 1, ln), 'wb') as f: pickle.dump(score_dict, f) # Empty cache after evaluation torch.cuda.empty_cache() def start(self): if self.arg.phase == 'train': self.print_log(f'Parameters:\n{pprint.pformat(vars(self.arg))}\n') self.print_log( f'Model total number of params: {count_params(self.model)}') self.global_step = self.arg.start_epoch * len( self.data_loader['train']) / self.arg.batch_size for epoch in range(self.arg.start_epoch, self.arg.num_epoch): save_model = ((epoch + 1) % self.arg.save_interval == 0) or (epoch + 1 == self.arg.num_epoch) self.train(epoch, save_model=save_model) self.eval(epoch, save_score=self.arg.save_score, loader_name=['test']) num_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) self.print_log(f'Best accuracy: {self.best_acc}') self.print_log(f'Epoch number: {self.best_acc_epoch}') self.print_log(f'Model name: {self.arg.work_dir}') self.print_log(f'Model total number of params: {num_params}') self.print_log(f'Weight decay: {self.arg.weight_decay}') self.print_log(f'Base LR: {self.arg.base_lr}') self.print_log(f'Batch Size: {self.arg.batch_size}') self.print_log( f'Forward Batch Size: {self.arg.forward_batch_size}') self.print_log(f'Test Batch Size: {self.arg.test_batch_size}') elif self.arg.phase == 'test': if not self.arg.test_feeder_args['debug']: wf = os.path.join(self.arg.work_dir, 'wrong-samples.txt') rf = os.path.join(self.arg.work_dir, 'right-samples.txt') else: wf = rf = None if self.arg.weights is None: raise ValueError('Please appoint --weights.') self.print_log(f'Model: {self.arg.model}') self.print_log(f'Weights: {self.arg.weights}') self.eval(epoch=0, save_score=self.arg.save_score, loader_name=['test'], wrong_file=wf, result_file=rf) self.print_log('Done.\n')
def main(): global best_RMSE lw = utils_func.LossWise(args.api_key, args.losswise_tag, args.epochs - 1) # set logger log = logger.setup_logger(os.path.join(args.save_path, 'training.log')) for key, value in sorted(vars(args).items()): log.info(str(key) + ': ' + str(value)) # set tensorboard writer = SummaryWriter(args.save_path + '/tensorboardx') # Data Loader if args.generate_depth_map: TrainImgLoader = None import dataloader.KITTI_submission_loader as KITTI_submission_loader TestImgLoader = torch.utils.data.DataLoader( KITTI_submission_loader.SubmiteDataset(args.datapath, args.data_list, args.dynamic_bs), batch_size=args.bval, shuffle=False, num_workers=args.workers, drop_last=False) elif args.dataset == 'kitti': train_data, val_data = KITTILoader3D.dataloader( args.datapath, args.split_train, args.split_val, kitti2015=args.kitti2015) TrainImgLoader = torch.utils.data.DataLoader( KITTILoader_dataset3d.myImageFloder(train_data, True, kitti2015=args.kitti2015, dynamic_bs=args.dynamic_bs), batch_size=args.btrain, shuffle=True, num_workers=8, drop_last=False, pin_memory=True) TestImgLoader = torch.utils.data.DataLoader( KITTILoader_dataset3d.myImageFloder(val_data, False, kitti2015=args.kitti2015, dynamic_bs=args.dynamic_bs), batch_size=args.bval, shuffle=False, num_workers=8, drop_last=False, pin_memory=True) else: train_data, val_data = listflowfile.dataloader(args.datapath) TrainImgLoader = torch.utils.data.DataLoader( SceneFlowLoader.myImageFloder(train_data, True, calib=args.calib_value), batch_size=args.btrain, shuffle=True, num_workers=8, drop_last=False) TestImgLoader = torch.utils.data.DataLoader( SceneFlowLoader.myImageFloder(val_data, False, calib=args.calib_value), batch_size=args.bval, shuffle=False, num_workers=8, drop_last=False) # Load Model if args.data_type == 'disparity': model = disp_models.__dict__[args.arch](maxdisp=args.maxdisp) elif args.data_type == 'depth': model = models.__dict__[args.arch](maxdepth=args.maxdepth, maxdisp=args.maxdisp, down=args.down, scale=args.scale) else: log.info('Model is not implemented') assert False # Number of parameters log.info('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) model = nn.DataParallel(model).cuda() torch.backends.cudnn.benchmark = True # Optimizer optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999)) scheduler = MultiStepLR(optimizer, milestones=args.lr_stepsize, gamma=args.lr_gamma) if args.pretrain: if os.path.isfile(args.pretrain): log.info("=> loading pretrain '{}'".format(args.pretrain)) checkpoint = torch.load(args.pretrain) model.load_state_dict(checkpoint['state_dict'], strict=False) else: log.info('[Attention]: Do not find checkpoint {}'.format( args.pretrain)) if args.resume: if os.path.isfile(args.resume): log.info("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) best_RMSE = checkpoint['best_RMSE'] scheduler.load_state_dict(checkpoint['scheduler']) log.info("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: log.info('[Attention]: Do not find checkpoint {}'.format( args.resume)) if args.generate_depth_map: os.makedirs(args.save_path + '/depth_maps/' + args.data_tag, exist_ok=True) tqdm_eval_loader = tqdm(TestImgLoader, total=len(TestImgLoader)) for batch_idx, (imgL_crop, imgR_crop, calib, H, W, filename) in enumerate(tqdm_eval_loader): pred_disp = inference(imgL_crop, imgR_crop, calib, model) for idx, name in enumerate(filename): np.save( args.save_path + '/depth_maps/' + args.data_tag + '/' + name, pred_disp[idx][-H[idx]:, :W[idx]]) import sys sys.exit() # evaluation if args.evaluate: evaluate_metric = utils_func.Metric() ## training ## for batch_idx, (imgL_crop, imgR_crop, disp_crop_L, calib) in enumerate(TestImgLoader): start_time = time.time() test(imgL_crop, imgR_crop, disp_crop_L, calib, evaluate_metric, optimizer, model) log.info( evaluate_metric.print(batch_idx, 'EVALUATE') + ' Time:{:.3f}'.format(time.time() - start_time)) import sys sys.exit() for epoch in range(args.start_epoch, args.epochs): scheduler.step() ## training ## train_metric = utils_func.Metric() tqdm_train_loader = tqdm(TrainImgLoader, total=len(TrainImgLoader)) for batch_idx, (imgL_crop, imgR_crop, disp_crop_L, calib) in enumerate(tqdm_train_loader): # start_time = time.time() train(imgL_crop, imgR_crop, disp_crop_L, calib, train_metric, optimizer, model, epoch) # log.info(train_metric.print(batch_idx, 'TRAIN') + ' Time:{:.3f}'.format(time.time() - start_time)) log.info(train_metric.print(0, 'TRAIN Epoch' + str(epoch))) train_metric.tensorboard(writer, epoch, token='TRAIN') lw.update(train_metric.get_info(), epoch, 'Train') ## testing ## is_best = False if epoch == 0 or ((epoch + 1) % args.eval_interval) == 0: test_metric = utils_func.Metric() tqdm_test_loader = tqdm(TestImgLoader, total=len(TestImgLoader)) for batch_idx, (imgL_crop, imgR_crop, disp_crop_L, calib) in enumerate(tqdm_test_loader): # start_time = time.time() test(imgL_crop, imgR_crop, disp_crop_L, calib, test_metric, optimizer, model) # log.info(test_metric.print(batch_idx, 'TEST') + ' Time:{:.3f}'.format(time.time() - start_time)) log.info(test_metric.print(0, 'TEST Epoch' + str(epoch))) test_metric.tensorboard(writer, epoch, token='TEST') lw.update(test_metric.get_info(), epoch, 'Test') # SAVE is_best = test_metric.RMSELIs.avg < best_RMSE best_RMSE = min(test_metric.RMSELIs.avg, best_RMSE) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_RMSE': best_RMSE, 'scheduler': scheduler.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best, epoch, folder=args.save_path) lw.done()
def train(args): use_gpu = torch.cuda.is_available() num_gpu = list(range(torch.cuda.device_count())) assert use_gpu, "Please use gpus." logger = get_logger(name=args.shortname) display_args(args, logger) # create dir for saving args.saverootpath = osp.abspath(args.saverootpath) savepath = osp.join(args.saverootpath, args.run_name) if not osp.exists(savepath): os.makedirs(savepath) train_file = os.path.join(args.image_sets, "{}.txt".format(args.train_dataset)) n_features = 35 if args.no_reflex else 36 if args.pixor_fusion: if args.e2e: train_data = KittiDataset_Fusion_stereo( txt_file=train_file, flip_rate=args.flip_rate, lidar_dir=args.eval_lidar_dir, label_dir=args.eval_label_dir, calib_dir=args.eval_calib_dir, image_dir=args.eval_image_dir, root_dir=args.root_dir, only_feature=args.no_cal_loss, split=args.split, image_downscale=args.image_downscale, crop_height=args.crop_height, random_shift_scale=args.random_shift_scale) else: train_data = KittiDataset_Fusion( txt_file=train_file, flip_rate=args.flip_rate, lidar_dir=args.train_lidar_dir, label_dir=args.train_label_dir, calib_dir=args.train_calib_dir, n_features=n_features, random_shift_scale=args.random_shift_scale, root_dir=args.root_dir, image_downscale=args.image_downscale) else: train_data = KittiDataset(txt_file=train_file, flip_rate=args.flip_rate, lidar_dir=args.train_lidar_dir, label_dir=args.train_label_dir, calib_dir=args.train_calib_dir, image_dir=args.train_image_dir, n_features=n_features, random_shift_scale=args.random_shift_scale, root_dir=args.root_dir) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=8) eval_data, eval_loader = get_eval_dataset(args) if args.pixor_fusion: pixor = PixorNet_Fusion(n_features, groupnorm=args.groupnorm, resnet_type=args.resnet_type, image_downscale=args.image_downscale, resnet_chls=args.resnet_chls) else: pixor = PixorNet(n_features, groupnorm=args.groupnorm) ts = time.time() pixor = pixor.cuda() pixor = nn.DataParallel(pixor, device_ids=num_gpu) class_criterion = nn.BCELoss(reduction='none') reg_criterion = nn.SmoothL1Loss(reduction='none') if args.opt_method == 'RMSprop': optimizer = optim.RMSprop(pixor.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) else: raise NotImplementedError() depth_model = PSMNet(maxdepth=80, maxdisp=192, down=args.depth_down) depth_model = nn.DataParallel(depth_model).cuda() # torch.backends.cudnn.benchmark = True depth_optimizer = optim.Adam(depth_model.parameters(), lr=args.depth_lr, betas=(0.9, 0.999)) grid_3D_extended = get_3D_global_grid_extended(700, 800, 35).cuda().float() if args.depth_pretrain: if os.path.isfile(args.depth_pretrain): logger.info("=> loading depth pretrain '{}'".format( args.depth_pretrain)) checkpoint = torch.load(args.depth_pretrain) depth_model.load_state_dict(checkpoint['state_dict']) depth_optimizer.load_state_dict(checkpoint['optimizer']) else: logger.info('[Attention]: Do not find checkpoint {}'.format( args.depth_pretrain)) depth_scheduler = MultiStepLR(depth_optimizer, milestones=args.depth_lr_stepsize, gamma=args.depth_lr_gamma) if args.pixor_pretrain: if os.path.isfile(args.pixor_pretrain): logger.info("=> loading depth pretrain '{}'".format( args.pixor_pretrain)) checkpoint = torch.load(args.pixor_pretrain) pixor.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) optimizer.param_groups[0]['lr'] *= 10 else: logger.info('[Attention]: Do not find checkpoint {}'.format( args.pixor_pretrain)) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_milestones, gamma=args.gamma) if args.resume: logger.info("Resuming...") checkpoint_path = osp.join(savepath, args.checkpoint) if os.path.isfile(checkpoint_path): logger.info("Loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path) pixor.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) depth_model.load_state_dict(checkpoint['depth_state_dict']) depth_optimizer.load_state_dict(checkpoint['depth_optimizer']) depth_scheduler.load_state_dict(checkpoint['depth_scheduler']) start_epoch = checkpoint['epoch'] + 1 logger.info( "Resumed successfully from epoch {}.".format(start_epoch)) else: logger.warning("Model {} not found. " "Train from scratch".format(checkpoint_path)) start_epoch = 0 else: start_epoch = 0 class_criterion = class_criterion.cuda() reg_criterion = reg_criterion.cuda() processes = [] last_eval_epoches = [] for epoch in range(start_epoch, args.epochs): pixor.train() depth_model.train() scheduler.step() depth_scheduler.step() ts = time.time() logger.info("Start epoch {}, depth lr {:.6f} pixor lr {:.7f}".format( epoch, depth_optimizer.param_groups[0]['lr'], optimizer.param_groups[0]['lr'])) avg_class_loss = AverageMeter() avg_reg_loss = AverageMeter() avg_total_loss = AverageMeter() train_metric = utils_func.Metric() for iteration, batch in enumerate(train_loader): if args.pixor_fusion: if not args.e2e: inputs = batch['X'].cuda() else: imgL = batch['imgL'].cuda() imgR = batch['imgR'].cuda() f = batch['f'] depth_map = batch['depth_map'].cuda() idxx = batch['idx'] h_shift = batch['h_shift'] ori_shape = batch['ori_shape'] a_shift = batch['a_shift'] flip = batch['flip'] images = batch['image'].cuda() img_index = batch['img_index'].cuda() bev_index = batch['bev_index'].cuda() else: inputs = batch['X'].cuda() class_labels = batch['cl'].cuda() reg_labels = batch['rl'].cuda() if args.pixor_fusion: if not args.e2e: class_outs, reg_outs = pixor(inputs, images, img_index, bev_index) else: depth_loss, depth_map = forward_depth_model( imgL, imgR, depth_map, f, train_metric, depth_model) inputs = [] for i in range(depth_map.shape[0]): calib = utils_func.torchCalib( train_data.dataset.get_calibration(idxx[i]), h_shift[i]) H, W = ori_shape[0][i], ori_shape[1][i] depth = depth_map[i][-H:, :W] ptc = depth_to_pcl(calib, depth, max_high=1.) ptc = calib.lidar_to_rect(ptc[:, 0:3]) if torch.abs(a_shift[i]).item() > 1e-6: roty = utils_func.roty_pth(a_shift[i]).cuda() ptc = torch.mm(ptc, roty.t()) voxel = gen_feature_diffused_tensor( ptc, 700, 800, grid_3D_extended, diffused=args.diffused) if flip[i] > 0: voxel = torch.flip(voxel, [2]) inputs.append(voxel) inputs = torch.stack(inputs) class_outs, reg_outs = pixor(inputs, images, img_index, bev_index) else: class_outs, reg_outs = pixor(inputs) class_outs = class_outs.squeeze(1) class_loss, reg_loss, loss = \ compute_loss(epoch, class_outs, reg_outs, class_labels, reg_labels, class_criterion, reg_criterion, args) avg_class_loss.update(class_loss.item()) avg_reg_loss.update(reg_loss.item() \ if not isinstance(reg_loss, int) else reg_loss) avg_total_loss.update(loss.item()) optimizer.zero_grad() depth_optimizer.zero_grad() loss = depth_loss + 0.1 * loss loss.backward() optimizer.step() depth_optimizer.step() if not isinstance(reg_loss, int): reg_loss = reg_loss.item() if iteration % args.logevery == 0: logger.info("epoch {:d}, iter {:d}, class_loss: {:.5f}," " reg_loss: {:.5f}, loss: {:.5f}".format( epoch, iteration, avg_class_loss.avg, avg_reg_loss.avg, avg_total_loss.avg)) logger.info(train_metric.print(epoch, iteration)) logger.info("Finish epoch {}, time elapsed {:.3f} s".format( epoch, time.time() - ts)) if epoch % args.eval_every_epoch == 0 and epoch >= args.start_eval: logger.info("Evaluation begins at epoch {}".format(epoch)) evaluate(eval_data, eval_loader, pixor, depth_model, args.batch_size, gpu=use_gpu, logger=logger, args=args, epoch=epoch, processes=processes, grid_3D_extended=grid_3D_extended) if args.run_official_evaluate: last_eval_epoches.append((epoch, 7)) last_eval_epoches.append((epoch, 5)) if len(last_eval_epoches) > 0: for e, iou in last_eval_epoches[:]: predicted_results = osp.join(args.saverootpath, args.run_name, 'predicted_label_{}'.format(e), 'outputs_{:02d}.txt'.format(iou)) if osp.exists(predicted_results): with open(predicted_results, 'r') as f: for line in f.readlines(): if line.startswith('car_detection_ground AP'): results = [ float(num) for num in line.strip('\n').split(' ')[-3:] ] last_eval_epoches.remove((e, iou)) if epoch % args.save_every == 0: saveto = osp.join(savepath, "checkpoint_{}.pth.tar".format(epoch)) torch.save( { 'state_dict': pixor.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'depth_state_dict': depth_model.state_dict(), 'depth_optimizer': depth_optimizer.state_dict(), 'depth_scheduler': depth_scheduler.state_dict(), 'epoch': epoch }, saveto) logger.info("model saved to {}".format(saveto)) symlink_force(saveto, osp.join(savepath, "checkpoint.pth.tar")) for p in processes: if p.wait() != 0: logger.warning("There was an error") if len(last_eval_epoches) > 0: for e, iou in last_eval_epoches[:]: predicted_results = osp.join(args.saverootpath, args.run_name, 'predicted_label_{}'.format(e), 'outputs_{:02d}.txt'.format(iou)) if osp.exists(predicted_results): with open(predicted_results, 'r') as f: for line in f.readlines(): if line.startswith('car_detection_ground AP'): results = [ float(num) for num in line.strip('\n').split(' ')[-3:] ] last_eval_epoches.remove((e, iou))