def main(args): utils.init_distributed_mode(args) print(args) # device = torch.device(args.device) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Loading data") # dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path) # dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path) dataset = CustomDataset(args.img_data_path, args.anno_data_path, transforms=get_transform(train=True)) dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(.2 * dataset_size)) random_seed = 42 np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_sampler = SubsetRandomSampler(train_indices) test_sampler = SubsetRandomSampler(val_indices) print("Creating data loaders") # if args.distributed: # train_sampler = torch.utils.data.distributed.DistributedSampler(dataset_train) # test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test) # else: # train_sampler = torch.utils.data.RandomSampler(dataset_train) # test_sampler = torch.utils.data.SequentialSampler(dataset_test) # # if args.aspect_ratio_group_factor >= 0: # group_ids = create_aspect_ratio_groups(dataset_train, k=args.aspect_ratio_group_factor) # train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) # else: # train_batch_sampler = torch.utils.data.BatchSampler( # train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, sampler=train_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") num_classes = 2 # 1 class (Car) + background model = torchvision.models.detection.__dict__[args.model]( num_classes=num_classes, pretrained=args.pretrained) model.to(device) model_without_ddp = model # if args.distributed: # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): # if args.distributed: # train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print("Loading data") dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups( dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") # model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes, # pretrained=args.pretrained) model = get_model(num_classes=num_classes) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: print("----------------------Resume--------------") checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def run(self): dataset = Datalayer(self.config, transform=tf.Compose([ tf.ToTensor(), ])) data_loader = DataLoader( dataset=dataset, shuffle=True, batch_size=self.config['Dataset']['BatchSize'], num_workers=self.config['Dataset']['NumWorkers'], pin_memory=True) # 创建模型 model = getModels(self.config['Model']['Name'], num_classes=self.config['Model']['NumClass'], pretrained=self.config['Model']['IsPretrained']).to( self.device) # 创建损失函数 criterion = getLossFuns() # 创建优化器 optimizer = make_optimizer(cfg=self.config, model=model) lr_scheduler = GeneralLR_Scheduler(optimizer, self.config, max_iter=len(data_loader) * self.config['Dataset']['Epochs']) start_epoch = 0 # 恢复训练 if self.config['Model']['IsResume']: checkpoint = torch.load(os.path.join( self.output_model_path, self.config['Misc']['BestModelName']), map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) start_epoch = checkpoint['epoch'] + 1 # 开始训练 print("Start training") for epoch in range(start_epoch, self.config['Dataset']['Epochs']): self.train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, self.device, epoch, print_freq=10) if self.output_model_path: checkpoint = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } utils.save_on_master( checkpoint, os.path.join(self.output_model_path, self.config['Misc']['BestModelName'])) if epoch % self.config['Model']['OutputFreq'] == 0: utils.save_on_master( checkpoint, os.path.join(self.output_model_path, 'model_{}.pth'.format(epoch)))
def main(args): cfg = get_default_cfg() cfg.merge_from_file('/home/lh/project/SeqNet/exp_cuhk/config.yaml') cfg.merge_from_list(args.opts) cfg.freeze() device = torch.device(cfg.DEVICE) if cfg.SEED >= 0: set_random_seed(cfg.SEED) print("Creating model") model = SeqNet(cfg) model.to(device) print("Loading data") train_loader = build_train_loader(cfg) gallery_loader, query_loader = build_test_loader(cfg) if args.eval: assert args.ckpt, "--ckpt must be specified when --eval enabled" resume_from_ckpt(args.ckpt, model) evaluate_performance( model, gallery_loader, query_loader, device, use_gt=cfg.EVAL_USE_GT, use_cache=cfg.EVAL_USE_CACHE, use_cbgm=cfg.EVAL_USE_CBGM, ) exit(0) params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD( params, lr=cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.SGD_MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY, ) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.SOLVER.LR_DECAY_MILESTONES, gamma=0.1) start_epoch = 0 if args.resume: assert args.ckpt, "--ckpt must be specified when --resume enabled" start_epoch = resume_from_ckpt(args.ckpt, model, optimizer, lr_scheduler) + 1 print("Creating output folder") output_dir = cfg.OUTPUT_DIR mkdir(output_dir) path = osp.join(output_dir, "config.yaml") with open(path, "w") as f: f.write(cfg.dump()) print(f"Full config is saved to {path}") tfboard = None if cfg.TF_BOARD: from torch.utils.tensorboard import SummaryWriter tf_log_path = osp.join(output_dir, "tf_log") mkdir(tf_log_path) tfboard = SummaryWriter(log_dir=tf_log_path) print(f"TensorBoard files are saved to {tf_log_path}") print("Start training") start_time = time.time() for epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCHS): train_one_epoch(cfg, model, optimizer, train_loader, device, epoch, tfboard) lr_scheduler.step() if (epoch + 1 ) % cfg.EVAL_PERIOD == 0 or epoch == cfg.SOLVER.MAX_EPOCHS - 1: evaluate_performance( model, gallery_loader, query_loader, device, use_gt=cfg.EVAL_USE_GT, use_cache=cfg.EVAL_USE_CACHE, use_cbgm=cfg.EVAL_USE_CBGM, ) if (epoch + 1 ) % cfg.CKPT_PERIOD == 0 or epoch == cfg.SOLVER.MAX_EPOCHS - 1: save_on_master( { "model": model.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, }, osp.join(output_dir, f"epoch_{epoch}.pth"), ) if tfboard: tfboard.close() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print(f"Total training time {total_time_str}")
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print("Loading data") dataset, num_classes = get_dataset( args.dataset, "train" if args.dataset == 'coco' else 'trainval', get_transform(train=True), args.data_path) dataset_test, _ = get_dataset(args.dataset, "val" if args.dataset == 'coco' else 'test', get_transform(train=False), args.data_path) print("Creating data loaderssss") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups( dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") # model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes, # pretrained=args.pretrained) model = get_model(num_classes=num_classes) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) if args.test_only: voc_evaluate(model, data_loader_test, device=device) return pytorch_total_params = sum(p.numel() for p in model.parameters()) print("model params", pytorch_total_params) temp = torch.randn(1, 3, 500, 353, device='cuda') model.eval() macs, params = flop_count(model, (temp, )) # macs, params = clever_format([macs, params], "%.3f") print("macs", macs.items()) # print("macs", params.items()) print("Start training") start_time = time.time() best_map = 0 for epoch in range(args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir and epoch % 9 == 0: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch if 'coco' in args.dataset: coco_evaluate(model, data_loader_test, device=device) elif 'voc' in args.dataset: map = voc_evaluate(model, data_loader_test, device=device) if map > best_map: best_map = map print("Best Mean AP") print(best_map) else: print( f'No evaluation method available for the dataset {args.dataset}' ) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str)) if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args }, os.path.join(args.output_dir, 'model_{}.pth'.format("final")))