def main(args): warnings.warn( 'Warning! Old testing code is deprecated and will be deleted ' 'in next version. Please use tools/test.py') load_config(cfg, args.config) local_rank = -1 torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True cfg.defrost() timestr = datetime.datetime.now().__format__('%Y%m%d%H%M%S') cfg.save_dir = os.path.join(cfg.save_dir, timestr) cfg.freeze() mkdir(local_rank, cfg.save_dir) logger = Logger(local_rank, cfg.save_dir) logger.log('Creating model...') model = build_model(cfg.model) logger.log('Setting up data...') val_dataset = build_dataset(cfg.data.val, args.task) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) trainer = build_trainer(local_rank, cfg, model, logger) cfg.schedule.update({'load_model': args.model}) trainer.load_model(cfg) evaluator = build_evaluator(cfg, val_dataset) logger.log('Starting testing...') with torch.no_grad(): results, val_loss_dict = trainer.run_epoch(0, val_dataloader, mode=args.task) if args.task == 'test': res_json = evaluator.results2json(results) json_path = os.path.join(cfg.save_dir, 'results{}.json'.format(timestr)) json.dump(res_json, open(json_path, 'w')) elif args.task == 'val': eval_results = evaluator.evaluate(results, cfg.save_dir, rank=local_rank) if args.save_result: txt_path = os.path.join(cfg.save_dir, "eval_results{}.txt".format(timestr)) with open(txt_path, "a") as f: for k, v in eval_results.items(): f.write("{}: {}\n".format(k, v))
def main(args): warnings.warn('Warning! Old training code is deprecated and will be deleted ' 'in next version. Please use tools/train.py') load_config(cfg, args.config) local_rank = int(args.local_rank) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True mkdir(local_rank, cfg.save_dir) # mkdir用@rank_filter包裹,主进程创建save_dir logger = Logger(local_rank, cfg.save_dir) if args.seed is not None: logger.log('Set random seed to {}'.format(args.seed)) init_seeds(args.seed) logger.log('Creating model...') model = build_model(cfg.model) logger.log('Setting up data...') train_dataset = build_dataset(cfg.data.train, 'train') val_dataset = build_dataset(cfg.data.val, 'test') if len(cfg.device.gpu_ids) > 1: print('rank = ', local_rank) num_gpus = torch.cuda.device_count() torch.cuda.set_device(local_rank % num_gpus) dist.init_process_group(backend='nccl') train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, sampler=train_sampler, drop_last=True) else: train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) trainer = build_trainer(local_rank, cfg, model, logger) if 'load_model' in cfg.schedule: trainer.load_model(cfg) if 'resume' in cfg.schedule: trainer.resume(cfg) evaluator = build_evaluator(cfg, val_dataset) logger.log('Starting training...') trainer.run(train_dataloader, val_dataloader, evaluator)
def startNanodetTrain(self): #加载配置文件 load_config(cfg, self.nanoTrainConfig['cfg']) #判断分布式训练当中该主机的角色 local_rank = int(self.nanoTrainConfig["local_rank"]) # torch.backends.cudnn.enabled = True # torch.backends.cudnn.benchmark = True mkdir(local_rank, self.nanoTrainConfig["save_dir"]) logger = Logger(local_rank, self.nanoTrainConfig["save_dir"]) if self.nanoTrainConfig.keys().__contains__("seed"): logger.log('Set random seed to {}'.format( self.nanoTrainConfig['seed'])) self.init_seeds(self.nanoTrainConfig['seed']) #1.创建模型 model = build_model(cfg.model) model = model.cpu() #2.加载数据 logger.log('Setting up data...') train_dataset = build_dataset(cfg.data.train, 'train', self.nanoTrainConfig) val_dataset = build_dataset(cfg.data.val, 'test', self.nanoTrainConfig) if len(cfg.device.gpu_ids) > 1: print('rank = ', local_rank) num_gpus = torch.cuda.device_count() torch.cuda.set_device(local_rank % num_gpus) dist.init_process_group(backend='nccl') train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.device.batchsize_per_gpu, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, sampler=train_sampler, drop_last=True) else: print("加载数据...") train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True, collate_fn=collate_function, drop_last=True) trainer = build_trainer(local_rank, cfg, model, logger) if 'load_model' in cfg.schedule: trainer.load_model(cfg) if 'resume' in cfg.schedule: trainer.resume(cfg) evaluator = build_evaluator(cfg, val_dataset) logger.log('Starting training...') trainer.run(train_dataloader, val_dataloader, evaluator, self.nanoTrainConfig)
def run(args): """ :param args: :return: """ load_config(cfg, args.config) local_rank = int(args.local_rank) # what's this? torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True mkdir(local_rank, cfg.save_dir) logger = Logger(local_rank, cfg.save_dir) if args.seed is not None: logger.log('Set random seed to {}'.format(args.seed)) init_seeds(args.seed) logger.log('Creating model...') model = build_model(cfg.model) logger.log('Setting up data...') train_dataset = build_dataset(cfg.data.train, 'train') # build_dataset(cfg.data.train, 'train') val_dataset = build_dataset(cfg.data.val, 'test') if len(cfg.device.gpu_ids) > 1: # More than one GPU(distributed training) print('rank = ', local_rank) num_gpus = torch.cuda.device_count() torch.cuda.set_device(local_rank % num_gpus) dist.init_process_group(backend='nccl') train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.is_debug: train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, num_workers=0, pin_memory=True, collate_fn=collate_function, sampler=train_sampler, drop_last=True) else: train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, sampler=train_sampler, drop_last=True) else: if args.is_debug: train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=0, pin_memory=True, collate_fn=collate_function, drop_last=True) else: train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) if args.is_debug: val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0, pin_memory=True, collate_fn=collate_function, drop_last=True) else: val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True, collate_fn=collate_function, drop_last=True) # ----- trainer = build_trainer(local_rank, cfg, model, logger) if 'load_model' in cfg.schedule: trainer.load_model(cfg) if 'resume' in cfg.schedule: trainer.resume(cfg) # ----- Build a evaluator evaluator = build_evaluator(cfg, val_dataset) # evaluator = None logger.log('Starting training...') trainer.run(train_data_loader, val_data_loader, evaluator)