def run(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test print('Setting up data...') Dataset = get_dataset(opt.dataset, opt.task, opt.multi_scale) # if opt.task==mot -> JointDataset f = open(opt.data_cfg ) # choose which dataset to train '../src/lib/cfg/mot15.json', data_config = json.load(f) trainset_paths = data_config['train'] # 训练集路径 dataset_root = data_config['root'] # 数据集所在目录 print("Dataset root: %s" % dataset_root) f.close() # Image data transformations transforms = T.Compose([T.ToTensor()]) # Dataset dataset = Dataset(opt=opt, root=dataset_root, paths=trainset_paths, img_size=opt.input_wh, augment=True, transforms=transforms) opt = opts().update_dataset_info_and_set_heads(opt, dataset) print("opt:\n", opt) logger = Logger(opt) # os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str # 多GPU训练 # print("opt.gpus_str: ", opt.gpus_str) opt.device = torch.device('cuda:0' if opt.gpus[0] >= 0 else 'cpu') # 设置GPU #opt.device = device #NC UPDATE - fallback to original fairmot #opt.gpus = my_visible_devs #NC UPDATE print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) # 初始化优化器 optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) # Get dataloader if opt.is_debug: if opt.multi_scale: train_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=opt.batch_size, shuffle=False, pin_memory=True, drop_last=True) # debug时不设置线程数(即默认为0) else: train_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=opt.batch_size, shuffle=True, pin_memory=True, drop_last=True) # debug时不设置线程数(即默认为0) else: if opt.multi_scale: train_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=True) else: train_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=opt.batch_size, shuffle=True, pin_memory=True, drop_last=True) # debug时不设置线程数(即默认为0) print('Starting training...') Trainer = train_factory[opt.task] trainer = Trainer(opt=opt, model=model, optimizer=optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) # trainer.set_device(opt.gpus, opt.chunk_sizes, device) #NC UPDATE best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' # Train an epoch log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) else: # mcmot_last_track or mcmot_last_det if opt.id_weight > 0: # do tracking(detection and re-id) save_model( os.path.join(opt.save_dir, 'mcmot_last_track_' + opt.arch + '.pth'), epoch, model, optimizer) else: # only do detection # save_model(os.path.join(opt.save_dir, 'mcmot_last_det_' + opt.arch + '.pth'), # epoch, model, optimizer) save_model( os.path.join(opt.save_dir, 'mcmot_last_det_' + opt.arch + '.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr if epoch % 10 == 0: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) logger.close()
def main(opt, opt_t): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test if opt.target_dataset: Dataset_target = get_dataset(opt_t.target_dataset, opt_t.task) opt_t = opts().update_dataset_info_and_set_heads( opt_t, Dataset_target) # target dataset Dataset_source = get_dataset(opt.source_dataset, opt.task) opt = opts().update_dataset_info_and_set_heads( opt, Dataset_source) # source dataset print(opt) logger = Logger(opt) # record os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) # create model optimizer = torch.optim.Adam(model.parameters(), opt.lr) # create optimizer start_epoch = 0 if opt.load_model != '': # load model model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] # set trainer function trainer = Trainer(opt, model, optimizer) # initial trainer trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up source_val data...') val_source_loader = torch.utils.data.DataLoader(Dataset_source(opt, 'val'), batch_size=1, shuffle=False, num_workers=0, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_source_loader) val_source_loader.dataset.run_eval(preds, opt.save_dir) return # source loader print('Setting up source_train data...') train_source_loader = torch.utils.data.DataLoader( Dataset_source(opt, 'train'), # modify SOURCE dataset parameters batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) if opt.target_dataset: # target loader print('Setting up target_train data...') train_target_loader = torch.utils.data.DataLoader( Dataset_target(opt_t, 'train'), # modify TARGET dataset parameters batch_size=opt_t.batch_size, shuffle=True, num_workers=opt_t.num_workers, pin_memory=True, drop_last=True) print('DA MODE') else: train_target_loader = None print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_source_loader, train_target_loader) # do train logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): # log information logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) # if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: # save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), # save last-model # epoch, model, optimizer) # with torch.no_grad(): # log_dict_val, preds = trainer.val(epoch, val_source_loader) # cal val-set loss # for k, v in log_dict_val.items(): # logger.scalar_summary('val_{}'.format(k), v, epoch) # logger.write('{} {:8f} | '.format(k, v)) # if log_dict_val[opt.metric] < best: # best = log_dict_val[opt.metric] # save_model(os.path.join(opt.save_dir, 'model_best.pth'), # save best-model # epoch, model) # else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: # update learning rate save_model( os.path.join( opt.save_dir, 'model_{}.pth'.format(epoch)), # save lr_step-model epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=0, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) # benchmark=True 自动寻找最适合当前配置的高效算法,来达到优化运行效率的wento torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) def adjust_learning_rate(optimizer, epoch): # use warmup if epoch < 5: lr = opt.lr * ((epoch + 1) / 5) else: # use cosine lr PI = 3.14159 lr = opt.lr * 0.5 * (1 + math.cos(epoch * PI / 20)) # print(1111) for param_group in optimizer.param_groups: param_group['lr'] = lr os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) # optimizer = torch.optim.Adam(model.parameters(), opt.lr) optimizer = torch.optim.SGD(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') # num_worker=0是主进程读取; >0使用多进程读取,子进程读取数据时,训练程序会卡住,GPU utils为0, val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): adjust_learning_rate(optimizer, epoch) mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % 2 == 0 and epoch > 10: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) # test(opt) # opt.model = None if epoch > 20: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) # elif 80 < epoch <=100 and epoch % 3 == 0: # save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), # epoch, model, optimizer) logger.write('\n') # if epoch in opt.lr_step: # save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), # epoch, model, optimizer) # lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1)) # print('Drop LR to', lr) # for param_group in optimizer.param_groups: # param_group['lr'] = lr print('Epoch is Finished') logger.close()
def main(opt): torch.manual_seed( opt.seed ) # opt.seed: default=317 ;加上torch.manual_seed这个函数调用的话,打印出来的随机数每次都一样。 torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset( opt.dataset, opt.task ) # opt.dataset = coco, opt.task = ctdet (| ddd | multi_pose | exdet) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) # modified by zy # val_loader = torch.utils.data.DataLoader(Dataset(opt, 'test'), batch_size=1, shuffle=False, num_workers=1,pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) output_choice_log = '/home/zy/zy/2new_network/CenterNet-master/output_choice.log' if os.path.exists(output_choice_log): os.remove(output_choice_log) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' try: log_dict_train, _ = trainer.train( epoch, train_loader ) # !!!!!!!! train = self.run_epoch('train', epoch, data_loader) except Exception as e: # 如果发生异常,那就返回预设的loss值 print('Error_train!!!', e) print(traceback.format_exc()) continue logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): try: log_dict_val, preds = trainer.val(epoch, val_loader) except Exception as e: # 如果发生异常,那就返回预设的loss值 print('Error_train!!!', e) print(traceback.format_exc()) continue for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test print('Setting up data...') Dataset = get_dataset(opt.task) f = open(opt.data_cfg) data_config = json.load(f) trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() transforms = T.Compose([T.ToTensor()]) dataset = Dataset(opt, dataset_root, trainset_paths, (1088, 608), augment=True, transforms=transforms) opt = opts().update_dataset_info_and_set_heads(opt, dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) # Get dataloader train_loader = torch.utils.data.DataLoader(dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr if epoch % 5 == 0: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) logger.close()