def train(model, training_parameters, model_choice, target, model_name_suffix=''): optimizer = tf.keras.optimizers.Adam(1e-2) log_joint_pdf = get_log_joint_pdf(training_parameters['name']) # Early stopping best_loss = 1e20 last_improvement = 0 max_consecutive_no_improvement = 15000 min_epoch_checkpoint = 1 checkpoint_tol = 0.02 saved_checkpoint = False # Monitor training loss for visualisation loss_monitor = [] for epoch in range(1, training_parameters['epochs']): loss = compute_apply_gradients(model, optimizer, log_joint_pdf) if loss < best_loss: if ((best_loss - loss) / np.abs(best_loss) > checkpoint_tol) & (epoch > min_epoch_checkpoint): print( f" - CHECKPOINT for epoch {epoch + 1}, current best loss {loss}" ) save_model(model, model_choice, target, model_name_suffix=model_name_suffix) best_loss = loss last_improvement = 0 saved_checkpoint = True else: last_improvement += 1 if last_improvement >= max_consecutive_no_improvement: print(f" - STOPPED after {epoch} epochs") break if epoch % 100 == 0: print(f"Epoch {epoch}, loss: {loss}") loss_monitor.append(loss) plt.figure() plt.plot(loss_monitor, color='slategrey') plt.xlabel('Epochs (x100)') plt.ylabel('-ELBO(q)') if saved_checkpoint: model = load_model(model_choice, training_parameters, model_name_suffix='') return model
def train(model, criterion_softmax, criterion_binary, train_set, val_set, opt): # define web visualizer using visdom #webvis = WebVisualizer(opt) # modify learning rate of last layer finetune_params = modify_last_layer_lr(model.named_parameters(), opt.lr, opt.lr_mult_w, opt.lr_mult_b) # define optimizer optimizer = optim.SGD(finetune_params, opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) # define laerning rate scheluer scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_in_epoch, gamma=opt.gamma) # record forward and backward times train_batch_num = len(train_set) total_batch_iter = 0 logging.info("####################Train Model###################") for epoch in range(opt.sum_epoch): # epoch_start_t = time.time() epoch_batch_iter = 0 logging.info('Begin of epoch %d' % (epoch)) for i, data in enumerate(train_set): # iter_start_t = time.time() # train inputs, target_softmax,target_binary = data output, loss, loss_list = forward_batch(model, criterion_softmax, criterion_binary, inputs, target_softmax,target_binary, opt, "Train") optimizer.zero_grad() loss.backward() optimizer.step() # webvis.reset() epoch_batch_iter += 1 total_batch_iter += 1 # logging.info('End of epoch %d / %d \t Time Taken: %d sec' % # (epoch, opt.sum_epoch, time.time() - epoch_start_t)) if epoch % opt.save_epoch_freq == 0: logging.info('saving the model at the end of epoch %d, iters %d' % (epoch + 1, total_batch_iter)) save_model(model, opt, epoch + 1) # adjust learning rate scheduler.step() lr = optimizer.param_groups[0]['lr'] logging.info('learning rate = %.7f epoch = %d' % (lr, epoch)) logging.info("--------Optimization Done--------")
def do_eval(opt, epoch, model, DatasetFactory, logger, best): # Based this code on test.py's non-prefetched code path: Detector = detector_factory[opt.task] dataset = DatasetFactory(opt, "val") detector = Detector(opt, model) results = {} num_iters = len(dataset) bar = Bar('{}'.format(opt.exp_id), max=num_iters) time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge'] avg_time_stats = {t: AverageMeter() for t in time_stats} for ind in range(num_iters): img_id = dataset.images[ind] img_info = dataset.coco.loadImgs(ids=[img_id])[0] img_path = os.path.join(dataset.img_dir, img_info['file_name']) if opt.task == 'ddd': ret = detector.run(img_path, img_info['calib']) else: ret = detector.run(img_path) results[img_id] = ret['results'] Bar.suffix = '[{0}/{1}]|Tot: {total:} |ETA: {eta:} '.format( ind, num_iters, total=bar.elapsed_td, eta=bar.eta_td) for t in avg_time_stats: avg_time_stats[t].update(ret[t]) Bar.suffix = Bar.suffix + '|{} {:.3f} '.format(t, avg_time_stats[t].avg) bar.next() bar.finish() metric = float("-inf") # Capture metric of interest, e.g., for COCO eval, something like AP50: eval_stats = dataset.run_eval(results, opt.save_dir, logger) if uses_coco_eval(opt): ap50 = eval_stats[1] ap25 = eval_stats[12] metric = ap25 # Log results to log.txt and/or tensorboard: logger.scalar_summary("val_ap50", ap50, epoch) logger.scalar_summary("val_ap25", ap25, epoch) else: # Pascal VOC: metric = eval_stats["Mean AP"] # Log results to log.txt and/or tensorboard: logger.scalar_summary("mean_AP", metric, epoch) # Best model checkpointing: if metric > best: best = metric save_model( os.path.join(opt.save_dir, "model_best.pth"), epoch, model ) return best
def teacher_train(cfg, start_epoch): torch.manual_seed(cfg.SEED) device = torch.device('cuda' if cfg.GPU[0] >= 0 else 'cpu') if start_epoch == 1: train_log = open(os.path.join(cfg.LOG_DIR, "train_log.csv"), 'w') train_log_title = "epoch,total_loss,hm_loss,wh_loss" val_log = open(os.path.join(cfg.LOG_DIR, "val_log.csv"), 'w') val_log_title = "epoch,precision,recall\n" if cfg.USE_OFFSET: train_log_title += ",offset_loss\n" else: train_log_title += "\n" train_log.write(train_log_title) train_log.flush() val_log.write(val_log_title) val_log.flush() else: train_log = open(os.path.join(cfg.LOG_DIR, "train_lo.csv"), 'a') val_log = open(os.path.join(cfg.LOG_DIR, "val_log.csv"), 'a') print('Creating model...') teacher = create_model(cfg, 'res_18') teacher = load_model(teacher, 'log/weights/model_last_res.pth') model = create_model(cfg, 'litnet') if start_epoch != 1: model = load_model( model, 'log/weights/model_epoch_{}.pth'.format(start_epoch - 1)) optimizer = torch.optim.Adam(model.parameters(), cfg.LR) trainer = TeacherTrainer(cfg, teacher, model, optimizer) trainer.set_device(cfg.GPU, device) print('Setting up data...') train_loader = DataLoader(TrainCircleDataset(cfg), batch_size=cfg.BATCH_SIZE, shuffle=True, num_workers=cfg.NUM_WORKERS, pin_memory=True, drop_last=True) val_loader = ValCircleDataset() print('Starting training...') epoch = start_epoch for epoch in range(start_epoch, start_epoch + cfg.NUM_EPOCHS): trainer.train(epoch, train_loader, train_log) model_path = os.path.join(cfg.WEIGHTS_DIR, 'model_epoch_{}.pth'.format(epoch)) save_model(model_path, epoch, model, optimizer) trainer.val(epoch, model_path, val_loader, val_log, cfg) save_model(os.path.join(cfg.WEIGHTS_DIR, 'model_last.pth'), epoch, model, optimizer)
def train(model, criterion, train_set, val_set, optimizer, scheduler, opt): logging.info("####################Train Model###################") # loss_avg = for epoch in range(opt.sum_epoch): epoch_start_t = time.time() epoch_batch_iter = 0 logging.info('Begin of epoch %d' % (epoch)) for i, data in enumerate(train_set): inputs, targets, targets_weight, meta = data if torch.cuda.is_available(): inputs = inputs.cuda() targets = targets.cuda() targets_weight = targets_weight.cuda() # cal output of CNN outputs = model(inputs) # cal loss loss = criterion(outputs, targets, targets_weight) # cal gradient optimizer.zero_grad() loss.backward() optimizer.step() epoch_batch_iter += 1 # display train loss if epoch_batch_iter % opt.display_train_freq == 0: util.print_loss(loss, epoch, epoch_batch_iter, opt) # display validate accuracy if epoch_batch_iter % opt.display_validate_freq == 0: logging.info('Validate of epoch %d' % (epoch)) test(model, val_set, opt) # adjust learning rate scheduler.step() lr = optimizer.param_groups[0]['lr'] logging.info('learning rate = %.7f epoch = %d' % (lr, epoch)) # save model if epoch % opt.save_epoch_freq == 0 or epoch == opt.sum_epoch - 1: logging.info('saving the model at the end of epoch %d' % (epoch)) save_model(model, opt, epoch) logging.info("--------Optimization Done--------")
def train(self, epoch): mark = epoch if self.opt.save_all else 'last' log_dict_train, _ = self.trainer.train(epoch, self.train_loader) self.logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): self.logger.scalar_summary('train_{}'.format(k), v, epoch) self.logger.write('{} {:8f} | '.format(k, v)) if self.opt.val_intervals > 0 and epoch % self.opt.val_intervals == 0: save_model( os.path.join(self.opt.save_dir, 'model_{}.pth'.format(mark)), epoch, self.model, self.optimizer) with torch.no_grad(): log_dict_val, preds = self.trainer.val(epoch, self.val_loader) for k, v in log_dict_val.items(): self.logger.scalar_summary('val_{}'.format(k), v, epoch) self.logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[self.opt.metric] < self.best: self.best = log_dict_val[self.opt.metric] save_model(os.path.join(self.opt.save_dir, 'model_best.pth'), epoch, self.model) else: save_model(os.path.join(self.opt.save_dir, 'model_last.pth'), epoch, self.model, self.optimizer) self.logger.write('\n') if epoch in self.opt.lr_step: lr = self.opt.lr * (0.1**(self.opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in self.optimizer.param_groups: param_group['lr'] = lr
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) output_file_name = os.path.splitext(os.path.basename(opt.load_model))[0] output_file_name = 'inference_{}.pth'.format(output_file_name) output_file_name = os.path.join(opt.save_dir, output_file_name) save_model(output_file_name, start_epoch, model) print("Model Saved at {} ".format(output_file_name))
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if epoch > 100: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test print('Setting up data...') Dataset = get_dataset(opt.dataset, opt.task) f = open(opt.data_cfg) data_config = json.load(f) trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() transforms = T.Compose([T.ToTensor()]) dataset = Dataset(opt, dataset_root, trainset_paths, (1088, 608), augment=True, transforms=transforms) opt = opts().update_dataset_info_and_set_heads(opt, dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) # Get dataloader train_loader = torch.utils.data.DataLoader(dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr if epoch % 5 == 0: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv, opt.nbr_frames) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model( model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step, opt.nbr_frames) # save_model('/usagers2/huper/dev/SpotNet2/exp/uav/ctdetSpotNetVid/fromCOCOB/fromSN2.pth', 0, model, optimizer) # exit() Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') val_loader = torch.utils.data.DataLoader( Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True ) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader( Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True ) # logger.write_model(model) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print(get_parameter_number(model)) #summary(model, (3, 416, 416)) # # 计算网络参数量和FLOPS 可以GPU # from thop import profile # input = torch.randn(1, 3, 416, 416).cuda() # flops, params = profile(model, inputs=(input,)) # print(flops) # print(params) # # # 计算网络参数量和FLOPS 好像只能CPU # from torchstat import stat # stat(model, (3, 416, 416)) print('Setting up data...') val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=0, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(cfg, local_rank): torch.manual_seed(cfg.SEED) torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK Dataset = get_dataset(cfg.SAMPLE_METHOD, cfg.TASK) print('Creating model...') model = create_model(cfg.MODEL.NAME, cfg.MODEL.HEAD_CONV, cfg) num_gpus = torch.cuda.device_count() if cfg.TRAIN.DISTRIBUTE: device = torch.device('cuda:%d'%local_rank) torch.cuda.set_device(local_rank) dist.init_process_group(backend='nccl', init_method='env://', world_size=num_gpus, rank=local_rank) else: device = torch.device('cuda') logger = Logger(cfg) if cfg.TRAIN.OPTIMIZER=='adam': optimizer = torch.optim.Adam(model.parameters(), cfg.TRAIN.LR) elif cfg.TRAIN.OPTIMIZER== 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=cfg.TRAIN.LR, momentum=0.9) else: NotImplementedError start_epoch = 0 if cfg.MODEL.INIT_WEIGHTS: model, optimizer, start_epoch = load_model( model, cfg.MODEL.PRETRAINED, optimizer, cfg.TRAIN.RESUME, cfg.TRAIN.LR, cfg.TRAIN.LR_STEP) Trainer = train_factory[cfg.TASK] trainer = Trainer(cfg, local_rank, model, optimizer) cfg.TRAIN.MASTER_BATCH_SIZE if cfg.TRAIN.MASTER_BATCH_SIZE == -1: master_batch_size = cfg.TRAIN.BATCH_SIZE // len(cfg.GPUS) else: master_batch_size = cfg.TRAIN.MASTER_BATCH_SIZE rest_batch_size = (cfg.TRAIN.BATCH_SIZE - master_batch_size) chunk_sizes = [cfg.TRAIN.MASTER_BATCH_SIZE] for i in range(len(cfg.GPUS) - 1): slave_chunk_size = rest_batch_size // (len(cfg.GPUS) - 1) if i < rest_batch_size % (len(cfg.GPUS) - 1): slave_chunk_size += 1 chunk_sizes.append(slave_chunk_size) trainer.set_device(cfg.GPUS, chunk_sizes, device) print('Setting up data...') val_dataset = Dataset(cfg, 'val') val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True ) train_dataset = Dataset(cfg, 'train') train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=num_gpus, rank=local_rank) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE//num_gpus if cfg.TRAIN.DISTRIBUTE else cfg.TRAIN.BATCH_SIZE, shuffle=not cfg.TRAIN.DISTRIBUTE, num_workers=cfg.WORKERS, pin_memory=True, drop_last=True, sampler = train_sampler if cfg.TRAIN.DISTRIBUTE else None ) print('Starting training...') best = 0. for epoch in range(start_epoch + 1, cfg.TRAIN.EPOCHS + 1): mark = epoch if cfg.TRAIN.SAVE_ALL_MODEL else 'last' train_sampler.set_epoch(epoch) log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if cfg.TRAIN.VAL_INTERVALS > 0 and epoch % cfg.TRAIN.VAL_INTERVALS == 0: save_model(os.path.join(cfg.OUTPUT_DIR, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) mAP = val_dataset.run_eval(preds, cfg.OUTPUT_DIR) print('mAP is: ', mAP) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if mAP > best: best = mAP save_model(os.path.join(cfg.OUTPUT_DIR, 'model_best.pth'), epoch, model) else: save_model(os.path.join(cfg.OUTPUT_DIR, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in cfg.TRAIN.LR_STEP: save_model(os.path.join(cfg.OUTPUT_DIR, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = cfg.TRAIN.LR * (0.1 ** (cfg.TRAIN.LR_STEP.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) # if add --not_cuda_benchmark, opt.not_cuda_benchmark=True torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test # return Dataset class by dataset and task name # one dataset can do multiple tasks by different annotation settings Dataset = get_dataset(opt.dataset, opt.task) # update opt [ input|ouput res, opt.heads ] with Dataset info opt = opts().update_dataset_info_and_set_heads(opt, Dataset) pprint(vars(opt)) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') # opt.arch: --arch dla_34 # opt.heads: set heads by task in opts().update_dataset_info_and_set_heads() # opt.head_conv: 256, one more layer btw features and final_class, number defined by opt.arch model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) # optimize all params start_epoch = 0 # load pretrain model if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) # choose trainer by opt.task Trainer = train_factory[opt.task] # define trainer trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') # val dataset val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.test: # test on val dataset _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return # end program here train_loader = torch.utils.data.DataLoader( Dataset(opt, 'train'), # split, load json batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, # multi-process read data, wrt batch_size pin_memory=True, drop_last=True) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' # save all middle model or last log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): # default will USE_TENSORBOARD to log scalars logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) # default val/save intervals = 5 if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), # path epoch, model, optimizer) # save model dict keys with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: # metric: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) opt.lr = 5e-3 optimizer = torch.optim.Adam(model.parameters(), opt.lr, weight_decay=0) scheduler_consine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=310, eta_min=0) scheduler_warmup = GradualWarmupScheduler( optimizer, multiplier=1, total_epoch=10, after_scheduler=scheduler_consine) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') scheduler_warmup.step(epoch) # if epoch in opt.lr_step: # save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), # epoch, model, optimizer) # lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1)) # print('Drop LR to', lr) # for param_group in optimizer.param_groups: # param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test print('Setting up data...') Dataset = get_dataset(opt.dataset, opt.task, opt.multi_scale) f = open(opt.data_cfg) data_config = json.load(f) trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() transforms = T.Compose([T.ToTensor()]) dataset = Dataset(opt, dataset_root, trainset_paths, (640, 480), augment=True, transforms=transforms) opt = opts().update_dataset_info_and_set_heads(opt, dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) # optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=0.9, weight_decay=5e-4) warmup_epoch = 5 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(opt.num_epochs - warmup_epoch)) lr1 = lr_scheduler.get_lr()[0] print("Learn_rate:%s" % lr1) iter_per_epoch = len(dataset) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * warmup_epoch) start_epoch = 0 # Get dataloader if opt.multi_scale: train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=True) else: train_loader = torch.utils.data.DataLoader(dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, trainer.optimizer, opt.resume, opt.lr, opt.lr_step) best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) # lr_scheduler.step() if epoch >= warmup_epoch: lr_scheduler.step() lr = lr_scheduler.get_lr()[0] print("Learn_rate:%s" % lr) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') # if epoch in opt.lr_step: # save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), # epoch, model, optimizer) # lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1)) # print('Drop LR to', lr) # for param_group in optimizer.param_groups: # param_group['lr'] = lr if epoch % 5 == 0 or epoch >= 30: print('Drop LR to', lr) save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) logger.close()
def main(opt): torch.manual_seed(opt.seed) # sets random seed for pytorch random number generator torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test # enable inbuilt cudnn auto-tuner to find the best algorithm for hardware, if both opts not set. Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 # load model if specified if opt.load_model != '': model, optimizer, start_epoch = load_model( model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') val_loader = torch.utils.data.DataLoader( Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True ) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader( Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True ) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) ###### training ###### logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) # save model logger.write('\n') # adjust lr every certain epochs if epoch in opt.lr_step: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) task = 1 logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) model1 = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) #,weight_decay=0.1) start_epoch = 0 #print(model) if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) model1, _, _ = load_model(model1, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) else: task = -1 #表明这是第一个task set_requires_grad(model1, requires_grad=False) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, model1, task, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) old_loader = None if os.path.exists('./exemplar_dataset'): N = len(os.listdir('./exemplar_dataset')) - 1 N = min(N, opt.batch_size) if (task != -1): old_loader = torch.utils.data.DataLoader( Dataset(opt, 'exemplar'), batch_size=N, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True, ) params = {n: p for n, p in model.named_parameters() if p.requires_grad} _means = {} for n, p in params.items(): _means[n] = p.clone().detach() precision_matrices = {} # 重要度 for n, p in params.items(): precision_matrices[n] = p.clone().detach().fill_(0) # 取zeros_like print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader, old_loader, _means, precision_matrices) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader, old_loader, _means, precision_matrices) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
rfc = RandomForestClassifier() sgd = SGDClassifier() scorer = make_scorer(accuracy_score) param_grid = [ # {'alpha': np.linspace(0.00001, 1, 40)}, { 'penalty': ['l2'], 'C': [0.1, 1, 5, 10], 'solver': ['lbfgs', 'liblinear'] }, { 'n_neighbors': [1, 3, 5, 10] }, { 'n_estimators': list(range(10, 101, 10)), 'max_features': list(range(6, 32, 5)) }, { 'average': [True, False], 'alpha': np.linspace(0.001, 1, 40) } ] model_list = [logistic, knn, rfc, sgd] grid_search, grid_results, results = random_search_best_estimator( scorer, param_grid, model_list, X_train, X_test, y_train, y_test) results = pd.DataFrame(results) best_estimator = results['best_estimator'][results['best_score'] == results['best_score'].max()].iloc[0] final_accuracy_test, final_accuracy_train, pred_test, pred_train = final_model( X_train, X_test, y_train, y_test, best_estimator) save_model('../models', 'best_estimator.sav', best_estimator)
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = False # torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print(opt.device) print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv, opt.deform_conv, w2=opt.w2, maxpool=opt.maxpool) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) # quantization-aware fine-tuning quantize_shufflenetv2_dcn(model, quant_conv=4, quant_bn=None, quant_act=8, wt_quant_mode='symmetric', act_quant_mode='asymmetric', wt_per_channel=True, wt_percentile=True, act_percentile=False, deform_backbone=False, w2=opt.w2, maxpool=opt.maxpool) # quantized_model = quantize_sfl_dcn(model, quant_conv=4, quant_bn=None, quant_act=4, # quant_mode='symmetric', wt_per_channel=True, wt_percentile=False, act_percentile=False) # print(quantized_model) # if opt.load_model != '': # model, optimizer, start_epoch = load_model( # model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) # if opt.test: # # if True: # _, preds = trainer.val(0, val_loader) # val_loader.dataset.run_eval(preds, opt.save_dir) # return train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr opt.test = True if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv, number_stacks=opt.number_stacks, fsm=opt.fsm, drmc=opt.drmc, drmr=opt.drmr, only_ls=opt.only_ls) optimizer = None # optimizer = torch.optim.Adam(model.parameters(), opt.lr) optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=0.9, weight_decay=0.0005, nesterov=True) # optimizer = torch.optim.AdamW(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step, opt.finetune) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') if not opt.trainval: val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if not opt.trainval and opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return split = 'trainval' if opt.trainval else 'train' train_loader = torch.utils.data.DataLoader(Dataset(opt, split), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) if epoch % opt.cache_model == 0: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) else: pass logger.write('\n') if epoch in opt.lr_step: if not os.path.exists( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch))): save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) # 使得每次获取的随机数都是一样的 torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test # 结构固定、形状固定时可以加快运行速度 Dataset = get_dataset(opt.dataset, opt.task) # (coco ctdet) 得到实例Dataset(实例COCO, 实例CTDetDataset) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) # 根据数据集和arch生成检测器头所需的参数 print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) # 当前测试模型结构(DLA、hourglass)、检测头、检测头卷即层设施 optimizer = torch.optim.Adam(model.parameters(), opt.lr) # 设置优化器、迭代返回模型参数 start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model( model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] # trainer = Trainer(opt, model, optimizer) # 产生一个训练实例 trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') val_loader = torch.utils.data.DataLoader( Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True ) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader( Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True ) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' # 模型保存参数 log_dict_train, _ = trainer.train(epoch, train_loader) # 训练 logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): patient = PATIENT torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '' and opt.load_model != '_': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) # model, optimizer = amp.initialize(model.cuda(), optimizer, opt_level="O1") Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) def recording(log_dict, prefix, epoch): for k, v in log_dict.items(): logger.scalar_summary(f'{prefix}_{k}', v, epoch) logger.write('{} {:8f} | '.format(k, v)) print('Starting training...') best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) recording(log_dict_train, 'train', epoch) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) recording(log_dict_val, 'val', epoch) # breakpoint() if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, f'model_best.pth'), epoch, model) patient = PATIENT else: patient -= 1 # print(colored(f'patient {patient}', 'red')) if patient < 0: print(colored(f'{opt.input_h} done', 'green')) break else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): # Completely reproducible results are not guaranteed across PyTorch releases, \ # individual commits, or different platforms. Furthermore, results may not be reproducible \ # between CPU and GPU executions, even when using identical seeds. # We can use torch.manual_seed() to seed the RNG for all devices (both CPU and CUDA): torch.manual_seed(opt.seed) # 设置 torch.backends.cudnn.benchmark=True 将会让程序在开始时花费一点额外时间,\ # 为整个网络的每个卷积层搜索最适合它的卷积实现算法,进而实现网络的加速。\ # 适用场景是网络结构固定(不是动态变化的),网络的输入形状(包括 batch size,图片大小,输入的通道)是不变的,\ # 其实也就是一般情况下都比较适用。反之,如果卷积层的设置一直变化,将会导致程序不停地做优化,反而会耗费更多的时间。 torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.model_name) optimizer = torch.optim.Adam(model.parameters(), opt.lr) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print("Setting up data...") val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) # run evaluation code val_loader.dataset.run_eval(preds, opt.save_dir) return train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print("Starting training") best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
import mxnet as mx print('Creating model...') opt = opts().init() print(opt.arch) ctx = [mx.gpu(int(i)) for i in opt.gpus_str.split(',') if i.strip()] ctx = ctx if ctx else [mx.cpu()] model = create_model(opt.arch, opt.heads, opt.head_conv, ctx) model.collect_params().initialize(init=init.Xavier()) X = nd.random.uniform(shape=(16, 3, 512, 512)) print("\t Input shape: ", X.shape) Y = model(X) print("output: heatmaps", Y[0]["hm"].shape) print("output: wh_scale", Y[0]["wh"].shape) print("output: xy_offset", Y[0]["reg"].shape) param = model.collect_params() param_keys = param.keys() param_keys_residual_1 = [param[param_key] for param_key in param_keys if "hourglassnet0_residual1_conv1_weight" in param_key] #print(param_keys_residual_1) flag_save_model = False if flag_save_model is True: print("\n\nSaving model...") save_model(model, "./init_params.params") # call: # python train.py ctdet --arch hourglass
def main(): def to_ncwh(x): return np.transpose(x, [2, 0, 1]) def to_tensor(x): x = torch.from_numpy(x) return x def transform_by_keys(x, transform, keys): for k, v in x.items(): if k in keys: x[k] = transform(v) return x import torchvision.transforms as transforms data_transform_composed = transforms.Compose([ lambda x: transform_by_keys(x, to_ncwh, ["needle", "stack"]) , lambda x: transform_by_keys(x, to_tensor, x.keys()) ]) def to_ncwh(x): return np.transpose(x, [2, 0, 1]) def to_tensor(x): x = torch.from_numpy(x) return x def transform_by_keys(x, transform, keys): for k, v in x.items(): if k in keys: x[k] = transform(v) return x import torchvision.transforms as transforms data_transform_composed = transforms.Compose([ lambda x: transform_by_keys(x, to_ncwh, ["needle", "stack"]) , lambda x: transform_by_keys(x, to_tensor, x.keys()) ]) Dataset = CTNumberDataset opt = opts().parse() opt= opts().update_dataset_info_and_set_heads(opt, Dataset) cv2.setNumThreads(0) logger = Logger(opt) val_loader = torch.utils.data.DataLoader( CTNumberDataset(start=100000, length=10000, transform=data_transform_composed, font=get_font(opt.font)), batch_size=1, shuffle=False, num_workers=1, pin_memory=True ) train_loader = torch.utils.data.DataLoader( CTNumberDataset(start=100000, length=10000, transform=data_transform_composed, font=get_font(opt.font)), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True, ) best = 1e10 start_epoch = -1 model = GeneralizedDetector() optimizer = torch.optim.Adam(model.parameters(), opt.lr) trainer = CtdetTrainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) if opt.load_model != '': model, optimizer, start_epoch = load_model( model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.lr_step: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def train(model, criterion, train_set, val_set, opt, labels=None): # define web visualizer using visdom webvis = WebVisualizer(opt) # modify learning rate of last layer finetune_params = modify_last_layer_lr(model.named_parameters(), opt.lr, opt.lr_mult_w, opt.lr_mult_b) # define optimizer #optimizer = optim.Adam(finetune_params, # opt.lr) # define laerning rate scheluer' optimizer = optim.Adam(finetune_params, 0.000001) #scheduler = optim.lr_scheduler.StepLR(optimizer, # step_size=opt.lr_decay_in_epoch, # gamma=opt.gamma) if labels is not None: rid2name, id2rid = labels # record forward and backward times train_batch_num = len(train_set) total_batch_iter = 0 logging.info("####################Train Model###################") for epoch in range(opt.sum_epoch): epoch_start_t = time.time() epoch_batch_iter = 0 logging.info('Begin of epoch %d' % (epoch)) for i, data in enumerate(train_set): iter_start_t = time.time() # train inputs, targets = data #print(i,targets) if opt.mode == 'Train': output, loss, loss_list = forward_batch( model, criterion, inputs, targets, opt, "Train") optimizer.zero_grad() loss.backward() optimizer.step() elif opt.mode == 'Test-Train': #use batchsize==1 output, loss, loss_list = forward_batch( model, criterion, inputs, targets, opt, "Test-Train") batch_accuracy = calc_accuracy(output, targets, opt.score_thres, opt, opt.top_k) if batch_accuracy[1] >= THRES: optimizer.zero_grad() loss.backward() optimizer.step() webvis.reset() epoch_batch_iter += 1 total_batch_iter += 1 # display train loss and accuracy if total_batch_iter % opt.display_train_freq == 0: # accuracy batch_accuracy = calc_accuracy(output, targets, opt.score_thres, opt, opt.top_k) util.print_loss(loss_list, "Train", epoch, total_batch_iter) util.print_accuracy(batch_accuracy, "Train", epoch, total_batch_iter) if opt.display_id > 0: x_axis = epoch + float(epoch_batch_iter) / train_batch_num # TODO support accuracy visualization of multiple top_k plot_accuracy = [ batch_accuracy[i][opt.top_k[0]] for i in range(len(batch_accuracy)) ] accuracy_list = [item["ratio"] for item in plot_accuracy] webvis.plot_points(x_axis, loss_list, "Loss", "Train") webvis.plot_points(x_axis, accuracy_list, "Accuracy", "Train") # display train data if total_batch_iter % opt.display_data_freq == 0: image_list = list() show_image_num = int( np.ceil(opt.display_image_ratio * inputs.size()[0])) for index in range(show_image_num): input_im = util.tensor2im(inputs[index], opt.mean, opt.std) class_label = "Image_" + str(index) if labels is not None: target_ids = [ targets[i][index] for i in range(opt.class_num) ] rids = [id2rid[j][k] for j, k in enumerate(target_ids)] class_label += "_" class_label += "#".join( [rid2name[j][k] for j, k in enumerate(rids)]) image_list.append((class_label, input_im)) image_dict = OrderedDict(image_list) save_result = total_batch_iter % opt.update_html_freq webvis.plot_images(image_dict, opt.display_id + 2 * opt.class_num, epoch, save_result) # validate and display validate loss and accuracy if len(val_set ) > 0 and total_batch_iter % opt.display_validate_freq == 0: val_accuracy, val_loss = validate(model, criterion, val_set, opt) x_axis = epoch + float(epoch_batch_iter) / train_batch_num accuracy_list = [ val_accuracy[i][opt.top_k[0]]["ratio"] for i in range(len(val_accuracy)) ] util.print_loss(val_loss, "Validate", epoch, total_batch_iter) util.print_accuracy(val_accuracy, "Validate", epoch, total_batch_iter) if opt.display_id > 0: webvis.plot_points(x_axis, val_loss, "Loss", "Validate") webvis.plot_points(x_axis, accuracy_list, "Accuracy", "Validate") # save snapshot if total_batch_iter % opt.save_batch_iter_freq == 0: logging.info( "saving the latest model (epoch %d, total_batch_iter %d)" % (epoch, total_batch_iter)) save_model(model, opt, epoch) # TODO snapshot loss and accuracy logging.info('End of epoch %d / %d \t Time Taken: %d sec' % (epoch, opt.sum_epoch, time.time() - epoch_start_t)) if epoch % opt.save_epoch_freq == 0: logging.info('saving the model at the end of epoch %d, iters %d' % (epoch + 1, total_batch_iter)) save_model(model, opt, epoch + 1) # adjust learning rate #scheduler.step() #lr = optimizer.param_groups[0]['lr'] #logging.info('learning rate = %.7f epoch = %d' %(lr,epoch)) logging.info("--------Optimization Done--------")
def train(args, config, model): tokenizer = pickle.load(open(config.filename_idx2word, 'rb')) max_sorce = 0.0 # optim optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, betas=(0.9, 0.999), eps=1e-9) optim = Optim(optimizer, config) # KLDivLoss loss_func = LabelSmothingLoss(config) # data train_loader = data_load(config.filename_trimmed_train, config.batch_size, True) # # display the result # f = open('data/clean/data_char/src_index2word.pkl', 'rb') # idx2word = pickle.load(f) for e in range(args.checkpoint, args.epoch): model.train() all_loss = 0 num = 0 for step, batch in enumerate(tqdm(train_loader)): x, y = batch word = y.ne(config.pad).sum().item() num += word if torch.cuda.is_available(): x = x.cuda() y = y.cuda() out = model(x, y) loss = loss_func(out, y) all_loss += loss.item() if step % 200 == 0: print('epoch:', e, '|step:', step, '|train_loss: %.4f' % (loss.item() / word)) # loss regularization loss = loss / config.accumulation_steps loss.backward() if ((step + 1) % config.accumulation_steps) == 0: optim.updata() optim.zero_grad() # ########################### # if step == 2: # break # ########################### # if step % 500 == 0: # test(e, config, model, loss_func) if step != 0 and step % 5000 == 0: filename = config.filename_model + 'model_' + str( step) + '.pkl' save_model(model, filename) # test(e, config, model, loss_func) # train loss loss = all_loss / num print('epoch:', e, '|train_loss: %.4f' % loss) # test sorce = test(e, config, model, loss_func, tokenizer) if sorce > max_sorce: max_sorce = sorce filename = config.filename_model + 'model.pkl' save_model(model, filename)
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark Dataset = get_dataset() Dataset.default_resolution=[512,512] if(opt.resume_labels is True): train_labels,valid_labels,test_labels,class_name=read_data(opt.data_dir,opt.resume_labels) Dataset.num_classes=len(class_name) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) logger = Logger(opt) np.random.shuffle(train_labels) else: gt_labels,class_name=read_data(opt.data_dir,opt.resume_labels) Dataset.num_classes=len(class_name) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) np.random.shuffle(gt_labels) logger = Logger(opt) train_labels,valid_labels,test_labels=selective_folding(gt_labels,class_name,logger) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.model_name,Dataset.num_classes) optimizer = torch.optim.Adam(model.parameters(), opt.lr) print(model) start_epoch = 0 if opt.load_model=='': model, optimizer, start_epoch = load_model( model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up data...') train_set=Dataset(opt,train_labels,class_name) train_loader = torch.utils.data.DataLoader( train_set, sampler=ImbalancedDatasetSampler(train_set), batch_size=int(opt.batch_size/opt.subdivision), shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=True ) test_opt=copy.deepcopy(opt) test_opt.phase="test" valid_set=Dataset(test_opt,valid_labels,class_name) valid_loader = torch.utils.data.DataLoader( valid_set, batch_size=1, shuffle=False, num_workers=test_opt.num_workers, pin_memory=True, drop_last=True ) test_set=Dataset(test_opt,test_labels,class_name) test_loader = torch.utils.data.DataLoader( test_set, batch_size=1, shuffle=False, num_workers=test_opt.num_workers, pin_memory=True, drop_last=True ) #train print('Starting training...') max_acc_epoch_dir=os.path.join(opt.save_dir, 'model_max_acc.pth') if(os.path.exists(max_acc_epoch_dir)): checkpoint=torch.load(max_acc_epoch_dir, map_location=lambda storage, loc: storage) max_acc_epoch = checkpoint['epoch'] max_acc=checkpoint['valid_acc'] else: max_acc_epoch=-1 max_acc=0 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) log_dict_valid, _ = trainer.test(epoch, valid_loader) logger.write('epoch: {} |'.format(epoch)) logger.write('train loss {:8f} | '.format(log_dict_train['loss'])) logger.write('valid loss {:8f} | '.format(log_dict_valid['loss'])) logger.write('valid acc {:8f} | '.format(log_dict_valid['acc'])) logger.write('\n') if(max_acc < log_dict_valid['acc']): max_acc_epoch=epoch max_acc=log_dict_valid['acc'] save_model(max_acc_epoch_dir,epoch, model, optimizer,max_acc) save_model(os.path.join(opt.save_dir, 'model_last.pth'),epoch, model, optimizer) if epoch in opt.lr_step: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close() #test print('Final testing...') logger.open("log(test).txt") model, optimizer, start_epoch = load_model( model, max_acc_epoch_dir, optimizer, True, opt.lr, opt.lr_step) Tester = train_factory[opt.task] tester = Tester(opt, model, optimizer) tester.set_device(opt.gpus, opt.chunk_sizes, opt.device) log_dict_test, _ = tester.test(start_epoch, test_loader) logger.write('test model: {}, epoch: {}\n'.format(max_acc_epoch_dir,start_epoch)) for k, v in log_dict_test.items(): logger.write('{} {} | '.format(k, v))
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, opt.task) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) logger = Logger(opt) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv) optimizer = create_optimizer(model, opt) Trainer = train_factory[opt.task] trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) if opt.mixed_precision: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=opt.opt_level, max_loss_scale=opt.max_loss_scale) print('Using amp with opt level %s...' % opt.opt_level) else: amp = None meta = {'it': 0, 'epoch': 0} if opt.load_model != '': model, optimizer, amp, meta = load_model(model, opt.load_model, optimizer, amp, opt.resume, opt.lr, opt.lr_step) start_it = meta['it'] start_epoch = meta['epoch'] print('Setting up data...') val_dataset = Dataset(opt, 'val') val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.task == 'car_pose_6dof': # pass loaded 3D models for debug visualisations trainer.set_models(val_dataset.models) if opt.use_swa and start_it > opt.swa_start: if opt.test or opt.save_avg_weights: optimizer.swap_swa_sgd() train_dataset = Dataset(opt, 'train') train_loader = create_train_loader(train_dataset, opt) trainer.bn_update(train_loader) if opt.save_avg_weights: path = os.path.join(opt.save_dir, 'model_%d_avg.pth' % start_epoch) save_model(path, meta, model) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return train_dataset = Dataset(opt, 'train') train_loader = create_train_loader(train_dataset, opt) print('Starting training from {} epoch ({} global step)...'.format( start_epoch, start_it)) best = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) meta['it'] += len(train_loader) meta['epoch'] = epoch logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), meta, model, optimizer, amp) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] save_model(os.path.join(opt.save_dir, 'model_best.pth'), meta, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), meta, model, optimizer, amp) logger.write('\n') if epoch in opt.lr_step: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), meta, model, optimizer, amp) lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def train(self, cfg): # 设置gpu环境,考虑单卡多卡情况 gpus_str = '' if isinstance(cfg.gpus, (list, tuple)): cfg.gpus = [int(i) for i in cfg.gpus] for s in cfg.gpus: gpus_str += str(s) + ',' gpus_str = gpus_str[:-1] else: gpus_str = str(int(cfg.gpus)) cfg.gpus = [int(cfg.gpus)] os.environ['CUDA_VISIBLE_DEVICES'] = gpus_str cfg.gpus = [i for i in range(len(cfg.gpus)) ] if cfg.gpus[0] >= 0 else [-1] # 设置log model_dir = os.path.join(cfg.save_dir, cfg.id) debug_dir = os.path.join(model_dir, 'debug') if not os.path.exists(model_dir): os.makedirs(model_dir) if not os.path.exists(debug_dir): os.makedirs(debug_dir) logger = setup_logger(cfg.id, os.path.join(model_dir, 'log')) if USE_TENSORBOARD: writer = tensorboardX.SummaryWriter( log_dir=os.path.join(model_dir, 'log')) logger.info(cfg) gpus = cfg.gpus device = torch.device('cpu' if gpus[0] < 0 else 'cuda') lr = cfg.lr lr_step = cfg.lr_step num_epochs = cfg.num_epochs val_step = cfg.val_step sample_size = cfg.sample_size # 设置数据集 dataset = YOLO(cfg.data_dir, cfg.hflip, cfg.vflip, cfg.rotation, cfg.scale, cfg.shear, opt=cfg, split='train') names = dataset.class_name std = dataset.std mean = dataset.mean # 用数据集类别数设置预测网络 cfg.setup_head(dataset) trainloader = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.num_workers, pin_memory=True, drop_last=True) # val_dataset = YOLO(cfg.data_dir, cfg.hflip, cfg.vflip, cfg.rotation, cfg.scale, cfg.shear, opt=cfg, split='val') # valloader = DataLoader(val_dataset, batch_size=1, shuffle=True, num_workers=1, pin_memory=True) valid_file = cfg.val_dir if not cfg.val_dir == '' else os.path.join( cfg.data_dir, 'valid.txt') with open(valid_file, 'r') as f: val_list = [l.rstrip() for l in f.readlines()] net = create_model(cfg.arch, cfg.heads, cfg.head_conv, cfg.down_ratio, cfg.filters) optimizer = optim.Adam(net.parameters(), lr=lr) start_epoch = 0 if cfg.resume: pretrain = os.path.join(model_dir, 'model_last.pth') if os.path.exists(pretrain): print('resume model from %s' % pretrain) try: net, optimizer, start_epoch = load_model( net, pretrain, optimizer, True, lr, lr_step) except: print('\t... loading model error: ckpt may not compatible') model = ModleWithLoss(net, CtdetLoss(cfg)) if len(gpus) > 1: model = nn.DataParallel(model, device_ids=gpus).to(device) else: model = model.to(device) step = 0 best = 1e10 log_loss_stats = ['loss', 'hm_loss', 'wh_loss'] if cfg.reg_offset: log_loss_stats += ['off_loss'] if cfg.reg_obj: log_loss_stats += ['obj_loss'] for epoch in range(start_epoch + 1, num_epochs + 1): avg_loss_stats = {l: AverageMeter() for l in log_loss_stats} model.train() with tqdm(trainloader) as loader: for _, batch in enumerate(loader): for k in batch: if k != 'meta': batch[k] = batch[k].to(device=device, non_blocking=True) output, loss, loss_stats = model(batch) loss = loss.mean() optimizer.zero_grad() loss.backward() optimizer.step() # 设置tqdm显示信息 lr = optimizer.param_groups[0]['lr'] poststr = '' for l in avg_loss_stats: avg_loss_stats[l].update(loss_stats[l].mean().item(), batch['input'].size(0)) poststr += '{}: {:.4f}; '.format( l, avg_loss_stats[l].avg) loader.set_description('Epoch %d' % (epoch)) poststr += 'lr: {:.4f}'.format(lr) loader.set_postfix_str(poststr) step += 1 # self.lossSignal.emit(loss.item(), step) del output, loss, loss_stats # valid if step % val_step == 0: if len(cfg.gpus) > 1: val_model = model.module else: val_model = model val_model.eval() torch.cuda.empty_cache() # 随机采样 idx = np.arange(len(val_list)) idx = np.random.permutation(idx)[:sample_size] for j, id in enumerate(idx): image = cv2.imread(val_list[id]) image = self.preprocess(image, cfg.input_h, cfg.input_w, mean, std) image = image.to(device) with torch.no_grad(): output = val_model.model(image)[-1] # 画图并保存 debugger = Debugger(dataset=names, down_ratio=cfg.down_ratio) reg = output['reg'] if cfg.reg_offset else None obj = output['obj'] if cfg.reg_obj else None dets = ctdet_decode(output['hm'].sigmoid_(), output['wh'], reg=reg, obj=obj, cat_spec_wh=cfg.cat_spec_wh, K=cfg.K) dets = dets.detach().cpu().numpy().reshape( -1, dets.shape[2]) dets[:, :4] *= cfg.down_ratio image = image[0].detach().cpu().numpy().transpose( 1, 2, 0) image = np.clip(((image * std + mean) * 255.), 0, 255).astype(np.uint8) pred = debugger.gen_colormap( output['hm'][0].detach().cpu().numpy()) debugger.add_blend_img(image, pred, 'pred_hm') debugger.add_img(image, img_id='out_pred') for k in range(len(dets)): if dets[k, 4] > cfg.vis_thresh: debugger.add_coco_bbox(dets[k, :4], dets[k, -1], dets[k, 4], img_id='out_pred') debugger.save_all_imgs(debug_dir, prefix='{}.{}_'.format( step, j)) del output, image, dets # 保存模型参数 save_model(os.path.join(model_dir, 'model_best.pth'), epoch, net) model.train() logstr = 'epoch {}'.format(epoch) for k, v in avg_loss_stats.items(): logstr += ' {}: {:.4f};'.format(k, v.avg) if USE_TENSORBOARD: writer.add_scalar('train_{}'.format(k), v.avg, epoch) logger.info(logstr) # if epoch % val_step == 0: # if len(cfg.gpus) > 1: # val_model = model.module # else: # val_model = model # val_model.eval() # torch.cuda.empty_cache() # # val_loss_stats = {l: AverageMeter() for l in log_loss_stats} # # with tqdm(valloader) as loader: # for j, batch in enumerate(loader): # for k in batch: # if k != 'meta': # batch[k] = batch[k].to(device=device, non_blocking=True) # with torch.no_grad(): # output, loss, loss_stats = val_model(batch) # # poststr = '' # for l in val_loss_stats: # val_loss_stats[l].update( # loss_stats[l].mean().item(), batch['input'].size(0)) # poststr += '{}: {:.4f}; '.format(l, val_loss_stats[l].avg) # loader.set_description('Epoch %d valid' % (epoch)) # poststr += 'lr: {:.4f}'.format(lr) # loader.set_postfix_str(poststr) # # if j < sample_size: # # 将预测结果画出来保存成jpg图片 # debugger = Debugger(dataset=names, down_ratio=cfg.down_ratio) # reg = output['reg'] if cfg.reg_offset else None # obj = output['obj'] if cfg.reg_obj else None # dets = ctdet_decode( # output['hm'], output['wh'], reg=reg, obj=obj, # cat_spec_wh=cfg.cat_spec_wh, K=cfg.K) # dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2]) # dets[:, :, :4] *= cfg.down_ratio # dets_gt = batch['meta']['gt_det'].numpy().reshape(1, -1, dets.shape[2]) # dets_gt[:, :, :4] *= cfg.down_ratio # for i in range(1): # img = batch['input'][i].detach().cpu().numpy().transpose(1, 2, 0) # img = np.clip(((img * std + mean) * 255.), 0, 255).astype(np.uint8) # pred = debugger.gen_colormap(output['hm'][i].detach().cpu().numpy()) # gt = debugger.gen_colormap(batch['hm'][i].detach().cpu().numpy()) # debugger.add_blend_img(img, pred, 'pred_hm') # debugger.add_blend_img(img, gt, 'gt_hm') # debugger.add_img(img, img_id='out_pred') # for k in range(len(dets[i])): # if dets[i, k, 4] > cfg.vis_thresh: # debugger.add_coco_bbox(dets[i, k, :4], dets[i, k, -1], # dets[i, k, 4], img_id='out_pred') # # debugger.add_img(img, img_id='out_gt') # for k in range(len(dets_gt[i])): # if dets_gt[i, k, 4] > cfg.vis_thresh: # debugger.add_coco_bbox(dets_gt[i, k, :4], dets_gt[i, k, -1], # dets_gt[i, k, 4], img_id='out_gt') # # debugger.save_all_imgs(debug_dir, prefix='{}.{}_'.format(epoch, j)) # del output, loss, loss_stats # model.train() # logstr = 'epoch {} valid'.format(epoch) # for k, v in val_loss_stats.items(): # logstr += ' {}: {:.4f};'.format(k, v.avg) # if USE_TENSORBOARD: # writer.add_scalar('val_{}'.format(k), v.avg, epoch) # logger.info(logstr) # if val_loss_stats['loss'].avg < best: # best = val_loss_stats['loss'].avg # save_model(os.path.join(model_dir, 'model_best.pth'), epoch, net) save_model(os.path.join(model_dir, 'model_last.pth'), epoch, net, optimizer) if epoch in cfg.lr_step: save_model( os.path.join(model_dir, 'model_{}.pth'.format(epoch)), epoch, net, optimizer) lr = cfg.lr * (0.1**(cfg.lr_step.index(epoch) + 1)) logger.info('Drop LR to {}'.format(lr)) for param_group in optimizer.param_groups: param_group['lr'] = lr
def train(model, criterion, train_set, val_set, opt, labels=None): # define web visualizer using visdom webvis = WebVisualizer(opt) # modify learning rate of last layer finetune_params = modify_last_layer_lr(model.named_parameters(), opt.lr, opt.lr_mult_w, opt.lr_mult_b) # define optimizer optimizer = optim.SGD(finetune_params, opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) # define laerning rate scheluer scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_in_epoch, gamma=opt.gamma) if labels is not None: rid2name, id2rid = labels # record forward and backward times train_batch_num = len(train_set) total_batch_iter = 0 logging.info("####################Train Model###################") for epoch in range(opt.sum_epoch): epoch_start_t = time.time() epoch_batch_iter = 0 logging.info('Begin of epoch %d' %(epoch)) for i, data in enumerate(train_set): iter_start_t = time.time() # train inputs, targets = data output, loss, loss_list = forward_batch(model, criterion, inputs, targets, opt, "Train") optimizer.zero_grad() loss.backward() optimizer.step() webvis.reset() epoch_batch_iter += 1 total_batch_iter += 1 # display train loss and accuracy if total_batch_iter % opt.display_train_freq == 0: # accuracy batch_accuracy = calc_accuracy(output, targets, opt.score_thres, opt.top_k) util.print_loss(loss_list, "Train", epoch, total_batch_iter) util.print_accuracy(batch_accuracy, "Train", epoch, total_batch_iter) if opt.display_id > 0: x_axis = epoch + float(epoch_batch_iter)/train_batch_num # TODO support accuracy visualization of multiple top_k plot_accuracy = [batch_accuracy[i][opt.top_k[0]] for i in range(len(batch_accuracy)) ] accuracy_list = [item["ratio"] for item in plot_accuracy] webvis.plot_points(x_axis, loss_list, "Loss", "Train") webvis.plot_points(x_axis, accuracy_list, "Accuracy", "Train") # display train data if total_batch_iter % opt.display_data_freq == 0: image_list = list() show_image_num = int(np.ceil(opt.display_image_ratio * inputs.size()[0])) for index in range(show_image_num): input_im = util.tensor2im(inputs[index], opt.mean, opt.std) class_label = "Image_" + str(index) if labels is not None: target_ids = [targets[i][index] for i in range(opt.class_num)] rids = [id2rid[j][k] for j,k in enumerate(target_ids)] class_label += "_" class_label += "#".join([rid2name[j][k] for j,k in enumerate(rids)]) image_list.append((class_label, input_im)) image_dict = OrderedDict(image_list) save_result = total_batch_iter % opt.update_html_freq webvis.plot_images(image_dict, opt.display_id + 2*opt.class_num, epoch, save_result) # validate and display validate loss and accuracy if len(val_set) > 0 and total_batch_iter % opt.display_validate_freq == 0: val_accuracy, val_loss = validate(model, criterion, val_set, opt) x_axis = epoch + float(epoch_batch_iter)/train_batch_num accuracy_list = [val_accuracy[i][opt.top_k[0]]["ratio"] for i in range(len(val_accuracy))] util.print_loss(val_loss, "Validate", epoch, total_batch_iter) util.print_accuracy(val_accuracy, "Validate", epoch, total_batch_iter) if opt.display_id > 0: webvis.plot_points(x_axis, val_loss, "Loss", "Validate") webvis.plot_points(x_axis, accuracy_list, "Accuracy", "Validate") # save snapshot if total_batch_iter % opt.save_batch_iter_freq == 0: logging.info("saving the latest model (epoch %d, total_batch_iter %d)" %(epoch, total_batch_iter)) save_model(model, opt, epoch) # TODO snapshot loss and accuracy logging.info('End of epoch %d / %d \t Time Taken: %d sec' % (epoch, opt.sum_epoch, time.time() - epoch_start_t)) if epoch % opt.save_epoch_freq == 0: logging.info('saving the model at the end of epoch %d, iters %d' %(epoch+1, total_batch_iter)) save_model(model, opt, epoch+1) # adjust learning rate scheduler.step() lr = optimizer.param_groups[0]['lr'] logging.info('learning rate = %.7f epoch = %d' %(lr,epoch)) logging.info("--------Optimization Done--------")