Пример #1
0
def train(model,
          training_parameters,
          model_choice,
          target,
          model_name_suffix=''):
    optimizer = tf.keras.optimizers.Adam(1e-2)
    log_joint_pdf = get_log_joint_pdf(training_parameters['name'])

    # Early stopping
    best_loss = 1e20
    last_improvement = 0
    max_consecutive_no_improvement = 15000
    min_epoch_checkpoint = 1
    checkpoint_tol = 0.02
    saved_checkpoint = False

    # Monitor training loss for visualisation
    loss_monitor = []
    for epoch in range(1, training_parameters['epochs']):
        loss = compute_apply_gradients(model, optimizer, log_joint_pdf)

        if loss < best_loss:
            if ((best_loss - loss) / np.abs(best_loss) >
                    checkpoint_tol) & (epoch > min_epoch_checkpoint):
                print(
                    f"    - CHECKPOINT for epoch {epoch + 1}, current best loss {loss}"
                )
                save_model(model,
                           model_choice,
                           target,
                           model_name_suffix=model_name_suffix)
                best_loss = loss
                last_improvement = 0
                saved_checkpoint = True

        else:
            last_improvement += 1
        if last_improvement >= max_consecutive_no_improvement:
            print(f"    - STOPPED after {epoch} epochs")
            break

        if epoch % 100 == 0:
            print(f"Epoch {epoch}, loss: {loss}")
            loss_monitor.append(loss)

    plt.figure()
    plt.plot(loss_monitor, color='slategrey')
    plt.xlabel('Epochs (x100)')
    plt.ylabel('-ELBO(q)')

    if saved_checkpoint:
        model = load_model(model_choice,
                           training_parameters,
                           model_name_suffix='')

    return model
def train(model, criterion_softmax, criterion_binary, train_set, val_set, opt):
    # define web visualizer using visdom
    #webvis = WebVisualizer(opt)

    # modify learning rate of last layer
    finetune_params = modify_last_layer_lr(model.named_parameters(),
                                           opt.lr, opt.lr_mult_w, opt.lr_mult_b)
    # define optimizer
    optimizer = optim.SGD(finetune_params,
                          opt.lr,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay)
    # define laerning rate scheluer
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=opt.lr_decay_in_epoch,
                                          gamma=opt.gamma)


    # record forward and backward times
    train_batch_num = len(train_set)
    total_batch_iter = 0
    logging.info("####################Train Model###################")
    for epoch in range(opt.sum_epoch):
       # epoch_start_t = time.time()
        epoch_batch_iter = 0
        logging.info('Begin of epoch %d' % (epoch))
        for i, data in enumerate(train_set):
           # iter_start_t = time.time()
            # train
            inputs, target_softmax,target_binary = data
            output, loss, loss_list = forward_batch(model, criterion_softmax, criterion_binary, inputs, target_softmax,target_binary, opt, "Train")
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

           # webvis.reset()
            epoch_batch_iter += 1
            total_batch_iter += 1




      #  logging.info('End of epoch %d / %d \t Time Taken: %d sec' %
                    # (epoch, opt.sum_epoch, time.time() - epoch_start_t))

        if epoch % opt.save_epoch_freq == 0:
            logging.info('saving the model at the end of epoch %d, iters %d' % (epoch + 1, total_batch_iter))
            save_model(model, opt, epoch + 1)

            # adjust learning rate
        scheduler.step()
        lr = optimizer.param_groups[0]['lr']
        logging.info('learning rate = %.7f epoch = %d' % (lr, epoch))
    logging.info("--------Optimization Done--------")
Пример #3
0
def do_eval(opt, epoch, model, DatasetFactory, logger, best):
    # Based this code on test.py's non-prefetched code path:
    Detector = detector_factory[opt.task]
    dataset = DatasetFactory(opt, "val")
    detector = Detector(opt, model)
    results = {}
    num_iters = len(dataset)
    bar = Bar('{}'.format(opt.exp_id), max=num_iters)
    time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge']
    avg_time_stats = {t: AverageMeter() for t in time_stats}
    for ind in range(num_iters):
        img_id = dataset.images[ind]
        img_info = dataset.coco.loadImgs(ids=[img_id])[0]
        img_path = os.path.join(dataset.img_dir, img_info['file_name'])

        if opt.task == 'ddd':
            ret = detector.run(img_path, img_info['calib'])
        else:
            ret = detector.run(img_path)

        results[img_id] = ret['results']
        Bar.suffix = '[{0}/{1}]|Tot: {total:} |ETA: {eta:} '.format(
                        ind, num_iters, total=bar.elapsed_td, eta=bar.eta_td)
        for t in avg_time_stats:
            avg_time_stats[t].update(ret[t])
            Bar.suffix = Bar.suffix + '|{} {:.3f} '.format(t, avg_time_stats[t].avg)
        bar.next()
    bar.finish()
    metric = float("-inf")
    # Capture metric of interest, e.g., for COCO eval, something like AP50:
    eval_stats = dataset.run_eval(results, opt.save_dir, logger)
    if uses_coco_eval(opt):
        ap50 = eval_stats[1]
        ap25 = eval_stats[12]
        metric = ap25
        # Log results to log.txt and/or tensorboard:
        logger.scalar_summary("val_ap50", ap50, epoch)
        logger.scalar_summary("val_ap25", ap25, epoch)
    else:
        # Pascal VOC:
        metric = eval_stats["Mean AP"]
        # Log results to log.txt and/or tensorboard:
        logger.scalar_summary("mean_AP", metric, epoch)

    # Best model checkpointing:
    if metric > best:
        best = metric
        save_model(
            os.path.join(opt.save_dir, "model_best.pth"), epoch, model
        )
    return best
Пример #4
0
def teacher_train(cfg, start_epoch):
    torch.manual_seed(cfg.SEED)
    device = torch.device('cuda' if cfg.GPU[0] >= 0 else 'cpu')
    if start_epoch == 1:
        train_log = open(os.path.join(cfg.LOG_DIR, "train_log.csv"), 'w')
        train_log_title = "epoch,total_loss,hm_loss,wh_loss"
        val_log = open(os.path.join(cfg.LOG_DIR, "val_log.csv"), 'w')
        val_log_title = "epoch,precision,recall\n"
        if cfg.USE_OFFSET:
            train_log_title += ",offset_loss\n"
        else:
            train_log_title += "\n"
        train_log.write(train_log_title)
        train_log.flush()
        val_log.write(val_log_title)
        val_log.flush()
    else:
        train_log = open(os.path.join(cfg.LOG_DIR, "train_lo.csv"), 'a')
        val_log = open(os.path.join(cfg.LOG_DIR, "val_log.csv"), 'a')

    print('Creating model...')
    teacher = create_model(cfg, 'res_18')
    teacher = load_model(teacher, 'log/weights/model_last_res.pth')
    model = create_model(cfg, 'litnet')
    if start_epoch != 1:
        model = load_model(
            model, 'log/weights/model_epoch_{}.pth'.format(start_epoch - 1))
    optimizer = torch.optim.Adam(model.parameters(), cfg.LR)

    trainer = TeacherTrainer(cfg, teacher, model, optimizer)
    trainer.set_device(cfg.GPU, device)
    print('Setting up data...')
    train_loader = DataLoader(TrainCircleDataset(cfg),
                              batch_size=cfg.BATCH_SIZE,
                              shuffle=True,
                              num_workers=cfg.NUM_WORKERS,
                              pin_memory=True,
                              drop_last=True)
    val_loader = ValCircleDataset()
    print('Starting training...')
    epoch = start_epoch
    for epoch in range(start_epoch, start_epoch + cfg.NUM_EPOCHS):
        trainer.train(epoch, train_loader, train_log)
        model_path = os.path.join(cfg.WEIGHTS_DIR,
                                  'model_epoch_{}.pth'.format(epoch))
        save_model(model_path, epoch, model, optimizer)
        trainer.val(epoch, model_path, val_loader, val_log, cfg)

    save_model(os.path.join(cfg.WEIGHTS_DIR, 'model_last.pth'), epoch, model,
               optimizer)
Пример #5
0
def train(model, criterion, train_set, val_set, optimizer, scheduler, opt):
    logging.info("####################Train Model###################")
    # loss_avg =
    for epoch in range(opt.sum_epoch):
        epoch_start_t = time.time()
        epoch_batch_iter = 0
        logging.info('Begin of epoch %d' % (epoch))
        for i, data in enumerate(train_set):
            inputs, targets, targets_weight, meta = data
            if torch.cuda.is_available():
                inputs = inputs.cuda()
                targets = targets.cuda()
                targets_weight = targets_weight.cuda()
            # cal output of CNN
            outputs = model(inputs)
            # cal loss
            loss = criterion(outputs, targets, targets_weight)
            # cal gradient
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_batch_iter += 1

            # display train loss
            if epoch_batch_iter % opt.display_train_freq == 0:
                util.print_loss(loss, epoch, epoch_batch_iter, opt)

        # display validate accuracy
        if epoch_batch_iter % opt.display_validate_freq == 0:
            logging.info('Validate of epoch %d' % (epoch))
            test(model, val_set, opt)

        # adjust learning rate
        scheduler.step()
        lr = optimizer.param_groups[0]['lr']
        logging.info('learning rate = %.7f epoch = %d' % (lr, epoch))

        # save model
        if epoch % opt.save_epoch_freq == 0 or epoch == opt.sum_epoch - 1:
            logging.info('saving the model at the end of epoch %d' % (epoch))
            save_model(model, opt, epoch)
    logging.info("--------Optimization Done--------")
Пример #6
0
 def train(self, epoch):
     mark = epoch if self.opt.save_all else 'last'
     log_dict_train, _ = self.trainer.train(epoch, self.train_loader)
     self.logger.write('epoch: {} |'.format(epoch))
     for k, v in log_dict_train.items():
         self.logger.scalar_summary('train_{}'.format(k), v, epoch)
         self.logger.write('{} {:8f} | '.format(k, v))
     if self.opt.val_intervals > 0 and epoch % self.opt.val_intervals == 0:
         save_model(
             os.path.join(self.opt.save_dir, 'model_{}.pth'.format(mark)),
             epoch, self.model, self.optimizer)
     with torch.no_grad():
         log_dict_val, preds = self.trainer.val(epoch, self.val_loader)
     for k, v in log_dict_val.items():
         self.logger.scalar_summary('val_{}'.format(k), v, epoch)
         self.logger.write('{} {:8f} | '.format(k, v))
     if log_dict_val[self.opt.metric] < self.best:
         self.best = log_dict_val[self.opt.metric]
         save_model(os.path.join(self.opt.save_dir, 'model_best.pth'),
                    epoch, self.model)
     else:
         save_model(os.path.join(self.opt.save_dir, 'model_last.pth'),
                    epoch, self.model, self.optimizer)
     self.logger.write('\n')
     if epoch in self.opt.lr_step:
         lr = self.opt.lr * (0.1**(self.opt.lr_step.index(epoch) + 1))
         print('Drop LR to', lr)
         for param_group in self.optimizer.param_groups:
             param_group['lr'] = lr
Пример #7
0
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    Dataset = get_dataset(opt.dataset, opt.task)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    start_epoch = 0
    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)

    output_file_name = os.path.splitext(os.path.basename(opt.load_model))[0]
    output_file_name = 'inference_{}.pth'.format(output_file_name)
    output_file_name = os.path.join(opt.save_dir, output_file_name)
    save_model(output_file_name, start_epoch, model)
    print("Model Saved at {} ".format(output_file_name))
Пример #8
0
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    Dataset = get_dataset(opt.dataset)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
    print(opt)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    start_epoch = 0
    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)

    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'),
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True,
                                               drop_last=True)

    print('Starting training...')
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if epoch > 100:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
    logger.close()
Пример #9
0
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test

    print('Setting up data...')
    Dataset = get_dataset(opt.dataset, opt.task)
    f = open(opt.data_cfg)
    data_config = json.load(f)
    trainset_paths = data_config['train']
    dataset_root = data_config['root']
    f.close()
    transforms = T.Compose([T.ToTensor()])
    dataset = Dataset(opt,
                      dataset_root,
                      trainset_paths, (1088, 608),
                      augment=True,
                      transforms=transforms)
    opt = opts().update_dataset_info_and_set_heads(opt, dataset)
    print(opt)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    start_epoch = 0
    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)

    # Get dataloader

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True,
                                               drop_last=True)

    print('Starting training...')
    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)
    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))

        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
        if epoch % 5 == 0:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
    logger.close()
Пример #10
0
def main(opt):
  torch.manual_seed(opt.seed)
  torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
  Dataset = get_dataset(opt.dataset, opt.task)
  opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
  print(opt)

  logger = Logger(opt)

  os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
  opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')
  
  print('Creating model...')
  model = create_model(opt.arch, opt.heads, opt.head_conv, opt.nbr_frames)
  optimizer = torch.optim.Adam(model.parameters(), opt.lr)
  start_epoch = 0
  if opt.load_model != '':
    model, optimizer, start_epoch = load_model(
      model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step, opt.nbr_frames)

  # save_model('/usagers2/huper/dev/SpotNet2/exp/uav/ctdetSpotNetVid/fromCOCOB/fromSN2.pth', 0, model, optimizer)
  # exit()

  Trainer = train_factory[opt.task]
  trainer = Trainer(opt, model, optimizer)
  trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

  print('Setting up data...')
  val_loader = torch.utils.data.DataLoader(
      Dataset(opt, 'val'), 
      batch_size=1, 
      shuffle=False,
      num_workers=1,
      pin_memory=True
  )

  if opt.test:
    _, preds = trainer.val(0, val_loader)
    val_loader.dataset.run_eval(preds, opt.save_dir)
    return

  train_loader = torch.utils.data.DataLoader(
      Dataset(opt, 'train'), 
      batch_size=opt.batch_size, 
      shuffle=True,
      num_workers=opt.num_workers,
      pin_memory=True,
      drop_last=True
  )

  # logger.write_model(model)
  print('Starting training...')
  best = 1e10
  for epoch in range(start_epoch + 1, opt.num_epochs + 1):
    mark = epoch if opt.save_all else 'last'
    log_dict_train, _ = trainer.train(epoch, train_loader)
    logger.write('epoch: {} |'.format(epoch))
    for k, v in log_dict_train.items():
      logger.scalar_summary('train_{}'.format(k), v, epoch)
      logger.write('{} {:8f} | '.format(k, v))
    if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), 
                 epoch, model, optimizer)
      with torch.no_grad():
        log_dict_val, preds = trainer.val(epoch, val_loader)
      for k, v in log_dict_val.items():
        logger.scalar_summary('val_{}'.format(k), v, epoch)
        logger.write('{} {:8f} | '.format(k, v))
      if log_dict_val[opt.metric] < best:
        best = log_dict_val[opt.metric]
        save_model(os.path.join(opt.save_dir, 'model_best.pth'), 
                   epoch, model)
    else:
      save_model(os.path.join(opt.save_dir, 'model_last.pth'), 
                 epoch, model, optimizer)
    logger.write('\n')
    if epoch in opt.lr_step:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), 
                 epoch, model, optimizer)
      lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
      print('Drop LR to', lr)
      for param_group in optimizer.param_groups:
          param_group['lr'] = lr
  logger.close()
Пример #11
0
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    Dataset = get_dataset(opt.dataset, opt.task)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
    print(opt)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    start_epoch = 0
    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)

    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    print(get_parameter_number(model))
    #summary(model, (3, 416, 416))

    # # 计算网络参数量和FLOPS  可以GPU
    # from thop import profile
    # input = torch.randn(1, 3, 416, 416).cuda()
    # flops, params = profile(model, inputs=(input,))
    # print(flops)
    # print(params)
    #
    # # 计算网络参数量和FLOPS 好像只能CPU
    # from torchstat import stat
    # stat(model, (3, 416, 416))

    print('Setting up data...')
    val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'),
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=0,
                                             pin_memory=True)

    if opt.test:
        _, preds = trainer.val(0, val_loader)
        val_loader.dataset.run_eval(preds, opt.save_dir)
        return

    train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'),
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True,
                                               drop_last=True)

    print('Starting training...')
    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if log_dict_val[opt.metric] < best:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch,
                           model)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
    logger.close()
Пример #12
0
def main(cfg, local_rank):
    torch.manual_seed(cfg.SEED)
    torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
    Dataset = get_dataset(cfg.SAMPLE_METHOD, cfg.TASK)


    print('Creating model...')
    model = create_model(cfg.MODEL.NAME, cfg.MODEL.HEAD_CONV, cfg)
    
    num_gpus = torch.cuda.device_count()

    if cfg.TRAIN.DISTRIBUTE:
        device = torch.device('cuda:%d'%local_rank)
        torch.cuda.set_device(local_rank)
        dist.init_process_group(backend='nccl', init_method='env://',
                                world_size=num_gpus, rank=local_rank)
    else:
        device = torch.device('cuda')
             
    logger = Logger(cfg)

        
    if cfg.TRAIN.OPTIMIZER=='adam':
        optimizer = torch.optim.Adam(model.parameters(), cfg.TRAIN.LR)
    elif cfg.TRAIN.OPTIMIZER== 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=cfg.TRAIN.LR, momentum=0.9)    
    else:
        NotImplementedError
        
    start_epoch = 0
    if cfg.MODEL.INIT_WEIGHTS:
        model, optimizer, start_epoch = load_model(
          model, cfg.MODEL.PRETRAINED, optimizer, cfg.TRAIN.RESUME, cfg.TRAIN.LR, cfg.TRAIN.LR_STEP)

    Trainer = train_factory[cfg.TASK]
    trainer = Trainer(cfg, local_rank, model, optimizer)

    cfg.TRAIN.MASTER_BATCH_SIZE

    if cfg.TRAIN.MASTER_BATCH_SIZE == -1:
        master_batch_size = cfg.TRAIN.BATCH_SIZE // len(cfg.GPUS)
    else:
        master_batch_size = cfg.TRAIN.MASTER_BATCH_SIZE
    rest_batch_size = (cfg.TRAIN.BATCH_SIZE - master_batch_size)
    chunk_sizes = [cfg.TRAIN.MASTER_BATCH_SIZE]
    for i in range(len(cfg.GPUS) - 1):
        slave_chunk_size = rest_batch_size // (len(cfg.GPUS) - 1)
        if i < rest_batch_size % (len(cfg.GPUS) - 1):
            slave_chunk_size += 1
        chunk_sizes.append(slave_chunk_size)
    trainer.set_device(cfg.GPUS, chunk_sizes, device)

    print('Setting up data...')
    val_dataset = Dataset(cfg, 'val')
    val_loader = torch.utils.data.DataLoader(
      val_dataset, 
      batch_size=1, 
      shuffle=False,
      num_workers=1,
      pin_memory=True
    )
    
    train_dataset = Dataset(cfg, 'train')
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,
                                                                  num_replicas=num_gpus,
                                                                  rank=local_rank)
    train_loader = torch.utils.data.DataLoader(
      train_dataset, 
      batch_size=cfg.TRAIN.BATCH_SIZE//num_gpus if cfg.TRAIN.DISTRIBUTE else cfg.TRAIN.BATCH_SIZE, 
      shuffle=not cfg.TRAIN.DISTRIBUTE,
      num_workers=cfg.WORKERS,
      pin_memory=True,
      drop_last=True,
      sampler = train_sampler if cfg.TRAIN.DISTRIBUTE else None
    )

    print('Starting training...')
    best = 0.
    for epoch in range(start_epoch + 1, cfg.TRAIN.EPOCHS + 1):
        mark = epoch if cfg.TRAIN.SAVE_ALL_MODEL else 'last'
        train_sampler.set_epoch(epoch)
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if cfg.TRAIN.VAL_INTERVALS > 0 and epoch % cfg.TRAIN.VAL_INTERVALS == 0:
            save_model(os.path.join(cfg.OUTPUT_DIR, 'model_{}.pth'.format(mark)), 
                 epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
                mAP = val_dataset.run_eval(preds, cfg.OUTPUT_DIR)
                print('mAP is: ', mAP)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if mAP > best:
                best = mAP
                save_model(os.path.join(cfg.OUTPUT_DIR, 'model_best.pth'), 
                           epoch, model)
        else:
            save_model(os.path.join(cfg.OUTPUT_DIR, 'model_last.pth'), 
                     epoch, model, optimizer)
        logger.write('\n')
        if epoch in cfg.TRAIN.LR_STEP:
            save_model(os.path.join(cfg.OUTPUT_DIR, 'model_{}.pth'.format(epoch)), 
                     epoch, model, optimizer)
            lr = cfg.TRAIN.LR * (0.1 ** (cfg.TRAIN.LR_STEP.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
              param_group['lr'] = lr
    logger.close()
Пример #13
0
def main(opt):
    torch.manual_seed(opt.seed)
    # if add --not_cuda_benchmark, opt.not_cuda_benchmark=True
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test

    # return Dataset class by dataset and task name
    # one dataset can do multiple tasks by different annotation settings
    Dataset = get_dataset(opt.dataset, opt.task)
    # update opt [ input|ouput res, opt.heads ] with Dataset info
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
    pprint(vars(opt))

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    # opt.arch: --arch dla_34
    # opt.heads: set heads by task in opts().update_dataset_info_and_set_heads()
    # opt.head_conv: 256, one more layer btw features and final_class, number defined by opt.arch
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    optimizer = torch.optim.Adam(model.parameters(),
                                 opt.lr)  # optimize all params
    start_epoch = 0

    # load pretrain model
    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)

    # choose trainer by opt.task
    Trainer = train_factory[opt.task]
    # define trainer
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    print('Setting up data...')

    # val dataset
    val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'),
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=1,
                                             pin_memory=True)
    if opt.test:  # test on val dataset
        _, preds = trainer.val(0, val_loader)
        val_loader.dataset.run_eval(preds, opt.save_dir)
        return  # end program here

    train_loader = torch.utils.data.DataLoader(
        Dataset(opt, 'train'),  # split, load json
        batch_size=opt.batch_size,
        shuffle=True,
        num_workers=opt.num_workers,  # multi-process read data, wrt batch_size
        pin_memory=True,
        drop_last=True)

    print('Starting training...')
    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'  # save all middle model or last
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            # default will USE_TENSORBOARD to log scalars
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))

        # default val/save intervals = 5
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(
                os.path.join(opt.save_dir,
                             'model_{}.pth'.format(mark)),  # path
                epoch,
                model,
                optimizer)  # save model dict keys
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if log_dict_val[opt.metric] < best:
                # metric:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch,
                           model)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
    logger.close()
Пример #14
0
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    Dataset = get_dataset(opt.dataset, opt.task)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    opt.lr = 5e-3
    optimizer = torch.optim.Adam(model.parameters(), opt.lr, weight_decay=0)
    scheduler_consine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                                   T_max=310,
                                                                   eta_min=0)
    scheduler_warmup = GradualWarmupScheduler(
        optimizer,
        multiplier=1,
        total_epoch=10,
        after_scheduler=scheduler_consine)
    start_epoch = 0
    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)

    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    print('Setting up data...')
    val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'),
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=1,
                                             pin_memory=True)

    if opt.test:
        _, preds = trainer.val(0, val_loader)
        val_loader.dataset.run_eval(preds, opt.save_dir)
        return

    train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'),
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True,
                                               drop_last=True)

    print('Starting training...')
    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if log_dict_val[opt.metric] < best:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch,
                           model)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
        logger.write('\n')
        scheduler_warmup.step(epoch)
        # if epoch in opt.lr_step:
        #  save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
        #             epoch, model, optimizer)
        #  lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
        #  print('Drop LR to', lr)
        #  for param_group in optimizer.param_groups:
        #      param_group['lr'] = lr
    logger.close()
Пример #15
0
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test

    print('Setting up data...')
    Dataset = get_dataset(opt.dataset, opt.task, opt.multi_scale)
    f = open(opt.data_cfg)
    data_config = json.load(f)
    trainset_paths = data_config['train']
    dataset_root = data_config['root']
    f.close()
    transforms = T.Compose([T.ToTensor()])
    dataset = Dataset(opt,
                      dataset_root,
                      trainset_paths, (640, 480),
                      augment=True,
                      transforms=transforms)
    opt = opts().update_dataset_info_and_set_heads(opt, dataset)
    print(opt)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    # optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=0.9, weight_decay=5e-4)
    warmup_epoch = 5
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(opt.num_epochs - warmup_epoch))
    lr1 = lr_scheduler.get_lr()[0]
    print("Learn_rate:%s" % lr1)

    iter_per_epoch = len(dataset)
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * warmup_epoch)
    start_epoch = 0

    # Get dataloader
    if opt.multi_scale:
        train_loader = torch.utils.data.DataLoader(dataset=dataset,
                                                   batch_size=opt.batch_size,
                                                   shuffle=False,
                                                   num_workers=opt.num_workers,
                                                   pin_memory=True,
                                                   drop_last=True)
    else:
        train_loader = torch.utils.data.DataLoader(dataset,
                                                   batch_size=opt.batch_size,
                                                   shuffle=True,
                                                   num_workers=opt.num_workers,
                                                   pin_memory=True,
                                                   drop_last=True)

    print('Starting training...')
    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   trainer.optimizer,
                                                   opt.resume, opt.lr,
                                                   opt.lr_step)

    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))

        # lr_scheduler.step()
        if epoch >= warmup_epoch:
            lr_scheduler.step()
            lr = lr_scheduler.get_lr()[0]
            print("Learn_rate:%s" % lr)
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))

        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
        logger.write('\n')
        # if epoch in opt.lr_step:
        #     save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
        #                epoch, model, optimizer)
        #     lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
        #     print('Drop LR to', lr)
        #     for param_group in optimizer.param_groups:
        #         param_group['lr'] = lr
        if epoch % 5 == 0 or epoch >= 30:
            print('Drop LR to', lr)
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
    logger.close()
Пример #16
0
def main(opt):
  torch.manual_seed(opt.seed)     # sets random seed for pytorch random number generator
  torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test      # enable inbuilt cudnn auto-tuner to find the best algorithm for hardware, if both opts not set.
  Dataset = get_dataset(opt.dataset, opt.task)
  opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
  print(opt)

  logger = Logger(opt)

  os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
  opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')
  
  print('Creating model...')
  model = create_model(opt.arch, opt.heads, opt.head_conv)
  optimizer = torch.optim.Adam(model.parameters(), opt.lr)
  start_epoch = 0

  # load model if specified
  if opt.load_model != '':
    model, optimizer, start_epoch = load_model(
      model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step)

  Trainer = train_factory[opt.task]
  trainer = Trainer(opt, model, optimizer)
  trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

  print('Setting up data...')
  val_loader = torch.utils.data.DataLoader(
      Dataset(opt, 'val'), 
      batch_size=1, 
      shuffle=False,
      num_workers=1,
      pin_memory=True
  )

  if opt.test:
    _, preds = trainer.val(0, val_loader)
    val_loader.dataset.run_eval(preds, opt.save_dir)
    return

  train_loader = torch.utils.data.DataLoader(
      Dataset(opt, 'train'), 
      batch_size=opt.batch_size, 
      shuffle=True,
      num_workers=opt.num_workers,
      pin_memory=True,
      drop_last=True
  )

  print('Starting training...')
  best = 1e10
  for epoch in range(start_epoch + 1, opt.num_epochs + 1):
    mark = epoch if opt.save_all else 'last'
    log_dict_train, _ = trainer.train(epoch, train_loader)      ###### training ######
    logger.write('epoch: {} |'.format(epoch))

    for k, v in log_dict_train.items():
      logger.scalar_summary('train_{}'.format(k), v, epoch)
      logger.write('{} {:8f} | '.format(k, v))

    if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), 
                 epoch, model, optimizer)
      with torch.no_grad():
        log_dict_val, preds = trainer.val(epoch, val_loader)
      for k, v in log_dict_val.items():
        logger.scalar_summary('val_{}'.format(k), v, epoch)
        logger.write('{} {:8f} | '.format(k, v))
      if log_dict_val[opt.metric] < best:
        best = log_dict_val[opt.metric]
        save_model(os.path.join(opt.save_dir, 'model_best.pth'), 
                   epoch, model)
    else:
      save_model(os.path.join(opt.save_dir, 'model_last.pth'),
                 epoch, model, optimizer)                         # save model
    logger.write('\n')

    # adjust lr every certain epochs
    if epoch in opt.lr_step:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), 
                 epoch, model, optimizer)
      lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
      print('Drop LR to', lr)
      for param_group in optimizer.param_groups:
          param_group['lr'] = lr
  logger.close()
Пример #17
0
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    Dataset = get_dataset(opt.dataset, opt.task)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
    print(opt)
    task = 1
    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    model1 = create_model(opt.arch, opt.heads, opt.head_conv)
    optimizer = torch.optim.Adam(model.parameters(),
                                 opt.lr)  #,weight_decay=0.1)
    start_epoch = 0
    #print(model)

    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)
        model1, _, _ = load_model(model1, opt.load_model, optimizer,
                                  opt.resume, opt.lr, opt.lr_step)
    else:
        task = -1  #表明这是第一个task
    set_requires_grad(model1, requires_grad=False)
    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, model1, task, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    print('Setting up data...')
    val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'),
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=1,
                                             pin_memory=True)

    if opt.test:
        _, preds = trainer.val(0, val_loader)
        val_loader.dataset.run_eval(preds, opt.save_dir)
        return

    train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'),
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True,
                                               drop_last=True)
    old_loader = None

    if os.path.exists('./exemplar_dataset'):
        N = len(os.listdir('./exemplar_dataset')) - 1
        N = min(N, opt.batch_size)
    if (task != -1):
        old_loader = torch.utils.data.DataLoader(
            Dataset(opt, 'exemplar'),
            batch_size=N,
            shuffle=True,
            num_workers=opt.num_workers,
            pin_memory=True,
            drop_last=True,
        )

    params = {n: p for n, p in model.named_parameters() if p.requires_grad}
    _means = {}
    for n, p in params.items():
        _means[n] = p.clone().detach()

    precision_matrices = {}  # 重要度
    for n, p in params.items():
        precision_matrices[n] = p.clone().detach().fill_(0)  # 取zeros_like

    print('Starting training...')

    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader, old_loader,
                                          _means, precision_matrices)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader,
                                                  old_loader, _means,
                                                  precision_matrices)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if log_dict_val[opt.metric] < best:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch,
                           model)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
    logger.close()
rfc = RandomForestClassifier()
sgd = SGDClassifier()

scorer = make_scorer(accuracy_score)

param_grid = [  # {'alpha': np.linspace(0.00001, 1, 40)},
    {
        'penalty': ['l2'],
        'C': [0.1, 1, 5, 10],
        'solver': ['lbfgs', 'liblinear']
    }, {
        'n_neighbors': [1, 3, 5, 10]
    }, {
        'n_estimators': list(range(10, 101, 10)),
        'max_features': list(range(6, 32, 5))
    }, {
        'average': [True, False],
        'alpha': np.linspace(0.001, 1, 40)
    }
]
model_list = [logistic, knn, rfc, sgd]

grid_search, grid_results, results = random_search_best_estimator(
    scorer, param_grid, model_list, X_train, X_test, y_train, y_test)
results = pd.DataFrame(results)
best_estimator = results['best_estimator'][results['best_score'] ==
                                           results['best_score'].max()].iloc[0]
final_accuracy_test, final_accuracy_train, pred_test, pred_train = final_model(
    X_train, X_test, y_train, y_test, best_estimator)
save_model('../models', 'best_estimator.sav', best_estimator)
Пример #19
0
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = False
    # torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    Dataset = get_dataset(opt.dataset, opt.task)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
    print(opt)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')
    print(opt.device)

    print('Creating model...')
    model = create_model(opt.arch,
                         opt.heads,
                         opt.head_conv,
                         opt.deform_conv,
                         w2=opt.w2,
                         maxpool=opt.maxpool)

    optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    start_epoch = 0

    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)

    # quantization-aware fine-tuning
    quantize_shufflenetv2_dcn(model,
                              quant_conv=4,
                              quant_bn=None,
                              quant_act=8,
                              wt_quant_mode='symmetric',
                              act_quant_mode='asymmetric',
                              wt_per_channel=True,
                              wt_percentile=True,
                              act_percentile=False,
                              deform_backbone=False,
                              w2=opt.w2,
                              maxpool=opt.maxpool)
    # quantized_model = quantize_sfl_dcn(model, quant_conv=4, quant_bn=None, quant_act=4,
    #                           quant_mode='symmetric', wt_per_channel=True, wt_percentile=False, act_percentile=False)
    # print(quantized_model)

    # if opt.load_model != '':
    #   model, optimizer, start_epoch = load_model(
    #     model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step)

    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    print('Setting up data...')
    val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'),
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=1,
                                             pin_memory=True)

    # if opt.test:
    # # if True:
    #   _, preds = trainer.val(0, val_loader)
    #   val_loader.dataset.run_eval(preds, opt.save_dir)
    #   return

    train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'),
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True,
                                               drop_last=True)

    print('Starting training...')
    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if log_dict_val[opt.metric] < best:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch,
                           model)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

    opt.test = True
    if opt.test:
        _, preds = trainer.val(0, val_loader)
        val_loader.dataset.run_eval(preds, opt.save_dir)

    logger.close()
Пример #20
0
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    Dataset = get_dataset(opt.dataset, opt.task)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
    print(opt)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch,
                         opt.heads,
                         opt.head_conv,
                         number_stacks=opt.number_stacks,
                         fsm=opt.fsm,
                         drmc=opt.drmc,
                         drmr=opt.drmr,
                         only_ls=opt.only_ls)
    optimizer = None
    # optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=opt.lr,
                                momentum=0.9,
                                weight_decay=0.0005,
                                nesterov=True)
    # optimizer = torch.optim.AdamW(model.parameters(), opt.lr)
    start_epoch = 0
    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step,
                                                   opt.finetune)

    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    print('Setting up data...')
    if not opt.trainval:
        val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'),
                                                 batch_size=1,
                                                 shuffle=False,
                                                 num_workers=1,
                                                 pin_memory=True)

    if not opt.trainval and opt.test:
        _, preds = trainer.val(0, val_loader)
        val_loader.dataset.run_eval(preds, opt.save_dir)
        return
    split = 'trainval' if opt.trainval else 'train'
    train_loader = torch.utils.data.DataLoader(Dataset(opt, split),
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True,
                                               drop_last=True)

    print('Starting training...')
    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if log_dict_val[opt.metric] < best:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch,
                           model)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
            if epoch % opt.cache_model == 0:
                save_model(
                    os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                    epoch, model, optimizer)
            else:
                pass
        logger.write('\n')
        if epoch in opt.lr_step:
            if not os.path.exists(
                    os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch))):
                save_model(
                    os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                    epoch, model, optimizer)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
    logger.close()
Пример #21
0
def main(opt):
  torch.manual_seed(opt.seed)  # 使得每次获取的随机数都是一样的
  torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test  # 结构固定、形状固定时可以加快运行速度
  Dataset = get_dataset(opt.dataset, opt.task)   # (coco ctdet) 得到实例Dataset(实例COCO, 实例CTDetDataset)
  opt = opts().update_dataset_info_and_set_heads(opt, Dataset)  # 根据数据集和arch生成检测器头所需的参数
  print(opt)

  logger = Logger(opt)

  os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
  opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')
  
  print('Creating model...')
  model = create_model(opt.arch, opt.heads, opt.head_conv)  # 当前测试模型结构(DLA、hourglass)、检测头、检测头卷即层设施
  optimizer = torch.optim.Adam(model.parameters(), opt.lr)  # 设置优化器、迭代返回模型参数
  start_epoch = 0
  if opt.load_model != '':
    model, optimizer, start_epoch = load_model(
      model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step)

  Trainer = train_factory[opt.task]  #
  trainer = Trainer(opt, model, optimizer)  # 产生一个训练实例
  trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

  print('Setting up data...')
  val_loader = torch.utils.data.DataLoader(
      Dataset(opt, 'val'), 
      batch_size=1, 
      shuffle=False,
      num_workers=1,
      pin_memory=True
  )

  if opt.test:
    _, preds = trainer.val(0, val_loader)
    val_loader.dataset.run_eval(preds, opt.save_dir)
    return

  train_loader = torch.utils.data.DataLoader(
      Dataset(opt, 'train'), 
      batch_size=opt.batch_size, 
      shuffle=True,
      num_workers=opt.num_workers,
      pin_memory=True,
      drop_last=True
  )

  print('Starting training...')
  best = 1e10
  for epoch in range(start_epoch + 1, opt.num_epochs + 1):
    mark = epoch if opt.save_all else 'last'   # 模型保存参数

    log_dict_train, _ = trainer.train(epoch, train_loader)  # 训练

    logger.write('epoch: {} |'.format(epoch))
    for k, v in log_dict_train.items():
      logger.scalar_summary('train_{}'.format(k), v, epoch)
      logger.write('{} {:8f} | '.format(k, v))
    if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), 
                 epoch, model, optimizer)
      with torch.no_grad():
        log_dict_val, preds = trainer.val(epoch, val_loader)
      for k, v in log_dict_val.items():
        logger.scalar_summary('val_{}'.format(k), v, epoch)
        logger.write('{} {:8f} | '.format(k, v))
      if log_dict_val[opt.metric] < best:
        best = log_dict_val[opt.metric]
        save_model(os.path.join(opt.save_dir, 'model_best.pth'), 
                   epoch, model)
    else:
      save_model(os.path.join(opt.save_dir, 'model_last.pth'), 
                 epoch, model, optimizer)
    logger.write('\n')
    if epoch in opt.lr_step:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), 
                 epoch, model, optimizer)
      lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
      print('Drop LR to', lr)
      for param_group in optimizer.param_groups:
          param_group['lr'] = lr
  logger.close()
Пример #22
0
def main(opt):
    patient = PATIENT
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    Dataset = get_dataset(opt.dataset, opt.task)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
    print(opt)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    start_epoch = 0
    if opt.load_model != '' and opt.load_model != '_':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)
    # model, optimizer = amp.initialize(model.cuda(), optimizer, opt_level="O1")

    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    print('Setting up data...')
    val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'),
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=1,
                                             pin_memory=True)

    if opt.test:
        _, preds = trainer.val(0, val_loader)
        val_loader.dataset.run_eval(preds, opt.save_dir)
        return

    train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'),
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True,
                                               drop_last=True)

    def recording(log_dict, prefix, epoch):
        for k, v in log_dict.items():
            logger.scalar_summary(f'{prefix}_{k}', v, epoch)
            logger.write('{} {:8f} | '.format(k, v))

    print('Starting training...')
    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        recording(log_dict_train, 'train', epoch)
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
            recording(log_dict_val, 'val', epoch)
            # breakpoint()
            if log_dict_val[opt.metric] < best:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, f'model_best.pth'),
                           epoch, model)
                patient = PATIENT
            else:
                patient -= 1
                # print(colored(f'patient {patient}', 'red'))
                if patient < 0:
                    print(colored(f'{opt.input_h} done', 'green'))
                    break
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
    logger.close()
Пример #23
0
def main(opt):
    # Completely reproducible results are not guaranteed across PyTorch releases, \
    # individual commits, or different platforms. Furthermore, results may not be reproducible \
    # between CPU and GPU executions, even when using identical seeds.
    # We can use torch.manual_seed() to seed the RNG for all devices (both CPU and CUDA):
    torch.manual_seed(opt.seed)

    # 设置 torch.backends.cudnn.benchmark=True 将会让程序在开始时花费一点额外时间,\
    # 为整个网络的每个卷积层搜索最适合它的卷积实现算法,进而实现网络的加速。\
    # 适用场景是网络结构固定(不是动态变化的),网络的输入形状(包括 batch size,图片大小,输入的通道)是不变的,\
    # 其实也就是一般情况下都比较适用。反之,如果卷积层的设置一直变化,将会导致程序不停地做优化,反而会耗费更多的时间。
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test

    Dataset = get_dataset(opt.dataset, opt.task)

    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')

    model = create_model(opt.model_name)

    optimizer = torch.optim.Adam(model.parameters(), opt.lr)

    start_epoch = 0

    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)

    Trainer = train_factory[opt.task]

    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    print("Setting up data...")
    val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'),
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=1,
                                             pin_memory=True)

    if opt.test:
        _, preds = trainer.val(0, val_loader)
        # run evaluation code
        val_loader.dataset.run_eval(preds, opt.save_dir)
        return

    train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'),
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True,
                                               drop_last=True)

    print("Starting training")
    best = 1e10

    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))

        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if log_dict_val[opt.metric] < best:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch,
                           model)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)

        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
    logger.close()
Пример #24
0
import mxnet as mx

print('Creating model...')
opt = opts().init()
print(opt.arch)
ctx = [mx.gpu(int(i)) for i in opt.gpus_str.split(',') if i.strip()]
ctx = ctx if ctx else [mx.cpu()]
model = create_model(opt.arch, opt.heads, opt.head_conv, ctx)
model.collect_params().initialize(init=init.Xavier())

X   = nd.random.uniform(shape=(16, 3, 512, 512))
print("\t Input shape: ", X.shape)
Y   = model(X)
print("output: heatmaps", Y[0]["hm"].shape)
print("output: wh_scale", Y[0]["wh"].shape)
print("output: xy_offset", Y[0]["reg"].shape)

param = model.collect_params()
param_keys = param.keys()
param_keys_residual_1 = [param[param_key] for param_key in param_keys if "hourglassnet0_residual1_conv1_weight" in param_key]
#print(param_keys_residual_1)

flag_save_model = False
if flag_save_model is True:
    print("\n\nSaving model...")
    save_model(model, "./init_params.params")


# call:
# python train.py ctdet --arch hourglass
Пример #25
0
def main():
    def to_ncwh(x):
        return np.transpose(x, [2, 0, 1])

    def to_tensor(x):
        x = torch.from_numpy(x)
        return x

    def transform_by_keys(x, transform, keys):
        for k, v in x.items():
            if k in keys:
                x[k] = transform(v)
        return x

    import torchvision.transforms as transforms
    data_transform_composed = transforms.Compose([
        lambda x: transform_by_keys(x, to_ncwh, ["needle", "stack"])
        , lambda x: transform_by_keys(x, to_tensor, x.keys())
    ])

    def to_ncwh(x):
        return np.transpose(x, [2, 0, 1])

    def to_tensor(x):
        x = torch.from_numpy(x)
        return x

    def transform_by_keys(x, transform, keys):
        for k, v in x.items():
            if k in keys:
                x[k] = transform(v)
        return x

    import torchvision.transforms as transforms
    data_transform_composed = transforms.Compose([
        lambda x: transform_by_keys(x, to_ncwh, ["needle", "stack"])
        , lambda x: transform_by_keys(x, to_tensor, x.keys())
    ])
    Dataset = CTNumberDataset
    opt = opts().parse()
    opt= opts().update_dataset_info_and_set_heads(opt, Dataset)
    cv2.setNumThreads(0)
    logger = Logger(opt)
    val_loader = torch.utils.data.DataLoader(
        CTNumberDataset(start=100000, length=10000, transform=data_transform_composed, font=get_font(opt.font)),
        batch_size=1,
        shuffle=False,
        num_workers=1,
        pin_memory=True
    )

    train_loader = torch.utils.data.DataLoader(
        CTNumberDataset(start=100000, length=10000, transform=data_transform_composed, font=get_font(opt.font)),
        batch_size=opt.batch_size,
        shuffle=True,
        num_workers=opt.num_workers,
        pin_memory=True,
        drop_last=True,
    )

    best = 1e10
    start_epoch = -1

    model = GeneralizedDetector()

    optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    trainer = CtdetTrainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)
    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(
            model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step)

    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if log_dict_val[opt.metric] < best:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, 'model_best.pth'),
                           epoch, model)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'),
                       epoch, model, optimizer)
        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                       epoch, model, optimizer)
            lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
    logger.close()
def train(model, criterion, train_set, val_set, opt, labels=None):
    # define web visualizer using visdom
    webvis = WebVisualizer(opt)

    # modify learning rate of last layer
    finetune_params = modify_last_layer_lr(model.named_parameters(), opt.lr,
                                           opt.lr_mult_w, opt.lr_mult_b)
    # define optimizer
    #optimizer = optim.Adam(finetune_params,
    #                      opt.lr)
    # define laerning rate scheluer'
    optimizer = optim.Adam(finetune_params, 0.000001)
    #scheduler = optim.lr_scheduler.StepLR(optimizer,
    #                                      step_size=opt.lr_decay_in_epoch,
    #                                      gamma=opt.gamma)

    if labels is not None:
        rid2name, id2rid = labels

    # record forward and backward times
    train_batch_num = len(train_set)
    total_batch_iter = 0
    logging.info("####################Train Model###################")

    for epoch in range(opt.sum_epoch):
        epoch_start_t = time.time()
        epoch_batch_iter = 0
        logging.info('Begin of epoch %d' % (epoch))
        for i, data in enumerate(train_set):
            iter_start_t = time.time()
            # train

            inputs, targets = data
            #print(i,targets)
            if opt.mode == 'Train':
                output, loss, loss_list = forward_batch(
                    model, criterion, inputs, targets, opt, "Train")
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            elif opt.mode == 'Test-Train':
                #use batchsize==1
                output, loss, loss_list = forward_batch(
                    model, criterion, inputs, targets, opt, "Test-Train")
                batch_accuracy = calc_accuracy(output, targets,
                                               opt.score_thres, opt, opt.top_k)
                if batch_accuracy[1] >= THRES:
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            webvis.reset()
            epoch_batch_iter += 1
            total_batch_iter += 1

            # display train loss and accuracy
            if total_batch_iter % opt.display_train_freq == 0:
                # accuracy
                batch_accuracy = calc_accuracy(output, targets,
                                               opt.score_thres, opt, opt.top_k)
                util.print_loss(loss_list, "Train", epoch, total_batch_iter)
                util.print_accuracy(batch_accuracy, "Train", epoch,
                                    total_batch_iter)
                if opt.display_id > 0:
                    x_axis = epoch + float(epoch_batch_iter) / train_batch_num
                    # TODO support accuracy visualization of multiple top_k
                    plot_accuracy = [
                        batch_accuracy[i][opt.top_k[0]]
                        for i in range(len(batch_accuracy))
                    ]
                    accuracy_list = [item["ratio"] for item in plot_accuracy]
                    webvis.plot_points(x_axis, loss_list, "Loss", "Train")
                    webvis.plot_points(x_axis, accuracy_list, "Accuracy",
                                       "Train")

            # display train data
            if total_batch_iter % opt.display_data_freq == 0:
                image_list = list()
                show_image_num = int(
                    np.ceil(opt.display_image_ratio * inputs.size()[0]))
                for index in range(show_image_num):
                    input_im = util.tensor2im(inputs[index], opt.mean, opt.std)
                    class_label = "Image_" + str(index)
                    if labels is not None:
                        target_ids = [
                            targets[i][index] for i in range(opt.class_num)
                        ]
                        rids = [id2rid[j][k] for j, k in enumerate(target_ids)]
                        class_label += "_"
                        class_label += "#".join(
                            [rid2name[j][k] for j, k in enumerate(rids)])
                    image_list.append((class_label, input_im))
                image_dict = OrderedDict(image_list)
                save_result = total_batch_iter % opt.update_html_freq
                webvis.plot_images(image_dict,
                                   opt.display_id + 2 * opt.class_num, epoch,
                                   save_result)

            # validate and display validate loss and accuracy
            if len(val_set
                   ) > 0 and total_batch_iter % opt.display_validate_freq == 0:
                val_accuracy, val_loss = validate(model, criterion, val_set,
                                                  opt)
                x_axis = epoch + float(epoch_batch_iter) / train_batch_num
                accuracy_list = [
                    val_accuracy[i][opt.top_k[0]]["ratio"]
                    for i in range(len(val_accuracy))
                ]
                util.print_loss(val_loss, "Validate", epoch, total_batch_iter)
                util.print_accuracy(val_accuracy, "Validate", epoch,
                                    total_batch_iter)
                if opt.display_id > 0:
                    webvis.plot_points(x_axis, val_loss, "Loss", "Validate")
                    webvis.plot_points(x_axis, accuracy_list, "Accuracy",
                                       "Validate")

            # save snapshot
            if total_batch_iter % opt.save_batch_iter_freq == 0:
                logging.info(
                    "saving the latest model (epoch %d, total_batch_iter %d)" %
                    (epoch, total_batch_iter))
                save_model(model, opt, epoch)
                # TODO snapshot loss and accuracy

        logging.info('End of epoch %d / %d \t Time Taken: %d sec' %
                     (epoch, opt.sum_epoch, time.time() - epoch_start_t))

        if epoch % opt.save_epoch_freq == 0:
            logging.info('saving the model at the end of epoch %d, iters %d' %
                         (epoch + 1, total_batch_iter))
            save_model(model, opt, epoch + 1)

        # adjust learning rate
        #scheduler.step()
        #lr = optimizer.param_groups[0]['lr']
        #logging.info('learning rate = %.7f epoch = %d' %(lr,epoch))
    logging.info("--------Optimization Done--------")
Пример #27
0
def train(args, config, model):
    tokenizer = pickle.load(open(config.filename_idx2word, 'rb'))
    max_sorce = 0.0
    # optim
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=config.lr,
                                 betas=(0.9, 0.999),
                                 eps=1e-9)
    optim = Optim(optimizer, config)
    # KLDivLoss
    loss_func = LabelSmothingLoss(config)

    # data
    train_loader = data_load(config.filename_trimmed_train, config.batch_size,
                             True)

    # # display the result
    # f = open('data/clean/data_char/src_index2word.pkl', 'rb')
    # idx2word = pickle.load(f)

    for e in range(args.checkpoint, args.epoch):
        model.train()
        all_loss = 0
        num = 0
        for step, batch in enumerate(tqdm(train_loader)):
            x, y = batch
            word = y.ne(config.pad).sum().item()
            num += word
            if torch.cuda.is_available():
                x = x.cuda()
                y = y.cuda()
            out = model(x, y)
            loss = loss_func(out, y)
            all_loss += loss.item()
            if step % 200 == 0:
                print('epoch:', e, '|step:', step,
                      '|train_loss: %.4f' % (loss.item() / word))

            # loss regularization
            loss = loss / config.accumulation_steps
            loss.backward()
            if ((step + 1) % config.accumulation_steps) == 0:
                optim.updata()
                optim.zero_grad()

            # ###########################
            # if step == 2:
            #     break
            # ###########################

            # if step % 500 == 0:
            #     test(e, config, model, loss_func)

            if step != 0 and step % 5000 == 0:
                filename = config.filename_model + 'model_' + str(
                    step) + '.pkl'
                save_model(model, filename)
                # test(e, config, model, loss_func)
        # train loss
        loss = all_loss / num
        print('epoch:', e, '|train_loss: %.4f' % loss)

        # test
        sorce = test(e, config, model, loss_func, tokenizer)
        if sorce > max_sorce:
            max_sorce = sorce
            filename = config.filename_model + 'model.pkl'
            save_model(model, filename)
Пример #28
0
def main(opt):
	torch.manual_seed(opt.seed)
	torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark
	
	Dataset = get_dataset()
	Dataset.default_resolution=[512,512]
	if(opt.resume_labels is True):
		train_labels,valid_labels,test_labels,class_name=read_data(opt.data_dir,opt.resume_labels)	
		Dataset.num_classes=len(class_name)
		opt = opts().update_dataset_info_and_set_heads(opt, Dataset)	
		logger = Logger(opt)	
		np.random.shuffle(train_labels)	
	else:
		gt_labels,class_name=read_data(opt.data_dir,opt.resume_labels)
		Dataset.num_classes=len(class_name)
		opt = opts().update_dataset_info_and_set_heads(opt, Dataset)		
		np.random.shuffle(gt_labels)		
		logger = Logger(opt)
		train_labels,valid_labels,test_labels=selective_folding(gt_labels,class_name,logger) 
		
			
	
	os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
	opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

	print('Creating model...')
	model = create_model(opt.model_name,Dataset.num_classes)
	optimizer = torch.optim.Adam(model.parameters(), opt.lr)
	print(model)
	
	start_epoch = 0
	if opt.load_model=='':
		model, optimizer, start_epoch = load_model(
		  model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step)	  
	Trainer = train_factory[opt.task]
	trainer = Trainer(opt, model, optimizer)
	trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)


	print('Setting up data...')
	
	train_set=Dataset(opt,train_labels,class_name)
	train_loader = torch.utils.data.DataLoader(
		train_set, 
		sampler=ImbalancedDatasetSampler(train_set),
		batch_size=int(opt.batch_size/opt.subdivision),
		shuffle=False,
		num_workers=opt.num_workers,
		pin_memory=True,
		drop_last=True
	)
	test_opt=copy.deepcopy(opt)
	test_opt.phase="test"
	valid_set=Dataset(test_opt,valid_labels,class_name)
	valid_loader = torch.utils.data.DataLoader(
		valid_set, 
		batch_size=1,
		shuffle=False,
		num_workers=test_opt.num_workers,
		pin_memory=True,
		drop_last=True
	)
	test_set=Dataset(test_opt,test_labels,class_name)
	test_loader = torch.utils.data.DataLoader(
		test_set, 
		batch_size=1,
		shuffle=False,
		num_workers=test_opt.num_workers,
		pin_memory=True,
		drop_last=True
	)

	
	

	#train
	print('Starting training...')
	
	max_acc_epoch_dir=os.path.join(opt.save_dir, 'model_max_acc.pth')
	if(os.path.exists(max_acc_epoch_dir)):
		checkpoint=torch.load(max_acc_epoch_dir, map_location=lambda storage, loc: storage)
		max_acc_epoch = checkpoint['epoch']
		max_acc=checkpoint['valid_acc']
	else:
		max_acc_epoch=-1
		max_acc=0
	for epoch in range(start_epoch + 1, opt.num_epochs + 1):
		mark = epoch if opt.save_all else 'last'
		log_dict_train, _ = trainer.train(epoch, train_loader)
		log_dict_valid, _ = trainer.test(epoch, valid_loader)
		logger.write('epoch: {} |'.format(epoch))
		logger.write('train loss {:8f} | '.format(log_dict_train['loss']))		
		logger.write('valid loss {:8f} | '.format(log_dict_valid['loss']))
		logger.write('valid acc {:8f} | '.format(log_dict_valid['acc']))
		logger.write('\n')
		if(max_acc < log_dict_valid['acc']):
			max_acc_epoch=epoch
			max_acc=log_dict_valid['acc']
			save_model(max_acc_epoch_dir,epoch, model, optimizer,max_acc)	
			
		save_model(os.path.join(opt.save_dir, 'model_last.pth'),epoch, model, optimizer)
		if epoch in opt.lr_step:
			save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), 
					 epoch, model, optimizer)
			lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
			print('Drop LR to', lr)
			for param_group in optimizer.param_groups:
				param_group['lr'] = lr					
	logger.close()
	
	#test
	print('Final testing...')
	logger.open("log(test).txt")
	model, optimizer, start_epoch = load_model(
	model, max_acc_epoch_dir, optimizer, True, opt.lr, opt.lr_step)	  
	Tester = train_factory[opt.task]
	tester = Tester(opt, model, optimizer)
	tester.set_device(opt.gpus, opt.chunk_sizes, opt.device)
	log_dict_test, _ = tester.test(start_epoch, test_loader)
	logger.write('test model: {}, epoch: {}\n'.format(max_acc_epoch_dir,start_epoch))
	for k, v in log_dict_test.items():
		logger.write('{} {} | '.format(k, v))
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    Dataset = get_dataset(opt.dataset, opt.task)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
    print(opt)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    optimizer = create_optimizer(model, opt)
    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)
    if opt.mixed_precision:
        from apex import amp
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=opt.opt_level,
                                          max_loss_scale=opt.max_loss_scale)
        print('Using amp with opt level %s...' % opt.opt_level)
    else:
        amp = None

    meta = {'it': 0, 'epoch': 0}
    if opt.load_model != '':
        model, optimizer, amp, meta = load_model(model, opt.load_model,
                                                 optimizer, amp, opt.resume,
                                                 opt.lr, opt.lr_step)
    start_it = meta['it']
    start_epoch = meta['epoch']

    print('Setting up data...')
    val_dataset = Dataset(opt, 'val')
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=1,
                                             pin_memory=True)

    if opt.task == 'car_pose_6dof':
        # pass loaded 3D models for debug visualisations
        trainer.set_models(val_dataset.models)

    if opt.use_swa and start_it > opt.swa_start:
        if opt.test or opt.save_avg_weights:
            optimizer.swap_swa_sgd()
            train_dataset = Dataset(opt, 'train')
            train_loader = create_train_loader(train_dataset, opt)
            trainer.bn_update(train_loader)
        if opt.save_avg_weights:
            path = os.path.join(opt.save_dir, 'model_%d_avg.pth' % start_epoch)
            save_model(path, meta, model)

    if opt.test:
        _, preds = trainer.val(0, val_loader)
        val_loader.dataset.run_eval(preds, opt.save_dir)
        return

    train_dataset = Dataset(opt, 'train')
    train_loader = create_train_loader(train_dataset, opt)

    print('Starting training from {} epoch ({} global step)...'.format(
        start_epoch, start_it))
    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        meta['it'] += len(train_loader)
        meta['epoch'] = epoch
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       meta, model, optimizer, amp)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if log_dict_val[opt.metric] < best:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, 'model_best.pth'), meta,
                           model)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), meta,
                       model, optimizer, amp)
        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), meta,
                model, optimizer, amp)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
    logger.close()
Пример #30
0
    def train(self, cfg):
        # 设置gpu环境,考虑单卡多卡情况
        gpus_str = ''
        if isinstance(cfg.gpus, (list, tuple)):
            cfg.gpus = [int(i) for i in cfg.gpus]
            for s in cfg.gpus:
                gpus_str += str(s) + ','
            gpus_str = gpus_str[:-1]
        else:
            gpus_str = str(int(cfg.gpus))
            cfg.gpus = [int(cfg.gpus)]
        os.environ['CUDA_VISIBLE_DEVICES'] = gpus_str
        cfg.gpus = [i for i in range(len(cfg.gpus))
                    ] if cfg.gpus[0] >= 0 else [-1]

        # 设置log
        model_dir = os.path.join(cfg.save_dir, cfg.id)
        debug_dir = os.path.join(model_dir, 'debug')
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        if not os.path.exists(debug_dir):
            os.makedirs(debug_dir)
        logger = setup_logger(cfg.id, os.path.join(model_dir, 'log'))
        if USE_TENSORBOARD:
            writer = tensorboardX.SummaryWriter(
                log_dir=os.path.join(model_dir, 'log'))
        logger.info(cfg)

        gpus = cfg.gpus
        device = torch.device('cpu' if gpus[0] < 0 else 'cuda')
        lr = cfg.lr
        lr_step = cfg.lr_step
        num_epochs = cfg.num_epochs
        val_step = cfg.val_step
        sample_size = cfg.sample_size

        # 设置数据集
        dataset = YOLO(cfg.data_dir,
                       cfg.hflip,
                       cfg.vflip,
                       cfg.rotation,
                       cfg.scale,
                       cfg.shear,
                       opt=cfg,
                       split='train')
        names = dataset.class_name
        std = dataset.std
        mean = dataset.mean
        # 用数据集类别数设置预测网络
        cfg.setup_head(dataset)
        trainloader = DataLoader(dataset,
                                 batch_size=cfg.batch_size,
                                 shuffle=True,
                                 num_workers=cfg.num_workers,
                                 pin_memory=True,
                                 drop_last=True)

        # val_dataset = YOLO(cfg.data_dir, cfg.hflip, cfg.vflip, cfg.rotation, cfg.scale, cfg.shear, opt=cfg, split='val')
        # valloader = DataLoader(val_dataset, batch_size=1, shuffle=True, num_workers=1, pin_memory=True)
        valid_file = cfg.val_dir if not cfg.val_dir == '' else os.path.join(
            cfg.data_dir, 'valid.txt')
        with open(valid_file, 'r') as f:
            val_list = [l.rstrip() for l in f.readlines()]

        net = create_model(cfg.arch, cfg.heads, cfg.head_conv, cfg.down_ratio,
                           cfg.filters)
        optimizer = optim.Adam(net.parameters(), lr=lr)
        start_epoch = 0

        if cfg.resume:
            pretrain = os.path.join(model_dir, 'model_last.pth')
            if os.path.exists(pretrain):
                print('resume model from %s' % pretrain)
                try:
                    net, optimizer, start_epoch = load_model(
                        net, pretrain, optimizer, True, lr, lr_step)
                except:
                    print('\t... loading model error: ckpt may not compatible')
        model = ModleWithLoss(net, CtdetLoss(cfg))
        if len(gpus) > 1:
            model = nn.DataParallel(model, device_ids=gpus).to(device)
        else:
            model = model.to(device)

        step = 0
        best = 1e10
        log_loss_stats = ['loss', 'hm_loss', 'wh_loss']
        if cfg.reg_offset:
            log_loss_stats += ['off_loss']
        if cfg.reg_obj:
            log_loss_stats += ['obj_loss']
        for epoch in range(start_epoch + 1, num_epochs + 1):
            avg_loss_stats = {l: AverageMeter() for l in log_loss_stats}
            model.train()
            with tqdm(trainloader) as loader:
                for _, batch in enumerate(loader):
                    for k in batch:
                        if k != 'meta':
                            batch[k] = batch[k].to(device=device,
                                                   non_blocking=True)
                    output, loss, loss_stats = model(batch)
                    loss = loss.mean()
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    # 设置tqdm显示信息
                    lr = optimizer.param_groups[0]['lr']
                    poststr = ''
                    for l in avg_loss_stats:
                        avg_loss_stats[l].update(loss_stats[l].mean().item(),
                                                 batch['input'].size(0))
                        poststr += '{}: {:.4f}; '.format(
                            l, avg_loss_stats[l].avg)
                    loader.set_description('Epoch %d' % (epoch))
                    poststr += 'lr: {:.4f}'.format(lr)
                    loader.set_postfix_str(poststr)

                    step += 1
                    # self.lossSignal.emit(loss.item(), step)
                    del output, loss, loss_stats

                    # valid
                    if step % val_step == 0:
                        if len(cfg.gpus) > 1:
                            val_model = model.module
                        else:
                            val_model = model
                        val_model.eval()
                        torch.cuda.empty_cache()

                        # 随机采样
                        idx = np.arange(len(val_list))
                        idx = np.random.permutation(idx)[:sample_size]

                        for j, id in enumerate(idx):
                            image = cv2.imread(val_list[id])
                            image = self.preprocess(image, cfg.input_h,
                                                    cfg.input_w, mean, std)
                            image = image.to(device)

                            with torch.no_grad():
                                output = val_model.model(image)[-1]

                            # 画图并保存
                            debugger = Debugger(dataset=names,
                                                down_ratio=cfg.down_ratio)
                            reg = output['reg'] if cfg.reg_offset else None
                            obj = output['obj'] if cfg.reg_obj else None
                            dets = ctdet_decode(output['hm'].sigmoid_(),
                                                output['wh'],
                                                reg=reg,
                                                obj=obj,
                                                cat_spec_wh=cfg.cat_spec_wh,
                                                K=cfg.K)
                            dets = dets.detach().cpu().numpy().reshape(
                                -1, dets.shape[2])
                            dets[:, :4] *= cfg.down_ratio
                            image = image[0].detach().cpu().numpy().transpose(
                                1, 2, 0)
                            image = np.clip(((image * std + mean) * 255.), 0,
                                            255).astype(np.uint8)
                            pred = debugger.gen_colormap(
                                output['hm'][0].detach().cpu().numpy())
                            debugger.add_blend_img(image, pred, 'pred_hm')
                            debugger.add_img(image, img_id='out_pred')
                            for k in range(len(dets)):
                                if dets[k, 4] > cfg.vis_thresh:
                                    debugger.add_coco_bbox(dets[k, :4],
                                                           dets[k, -1],
                                                           dets[k, 4],
                                                           img_id='out_pred')

                            debugger.save_all_imgs(debug_dir,
                                                   prefix='{}.{}_'.format(
                                                       step, j))
                            del output, image, dets
                        # 保存模型参数
                        save_model(os.path.join(model_dir, 'model_best.pth'),
                                   epoch, net)
                        model.train()

            logstr = 'epoch {}'.format(epoch)
            for k, v in avg_loss_stats.items():
                logstr += ' {}: {:.4f};'.format(k, v.avg)
                if USE_TENSORBOARD:
                    writer.add_scalar('train_{}'.format(k), v.avg, epoch)
            logger.info(logstr)

            # if epoch % val_step == 0:
            #     if len(cfg.gpus) > 1:
            #         val_model = model.module
            #     else:
            #         val_model = model
            #     val_model.eval()
            #     torch.cuda.empty_cache()
            #
            #     val_loss_stats = {l: AverageMeter() for l in log_loss_stats}
            #
            #     with tqdm(valloader) as loader:
            #         for j, batch in enumerate(loader):
            #             for k in batch:
            #                 if k != 'meta':
            #                     batch[k] = batch[k].to(device=device, non_blocking=True)
            #             with torch.no_grad():
            #                 output, loss, loss_stats = val_model(batch)
            #
            #             poststr = ''
            #             for l in val_loss_stats:
            #                 val_loss_stats[l].update(
            #                     loss_stats[l].mean().item(), batch['input'].size(0))
            #                 poststr += '{}: {:.4f}; '.format(l, val_loss_stats[l].avg)
            #             loader.set_description('Epoch %d valid' % (epoch))
            #             poststr += 'lr: {:.4f}'.format(lr)
            #             loader.set_postfix_str(poststr)
            #
            #             if j < sample_size:
            #                 # 将预测结果画出来保存成jpg图片
            #                 debugger = Debugger(dataset=names, down_ratio=cfg.down_ratio)
            #                 reg = output['reg'] if cfg.reg_offset else None
            #                 obj = output['obj'] if cfg.reg_obj else None
            #                 dets = ctdet_decode(
            #                     output['hm'], output['wh'], reg=reg, obj=obj,
            #                     cat_spec_wh=cfg.cat_spec_wh, K=cfg.K)
            #                 dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
            #                 dets[:, :, :4] *= cfg.down_ratio
            #                 dets_gt = batch['meta']['gt_det'].numpy().reshape(1, -1, dets.shape[2])
            #                 dets_gt[:, :, :4] *= cfg.down_ratio
            #                 for i in range(1):
            #                     img = batch['input'][i].detach().cpu().numpy().transpose(1, 2, 0)
            #                     img = np.clip(((img * std + mean) * 255.), 0, 255).astype(np.uint8)
            #                     pred = debugger.gen_colormap(output['hm'][i].detach().cpu().numpy())
            #                     gt = debugger.gen_colormap(batch['hm'][i].detach().cpu().numpy())
            #                     debugger.add_blend_img(img, pred, 'pred_hm')
            #                     debugger.add_blend_img(img, gt, 'gt_hm')
            #                     debugger.add_img(img, img_id='out_pred')
            #                     for k in range(len(dets[i])):
            #                         if dets[i, k, 4] > cfg.vis_thresh:
            #                             debugger.add_coco_bbox(dets[i, k, :4], dets[i, k, -1],
            #                                                    dets[i, k, 4], img_id='out_pred')
            #
            #                     debugger.add_img(img, img_id='out_gt')
            #                     for k in range(len(dets_gt[i])):
            #                         if dets_gt[i, k, 4] > cfg.vis_thresh:
            #                             debugger.add_coco_bbox(dets_gt[i, k, :4], dets_gt[i, k, -1],
            #                                                    dets_gt[i, k, 4], img_id='out_gt')
            #
            #                     debugger.save_all_imgs(debug_dir, prefix='{}.{}_'.format(epoch, j))
            #             del output, loss, loss_stats
            #     model.train()
            #     logstr = 'epoch {} valid'.format(epoch)
            #     for k, v in val_loss_stats.items():
            #         logstr += ' {}: {:.4f};'.format(k, v.avg)
            #         if USE_TENSORBOARD:
            #             writer.add_scalar('val_{}'.format(k), v.avg, epoch)
            #     logger.info(logstr)
            #     if val_loss_stats['loss'].avg < best:
            #         best = val_loss_stats['loss'].avg
            #         save_model(os.path.join(model_dir, 'model_best.pth'), epoch, net)
            save_model(os.path.join(model_dir, 'model_last.pth'), epoch, net,
                       optimizer)
            if epoch in cfg.lr_step:
                save_model(
                    os.path.join(model_dir, 'model_{}.pth'.format(epoch)),
                    epoch, net, optimizer)
                lr = cfg.lr * (0.1**(cfg.lr_step.index(epoch) + 1))
                logger.info('Drop LR to {}'.format(lr))
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
def train(model, criterion, train_set, val_set, opt, labels=None):
    # define web visualizer using visdom
    webvis = WebVisualizer(opt)
    
    # modify learning rate of last layer
    finetune_params = modify_last_layer_lr(model.named_parameters(), 
                                            opt.lr, opt.lr_mult_w, opt.lr_mult_b)
    # define optimizer
    optimizer = optim.SGD(finetune_params, 
                          opt.lr, 
                          momentum=opt.momentum, 
                          weight_decay=opt.weight_decay)
    # define laerning rate scheluer
    scheduler = optim.lr_scheduler.StepLR(optimizer, 
                                          step_size=opt.lr_decay_in_epoch,
                                          gamma=opt.gamma)
    if labels is not None:
        rid2name, id2rid = labels
    
    # record forward and backward times 
    train_batch_num = len(train_set)
    total_batch_iter = 0
    logging.info("####################Train Model###################")
    for epoch in range(opt.sum_epoch):
        epoch_start_t = time.time()
        epoch_batch_iter = 0
        logging.info('Begin of epoch %d' %(epoch))
        for i, data in enumerate(train_set):
            iter_start_t = time.time()
            # train 
            inputs, targets = data
            output, loss, loss_list = forward_batch(model, criterion, inputs, targets, opt, "Train")
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
           
            webvis.reset()
            epoch_batch_iter += 1
            total_batch_iter += 1

            # display train loss and accuracy
            if total_batch_iter % opt.display_train_freq == 0:
                # accuracy
                batch_accuracy = calc_accuracy(output, targets, opt.score_thres, opt.top_k) 
                util.print_loss(loss_list, "Train", epoch, total_batch_iter)
                util.print_accuracy(batch_accuracy, "Train", epoch, total_batch_iter)
                if opt.display_id > 0:
                    x_axis = epoch + float(epoch_batch_iter)/train_batch_num
                    # TODO support accuracy visualization of multiple top_k
                    plot_accuracy = [batch_accuracy[i][opt.top_k[0]] for i in range(len(batch_accuracy)) ]
                    accuracy_list = [item["ratio"] for item in plot_accuracy]
                    webvis.plot_points(x_axis, loss_list, "Loss", "Train")
                    webvis.plot_points(x_axis, accuracy_list, "Accuracy", "Train")
            
            # display train data 
            if total_batch_iter % opt.display_data_freq == 0:
                image_list = list()
                show_image_num = int(np.ceil(opt.display_image_ratio * inputs.size()[0]))
                for index in range(show_image_num): 
                    input_im = util.tensor2im(inputs[index], opt.mean, opt.std)
                    class_label = "Image_" + str(index) 
                    if labels is not None:
                        target_ids = [targets[i][index] for i in range(opt.class_num)]
                        rids = [id2rid[j][k] for j,k in enumerate(target_ids)]
                        class_label += "_"
                        class_label += "#".join([rid2name[j][k] for j,k in enumerate(rids)])
                    image_list.append((class_label, input_im))
                image_dict = OrderedDict(image_list)
                save_result = total_batch_iter % opt.update_html_freq
                webvis.plot_images(image_dict, opt.display_id + 2*opt.class_num, epoch, save_result)
            
            # validate and display validate loss and accuracy
            if len(val_set) > 0  and total_batch_iter % opt.display_validate_freq == 0:
                val_accuracy, val_loss = validate(model, criterion, val_set, opt)
                x_axis = epoch + float(epoch_batch_iter)/train_batch_num
                accuracy_list = [val_accuracy[i][opt.top_k[0]]["ratio"] for i in range(len(val_accuracy))]
                util.print_loss(val_loss, "Validate", epoch, total_batch_iter)
                util.print_accuracy(val_accuracy, "Validate", epoch, total_batch_iter)
                if opt.display_id > 0:
                    webvis.plot_points(x_axis, val_loss, "Loss", "Validate")
                    webvis.plot_points(x_axis, accuracy_list, "Accuracy", "Validate")

            # save snapshot 
            if total_batch_iter % opt.save_batch_iter_freq == 0:
                logging.info("saving the latest model (epoch %d, total_batch_iter %d)" %(epoch, total_batch_iter))
                save_model(model, opt, epoch)
                # TODO snapshot loss and accuracy
            
        logging.info('End of epoch %d / %d \t Time Taken: %d sec' %
              (epoch, opt.sum_epoch, time.time() - epoch_start_t))
        
        if epoch % opt.save_epoch_freq == 0:
            logging.info('saving the model at the end of epoch %d, iters %d' %(epoch+1, total_batch_iter))
            save_model(model, opt, epoch+1) 

        # adjust learning rate 
        scheduler.step()
        lr = optimizer.param_groups[0]['lr'] 
        logging.info('learning rate = %.7f epoch = %d' %(lr,epoch)) 
    logging.info("--------Optimization Done--------")