示例#1
0
def run_one_experiment():
    t_exp_start = time.time()

    # Save all print-out to a logger file
    logger = Logger(FLAGS.log_file)

    # Print experience setup
    for k in sorted(FLAGS.keys()):
        print('{}: {}'.format(k, FLAGS[k]))

    # Init torch
    if FLAGS.seed is None:
        torch.backends.cudnn.deterministic = False
        torch.backends.cudnn.benchmark = True
    else:
        random.seed(FLAGS.seed)
        np.random.seed(FLAGS.seed)
        torch.manual_seed(FLAGS.seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    # Init model
    model = importlib.import_module(FLAGS.module_name).get_model(FLAGS)
    model = torch.nn.DataParallel(model).cuda()

    if FLAGS.pretrained:
        checkpoint = torch.load(FLAGS.pretrained)
        model.module.load_state_dict(checkpoint['model'])
        print('Loaded model {}.'.format(FLAGS.pretrained))

    if FLAGS.model_profiling and len(FLAGS.model_profiling) > 0:
        print(model)
        profiling(model, FLAGS.model_profiling, FLAGS.image_size,
                  FLAGS.image_channels, FLAGS.train_width_mults,
                  FLAGS.model_profiling_verbose)
    logger.flush()

    # Init data loaders
    train_loader, val_loader, _, train_set = prepare_data(
        FLAGS.dataset, FLAGS.data_dir, FLAGS.data_transforms,
        FLAGS.data_loader, FLAGS.data_loader_workers, FLAGS.train_batch_size,
        FLAGS.val_batch_size, FLAGS.drop_last, FLAGS.test_only)
    class_labels = train_set.classes

    # Perform inference/test only
    if FLAGS.test_only:
        print('Start testing...')
        min_wm = min(FLAGS.train_width_mults)
        max_wm = max(FLAGS.train_width_mults)
        if FLAGS.test_num_width_mults == 1:
            test_width_mults = []
        else:
            step = (max_wm - min_wm) / (FLAGS.test_num_width_mults - 1)
            test_width_mults = np.arange(min_wm, max_wm, step).tolist()
        test_width_mults += [max_wm]

        criterion = torch.nn.CrossEntropyLoss(reduction='none').cuda()
        test_meters = get_meters('val', FLAGS.topk, test_width_mults)
        epoch = -1

        avg_error1, _ = test(epoch,
                             val_loader,
                             model,
                             criterion,
                             test_meters,
                             test_width_mults,
                             topk=FLAGS.topk)
        print('==> Epoch avg accuracy {:.2f}%,'.format((1 - avg_error1) * 100))

        logger.close()
        plot_acc_width(FLAGS.log_file)
        return

    # Init training devices
    criterion = torch.nn.CrossEntropyLoss(reduction='none').cuda()
    optimizer = get_optimizer(model,
                              FLAGS.optimizer,
                              FLAGS.weight_decay,
                              FLAGS.lr,
                              FLAGS.momentum,
                              FLAGS.nesterov,
                              depthwise=FLAGS.depthwise)
    lr_scheduler = get_lr_scheduler(optimizer, FLAGS.lr_scheduler,
                                    FLAGS.lr_scheduler_params)

    train_meters = get_meters('train', FLAGS.topk, FLAGS.train_width_mults)
    val_meters = get_meters('val', FLAGS.topk, FLAGS.train_width_mults)
    val_meters['best_val_error1'] = ScalarMeter('best_val_error1')

    time_meter = ScalarMeter('runtime')

    # Perform training
    print('Start training...')
    last_epoch = -1
    best_val_error1 = 1.
    for epoch in range(last_epoch + 1, FLAGS.num_epochs):
        t_epoch_start = time.time()
        print('\nEpoch {}/{}.'.format(epoch + 1, FLAGS.num_epochs) +
              ' Print format: [width factor, loss, accuracy].' +
              ' Learning rate: {}'.format(optimizer.param_groups[0]['lr']))

        # Train one epoch
        steps_per_epoch = len(train_loader.dataset) / FLAGS.train_batch_size
        total_steps = FLAGS.num_epochs * steps_per_epoch
        lr_decay_per_step = (None if FLAGS.lr_scheduler != 'linear_decaying'
                             else FLAGS.lr / total_steps)
        if FLAGS.lr_scheduler == 'linear_decaying':
            lr_decay_per_step = (FLAGS.lr / FLAGS.num_epochs /
                                 len(train_loader.dataset) *
                                 FLAGS.train_batch_size)
        train_results = train(epoch, FLAGS.num_epochs, train_loader, model,
                              criterion, optimizer, train_meters,
                              FLAGS.train_width_mults, FLAGS.log_interval,
                              FLAGS.topk, FLAGS.rand_width_mult_args,
                              lr_decay_per_step)

        # Validate
        avg_error1, val_results = test(epoch,
                                       val_loader,
                                       model,
                                       criterion,
                                       val_meters,
                                       FLAGS.train_width_mults,
                                       topk=FLAGS.topk)

        # Update best result
        is_best = avg_error1 < best_val_error1
        if is_best:
            best_val_error1 = avg_error1
        val_meters['best_val_error1'].cache(best_val_error1)

        # Save checkpoint
        print()
        if FLAGS.saving_checkpoint:
            save_model(model, optimizer, epoch, FLAGS.train_width_mults,
                       FLAGS.rand_width_mult_args, train_meters, val_meters,
                       1 - avg_error1, 1 - best_val_error1,
                       FLAGS.epoch_checkpoint, is_best, FLAGS.best_checkpoint)
        print('==> Epoch avg accuracy {:.2f}%,'.format((1 - avg_error1) * 100),
              'Best accuracy: {:.2f}%\n'.format((1 - best_val_error1) * 100))

        logger.flush()

        if lr_scheduler is not None and epoch != FLAGS.num_epochs - 1:
            lr_scheduler.step()
        print('Epoch time: {:.4f} mins'.format(
            (time.time() - t_epoch_start) / 60))

    print('Total time: {:.4f} mins'.format((time.time() - t_exp_start) / 60))
    logger.close()
    return
示例#2
0
    def train(self, 
             model_dir=constant.train_config['trained_model_dir'], 
             model_name=constant.predict_config['best_model_name']):

        iteration_step = 0
        logger = Logger(self.model_name)

        start_idx_epoch = 0
        for epoch in range(start_idx_epoch, start_idx_epoch+self.num_epochs): 
            print('Executing Epoch: {}'.format(epoch))
            
            #execute each batch
            for sample in iter(self.train_batch):
                #extract data and label
                data = sample['feature']
                label = sample['target']

                #clear gradient
                self.optimizer.zero_grad()
                
                #forward propagation
                batch_output = self.classifier_model.nn_model(data)

                #calculate loss
                loss = self.error(batch_output, label[:, 0, :])

                #claculate gradient and update weight
                loss.backward()
                self.optimizer.step()
                                                        
                # Find metrics on validation dataset
                iteration_step += self.batch_size


            eval_metric = EvaluationMetric(self.target_num_classes)

            training_loss = eval_metric.calculateLoss(self.valid_batch, self.batch_size, self.classifier_model.nn_model, self.error)
            test_loss = eval_metric.calculateLoss(self.valid_batch, self.batch_size, self.classifier_model.nn_model, self.error)
            

            
            precision_train, recall_train, f1_train  = eval_metric.calculateEvaluationMetric(self.train_batch, self.batch_size, self.classifier_model.nn_model)
            precision_valid, recall_valid, f1_valid = eval_metric.calculateEvaluationMetric(self.valid_batch, self.batch_size, self.classifier_model.nn_model)
            
            
            print('Epoch: {}, F1-Score (Training Dataset): {}, F1-Score (Validation Dataset): {},  Training Loss: {},  Validation Loss: {}'
            .format(epoch, f1_train, f1_valid, training_loss, test_loss))

            print('Precision(Training Dataset): {}, Precision(Validation Dataset): {}, Recall(Training Dataset): {}, Recall(Validation Dataset): {}'
            .format(precision_train, precision_valid, recall_train, recall_valid))
            

            #log the metric in graph with tensorboard
            logger.log(f1_train, f1_valid, training_loss, test_loss, iteration_step)

                
            #save the model weights
            model_filepath = model_dir + os.sep + 'weight_epoch-{}_loss-{}'.format(epoch, training_loss)
            torch.save(self.classifier_model.nn_model.state_dict(), model_filepath)
        
        logger.close()
示例#3
0
def main():
    args = parse_args()

    cfg = from_file(args.config)
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    if args.load_from is not None:
        cfg.load_from = args.load_from
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
    if args.seed is not None:
        cfg.seed = args.seed
    if args.gpus is not None:
        cfg.gpus = args.gpus
    # set random seeds
    if cfg.seed is not None:
        print('Set random seed to {}'.format(cfg.seed))
        set_random_seed(cfg.seed)

    if not os.path.exists(cfg.work_dir):
        os.makedirs(cfg.work_dir)

    ################ 1 DATA ###################
    print('Training model on {} dataset...'.format(cfg.data['dataset']))
    batch_size = cfg.data['batch_size'] * cfg.gpus
    train_dataset = UCF101Dataset(data_file=cfg.data['train_file'], img_tmpl=cfg.data['train_img_tmp'],
    							clip_len=cfg.data['train_clip_len'], size=cfg.data['size'], mode='train', shuffle=True)
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
    val_dataset = UCF101Dataset(data_file=cfg.data['val_file'], img_tmpl=cfg.data['val_img_tmp'],
    							clip_len=cfg.data['val_clip_len'], size=cfg.data['size'], mode='val', shuffle=False)
    val_dataloader= DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=8)

    ################ 2 MODEL ##################
    if cfg.load_from is not None:
        print('Init the model from pretrained weight {}.'.format(cfg.load_from))
        model = S3DG(num_class=cfg.model['num_class'])
        load_pretrained_model(model, pretrained_path=cfg.load_from)

    else:
        print('Init the model from scratch.')
        model = S3DG(num_class=cfg.model['num_class'])

    # MODEL
    # NOTE: train and resume train must have same number of GPU, since the name 'module'
    # nn.parallel
    if cfg.resume_from is not None:
        load_checkpoint_model(model, checkpoint_path=cfg.resume_from)

    if torch.cuda.device_count() > 1:  
        print('use %d gpus' % (torch.cuda.device_count()))
        model = nn.DataParallel(model, device_ids=range(cfg.gpus))
    else:
        print('use 1 gpu')

    print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0))
    model.to(device)

    # ################### 3 CRITERION and OPTIMIZER #########################
    criterion = nn.CrossEntropyLoss().to(device)  # standard crossentropy loss for classification
    # criterion = nn.BCEWithLogitsLoss().to(device)
    optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=5e-4)
    # set lr scheduler
    if cfg.lr_scheduler is not None:
        if cfg.lr_scheduler['type'] == 'step':
            scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=cfg.lr_scheduler['step'], gamma=cfg.lr_scheduler['gamma'])
        elif cfg.lr_scheduler['type'] == 'multistep':
            scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.lr_scheduler['step'], gamma=cfg.lr_scheduler['gamma'])
        elif cfg.lr_scheduler['type'] == 'exponent':
            scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=cfg.lr_scheduler['gamma'])
    
    log_path = cfg.work_dir
    # IF RESUME
    if cfg.resume_from is not None:
        checkpoint = torch.load(cfg.resume_from)
        print("Resume training from checkpoint: {}...".format(cfg.resume_from))
        optimizer.load_state_dict(checkpoint['opt_dict'])
        scheduler.load_state_dict(checkpoint['lr_dict'])
        resume_epoch = checkpoint['epoch'] + 1
        logger = Logger(os.path.join(log_path, 'log.txt'), resume=True)
    else:
        print("Training model from start...")
        resume_epoch = 0
        logger = Logger(os.path.join(log_path, 'log.txt'))
        logger.set_names(['Learning Rate', 'Train Loss', 'Val Loss', 'Train Acc.', 'Val Acc.'])

    # tensorboard 
    log_dir = os.path.join(cfg.work_dir, datetime.now().strftime('%b%d_%H-%M-%S'))
    writer = SummaryWriter(log_dir=log_dir)

    ################## 4 BEGIN TRAINING #########################
    num_epochs = cfg.num_epochs
    save_epoch = cfg.interval
    save_dir = cfg.work_dir
    display = cfg.display

    best_acc = 0.0
    best_epoch = 0

    for epoch in tqdm(range(resume_epoch, num_epochs)):
        print('\n----------------- Training -------------------')
        print('Epoch: {}/{}'.format(epoch, num_epochs-1))
        train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, epoch, writer, display)
        if args.validate:
            print('\n----------------- Validation -------------------')
            print('Epoch: {}/{}'.format(epoch, num_epochs-1))
            val_loss, val_acc = validation(val_dataloader, model, criterion, optimizer, epoch, writer, display)
            if val_acc >= best_acc:
                best_acc = val_acc
                best_epoch = epoch
            print("\nThe best validation top1-accuracy: {:.3f}%, the best epoch: {}".format(best_acc,best_epoch))

        # EPOCH
        lr = optimizer.state_dict()['param_groups'][0]['lr']
        if args.validate:
            logger.append([lr, train_loss, val_loss, train_acc, val_acc])
        else:
            logger.append([lr, train_loss, 0.0, train_acc, 0.0]) # no valid
        writer.add_scalar('train/learning_rate', optimizer.state_dict()['param_groups'][0]['lr'], epoch)

        if cfg.lr_scheduler is not None:
            scheduler.step()

        if epoch % save_epoch == 0:
            torch.save({
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'opt_dict': optimizer.state_dict(),
                'lr_dict': scheduler.state_dict()
            }, os.path.join(save_dir, 'epoch-' + str(epoch) + '.pth'))

    writer.close()
    logger.close()
    logger.plot()
    savefig(os.path.join(log_path, 'log.eps'))