def run_one_experiment(): t_exp_start = time.time() # Save all print-out to a logger file logger = Logger(FLAGS.log_file) # Print experience setup for k in sorted(FLAGS.keys()): print('{}: {}'.format(k, FLAGS[k])) # Init torch if FLAGS.seed is None: torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True else: random.seed(FLAGS.seed) np.random.seed(FLAGS.seed) torch.manual_seed(FLAGS.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') # Init model model = importlib.import_module(FLAGS.module_name).get_model(FLAGS) model = torch.nn.DataParallel(model).cuda() if FLAGS.pretrained: checkpoint = torch.load(FLAGS.pretrained) model.module.load_state_dict(checkpoint['model']) print('Loaded model {}.'.format(FLAGS.pretrained)) if FLAGS.model_profiling and len(FLAGS.model_profiling) > 0: print(model) profiling(model, FLAGS.model_profiling, FLAGS.image_size, FLAGS.image_channels, FLAGS.train_width_mults, FLAGS.model_profiling_verbose) logger.flush() # Init data loaders train_loader, val_loader, _, train_set = prepare_data( FLAGS.dataset, FLAGS.data_dir, FLAGS.data_transforms, FLAGS.data_loader, FLAGS.data_loader_workers, FLAGS.train_batch_size, FLAGS.val_batch_size, FLAGS.drop_last, FLAGS.test_only) class_labels = train_set.classes # Perform inference/test only if FLAGS.test_only: print('Start testing...') min_wm = min(FLAGS.train_width_mults) max_wm = max(FLAGS.train_width_mults) if FLAGS.test_num_width_mults == 1: test_width_mults = [] else: step = (max_wm - min_wm) / (FLAGS.test_num_width_mults - 1) test_width_mults = np.arange(min_wm, max_wm, step).tolist() test_width_mults += [max_wm] criterion = torch.nn.CrossEntropyLoss(reduction='none').cuda() test_meters = get_meters('val', FLAGS.topk, test_width_mults) epoch = -1 avg_error1, _ = test(epoch, val_loader, model, criterion, test_meters, test_width_mults, topk=FLAGS.topk) print('==> Epoch avg accuracy {:.2f}%,'.format((1 - avg_error1) * 100)) logger.close() plot_acc_width(FLAGS.log_file) return # Init training devices criterion = torch.nn.CrossEntropyLoss(reduction='none').cuda() optimizer = get_optimizer(model, FLAGS.optimizer, FLAGS.weight_decay, FLAGS.lr, FLAGS.momentum, FLAGS.nesterov, depthwise=FLAGS.depthwise) lr_scheduler = get_lr_scheduler(optimizer, FLAGS.lr_scheduler, FLAGS.lr_scheduler_params) train_meters = get_meters('train', FLAGS.topk, FLAGS.train_width_mults) val_meters = get_meters('val', FLAGS.topk, FLAGS.train_width_mults) val_meters['best_val_error1'] = ScalarMeter('best_val_error1') time_meter = ScalarMeter('runtime') # Perform training print('Start training...') last_epoch = -1 best_val_error1 = 1. for epoch in range(last_epoch + 1, FLAGS.num_epochs): t_epoch_start = time.time() print('\nEpoch {}/{}.'.format(epoch + 1, FLAGS.num_epochs) + ' Print format: [width factor, loss, accuracy].' + ' Learning rate: {}'.format(optimizer.param_groups[0]['lr'])) # Train one epoch steps_per_epoch = len(train_loader.dataset) / FLAGS.train_batch_size total_steps = FLAGS.num_epochs * steps_per_epoch lr_decay_per_step = (None if FLAGS.lr_scheduler != 'linear_decaying' else FLAGS.lr / total_steps) if FLAGS.lr_scheduler == 'linear_decaying': lr_decay_per_step = (FLAGS.lr / FLAGS.num_epochs / len(train_loader.dataset) * FLAGS.train_batch_size) train_results = train(epoch, FLAGS.num_epochs, train_loader, model, criterion, optimizer, train_meters, FLAGS.train_width_mults, FLAGS.log_interval, FLAGS.topk, FLAGS.rand_width_mult_args, lr_decay_per_step) # Validate avg_error1, val_results = test(epoch, val_loader, model, criterion, val_meters, FLAGS.train_width_mults, topk=FLAGS.topk) # Update best result is_best = avg_error1 < best_val_error1 if is_best: best_val_error1 = avg_error1 val_meters['best_val_error1'].cache(best_val_error1) # Save checkpoint print() if FLAGS.saving_checkpoint: save_model(model, optimizer, epoch, FLAGS.train_width_mults, FLAGS.rand_width_mult_args, train_meters, val_meters, 1 - avg_error1, 1 - best_val_error1, FLAGS.epoch_checkpoint, is_best, FLAGS.best_checkpoint) print('==> Epoch avg accuracy {:.2f}%,'.format((1 - avg_error1) * 100), 'Best accuracy: {:.2f}%\n'.format((1 - best_val_error1) * 100)) logger.flush() if lr_scheduler is not None and epoch != FLAGS.num_epochs - 1: lr_scheduler.step() print('Epoch time: {:.4f} mins'.format( (time.time() - t_epoch_start) / 60)) print('Total time: {:.4f} mins'.format((time.time() - t_exp_start) / 60)) logger.close() return
def train(self, model_dir=constant.train_config['trained_model_dir'], model_name=constant.predict_config['best_model_name']): iteration_step = 0 logger = Logger(self.model_name) start_idx_epoch = 0 for epoch in range(start_idx_epoch, start_idx_epoch+self.num_epochs): print('Executing Epoch: {}'.format(epoch)) #execute each batch for sample in iter(self.train_batch): #extract data and label data = sample['feature'] label = sample['target'] #clear gradient self.optimizer.zero_grad() #forward propagation batch_output = self.classifier_model.nn_model(data) #calculate loss loss = self.error(batch_output, label[:, 0, :]) #claculate gradient and update weight loss.backward() self.optimizer.step() # Find metrics on validation dataset iteration_step += self.batch_size eval_metric = EvaluationMetric(self.target_num_classes) training_loss = eval_metric.calculateLoss(self.valid_batch, self.batch_size, self.classifier_model.nn_model, self.error) test_loss = eval_metric.calculateLoss(self.valid_batch, self.batch_size, self.classifier_model.nn_model, self.error) precision_train, recall_train, f1_train = eval_metric.calculateEvaluationMetric(self.train_batch, self.batch_size, self.classifier_model.nn_model) precision_valid, recall_valid, f1_valid = eval_metric.calculateEvaluationMetric(self.valid_batch, self.batch_size, self.classifier_model.nn_model) print('Epoch: {}, F1-Score (Training Dataset): {}, F1-Score (Validation Dataset): {}, Training Loss: {}, Validation Loss: {}' .format(epoch, f1_train, f1_valid, training_loss, test_loss)) print('Precision(Training Dataset): {}, Precision(Validation Dataset): {}, Recall(Training Dataset): {}, Recall(Validation Dataset): {}' .format(precision_train, precision_valid, recall_train, recall_valid)) #log the metric in graph with tensorboard logger.log(f1_train, f1_valid, training_loss, test_loss, iteration_step) #save the model weights model_filepath = model_dir + os.sep + 'weight_epoch-{}_loss-{}'.format(epoch, training_loss) torch.save(self.classifier_model.nn_model.state_dict(), model_filepath) logger.close()
def main(): args = parse_args() cfg = from_file(args.config) if args.work_dir is not None: cfg.work_dir = args.work_dir if args.load_from is not None: cfg.load_from = args.load_from if args.resume_from is not None: cfg.resume_from = args.resume_from if args.seed is not None: cfg.seed = args.seed if args.gpus is not None: cfg.gpus = args.gpus # set random seeds if cfg.seed is not None: print('Set random seed to {}'.format(cfg.seed)) set_random_seed(cfg.seed) if not os.path.exists(cfg.work_dir): os.makedirs(cfg.work_dir) ################ 1 DATA ################### print('Training model on {} dataset...'.format(cfg.data['dataset'])) batch_size = cfg.data['batch_size'] * cfg.gpus train_dataset = UCF101Dataset(data_file=cfg.data['train_file'], img_tmpl=cfg.data['train_img_tmp'], clip_len=cfg.data['train_clip_len'], size=cfg.data['size'], mode='train', shuffle=True) train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=8) val_dataset = UCF101Dataset(data_file=cfg.data['val_file'], img_tmpl=cfg.data['val_img_tmp'], clip_len=cfg.data['val_clip_len'], size=cfg.data['size'], mode='val', shuffle=False) val_dataloader= DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=8) ################ 2 MODEL ################## if cfg.load_from is not None: print('Init the model from pretrained weight {}.'.format(cfg.load_from)) model = S3DG(num_class=cfg.model['num_class']) load_pretrained_model(model, pretrained_path=cfg.load_from) else: print('Init the model from scratch.') model = S3DG(num_class=cfg.model['num_class']) # MODEL # NOTE: train and resume train must have same number of GPU, since the name 'module' # nn.parallel if cfg.resume_from is not None: load_checkpoint_model(model, checkpoint_path=cfg.resume_from) if torch.cuda.device_count() > 1: print('use %d gpus' % (torch.cuda.device_count())) model = nn.DataParallel(model, device_ids=range(cfg.gpus)) else: print('use 1 gpu') print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) model.to(device) # ################### 3 CRITERION and OPTIMIZER ######################### criterion = nn.CrossEntropyLoss().to(device) # standard crossentropy loss for classification # criterion = nn.BCEWithLogitsLoss().to(device) optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=5e-4) # set lr scheduler if cfg.lr_scheduler is not None: if cfg.lr_scheduler['type'] == 'step': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=cfg.lr_scheduler['step'], gamma=cfg.lr_scheduler['gamma']) elif cfg.lr_scheduler['type'] == 'multistep': scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.lr_scheduler['step'], gamma=cfg.lr_scheduler['gamma']) elif cfg.lr_scheduler['type'] == 'exponent': scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=cfg.lr_scheduler['gamma']) log_path = cfg.work_dir # IF RESUME if cfg.resume_from is not None: checkpoint = torch.load(cfg.resume_from) print("Resume training from checkpoint: {}...".format(cfg.resume_from)) optimizer.load_state_dict(checkpoint['opt_dict']) scheduler.load_state_dict(checkpoint['lr_dict']) resume_epoch = checkpoint['epoch'] + 1 logger = Logger(os.path.join(log_path, 'log.txt'), resume=True) else: print("Training model from start...") resume_epoch = 0 logger = Logger(os.path.join(log_path, 'log.txt')) logger.set_names(['Learning Rate', 'Train Loss', 'Val Loss', 'Train Acc.', 'Val Acc.']) # tensorboard log_dir = os.path.join(cfg.work_dir, datetime.now().strftime('%b%d_%H-%M-%S')) writer = SummaryWriter(log_dir=log_dir) ################## 4 BEGIN TRAINING ######################### num_epochs = cfg.num_epochs save_epoch = cfg.interval save_dir = cfg.work_dir display = cfg.display best_acc = 0.0 best_epoch = 0 for epoch in tqdm(range(resume_epoch, num_epochs)): print('\n----------------- Training -------------------') print('Epoch: {}/{}'.format(epoch, num_epochs-1)) train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, epoch, writer, display) if args.validate: print('\n----------------- Validation -------------------') print('Epoch: {}/{}'.format(epoch, num_epochs-1)) val_loss, val_acc = validation(val_dataloader, model, criterion, optimizer, epoch, writer, display) if val_acc >= best_acc: best_acc = val_acc best_epoch = epoch print("\nThe best validation top1-accuracy: {:.3f}%, the best epoch: {}".format(best_acc,best_epoch)) # EPOCH lr = optimizer.state_dict()['param_groups'][0]['lr'] if args.validate: logger.append([lr, train_loss, val_loss, train_acc, val_acc]) else: logger.append([lr, train_loss, 0.0, train_acc, 0.0]) # no valid writer.add_scalar('train/learning_rate', optimizer.state_dict()['param_groups'][0]['lr'], epoch) if cfg.lr_scheduler is not None: scheduler.step() if epoch % save_epoch == 0: torch.save({ 'epoch': epoch, 'state_dict': model.state_dict(), 'opt_dict': optimizer.state_dict(), 'lr_dict': scheduler.state_dict() }, os.path.join(save_dir, 'epoch-' + str(epoch) + '.pth')) writer.close() logger.close() logger.plot() savefig(os.path.join(log_path, 'log.eps'))