def train(): datafile = DATA('train', dataset_dir) dataloader = DataLoader(datafile, batch_size=batch_size, shuffle=True, num_workers=workers, drop_last=True) print('-------------train-----------------') print('Length of train set is {0}'.format(len(datafile))) model = Net() model = model.cuda() model = nn.DataParallel(model) model.train() optimizer = torch.optim.Adam(model.parameters(), lr=lr) criterion = torch.nn.CrossEntropyLoss() cnt = 0 count = 0 for epoch in range(nepoch): for img, label in dataloader: img, label = Variable(img).cuda(), Variable(label).cuda() out = model(img) loss = criterion(out, label.squeeze()) loss.backward() optimizer.step() optimizer.zero_grad() cnt += 1 print('Epoch:{0},Frame:{1}, train_loss {2}'.format( epoch, cnt * batch_size, loss / batch_size)) torch.save(model.state_dict(), '{0}/{1}model.pth'.format(model_cp, count)) val(count) count += 1
def test_template(template): if 'troposphere/EMR_Cluster' in template: pytest.skip('troposphere/EMR_Cluster uses undocumented AWS::EMR::Cluster.EbsConfiguration') if 'OpenStack' in template: pytest.skip('OpenStack is not supported') instance = json.load(open(template)) val.val(instance, schema)
def test_template(template): if 'troposphere/EMR_Cluster' in template: pytest.skip( 'troposphere/EMR_Cluster uses undocumented AWS::EMR::Cluster.EbsConfiguration' ) if 'OpenStack' in template: pytest.skip('OpenStack is not supported') instance = json.load(open(template)) val.val(instance, schema)
def train(model, optimizer, scheduler, dataset, cfg, val_dataset=None, vis=True): training_loss_list = [] test_loss_list = [] for epoch in range(cfg['max_epoch']): true_nums = 0 for index, (inputs, label) in enumerate(dataset): inputs = img_preprocess(inputs) outputs = model.forward(inputs) loss, reg_loss = model.compute_loss(label) grads = model.backward() optimizer.step(grads) true_num, precision = cal_precision(outputs, label) true_nums += true_num logging.info( "[%d/%d] train loss: %.2f, reg loss: %.2f, total loss: %.4f, precision %.4f || lr: %.6f" % (epoch, index, loss, reg_loss, (loss + reg_loss), precision, optimizer.lr)) scheduler.step() params_path = save_weights(model.params, cfg['workspace'], model.name, epoch) logging.info("save model at: %s, training precision %.4f" % (params_path, true_nums / dataset.total)) training_loss_list.append(true_nums / dataset.total) if val_dataset is not None: loss = val(model, model.name, params_path, val_dataset) test_loss_list.append(loss) if vis: draw_loss_graph(cfg['workspace'] + "/loss.png", training_loss_list, test_loss_list)
def test_fn_findinmap_valid(instance): val.val(instance, basic_types_schema, definition="#/definitions/functions/Fn::FindInMap")
def test_fn_if_invalid(instance): with pytest.raises(jsonschema.ValidationError): val.val(instance, basic_types_schema, definition="#/definitions/condition_functions/Fn::If")
def test_lenientISO8601_valid(instance): val.val(instance, basic_types_schema, definition="#/definitions/timestamp")
def test_fn_base64_valid(instance): val.val(instance, basic_types_schema, definition="#/definitions/functions/Fn::Base64")
#!/usr/bin/env python import sys import val import tools schema = tools.load('schema.json') template = tools.load(sys.argv[1]) val.val(template, schema)
def test_string_list_valid(instance): val.val(instance, basic_types_schema, definition="#/definitions/list<string>")
def main(args): # Step0 ==================================================================== # Set GPU ids os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids # Set the file name format FILE_NAME_FORMAT = "{0}_{1}_{2}_{3:d}{4}".format(args.model, args.dataset, args.loss, args.epochs, args.flag) # Set the results file path RESULT_FILE_NAME = FILE_NAME_FORMAT + '_results.pkl' RESULT_FILE_PATH = os.path.join(RESULT_PATH, RESULT_FILE_NAME) # Set the checkpoint file path CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT + '.ckpt' CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, CHECKPOINT_FILE_NAME) BEST_CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT + '_best.ckpt' BEST_CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, BEST_CHECKPOINT_FILE_NAME) # Set the random seed same for reproducibility random.seed(190811) torch.manual_seed(190811) torch.cuda.manual_seed_all(190811) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Step1 ==================================================================== # Load dataset train_dataloader = CycleGAN_Dataloader(name=args.dataset, num_workers=args.num_workers) test_dataloader = CycleGAN_Dataloader(name=args.dataset, train=False, num_workers=args.num_workers) print('==> DataLoader ready.') # Step2 ==================================================================== # Make the model if args.dataset == 'cityscapes': A_generator = Generator(num_resblock=6) B_generator = Generator(num_resblock=6) A_discriminator = Discriminator() B_discriminator = Discriminator() else: A_generator = Generator(num_resblock=9) B_generator = Generator(num_resblock=9) A_discriminator = Discriminator() B_discriminator = Discriminator() # Check DataParallel available if torch.cuda.device_count() > 1: A_generator = nn.DataParallel(A_generator) B_generator = nn.DataParallel(B_generator) A_discriminator = nn.DataParallel(A_discriminator) B_discriminator = nn.DataParallel(B_discriminator) # Check CUDA available if torch.cuda.is_available(): A_generator.cuda() B_generator.cuda() A_discriminator.cuda() B_discriminator.cuda() print('==> Model ready.') # Step3 ==================================================================== # Set each loss function criterion_GAN = nn.MSELoss() criterion_cycle = nn.L1Loss() criterion_identity = nn.L1Loss() criterion_feature = nn.L1Loss() # Set each optimizer optimizer_G = optim.Adam(itertools.chain(A_generator.parameters(), B_generator.parameters()), lr=args.lr, betas=(0.5, 0.999)) optimizer_D = optim.Adam(itertools.chain(A_discriminator.parameters(), B_discriminator.parameters()), lr=args.lr, betas=(0.5, 0.999)) # Set learning rate scheduler def lambda_rule(epoch): epoch_decay = args.epochs / 2 lr_linear_scale = 1.0 - max(0, epoch + 1 - epoch_decay) \ / float(epoch_decay+ 1) return lr_linear_scale scheduler_G = lr_scheduler.LambdaLR(optimizer_G, lr_lambda=lambda_rule) scheduler_D = lr_scheduler.LambdaLR(optimizer_D, lr_lambda=lambda_rule) print('==> Criterion and optimizer ready.') # Step4 ==================================================================== # Train and validate the model start_epoch = 0 best_metric = float("inf") # Initialize the result lists train_loss_G = [] train_loss_D_A = [] train_loss_D_B = [] # Set image buffer A_buffer = ImageBuffer(args.buffer_size) B_buffer = ImageBuffer(args.buffer_size) if args.resume: assert os.path.exists(CHECKPOINT_FILE_PATH), 'No checkpoint file!' checkpoint = torch.load(CHECKPOINT_FILE_PATH) A_generator.load_state_dict(checkpoint['A_generator_state_dict']) B_generator.load_state_dict(checkpoint['B_generator_state_dict']) A_discriminator.load_state_dict( checkpoint['A_discriminator_state_dict']) B_discriminator.load_state_dict( checkpoint['B_discriminator_state_dict']) optimizer_G.load_state_dict(checkpoint['optimizer_G_state_dict']) optimizer_D.load_state_dict(checkpoint['optimizer_D_state_dict']) scheduler_G.load_state_dict(checkpoint['scheduler_G_state_dict']) scheduler_D.load_state_dict(checkpoint['scheduler_D_state_dict']) start_epoch = checkpoint['epoch'] train_loss_G = checkpoint['train_loss_G'] train_loss_D_A = checkpoint['train_loss_D_A'] train_loss_D_B = checkpoint['train_loss_D_B'] best_metric = checkpoint['best_metric'] # Save the training information result_data = {} result_data['model'] = args.model result_data['dataset'] = args.dataset result_data['loss'] = args.loss result_data['target_epoch'] = args.epochs result_data['batch_size'] = args.batch_size # Check the directory of the file path if not os.path.exists(os.path.dirname(RESULT_FILE_PATH)): os.makedirs(os.path.dirname(RESULT_FILE_PATH)) if not os.path.exists(os.path.dirname(CHECKPOINT_FILE_PATH)): os.makedirs(os.path.dirname(CHECKPOINT_FILE_PATH)) print('==> Train ready.') for epoch in range(args.epochs): # strat after the checkpoint epoch if epoch < start_epoch: continue print("\n[Epoch: {:3d}/{:3d}]".format(epoch + 1, args.epochs)) epoch_time = time.time() #======================================================================= # train and validate the model tloss_G, tloss_D = train( train_dataloader, A_generator, B_generator, A_discriminator, B_discriminator, criterion_GAN, criterion_cycle, criterion_identity, optimizer_G, optimizer_D, A_buffer, B_buffer, args.loss, args.lambda_cycle, args.lambda_identity, criterion_feature, args.lambda_feature, args.attention) train_loss_G.append(tloss_G) train_loss_D_A.append(tloss_D['A']) train_loss_D_B.append(tloss_D['B']) if (epoch + 1) % 10 == 0: val(test_dataloader, A_generator, B_generator, A_discriminator, B_discriminator, epoch + 1, FILE_NAME_FORMAT, args.attention) # Update the optimizer's learning rate current_lr = optimizer_G.param_groups[0]['lr'] scheduler_G.step() scheduler_D.step() #======================================================================= current = time.time() # Save the current result result_data['current_epoch'] = epoch result_data['train_loss_G'] = train_loss_G result_data['train_loss_D_A'] = train_loss_D_A result_data['train_loss_D_B'] = train_loss_D_B # Save result_data as pkl file with open(RESULT_FILE_PATH, 'wb') as pkl_file: pickle.dump(result_data, pkl_file, protocol=pickle.HIGHEST_PROTOCOL) # Save the best checkpoint # if train_loss_G < best_metric: # best_metric = train_loss_G # torch.save({ # 'epoch': epoch+1, # 'A_generator_state_dict': A_generator.state_dict(), # 'B_generator_state_dict': B_generator.state_dict(), # 'A_discriminator_state_dict': A_discriminator.state_dict(), # 'B_discriminator_state_dict': B_discriminator.state_dict(), # 'optimizer_G_state_dict': optimizer_G.state_dict(), # 'optimizer_D_state_dict': optimizer_D.state_dict(), # 'scheduler_G_state_dict': scheduler_G.state_dict(), # 'scheduler_D_state_dict': scheduler_D.state_dict(), # 'train_loss_G': train_loss_G, # 'train_loss_D_A': train_loss_D_A, # 'train_loss_D_B': train_loss_D_B, # 'best_metric': best_metric, # }, BEST_CHECKPOINT_FILE_PATH) # Save the current checkpoint torch.save( { 'epoch': epoch + 1, 'A_generator_state_dict': A_generator.state_dict(), 'B_generator_state_dict': B_generator.state_dict(), 'A_discriminator_state_dict': A_discriminator.state_dict(), 'B_discriminator_state_dict': B_discriminator.state_dict(), 'optimizer_G_state_dict': optimizer_G.state_dict(), 'optimizer_D_state_dict': optimizer_D.state_dict(), 'scheduler_G_state_dict': scheduler_G.state_dict(), 'scheduler_D_state_dict': scheduler_D.state_dict(), 'train_loss_G': train_loss_G, 'train_loss_D_A': train_loss_D_A, 'train_loss_D_B': train_loss_D_B, 'best_metric': best_metric, }, CHECKPOINT_FILE_PATH) if (epoch + 1) % 10 == 0: CHECKPOINT_FILE_NAME_epoch = FILE_NAME_FORMAT + '_{0}.ckpt' CHECKPOINT_FILE_PATH_epoch = os.path.join( CHECKPOINT_PATH, FILE_NAME_FORMAT, CHECKPOINT_FILE_NAME_epoch) if not os.path.exists(os.path.dirname(CHECKPOINT_FILE_PATH_epoch)): os.makedirs(os.path.dirname(CHECKPOINT_FILE_PATH_epoch)) torch.save( { 'epoch': epoch + 1, 'A_generator_state_dict': A_generator.state_dict(), 'B_generator_state_dict': B_generator.state_dict(), 'A_discriminator_state_dict': A_discriminator.state_dict(), 'B_discriminator_state_dict': B_discriminator.state_dict(), 'optimizer_G_state_dict': optimizer_G.state_dict(), 'optimizer_D_state_dict': optimizer_D.state_dict(), 'scheduler_G_state_dict': scheduler_G.state_dict(), 'scheduler_D_state_dict': scheduler_D.state_dict(), 'train_loss_G': train_loss_G, 'train_loss_D_A': train_loss_D_A, 'train_loss_D_B': train_loss_D_B, 'best_metric': best_metric, }, CHECKPOINT_FILE_PATH_epoch) # Print the information on the console print("model : {}".format(args.model)) print("dataset : {}".format(args.dataset)) print("loss : {}".format(args.loss)) print("batch_size : {}".format(args.batch_size)) print("current lrate : {:f}".format(current_lr)) print("G loss : {:f}".format(tloss_G)) print("D A/B loss : {:f}/{:f}".format( tloss_D['A'], tloss_D['B'])) print("epoch time : {0:.3f} sec".format(current - epoch_time)) print("Current elapsed time : {0:.3f} sec".format(current - start)) print('==> Train done.') print(' '.join(['Results have been saved at', RESULT_FILE_PATH])) print(' '.join(['Checkpoints have been saved at', CHECKPOINT_FILE_PATH]))
def main(data_root, config): log_dir = 'logs' if os.path.exists(log_dir): import shutil shutil.rmtree(log_dir) logger = SummaryWriter(log_dir) print_freq = config['print_freq'] save_freq = config['save_freq'] data_save_dir = config['data_save_dir'] model_save_dir = config['model_save_dir'] if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) print('===== preparing =====') hoi_db = prepare_hico(data_root, data_save_dir) test_dataset = HICODatasetSpa(hoi_db['val']) test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True) train_dataset = HICODatasetSpa(hoi_db['train']) train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True) hoi_classes_path = os.path.join(data_root, 'hoi_categories.pkl') hoi_classes, _, _, hoi2int = load_hoi_classes(hoi_classes_path) print('===== done =====') model = SpaLan(config['spa_feature_dim'], config['num_hoi_classes'], config['num_obj_classes'], config['num_key_points']) model = model.cuda() # Optimizer lr = config['learning_rate'] lr_adjust_freq = config['lr_adjust_freq'] wd = config['weight_decay'] mt = config['momentum'] batch_count = 0 last_print_time = time.time() for epoch in range(config['n_epochs']): model.train() optimizer = torch.optim.SGD([{ 'params': model.parameters() }], lr=lr, momentum=mt, weight_decay=wd, nesterov=True) for data in train_dataloader: batch_count += 1 spa_maps = Variable(data[0]).cuda() obj_vecs = Variable(data[1]).cuda() hoi_cates = Variable(data[2]).cuda() bin_cates = Variable(data[3]).cuda() obj_cates = Variable(data[4]).cuda() pose_feats = Variable(data[5]).cuda() pos_mask = torch.eq(bin_cates, 0) if pos_mask.sum().item() == 0: continue optimizer.zero_grad() bin_prob, hoi_prob, \ loss_bin, loss_hoi, \ error_bin, error_hoi = model(spa_maps, obj_cates, pose_feats, hoi_cates, bin_cates, pos_mask) # loss = loss_bin + loss_hoi loss = loss_hoi loss.backward() optimizer.step() logger.add_scalars( 'loss', { 'all': loss.data.item(), 'bin': loss_bin.data.item(), 'hoi': loss_hoi.data.item() }, batch_count) logger.add_scalars('error', { 'bin': error_bin.data.item(), 'hoi': error_hoi.data.item() }, batch_count) if batch_count % print_freq == 0: curr_time = time.time() print('[Epoch %d][Batch %d] loss: %.4f time: %.2fs' % (epoch, batch_count, loss.data.item(), curr_time - last_print_time)) print('\t\tloss_bin: %.4f\t\tloss_cls: %.4f' % (loss_bin.data.item(), loss_hoi.data.item())) print('\t\terror_bin: %.4f\t\terror_hoi: %.4f' % (error_bin.data.item(), error_hoi.data.item())) last_print_time = curr_time model.eval() error_bin_avg, error_hoi_avg = val(model, test_dataloader, hoi_classes, hoi2int, show=False) logger.add_scalars('error_val', { 'bin': error_bin_avg, 'hoi': error_hoi_avg }, epoch) if (epoch + 1) % save_freq == 0: model_file = os.path.join(model_save_dir, '%s_%d_weights.pkl' % (model, epoch)) torch.save(model.state_dict(), model_file) np.save( os.path.join(model_save_dir, '%s_%d_lr.pkl' % (model, epoch)), lr) if (epoch + 1) % lr_adjust_freq == 0: lr = lr * 0.6 logger.close()
def main(): # Define task. parser = argparse.ArgumentParser( description='Large-scale deep learning framework.') parser.add_argument('--task', metavar='NAME', type=str, required=True, help='specify a task name that defined in $ROOT/task/') arg = parser.parse_args(sys.argv[1:3]) task = importlib.import_module(arg.task) # Get task-specific options and print. task_opt = task.Option() opt = task_opt.opt print('Options.') for k in sorted(vars(opt)): if not k.startswith('dst_dir'): print(' {0}: {1}'.format(k, opt.__dict__[k])) # Build db. dst_dir_db = os.path.join(opt.dst_dir, opt.db) dst_path_db = os.path.join(dst_dir_db, 'db.pth') try: db = torch.load(dst_path_db) print('DB loaded.') except: db_module = importlib.import_module(opt.db) print('Make train DB.') db_train = db_module.make_dataset_train(opt.db_root) print('Make val DB.') db_val = db_module.make_dataset_val(opt.db_root) print('Save DB.') db = {'train': db_train, 'val': db_val} os.makedirs(dst_dir_db, exist_ok=True) torch.save(db, dst_path_db) db_train = db['train'] db_val = db['val'] # Estimate input statistics. dst_path_input_stats = os.path.join(dst_dir_db, 'db_stats.pth') try: input_stats = torch.load(dst_path_input_stats) print('DB input stats loaded.') except: print('Estimate DB input stats.') batch_manager_train = task.BatchManagerTrain(db_train, opt) input_stats = batch_manager_train.estimate_input_stats() os.makedirs(dst_dir_db, exist_ok=True) torch.save(input_stats, dst_path_input_stats) # Set destimation model directory. dst_dir_model = os.path.join(dst_dir_db, opt.arch) if opt.start_from: assert opt.start_from.endswith('.pth.tar') dst_dir_model = opt.start_from[:-8] if task_opt.changes: dst_dir_model += ',' + task_opt.changes # Apply active learning step to source model path, destination model directory/path. start_from = opt.start_from start_from_db = None for stage in range(opt.stage): start_from = os.path.join(dst_dir_model, '{:03d}.pth.tar'.format(opt.num_epoch)) start_from_db = os.path.join(dst_dir_model, 'db_active.pth') assert opt.num_epoch == len(utils.Logger(os.path.join(dst_dir_model, 'val.log'))), \ 'Finish training before the next active learning stage.' dst_dir_model = os.path.join( dst_dir_model, '{:03d},sampler={},stage={}'.format(opt.num_epoch, opt.sampler, stage + 1)) dst_path_model = os.path.join(dst_dir_model, '{:03d}.pth.tar') print('Active learning stage {}.'.format(opt.stage)) # Initialize model, criterion, optimizer. model = task.Model(opt) # Create loggers. logger_train = utils.Logger(os.path.join(dst_dir_model, 'train.log')) logger_val = utils.Logger(os.path.join(dst_dir_model, 'val.log')) assert len(logger_train) == len(logger_val) # If models trained before, update informations to resume training. best_perform = 0 start_epoch = len(logger_train) if start_epoch > 0: best_perform = logger_val.max() start_from = dst_path_model.format(start_epoch) start_from_db = os.path.join(dst_dir_model, 'db_active.pth') if start_epoch == opt.num_epoch: print('All done.') return # Fetch previouse parameters from that to resume training. dst_path_db_active = os.path.join(dst_dir_model, 'db_active.pth') os.makedirs(dst_dir_model, exist_ok=True) if start_from: print('Load a model from that to resume training.\n' '({})'.format(start_from)) checkpoint = torch.load(start_from) model.model.load_state_dict(checkpoint['state_dict']) model.optimizer.load_state_dict(checkpoint['optimizer']) print('Load active DB.') data = torch.load(start_from_db) db_active = ActiveDB(data, db_val, task.BatchManagerTrain, task.BatchManagerVal, input_stats, model, opt, dst_dir_model) if start_epoch == 0: print('Increase active DB labels.') db_active.increase_labels() print('Save increased active DB.') torch.save(db_active.db, dst_path_db_active) else: print('Make initial active DB.') data = {'pairs': [], 'pool': db_train['pairs'], 'log': [[]]} db_active = ActiveDB(data, db_val, task.BatchManagerTrain, task.BatchManagerVal, input_stats, model, opt, dst_dir_model) print('Save initial active DB.') torch.save(db_active.db, dst_path_db_active) # Set training db. db_train = db_active.db # Create batch manager. batch_manager_train = task.BatchManagerTrain(db_train, opt) batch_manager_train.set_input_stats(input_stats) batch_manager_val = task.BatchManagerVal(db_val, opt) batch_manager_val.set_input_stats(input_stats) # Cache input data if necessary. if opt.cache_train_data: batch_manager_train.cache_data() if opt.cache_val_data: batch_manager_val.cache_data() # If evaluation mode, evaluate the model and exit. if opt.evaluate: return val.val(batch_manager_val, model) # Do the job. cudnn.benchmark = True os.makedirs(dst_dir_model, exist_ok=True) for epoch in range(start_epoch, opt.num_epoch): # Adjust learning rate before training. learn_rate = opt.learn_rate * (0.1**(epoch // opt.decay_epoch)) for param_group in model.optimizer.param_groups: param_group['lr'] = learn_rate # Train. print('\nStart training at epoch {}.'.format(epoch + 1)) train.train(batch_manager_train, model, logger_train, epoch + 1) # Val. print('\nStart validation at epoch {}.'.format(epoch + 1)) perform = val.val(batch_manager_val, model, logger_val, epoch + 1) # Save model. print('\nSave this model.') data = { 'opt': opt, 'log_train': logger_train.read(), 'log_val': logger_val.read(), 'state_dict': model.model.state_dict(), 'optimizer': model.optimizer.state_dict() } torch.save(data, dst_path_model.format(epoch + 1)) # Remove previous model. if epoch > 0: print('Remove the previous model.') os.system('rm {}'.format(dst_path_model.format(epoch))) # Backup the best model. if perform > best_perform: print('Backup this model as the best.') os.system('cp {} {}'.format( dst_path_model.format(epoch + 1), os.path.join(dst_dir_model, 'best.pth.tar'))) best_perform = perform
def main(args): #=========================================================================== # Set the file name format FILE_NAME_FORMAT = "{0}_{1}_{2:d}_{3:d}_{4}_{5:.4f}_{6:.6f}{7}".format( args.model, args.dataset, args.epochs, args.batch_size, args.optimizer, args.weight_decay, args.lr, args.flag) # Set the results file path RESULT_FILE_NAME = FILE_NAME_FORMAT+'_results.pkl' RESULT_FILE_PATH = os.path.join(RESULTS_PATH, RESULT_FILE_NAME) # Set the checkpoint file path CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT+'.ckpt' CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, CHECKPOINT_FILE_NAME) BEST_CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT+'_best.ckpt' BEST_CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, BEST_CHECKPOINT_FILE_NAME) # Set the random seed same torch.manual_seed(190811) torch.cuda.manual_seed(190811) torch.cuda.manual_seed_all(190811) random.seed(190811) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Step1 ==================================================================== # Load dataset if args.dataset == 'VOC2011': voc = VOC2011_Dataloader() elif args.dataset == 'VOC2012': voc = VOC2012_Dataloader() else: assert False, "Please select the proper dataset!" train_loader = voc.get_train_loader(batch_size=args.batch_size, num_workers=args.num_workers) val_loader = voc.get_val_loader(batch_size=args.batch_size, num_workers=args.num_workers) print('==> DataLoader ready.') # Step2 ==================================================================== # Make FCN model if args.model == 'FCN_AlexNet': model = FCN_AlexNet() elif args.model == 'FCN_VGG16': model = FCN_VGG16() elif args.model == 'FCN_GoogLeNet': model = FCN_GoogLeNet() elif args.model == 'FCN_32s': model = FCN_32s() elif args.model == 'FCN_32s_fixed': model = FCN_32s_fixed() elif args.model == 'FCN_16s': model = FCN_16s() model.load_state_dict(torch.load('./model/pretrained/FCN32s_' +args.dataset+'_'+args.optimizer )['model_state_dict'], strict=False) elif args.model == 'FCN_8s': model = FCN_8s() model.load_state_dict(torch.load('./model/pretrained/FCN16s_' +args.dataset+'_'+args.optimizer )['model_state_dict'], strict=False) elif args.model == 'FCN_4s': model = FCN_4s() model.load_state_dict(torch.load('./model/pretrained/FCN8s_' +args.dataset+'_'+args.optimizer )['model_state_dict'], strict=False) elif args.model == 'FCN_2s': model = FCN_2s() model.load_state_dict(torch.load('./model/pretrained/FCN4s_' +args.dataset+'_'+args.optimizer )['model_state_dict'], strict=False) elif args.model == 'FCN_1s': model = FCN_1s() model.load_state_dict(torch.load('./model/pretrained/FCN2s_' +args.dataset+'_'+args.optimizer )['model_state_dict'], strict=False) elif args.model == 'DeconvNet': model = DeconvNet() else: assert False, "Please select the FCN model" # Check DataParallel available if torch.cuda.device_count() > 1: model = nn.DataParallel(model) # Check CUDA available if torch.cuda.is_available(): model.cuda() print('==> Model ready.') # Step3 ==================================================================== # Set loss function and optimizer criterion = nn.CrossEntropyLoss() # Separate parameters for bias double learning rate normal_parameters = [] double_parameters = [] for name, parameter in model.named_parameters(): if 'bias' in name: double_parameters.append(parameter) else: normal_parameters.append(parameter) # Select the optimizer if args.optimizer == 'SGD': optimizer = optim.SGD([ {'params': normal_parameters}, {'params': double_parameters, 'lr':args.lr*2, 'weight_decay': args.weight_decay*0}, ], lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) elif args.optimizer == 'Adam': optimizer = optim.Adam([ {'params': normal_parameters}, {'params': double_parameters, 'lr':args.lr*2, 'weight_decay': args.weight_decay*0}, ], lr=args.lr, betas=(0.9, 0.999), weight_decay=args.weight_decay) else: assert False, "Please select the proper optimizer." # Set the learning rate scheduler # scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=10, verbose=True, threshold=1e-4) print('==> Criterion and optimizer ready.') # Step4 ==================================================================== # Train and validate the model start_epoch = 0 best_val_mean_IoU = 0 if args.resume: assert os.path.exists(CHECKPOINT_FILE_PATH), 'No checkpoint file!' checkpoint = torch.load(CHECKPOINT_FILE_PATH) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] # Save the training information result_data = {} result_data['model'] = args.model result_data['dataset'] = args.dataset result_data['target epoch'] = args.epochs result_data['batch_size'] = args.batch_size result_data['optimizer'] = args.optimizer result_data['weight_decay'] = args.weight_decay result_data['lr'] = args.lr # Initialize the result lists train_loss = [] train_pixel_acc = [] train_mean_acc = [] train_mean_IoU = [] train_frew_IoU = [] val_loss = [] val_pixel_acc = [] val_mean_acc = [] val_mean_IoU = [] val_frew_IoU = [] # Check the directory of the file path if not os.path.exists(os.path.dirname(RESULT_FILE_PATH)): os.makedirs(os.path.dirname(RESULT_FILE_PATH)) if not os.path.exists(os.path.dirname(CHECKPOINT_FILE_PATH)): os.makedirs(os.path.dirname(CHECKPOINT_FILE_PATH)) print('==> Train ready.') for epoch in range(args.epochs): # strat after the checkpoint epoch if epoch < start_epoch: continue print("\n[Epoch: {:3d}/{:3d}]".format(epoch+1, args.epochs)) epoch_time = time.time() #======================================================================= # train the model tloss, tmetric = train(model, train_loader, criterion, optimizer) train_loss.append(tloss) train_pixel_acc.append(tmetric[0]) train_mean_acc.append(tmetric[1]) train_mean_IoU.append(tmetric[2]) train_frew_IoU.append(tmetric[3]) # validate the model vloss, vmetric = val(model, val_loader, criterion) val_loss.append(vloss) val_pixel_acc.append(vmetric[0]) val_mean_acc.append(vmetric[1]) val_mean_IoU.append(vmetric[2]) val_frew_IoU.append(vmetric[3]) # update learning rate # scheduler.step(vloss) #======================================================================= current = time.time() # Save the current result result_data['current epoch'] = epoch result_data['train_loss'] = train_loss result_data['train_pixel_acc'] = train_pixel_acc result_data['train_mean_acc'] = train_mean_acc result_data['train_mean_IoU'] = train_mean_IoU result_data['train_frew_IoU'] = train_frew_IoU result_data['val_loss'] = val_loss result_data['val_pixel_acc'] = val_pixel_acc result_data['val_mean_acc'] = val_mean_acc result_data['val_mean_IoU'] = val_mean_IoU result_data['val_frew_IoU'] = val_frew_IoU # Save result_data as pkl file with open(RESULT_FILE_PATH, 'wb') as pkl_file: pickle.dump(result_data, pkl_file, protocol=pickle.HIGHEST_PROTOCOL) # Save the best checkpoint if vmetric[2] > best_val_mean_IoU: best_val_mean_IoU = vmetric[2] torch.save({ 'epoch': epoch+1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'best_val_mean_IoU': best_val_mean_IoU, }, BEST_CHECKPOINT_FILE_PATH) # Save the current checkpoint torch.save({ 'epoch': epoch+1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'val_mean_IoU': vmetric[2] }, CHECKPOINT_FILE_PATH) # Print the information on the console print("model : {}".format(args.model)) print("dataset : {}".format(args.dataset)) print("batch_size : {}".format(args.batch_size)) print("optimizer : {}".format(args.optimizer)) print("learning rate : {:f}".format(optimizer.param_groups[0]['lr'])) print("weight decay : {:f}".format(optimizer.param_groups[0]['weight_decay'])) print("train/val loss : {:f}/{:f}".format(tloss,vloss)) print("train/val pixel acc : {:f}/{:f}".format(tmetric[0],vmetric[0])) print("train/val mean acc : {:f}/{:f}".format(tmetric[1],vmetric[1])) print("train/val mean IoU : {:f}/{:f}".format(tmetric[2],vmetric[2])) print("train/val frew IoU : {:f}/{:f}".format(tmetric[3],vmetric[3])) print("epoch time : {0:.3f} sec".format(current - epoch_time)) print("Current elapsed time: {0:.3f} sec".format(current - start)) print('==> Train done.') print(' '.join(['Results have been saved at', RESULT_FILE_PATH])) print(' '.join(['Checkpoints have been saved at', CHECKPOINT_FILE_PATH]))
model = model.to(device) # == Load Data == root = 'D:/data/dogs_cats' dataset_train = DogCatData(root, mode='train') dataset_val = DogCatData(root, mode='val') train_data = DataLoader(dataset_train, shuffle=True, batch_size=32, num_workers=4) val_data = DataLoader(dataset_val, shuffle=True, batch_size=32, num_workers=4) # == optimizer == criterion = torch.nn.CrossEntropyLoss().to(device) optimizer = torch.optim.Adam(model.parameters()) # == Main Loop == max_acc = 0 max_epoch = 1 for epoch in range(max_epoch): train(model, train_data, epoch, criterion, optimizer) acc = val(model, val_data) if acc > max_acc: max_acc = acc torch.save(model, 'checkpoints/lenet_max.pt') print('==========Max Acc: {}=========='.format(max_acc))
def train(generator, discriminator, train_loader, criterion, optimizer_G, optimizer_D, clipping, num_critic, step_counter, validate_noise, FILE_NAME_FORMAT): generator.train() discriminator.train() device = next(generator.parameters()).device.index losses_G = [] losses_D = [] distances = [] total_iter = len(train_loader) for i, (images, _) in enumerate(train_loader): real_images = images.cuda(device) #======================================================================= # For WGAN if criterion is None: #------------------------------------------------------------------- ''' Train Discriminator Network ''' #------------------------------------------------------------------- # Make real & fake lables and noise for generator batch_size = real_images.size(0) real_label = torch.full((batch_size, ), 1, device=device) fake_label = torch.full((batch_size, ), 0, device=device) noise = torch.randn(batch_size, 100, 1, 1, device=device) # Empty discriminator's gradients discriminator.zero_grad() # Generate fake images from noise fake_images = generator(noise) # Discriminate the images output_real = discriminator(real_images) output_fake = discriminator(fake_images.detach()) # Calculate loss loss_D = -output_real.mean() + output_fake.mean() losses_D.append(loss_D.item()) # Calculate gradients (Backpropagation) loss_D.backward() # Calculate Earth Mover(EM) distance distance = output_real.mean() - output_fake.mean() distances.append(distance) # Update discriminator's parameters optimizer_D.step() # Cliping the discriminator's parameters for parameter in discriminator.parameters(): parameter.data.clamp_(-clipping, clipping) #------------------------------------------------------------------- ''' Train Generator Network ''' #------------------------------------------------------------------- loss_G = -output_fake.mean() # For every num_critic iteration if step_counter.current_step % num_critic == 0: # Empty generator's gradients generator.zero_grad() # Generate fake images from noise fake_images = generator(noise) # Discriminate the images output_fake = discriminator(fake_images) # Calculate loss loss_G = -output_fake.mean() losses_G.extend([loss_G.item()] * num_critic) # Calculate gradients (Backpropagation) loss_G.backward() # Update generator's parameters optimizer_G.step() #======================================================================= # For DCGAN else: #------------------------------------------------------------------- ''' Train Discriminator Network ''' # maximize log(D(x)) + log(1 - D(G(z))) #------------------------------------------------------------------- # Make real & fake lables and noise for generator batch_size = real_images.size(0) real_label = torch.full((batch_size, ), 1, device=device) fake_label = torch.full((batch_size, ), 0, device=device) noise = torch.randn(batch_size, 100, 1, 1, device=device) # <For real images> ------------------------------------------------ # Empty discriminator's gradients discriminator.zero_grad() # Predict targets (Forward propagation) output = discriminator(real_images) pred_label = torch.sigmoid(output).view(-1) # Calculate loss (for real images) loss_D_real = criterion(pred_label, real_label) # Calculate gradients (Backpropagation) loss_D_real.backward() # <For fake images> ------------------------------------------------ # Generate fake images from noise fake_images = generator(noise) # Discriminate the images output = discriminator(fake_images.detach()) pred_label = torch.sigmoid(output).view(-1) # Calculate loss (for fake images) loss_D_fake = criterion(pred_label, fake_label) # Calculate gradients (Backpropagation) loss_D_fake.backward() # Make mixed error (Add the both of gradients) --------------------- loss_D = loss_D_real + loss_D_fake losses_D.append(loss_D.item()) # Update discriminator's parameters optimizer_D.step() #------------------------------------------------------------------- ''' Train Generator Network ''' # maximize log(D(G(z))) #------------------------------------------------------------------- # Empty generator's gradients generator.zero_grad() # Calculate loss (for fake images) output = discriminator(fake_images) pred_label = torch.sigmoid(output).view(-1) # Calculate loss (for fake images) loss_G = criterion(pred_label, real_label) losses_G.append(loss_G.item()) # Calculate gradients (Backpropagation) loss_G.backward() # Update discriminator's parameters optimizer_G.step() #------------------------------------------------------------------- # Calculate Jenen-Shannon Divergence target_label = torch.full((batch_size, ), 0.5) distance = jensenshannon(pred_label.detach().cpu(), target_label)**2 # Ref: Jensen-shannon Distance = sqrt(Jenen-Shannon Divergence) distances.append(distance) #======================================================================= # Count the step step_counter.step() # Display current status print("[{:5d}/{:5d}]".format(i + 1, total_iter), end='') print(" loss_G: {:f} loss_D: {:f} dist: {:f} step: {:d} \r".format( loss_G, loss_D, distance, step_counter.current_step), end='') # validate the model if (step_counter.current_step % 500 == 0 or step_counter.current_step == step_counter.objective_step): val(generator, validate_noise, step_counter, FILE_NAME_FORMAT) # Check the current step if step_counter.current_step >= step_counter.objective_step: step_counter.exit_signal = True break #=========================================================================== return losses_G, losses_D, distances
def test_valid(): val.val(valid, resource_schema)
print("{} model chosen.\n".format(opt.model)) vae = Model(vae_model,z_dim=opt.z_dim) best_loss = float("inf") best_epoch = -1 for epoch in range(opt.epochs): for m in metrics: m.reset() print("====== Epoch {} ======".format(epoch)) train(epoch, vae, t_generator, compute_vae, metrics, (models_folder, maps_folder), opt, train_logger) vae_loss,log_p_x = val(epoch, vae, v_generator, compute_vae, metrics, (models_folder, maps_folder), opt, val_logger) is_best = False if vae_loss < best_loss: best_loss = vae_loss best_epoch = epoch is_best = True internal_state = { 'model':opt.model, 'dataset': opt.dataset, 'z_dim': opt.z_dim, 'current_epoch': epoch, 'best_epoch': best_epoch, 'best_loss': best_loss, 'model_vae_state_dict': vae.vae.state_dict(),
def test_invalid(): with pytest.raises(jsonschema.ValidationError): val.val(invalid, resource_schema)
def _agent_sampler(self, mode): # Define fuctions. if mode == 'reg': def _gen_agent_target(posteriors, targets): return [ 0.5 - posterior[0, targets[i][0]] for i, posterior in enumerate(posteriors) ] def is_best(eval_best, eval_current): return eval_best > eval_current num_out_dim = 1 type_fun = float criterion = nn.MSELoss() evaluator = metric.mse learn_rate = 0.001 elif mode == 'cls': def _gen_agent_target(posteriors, targets): return [ posterior[0, targets[i][0]] != posterior.max() for i, posterior in enumerate(posteriors) ] def ap(outputs, targets): return metric.ap(F.softmax(outputs)[:, [1]].data, targets) def is_best(eval_best, eval_current): return eval_best < eval_current num_out_dim = 2 type_fun = int criterion = nn.CrossEntropyLoss() evaluator = ap learn_rate = 0.01 def _make_agent_dataset(db, db_indices, agent_targets, targets): pairs = [] for i, agent_target in enumerate(agent_targets): image, target = db['pairs'][db_indices[i]] assert target == targets[i][0] pairs.append((image, type_fun(agent_target))) return {'pairs': pairs} # Predict class posteriors to compute agent targets. posteriors_train, targets_train, db_indices_train = self._compute_posteriors( self._db['pairs'], self._model) posteriors_val, targets_val, db_indices_val = self._compute_posteriors( self._db_val['pairs'], self._model) # Generate agent targets. agent_targets_train = _gen_agent_target(posteriors_train, targets_train) agent_targets_val = _gen_agent_target(posteriors_val, targets_val) # Create input-target pairs to learn an agent. agent_db_train = _make_agent_dataset(self._db, db_indices_train, agent_targets_train, targets_train) agent_db_val = _make_agent_dataset(self._db_val, db_indices_val, agent_targets_val, targets_val) # Create batch managers to learn an agent. batch_manager_train = self._BatchManagerTrain(agent_db_train, self._opt, self._input_stats) batch_manager_train._evaluator = evaluator batch_manager_val = self._BatchManagerVal(agent_db_val, self._opt, self._input_stats) batch_manager_val._evaluator = evaluator # Cache input data if necessary. if self._opt.cache_train_data: batch_manager_train.cache_data() if self._opt.cache_val_data: batch_manager_val.cache_data() # Create loggers. logger_train = utils.Logger( os.path.join(self._dst_dir_agent, 'agent-train.log')) logger_val = utils.Logger( os.path.join(self._dst_dir_agent, 'agent-val.log')) assert len(logger_train) == 0 and len(logger_val) == 0 # Initialize an agent from the model. model = deepcopy(self._model.model.module) model.fc = nn.Linear(model.fc.in_features, num_out_dim) model = torch.nn.DataParallel(model) model = model.cuda() optimizer = torch.optim.SGD(model.parameters(), self._opt.learn_rate, momentum=self._opt.momentum, weight_decay=self._opt.weight_decay) class Model(object): def __init__(self): self.model = model self.criterion = criterion.cuda() self.optimizer = optimizer agent = Model() # Learn the agent. best_perform = 0 for param_group in agent.optimizer.param_groups: param_group['lr'] = learn_rate for epoch in range(3): print('\nStart agent training at epoch {}.'.format(epoch + 1)) train.train(batch_manager_train, agent, logger_train, epoch + 1) print('\nStart agent validation at epoch {}.'.format(epoch + 1)) perform = val.val(batch_manager_val, agent, logger_val, epoch + 1) if is_best(best_perform, perform): best_perform = perform best_agent = deepcopy(agent) # Predict uncertainty with the agent over unlabeled set. posteriors, _, db_indices = self._compute_posteriors( self._db['pool'], best_agent) # Compute uncertainties. uncertainties = [] for i, posterior in enumerate(posteriors): uncertainties.append(posterior[0, 1]) _, indices = torch.sort(torch.Tensor(uncertainties), descending=True) return torch.LongTensor(db_indices)[indices[:self._opt.sampling_size]]
def do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ): logger = logging.getLogger("rcnn.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() max_iou = 0 for iteration, (images, targets, _, __) in enumerate(data_loader, start_iter): data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 100 == 0 or iteration == max_iter: logger.info( meters.delimiter.join( [ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ] ).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, ) ) if iteration % checkpoint_period == 0: # checkpointer.save("model_{:07d}".format(iteration), **arguments) MODEL_PATH = os.path.join(sys.path[0], 'data', 'output', 'model') if not os.path.exists(MODEL_PATH): os.makedirs(MODEL_PATH) torch.save(model, os.path.join(MODEL_PATH, 'last.pkl')) print('Save !') val() if iteration == max_iter: # checkpointer.save("model_final", **arguments) pass total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info( "Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter) ) )
model_dict = model.state_dict() passed_dict = ['conv9.weight','conv10.weight','conv11.weight'] new_state_dict = OrderedDict() new_state_dict = {k: v for k,v in checkpoint['state_dict'].items() if k not in passed_dict} model_dict.update(new_state_dict) model.load_state_dict(model_dict) else: model.load_state_dict(checkpoint['state_dict']) opt.begin_epoch = checkpoint['epoch'] model = model.to(opt.device) if not opt.no_train and not opt.pretrained: optimizer.load_state_dict(checkpoint['optimizer']) best_mAP = checkpoint["best_mAP"] ######################################## # Train, Val, Test # ######################################## if opt.test: test(model,test_dataloader,opt.begin_epoch,opt) else: for epoch in range(opt.begin_epoch, opt.num_epochs + 1): if not opt.no_train: print("\n---- Training Model ----") train(model,optimizer,train_dataloader,epoch,opt,train_logger, best_mAP=best_mAP) if not opt.no_val and (epoch+1) % opt.val_interval == 0: print("\n---- Evaluating Model ----") best_mAP = val(model,optimizer,val_dataloader,epoch,opt,val_logger,best_mAP=best_mAP)
sys.exit() if (iteration % DC.save_iter == 0): t.save(model.state_dict(), 'ResNet152-iter' + str(iteration) + '.pth') if os.path.isfile('SAVENOW'): t.save(model.state_dict(), 'ResNet152-iter' + str(iteration) + '.pth') os.remove('SAVENOW') if DC.val_in_train and iteration % DC.val_iter == 0: # print('model.training: ', model.training) time1 = time.time() val_out = val.val(True, model, val_transform, val_data, val_dataloader) model.train() val_elps_time = time.time() - time1 print( 'Validate now, ', ' epoch:', epoch + 1, ' iter:', iteration, ' avg loss: {:.8f}'.format(val_out[0]), ' accuracy:({}/{}) {:.4f}% '.format(val_out[1], val_out[2], val_out[3]), ' time: {:.3f}'.format(val_elps_time)) if (epoch + 1 == DC.max_epoch): t.save(model.state_dict(), 'ResNet152-iter' + str(iteration) + '.pth')
def test_fn_getazs_valid(instance): val.val(instance, basic_types_schema, definition="#/definitions/functions/Fn::GetAZs")
def test_fn_join_valid(instance): val.val(instance, basic_types_schema, definition="#/definitions/functions/Fn::Join")
def test_string_function_valid(instance, definition): val.val(instance, basic_types_schema, definition=definition)
def test_lenientISO8601_invalid(instance): with pytest.raises(jsonschema.ValidationError): val.val(instance, basic_types_schema, definition="#/definitions/timestamp")
def test_fn_base64_invalid(instance): with pytest.raises(jsonschema.ValidationError): val.val(instance, basic_types_schema, definition="#/definitions/functions/Fn::Base64")
def test_fn_if_valid(instance): val.val(instance, basic_types_schema, definition="#/definitions/condition_functions/Fn::If")
def main(args): #=========================================================================== # Set the file name format FILE_NAME_FORMAT = "{0}_{1}_{2:d}_{3:d}_{4:d}_{5:f}{6}".format( args.model, args.dataset, args.epochs, args.obj_step, args.batch_size, args.lr, args.flag) # Set the results file path RESULT_FILE_NAME = FILE_NAME_FORMAT + '_results.pkl' RESULT_FILE_PATH = os.path.join(RESULTS_PATH, RESULT_FILE_NAME) # Set the checkpoint file path CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT + '.ckpt' CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, CHECKPOINT_FILE_NAME) BEST_CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT + '_best.ckpt' BEST_CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, BEST_CHECKPOINT_FILE_NAME) # Set the random seed same for reproducibility random.seed(190811) torch.manual_seed(190811) torch.cuda.manual_seed_all(190811) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Step1 ==================================================================== # Load dataset if args.dataset == 'CelebA': dataloader = CelebA_Dataloader() else: assert False, "Please select the proper dataset." train_loader = dataloader.get_train_loader(batch_size=args.batch_size, num_workers=args.num_workers) print('==> DataLoader ready.') # Step2 ==================================================================== # Make the model if args.model in ['WGAN', 'DCGAN']: generator = Generator(BN=True) discriminator = Discriminator(BN=True) elif args.model in ['WGAN_noBN', 'DCGAN_noBN']: generator = Generator(BN=False) discriminator = Discriminator(BN=False) else: assert False, "Please select the proper model." # Check DataParallel available if torch.cuda.device_count() > 1: generator = nn.DataParallel(generator) discriminator = nn.DataParallel(discriminator) # Check CUDA available if torch.cuda.is_available(): generator.cuda() discriminator.cuda() print('==> Model ready.') # Step3 ==================================================================== # Set loss function and optimizer if args.model in ['DCGAN', 'DCGAN_noBN']: criterion = nn.BCELoss() else: criterion = None optimizer_G = torch.optim.RMSprop(generator.parameters(), lr=args.lr) optimizer_D = torch.optim.RMSprop(discriminator.parameters(), lr=args.lr) step_counter = StepCounter(args.obj_step) print('==> Criterion and optimizer ready.') # Step4 ==================================================================== # Train and validate the model start_epoch = 0 best_metric = float("inf") validate_noise = torch.randn(args.batch_size, 100, 1, 1) # Initialize the result lists train_loss_G = [] train_loss_D = [] train_distance = [] if args.resume: assert os.path.exists(CHECKPOINT_FILE_PATH), 'No checkpoint file!' checkpoint = torch.load(CHECKPOINT_FILE_PATH) generator.load_state_dict(checkpoint['generator_state_dict']) discriminator.load_state_dict(checkpoint['discriminator_state_dict']) optimizer_G.load_state_dict(checkpoint['optimizer_G_state_dict']) optimizer_D.load_state_dict(checkpoint['optimizer_D_state_dict']) start_epoch = checkpoint['epoch'] step_counter.current_step = checkpoint['current_step'] train_loss_G = checkpoint['train_loss_G'] train_loss_D = checkpoint['train_loss_D'] train_distance = checkpoint['train_distance'] best_metric = checkpoint['best_metric'] # Save the training information result_data = {} result_data['model'] = args.model result_data['dataset'] = args.dataset result_data['target_epoch'] = args.epochs result_data['batch_size'] = args.batch_size # Check the directory of the file path if not os.path.exists(os.path.dirname(RESULT_FILE_PATH)): os.makedirs(os.path.dirname(RESULT_FILE_PATH)) if not os.path.exists(os.path.dirname(CHECKPOINT_FILE_PATH)): os.makedirs(os.path.dirname(CHECKPOINT_FILE_PATH)) print('==> Train ready.') # Validate before training (step 0) val(generator, validate_noise, step_counter, FILE_NAME_FORMAT) for epoch in range(args.epochs): # strat after the checkpoint epoch if epoch < start_epoch: continue print("\n[Epoch: {:3d}/{:3d}]".format(epoch + 1, args.epochs)) epoch_time = time.time() #======================================================================= # train the model (+ validate the model) tloss_G, tloss_D, tdist = train(generator, discriminator, train_loader, criterion, optimizer_G, optimizer_D, args.clipping, args.num_critic, step_counter, validate_noise, FILE_NAME_FORMAT) train_loss_G.extend(tloss_G) train_loss_D.extend(tloss_D) train_distance.extend(tdist) #======================================================================= current = time.time() # Calculate average loss avg_loss_G = sum(tloss_G) / len(tloss_G) avg_loss_D = sum(tloss_D) / len(tloss_D) avg_distance = sum(tdist) / len(tdist) # Save the current result result_data['current_epoch'] = epoch result_data['train_loss_G'] = train_loss_G result_data['train_loss_D'] = train_loss_D result_data['train_distance'] = train_distance # Save result_data as pkl file with open(RESULT_FILE_PATH, 'wb') as pkl_file: pickle.dump(result_data, pkl_file, protocol=pickle.HIGHEST_PROTOCOL) # Save the best checkpoint # if avg_distance < best_metric: # best_metric = avg_distance # torch.save({ # 'epoch': epoch+1, # 'generator_state_dict': generator.state_dict(), # 'discriminator_state_dict': discriminator.state_dict(), # 'optimizer_G_state_dict': optimizer_G.state_dict(), # 'optimizer_D_state_dict': optimizer_D.state_dict(), # 'current_step': step_counter.current_step, # 'best_metric': best_metric, # }, BEST_CHECKPOINT_FILE_PATH) # Save the current checkpoint torch.save( { 'epoch': epoch + 1, 'generator_state_dict': generator.state_dict(), 'discriminator_state_dict': discriminator.state_dict(), 'optimizer_G_state_dict': optimizer_G.state_dict(), 'optimizer_D_state_dict': optimizer_D.state_dict(), 'current_step': step_counter.current_step, 'train_loss_G': train_loss_G, 'train_loss_D': train_loss_D, 'train_distance': train_distance, 'best_metric': best_metric, }, CHECKPOINT_FILE_PATH) # Print the information on the console print("model : {}".format(args.model)) print("dataset : {}".format(args.dataset)) print("batch_size : {}".format(args.batch_size)) print("current step : {:d}".format(step_counter.current_step)) print("current lrate : {:f}".format(args.lr)) print("gen/disc loss : {:f}/{:f}".format( avg_loss_G, avg_loss_D)) print("distance metric : {:f}".format(avg_distance)) print("epoch time : {0:.3f} sec".format(current - epoch_time)) print("Current elapsed time : {0:.3f} sec".format(current - start)) # If iteration step has been satisfied if step_counter.exit_signal: break print('==> Train done.') print(' '.join(['Results have been saved at', RESULT_FILE_PATH])) print(' '.join(['Checkpoints have been saved at', CHECKPOINT_FILE_PATH]))
def main(): parser = argparse.ArgumentParser(description='SegTransformer training') parser.add_argument('--config', type=str, required=True) args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') config = load_config_yaml(args.config) data_config = load_config_yaml(config['data_config']) now = datetime.datetime.now() date_time = now.strftime("%Y-%m-%d-%H-%M") os.makedirs(config['logging_dir'], exist_ok=True) logging_path = os.path.join(config['logging_dir'], f'logging_train_{date_time}.txt') logger = create_logger(logging_path, stdout=False) ################################################################################### # construct net ################################################################################### n_channel = data_config['dataset']['2d']['n_slice'] n_class = len(data_config['dataset']['3d']['roi_names']) if data_config['dataset']['3d']['with_issue_air_mask']: n_class += 2 start_channel = int(config['start_channel']) logger.info( f'create model with n_channel={n_channel}, start_channel={start_channel}, n_class={n_class}' ) model = SegTransformer( n_channel=n_channel, start_channel=start_channel, n_class=n_class, deep_supervision=config["deep_supervision"]).to(device) logger.info(f"model_dir: {config['ckpt_dir']}") ################################################################################### # criterion, optimizer, scheduler ################################################################################### criterion = Criterion(config) optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config['step_size']) if config['deep_supervision']: logger.info('Train model using deep supervision') else: logger.info('Train model using deep supervision') ################################################################################### # SummaryWriter ################################################################################### logger.info("Creating writer") writer = SummaryWriter(comment=f"LR_{config['lr']}_BS_{config['n_epoch']}") ################################################################################### # train setup ################################################################################### global_step = 0 best_loss = np.inf epoch_start = 0 ################################################################################### # load previous model ################################################################################### if config['load_checkpoint']: logger.info( f'Loading model from {os.path.join(config["ckpt_dir"], config["ckpt_fn"])}...' ) model, optimizer, scheduler, epoch_start, global_step = load_checkpoint( model, optimizer, scheduler, config['ckpt_dir'], config['ckpt_fn'], device) elif config['load_checkpoint_encoder']: logger.info( f'Loading encoder from {os.path.join(config["ckpt_dir"], config["ckpt_fn"])}...' ) model.encoder = load_checkpoint_encoder(model.encoder, ckpt_dir=config['ckpt_dir'], ckpt_fn=config['ckpt_fn'], device=device) if config['freeze_encoder']: logger.info('Freeze encoder') freeze(model.encoder) elif config['load_checkpoint_decoder']: logger.info( f'Loading decoder from {os.path.join(config["ckpt_dir"], config["ckpt_fn"])}...' ) model.decoder = load_checkpoint_decoder(model.decoder, ckpt_dir=config['ckpt_dir'], ckpt_fn=config['ckpt_fn'], device=device) if config['freeze_decoder']: logger.info('Freeze decoder') freeze(model.decoder) ################################################################################### # parallel model and data ################################################################################### if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = torch.nn.DataParallel(model) ################################################################################### # Dataset ################################################################################### dataloader_3d = create_loader_3d(data_config, 'train') ################################################################################### # train ################################################################################### logger.info(f'Starting training from epoch: {epoch_start}') for epoch in range(epoch_start, config['n_epoch']): logger.info(f"Epoch: {epoch}/{config['n_epoch']}") epoch_loss = 0 epoch_loss_focal = 0 epoch_loss_dice = 0 n_batch_3d = len(dataloader_3d) with tqdm(total=n_batch_3d, desc=f"Epoch {epoch + 1}/{config['n_epoch']}", unit='batch') as pbar: for batch_3d in dataloader_3d: dataloader_2d = create_loader_2d(batch_3d, data_config, 'train') n_batch_2d = len(dataloader_2d) for idx, batch_2d in enumerate(dataloader_2d): img = batch_2d['img'].to( device=device, dtype=torch.float32) # [N, n_channel, H, W] mask_gt = batch_2d['mask'].to( device=device, dtype=torch.float32) # [N, H, W] mask_pred = model(img) mask_flag = batch_2d['mask_flag'].to(device=device, dtype=torch.float32) loss, loss_dict = criterion(pred=mask_pred, target=mask_gt, target_roi_weight=mask_flag) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_value_(model.parameters(), 0.01) optimizer.step() global_step += 1 loss_scalar = loss_dict["loss"] loss_focal_scalar = loss_dict["focal_loss"] loss_dice_scalar = loss_dict["dice_loss"] epoch_loss += loss_scalar epoch_loss_focal += loss_focal_scalar epoch_loss_dice += loss_dice_scalar pbar.set_postfix( **{ 'loss (batch)': loss_scalar, 'loss_focal': loss_focal_scalar, 'loss_dice': loss_dice_scalar, 'global_step': global_step }) if (global_step + 1) % ( config['write_summary_loss_batch_step']) == 0: logger.info( f"\tBatch: {idx}/{n_batch_2d}, Loss: {loss_scalar}, Focal_loss: {loss_focal_scalar}, Dice_loss: {loss_dice_scalar}" ) writer.add_scalar('Loss/train', loss_scalar, global_step) writer.add_scalar('Loss/train_focal', loss_focal_scalar, global_step) writer.add_scalar('Loss/train_dice', loss_dice_scalar, global_step) if (global_step + 1) % (config['write_summary_2d_batch_step']) == 0: writer.add_images( 'train/images', torch.unsqueeze(img[:, n_channel // 2], 1), global_step) writer.add_images( 'train/gt_masks', torch.sum(mask_gt, dim=1, keepdim=True), global_step) writer.add_images( 'train/pred_masks', torch.sum(mask_pred[0] > 0, dim=1, keepdim=True) >= 1, global_step) writer.add_images( 'train/pred_masks_raw', torch.sum(mask_pred[0], dim=1, keepdim=True), global_step) pbar.update() scheduler.step() # log epoch loss if (epoch + 1) % config['logging_epoch_step'] == 0: writer.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch) writer.add_scalar('Loss_epoch/train', epoch_loss, epoch) writer.add_scalar('Loss_epoch/train_focal', epoch_loss_focal, epoch) writer.add_scalar('Loss_epoch/train_dice', epoch_loss_dice, epoch) logger.info( f"Epoch: {epoch}/{config['n_epoch']}, Train Loss: {epoch_loss}, Train Loss BCE: {epoch_loss_focal}, Train Loss DSC: {epoch_loss_dice}" ) # validation and save model if (epoch + 1) % config['val_model_epoch_step'] == 0: val_loss, val_focal_loss, val_dice_loss = val( model, criterion, data_config, n_channel, logger, writer, global_step, device) writer.add_scalar('Loss_epoch/val', val_loss, epoch) writer.add_scalar('Loss_epoch/val_focal', val_focal_loss, epoch) writer.add_scalar('Loss_epoch/val_dice', val_dice_loss, epoch) logger.info( f"Epoch: {epoch}/{config['n_epoch']}, Validation Loss: {val_loss}, Validation Loss Focal: {val_focal_loss}, Validation Loss Dice: {val_dice_loss}" ) os.makedirs(config['ckpt_dir'], exist_ok=True) save_checkpoint(model=model, optimizer=optimizer, scheduler=scheduler, epoch=epoch, global_step=global_step, ckpt_dir=config['ckpt_dir'], ckpt_fn=f'ckpt_{date_time}_Epoch_{epoch}.ckpt') if best_loss > val_loss: best_loss = val_loss for filename in glob.glob( os.path.join(config['ckpt_dir'], "best_ckpt*")): os.remove(filename) save_checkpoint( model=model, optimizer=optimizer, scheduler=scheduler, epoch=epoch, global_step=global_step, ckpt_dir=config['ckpt_dir'], ckpt_fn=f'best_ckpt_{date_time}_epoch_{epoch}.ckpt') if config['freeze_encoder'] and config[ 'unfreeze_encoder_epoch'] is not None: if epoch >= int(config['unfreeze_encoder_epoch']): unfreeze(model.module.encoder) config['unfreeze_encoder_epoch'] = None logger.info(f'Unfreeze encoder at {epoch}') if config['freeze_decoder'] and config[ 'unfreeze_decoder_epoch'] is not None: if epoch >= int(config['unfreeze_decoder_epoch']): unfreeze(model.module.decoder) config['unfreeze_decoder_epoch'] = None logger.info(f'Unfreeze decoder at {epoch}') writer.close()
def main(args): #=========================================================================== # Set the file name format FILE_NAME_FORMAT = "{0}_{1}_{2:d}_{3:d}_{4:d}_{5:d}_{6:d}_{7:d}{8}".format( args.model, args.dataset, args.batch_size, args.dim_model, args.dim_ff, args.dim_KV, args.num_layers, args.num_heads, args.flag) # Set the results file path RESULT_FILE_NAME = FILE_NAME_FORMAT+'_results.pkl' RESULT_FILE_PATH = os.path.join(RESULTS_PATH, RESULT_FILE_NAME) # Set the checkpoint file path CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT+'.ckpt' CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, CHECKPOINT_FILE_NAME) BEST_CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT+'_best.ckpt' BEST_CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, BEST_CHECKPOINT_FILE_NAME) # Set the random seed same for reproducibility random.seed(190811) torch.manual_seed(190811) torch.cuda.manual_seed_all(190811) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Setting constants dim_model = args.dim_model # Dimension of model (=Embedding size) dim_ff = args.dim_ff # Dimension of FeedForward dim_K = args.dim_KV # Dimension of Key(=Query) dim_V = args.dim_KV # Dimension of Value num_layers = args.num_layers # Number of Encoder of Decoder Layer num_heads = args.num_heads # Number of heads in Multi-Head Attention dropout_p = args.dropout_p # Dropout probability warmup_steps = 4000 # Warming up learnimg rate steps label_smoothing_eps = 0.1 # Label smoothing epsilon max_src_len = 46 # Maximum source input length (Multi30k) max_trg_len = 45 # Maximum target input length (Multi30k) # Step1 ==================================================================== # Load dataset if args.dataset == 'WMT2014': dataloader = WMT2014_Dataloader() elif args.dataset == 'Multi30k': dataloader = Multi30k_Dataloader() else: assert False, "Please select the proper dataset." train_loader = dataloader.get_train_loader(batch_size=args.batch_size) val_loader = dataloader.get_val_loader(batch_size=args.batch_size) print('==> DataLoader ready.') # Step2 ==================================================================== # Make Translation model if args.model == 'Transformer': src_vocab_size = len(dataloader.SRC.vocab) trg_vocab_size = len(dataloader.TRG.vocab) model = Transformer(src_vocab_size, trg_vocab_size, max_src_len, max_trg_len, dim_model, dim_K, num_layers, num_heads, dim_ff, dropout_p) else: assert False, "Please select the proper model." # Check DataParallel available if torch.cuda.device_count() > 1: model = nn.DataParallel(model) # Check CUDA available if torch.cuda.is_available(): model.cuda() print('==> Model ready.') # Step3 ==================================================================== # Set loss function and optimizer (+ lrate scheduler) if args.smoothing: criterion = Criterion_LabelSmoothing(vocab_size=trg_vocab_size, padding_idx=dataloader.pad_idx, smoothing_eps=label_smoothing_eps) else: criterion = nn.CrossEntropyLoss(ignore_index=dataloader.pad_idx) optimizer = optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9) lr_scheduler = Warmup_scheduler(optimizer, dim_model, warmup_steps) print('==> Criterion and optimizer ready.') # Step4 ==================================================================== # Train and validate the model start_epoch = 0 best_val_metric = 0 if args.resume: assert os.path.exists(CHECKPOINT_FILE_PATH), 'No checkpoint file!' checkpoint = torch.load(CHECKPOINT_FILE_PATH) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] lr_scheduler.current_step = checkpoint['current_step'] best_val_metric = checkpoint['best_val_metric'] # Save the training information result_data = {} result_data['model'] = args.model result_data['dataset'] = args.dataset result_data['target epoch'] = args.epochs result_data['batch_size'] = args.batch_size # Initialize the result lists train_loss = [] train_ppl = [] train_bleu = [] val_loss = [] val_ppl = [] val_bleu = [] # Check the directory of the file path if not os.path.exists(os.path.dirname(RESULT_FILE_PATH)): os.makedirs(os.path.dirname(RESULT_FILE_PATH)) if not os.path.exists(os.path.dirname(CHECKPOINT_FILE_PATH)): os.makedirs(os.path.dirname(CHECKPOINT_FILE_PATH)) print('==> Train ready.') for epoch in range(args.epochs): # strat after the checkpoint epoch if epoch < start_epoch: continue print("\n[Epoch: {:3d}/{:3d}]".format(epoch+1, args.epochs)) epoch_time = time.time() #======================================================================= # train the model tloss, tmetric = train(model, train_loader, criterion, optimizer, lr_scheduler, dataloader) train_loss.append(tloss) train_ppl.append(tmetric[0]) train_bleu.append(tmetric[1]) # validate the model vloss, vmetric = val(model, val_loader, criterion, dataloader) val_loss.append(vloss) val_ppl.append(vmetric[0]) val_bleu.append(vmetric[1]) #======================================================================= current = time.time() # Save the current result result_data['current epoch'] = epoch result_data['train_loss'] = train_loss result_data['train_ppl'] = train_ppl result_data['train_bleu'] = train_bleu result_data['val_loss'] = val_loss result_data['val_ppl'] = val_ppl result_data['val_bleu'] = val_bleu # Save result_data as pkl file with open(RESULT_FILE_PATH, 'wb') as pkl_file: pickle.dump(result_data, pkl_file, protocol=pickle.HIGHEST_PROTOCOL) # Save the best checkpoint if vmetric[1] > best_val_metric: best_val_metric = vmetric[1] torch.save({ 'epoch': epoch+1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'current_step': lr_scheduler.current_step, 'best_val_metric': best_val_metric, }, BEST_CHECKPOINT_FILE_PATH) # Save the current checkpoint torch.save({ 'epoch': epoch+1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'current_step': lr_scheduler.current_step, 'val_metric': vmetric[0], # 'best_val_metric': best_val_metric, }, CHECKPOINT_FILE_PATH) # Print the information on the console print("model : {}".format(args.model)) print("dataset : {}".format(args.dataset)) print("batch_size : {}".format(args.batch_size)) print("current step : {:d}".format(lr_scheduler.current_step)) print("current lrate : {:f}".format(optimizer.param_groups[0]['lr'])) print("train/val loss : {:f}/{:f}".format(tloss,vloss)) print("train/val PPL : {:f}/{:f}".format(tmetric[0],vmetric[0])) print("train/val BLEU : {:f}/{:f}".format(tmetric[1],vmetric[1])) print("epoch time : {0:.3f} sec".format(current - epoch_time)) print("Current elapsed time : {0:.3f} sec".format(current - start)) print('==> Train done.') print(' '.join(['Results have been saved at', RESULT_FILE_PATH])) print(' '.join(['Checkpoints have been saved at', CHECKPOINT_FILE_PATH]))