def main(args): if args.checkpoint == '': args.checkpoint = "checkpoints/ctw1500_%s_bs_%d_ep_%d" % ( args.arch, args.batch_size, args.n_epoch) if args.pretrain: if 'synth' in args.pretrain: args.checkpoint += "_pretrain_synth" else: args.checkpoint += "_pretrain_ic17" print('checkpoint path: %s' % args.checkpoint) print('init lr: %.8f' % args.lr) print('schedule: ', args.schedule) sys.stdout.flush() if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) kernel_num = 7 min_scale = 0.4 start_epoch = 0 data_loader = CTW1500Loader(is_transform=True, img_size=args.img_size, kernel_num=kernel_num, min_scale=min_scale) #train_loader = ctw_train_loader(data_loader, batch_size=args.batch_size) if args.arch == "resnet50": model = models.resnet50(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet101": model = models.resnet101(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet152": model = models.resnet152(pretrained=True, num_classes=kernel_num) #resnet18 and 34 didn't inplement pretrained elif args.arch == "resnet18": model = models.resnet18(pretrained=False, num_classes=kernel_num) elif args.arch == "resnet34": model = models.resnet34(pretrained=False, num_classes=kernel_num) elif args.arch == "mobilenetv2": model = models.resnet152(pretrained=True, num_classes=kernel_num) elif args.arch == "mobilenetv3large": model = models.mobilenetv3_large(pretrained=False, num_classes=kernel_num) elif args.arch == "mobilenetv3small": model = models.mobilenetv3_small(pretrained=False, num_classes=kernel_num) optimizer = tf.keras.optimizers.SGD(learning_rate=args.lr, momentum=0.99, decay=5e-4) title = 'CTW1500' if args.pretrain: print('Using pretrained model.') assert os.path.isfile( args.pretrain), 'Error: no checkpoint directory found!' logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) elif args.resume: print('Resuming from checkpoint.') model.load_weights(args.resume) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print('Training from scratch.') logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) for epoch in range(start_epoch, args.n_epoch): optimizer = get_new_optimizer(args, optimizer, epoch) print( '\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.n_epoch, optimizer.get_config()['learning_rate'])) train_loader = ctw_train_loader(data_loader, batch_size=args.batch_size) train_loss, train_te_acc, train_ke_acc, train_te_iou, train_ke_iou = train(train_loader, model, dice_loss,\ optimizer, epoch) model.save_weights('%s%s' % (args.checkpoint, '/model_tf/weights')) logger.append([ optimizer.get_config()['learning_rate'], train_loss, train_te_acc, train_te_iou ]) logger.close()
def main(args): if args.checkpoint == '': args.checkpoint = "checkpoints/ctw1500_%s_bs_%d_ep_%d" % ( args.arch, args.batch_size, args.n_epoch) if args.pretrain: if 'synth' in args.pretrain: args.checkpoint += "_pretrain_synth" else: args.checkpoint += "_pretrain_ic17" print(('checkpoint path: %s' % args.checkpoint)) print(('init lr: %.8f' % args.lr)) print(('schedule: ', args.schedule)) sys.stdout.flush() if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) kernel_num = 7 min_scale = 0.4 start_epoch = 0 data_loader = CTW1500Loader(is_transform=True, img_size=args.img_size, kernel_num=kernel_num, min_scale=min_scale) train_loader = torch.utils.data.DataLoader(data_loader, batch_size=args.batch_size, shuffle=True, num_workers=3, drop_last=True, pin_memory=True) if args.arch == "resnet50": model = models.resnet50(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet101": model = models.resnet101(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet152": model = models.resnet152(pretrained=True, num_classes=kernel_num) model = torch.nn.DataParallel(model).cuda() if hasattr(model.module, 'optimizer'): optimizer = model.module.optimizer else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.99, weight_decay=5e-4) title = 'CTW1500' if args.pretrain: print('Using pretrained model.') assert os.path.isfile( args.pretrain), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.pretrain) model.load_state_dict(checkpoint['state_dict']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) elif args.resume: print('Resuming from checkpoint.') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print('Training from scratch.') logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) for epoch in range(start_epoch, args.n_epoch): adjust_learning_rate(args, optimizer, epoch) print(('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.n_epoch, optimizer.param_groups[0]['lr']))) train_loss, train_te_acc, train_ke_acc, train_te_iou, train_ke_iou = train( train_loader, model, dice_loss, optimizer, epoch) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'lr': args.lr, 'optimizer': optimizer.state_dict(), }, checkpoint=args.checkpoint) logger.append([ optimizer.param_groups[0]['lr'], train_loss, train_te_acc, train_te_iou ]) logger.close()
def main(args): print('checkpoint path: %s' % args.checkpoint) print('init lr: %.8f' % args.lr) print('schedule: ', args.schedule) print("useMultiGPUS?:" + args.multiGPU + ' running on device', end=' ') print(device) sys.stdout.flush() if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) kernel_num = args.kernelnum min_scale = args.min_scale start_epoch = 0 hostname = socket.gethostname() img_dir, label_dir = '', '' if hostname == 'DESKTOP-JBG1JGC': img_dir_root = 'C:/Users/xiangpu/Downloads/icdar2017rctw_train_v1.2/train/' img_dir = [img_dir_root + 'part' + str(i + 1) + '/' for i in range(3)] label_dir = 'labels.txt' elif hostname == 'zxp': img_dir_root = '/root/myDataSet/SceneText/' img_dir = [img_dir_root + 'part' + str(i + 1) + '/' for i in range(3)] label_dir = 'labels.txt' dataLoader = IC15Loader(img_dir, label_dir, False, args.img_size, kernel_num=kernel_num, min_scale=min_scale) train_loader = DataLoader(dataLoader, args.batch_size, shuffle=True, num_workers=3, drop_last=True, pin_memory=True) if args.arch == "resnet18": model = pseNets.resnet18(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet34": model = pseNets.resnet34(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet50": model = pseNets.resnet50(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet101": model = pseNets.resnet101(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet152": model = pseNets.resnet152(pretrained=True, num_classes=kernel_num) if args.multiGPU == 'true' and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) model = model.to(device) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.99, weight_decay=5e-4) title = 'ocrSegmentation' if args.pretrain: print('Using pretrained model.') assert os.path.isfile( args.pretrain), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.pretrain, map_location='cpu') model.load_state_dict(checkpoint['state_dict']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) elif args.resume: print('Resuming from checkpoint.') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print('Training from scratch.') logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) model = model.to(device) for epoch in range(start_epoch, args.n_epoch): adjust_learning_rate(args, optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.n_epoch, optimizer.param_groups[0]['lr'])) train_loss, train_te_acc, train_ke_acc, train_te_iou, train_ke_iou = train( train_loader, model, dice_loss, optimizer, epoch) logger.append([ optimizer.param_groups[0]['lr'], train_loss, train_te_acc, train_te_iou ]) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'lr': args.lr, 'optimizer': optimizer.state_dict(), }, checkpoint=args.checkpoint, filename="pseNet") os.remove('training_model') t = time.strftime('%Y_%m_%d_%H_%M', time.localtime()) os.rename('trained_models/pseNet', 'trained_models/pseNet_' + t) logger.close() os.rename(os.path.join(args.checkpoint, 'log.txt'), os.path.join(args.checkpoint, 'log_' + t + '.txt'))
def main(args): start_epoch = 0 if args.checkpoint == '': args.checkpoint = "finetune_lista_checkpoint/n%d_s%d_p%d_snr%d/%s_bs_%d_ep_%d/measurements%d"\ %(args.sample_nums, args.antenna_x*args.antenna_y, args.fault_prob*100, args.SNR, args.arch, args.batch_size, args.n_epoch, args.measurements) print('checkpoint path: %s' % args.checkpoint) print('init lr: %.8f' % args.lr) #print('schedule: ', args.schedule) sys.stdout.flush() if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) data_loader = ListaDataLoader(args) train_loader = torch.utils.data.DataLoader(data_loader, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=True, pin_memory=True) # sensing matrix A = ArrayResposeLoad(measurements=args.measurements, antenna_x=args.antenna_x, antenna_y=args.antenna_y) if args.arch == "LISTA": model = models.LISTA(A=A, T=args.T, lam=args.lam, untied=args.untied, coord=args.coord) model = torch.nn.DataParallel(model).cuda() #for p,v in model.named_parameters(): # pdb.set_trace() #if hasattr(model.module, 'optimizer'): # optimizer = model.module.optimizer #else: #optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) #optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) if args.pretrain: print('Using pretrained model.') assert os.path.isfile( args.pretrain), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.pretrain) model.load_state_dict(checkpoint['state_dict']) logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names(['Epoch', 'Learning Rate', 'nmse']) elif args.resume: print("Resuming from checkpoint") assert os.path.isfile( args.resume), 'Error: no resume checkpoint directory found!' checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True) else: print('Training from scratch.') logger = Logger(os.path.join(args.checkpoint, 'log.txt')) #logger.set_names(['Epoch', 'Learning Rate', 'Train Loss']) logger.set_names(['Epoch', 'Learning Rate', 'nmse']) bestResult = np.inf lr_scheduler = CosineAnnealingLR(optimizer, T_max=70, eta_min=5e-6) for epoch in range(start_epoch, args.n_epoch): adjust_learning_rate(args, optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.n_epoch, optimizer.param_groups[0]['lr'])) nmse = train(train_loader, model, optimizer, epoch) lr_scheduler.step() #save_checkpoint({ # 'epoch':epoch+1, # 'state_dict': model.state_dict(), # 'lr': args.lr, # 'optimizer': optimizer.state_dict(), # }, epoch+1, checkpoint=args.checkpoint) if args.need_validate and (epoch + 1) % 5 == 0: print('Validating the model') avgNmse = validate(model, args) print('The normalized mse in val set is:{nmse:.6f}'.format( nmse=avgNmse)) if True and avgNmse < bestResult: print('Save the best model!') bestResult = avgNmse save_best_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'lr': args.lr, 'optimizer': optimizer.state_dict(), }, checkpoint=args.checkpoint) logger.append([epoch + 1, optimizer.param_groups[0]['lr'], nmse]) logger.close()
def main(): # parser = argparse.ArgumentParser(description='Hyperparams') # parser.add_argument('--arch', nargs='?', type=str, default='resnet50') # parser.add_argument('--img_size', nargs='?', type=int, default=640, # help='Height of the input image') # parser.add_argument('--n_epoch', nargs='?', type=int, default=600, # help='# of the epochs') # parser.add_argument('--schedule', type=int, nargs='+', default=[200, 400], # help='Decrease learning rate at these epochs.') # parser.add_argument('--batch_size', nargs='?', type=int, default=1, # help='Batch Size') # parser.add_argument('--lr', nargs='?', type=float, default=1e-3, # help='Learning Rate') # parser.add_argument('--resume', nargs='?', type=str, default=None, # help='Path to previous saved model to restart from') # parser.add_argument('--checkpoint', default='', type=str, metavar='PATH', # help='path to save checkpoint (default: checkpoint)') # args = parser.parse_args() # lr = args.lr # schedule = args.schedule # batch_size = args.batch_size # n_epoch = args.n_epoch # image_size = args.img_size # resume = args.resume # checkpoint_path = args.checkpoint # arch = args.arch lr = 1e-3 schedule = [200, 400] batch_size = 16 # batch_size = 1 n_epoch = 100 image_size = 640 checkpoint_path = '' # arch = 'resnet50' arch = 'mobilenetV2' resume = "checkpoints/ReCTS_%s_bs_%d_ep_%d" % (arch, batch_size, 5) # resume = None if checkpoint_path == '': checkpoint_path = "checkpoints/ReCTS_%s_bs_%d_ep_%d" % ( arch, batch_size, n_epoch) print('checkpoint path: %s' % checkpoint_path) print('init lr: %.8f' % lr) print('schedule: ', schedule) sys.stdout.flush() if not os.path.isdir(checkpoint_path): os.makedirs(checkpoint_path) kernel_num = 7 min_scale = 0.4 start_epoch = 0 data_loader = ReCTSDataLoader( need_transform=True, img_size=image_size, kernel_num=kernel_num, min_scale=min_scale, train_data_dir='../ocr_data/ReCTS/img/', train_gt_dir='../ocr_data/ReCTS/gt/' # train_data_dir='/kaggle/input/rects-ocr/img/', # train_gt_dir='/kaggle/input/rects-ocr/gt/' ) ctw_root_dir = 'data/' train_loader = torch.utils.data.DataLoader(data_loader, batch_size=batch_size, shuffle=True, num_workers=3, drop_last=True, pin_memory=True) if arch == "resnet50": model = models.resnet50(pretrained=False, num_classes=kernel_num) elif arch == "resnet101": model = models.resnet101(pretrained=False, num_classes=kernel_num) elif arch == "resnet152": model = models.resnet152(pretrained=False, num_classes=kernel_num) elif arch == "mobilenetV2": model = PSENet(backbone="mobilenetv2", pretrained=False, result_num=kernel_num, scale=1) if torch.cuda.is_available(): model = torch.nn.DataParallel(model).cuda() device = 'cuda' else: model = torch.nn.DataParallel(model) device = 'cpu' optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99, weight_decay=5e-4) title = 'ReCTS' if resume: print('Resuming from checkpoint.') checkpoint_file_path = os.path.join(resume, "checkpoint.pth.tar") assert os.path.isfile( checkpoint_file_path ), 'Error: no checkpoint directory: %s found!' % checkpoint_file_path checkpoint = torch.load(checkpoint_file_path, map_location=torch.device(device)) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) shutil.copy(os.path.join(resume, 'log.txt'), os.path.join(checkpoint_path, 'log.txt')) logger = Logger(os.path.join(checkpoint_path, 'log.txt'), title=title, resume=True) else: print('Training from scratch.') logger = Logger(os.path.join(checkpoint_path, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) for epoch in range(start_epoch, n_epoch): lr = adjust_learning_rate(schedule, lr, optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, n_epoch, optimizer.param_groups[0]['lr'])) stat(model, (3, image_size, image_size)) train_loss, train_te_acc, train_ke_acc, train_te_iou, train_ke_iou = train( train_loader, model, dice_loss, optimizer, epoch, lr, checkpoint_path) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'lr': lr, 'optimizer': optimizer.state_dict(), }, checkpoint=checkpoint_path) logger.append([ optimizer.param_groups[0]['lr'], train_loss, train_te_acc, train_te_iou ]) logger.close()
def main(args): best_acc = 0 # Use CUDA os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus use_cuda = torch.cuda.is_available() # Random seed random.seed(time.time()) if args.manual_seed is None: args.manual_seed = random.randint(1, 10000) if os.path.exists(args.out): shutil.rmtree(args.out) mkdir_p(args.out) args.n_gpus = len(args.gpus.split(',')) state = {k: v for k, v in args._get_kwargs()} with open(os.path.join(args.out, 'args.json'), 'w', encoding='utf8') as f: json.dump(state, f) print('==> saved arguments') print(json.dumps(state, indent=4)) set_seed(args) # Data print(f'==> Preparing IMDB') train_labeled_set, train_unlabeled_set, valid_set, test_set,\ text_field, label_field = get_imdb('./data/aclImdb/') text_field.build_vocab(train_unlabeled_set, max_size=args.vocab_size, vectors=GloVe(name='6B', dim=300, cache='./data/')) label_field.build_vocab(train_unlabeled_set) text_vocab, label_vocab = text_field.vocab, label_field.vocab print(f"Unique tokens in TEXT vocabulary: {len(text_vocab)}") print(f"Unique tokens in LABEL vocabulary: {len(label_vocab)}") embedding_matrix = text_vocab.vectors train_labeled_set = MyIMDB(train_labeled_set, text_vocab, label_vocab) train_unlabeled_set = MyIMDB(train_unlabeled_set, text_vocab, label_vocab, unlabeled=True) valid_set = MyIMDB(valid_set, text_vocab, label_vocab) test_set = MyIMDB(test_set, text_vocab, label_vocab) train_labeled_loader = DataLoader(train_labeled_set, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=True) train_unlabeled_loader = DataLoader(train_unlabeled_set, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=True) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=False, num_workers=0) test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=0) # Model print("==> creating TextCNN") def create_model(config, model=MixTextCNN, use_cuda=False, ema=False): model = model(config) if use_cuda: model = model.cuda() if ema: for param in model.parameters(): param.detach_() return model config = Config(text_field, label_field, embedding=embedding_matrix) model = create_model(config, use_cuda=use_cuda) ema_model = create_model(config, use_cuda=use_cuda, ema=True) cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) train_criterion = SemiLoss() criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr) lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2) ema_optimizer = WeightEMA(model, ema_model, args.lr, alpha=args.ema_decay) start_epoch = 0 # Resume title = 'noisy-imdb' if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.out = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) ema_model.load_state_dict(checkpoint['ema_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.out, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.out, 'log.txt'), title=title) logger.set_names( ['Train Loss', 'Train Loss X', 'Train Loss U', 'Valid Loss', 'Valid Acc.', 'Test Loss', 'Test Acc.']) writer = SummaryWriter(args.out) step = 0 test_accs = [] # Train and val for epoch in range(start_epoch, args.epochs): print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_loss_x, train_loss_u = train(train_labeled_loader, train_unlabeled_loader, text_vocab, model, optimizer, ema_optimizer, train_criterion, epoch, use_cuda) _, train_acc = validate(train_labeled_loader, ema_model, criterion, use_cuda, mode='Train Stats') val_loss, val_acc = validate(valid_loader, ema_model, criterion, use_cuda, mode='Valid Stats') test_loss, test_acc = validate(test_loader, ema_model, criterion, use_cuda, mode='Test Stats ') lr_scheduler.step(test_acc) step = args.val_iteration * (epoch + 1) writer.add_scalar('losses/train_loss', train_loss, step) writer.add_scalar('losses/valid_loss', val_loss, step) writer.add_scalar('losses/test_loss', test_loss, step) writer.add_scalar('accuracy/train_acc', train_acc, step) writer.add_scalar('accuracy/val_acc', val_acc, step) writer.add_scalar('accuracy/test_acc', test_acc, step) # append logger file logger.append([train_loss, train_loss_x, train_loss_u, val_loss, val_acc, test_loss, test_acc]) # save model is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'ema_state_dict': ema_model.state_dict(), 'acc': val_acc, 'best_val_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, args.out) test_accs.append(test_acc) logger.close() writer.close() print('Best val acc:') print(best_acc) print('Mean test acc:') print(np.mean(test_accs[-20:]))
def main(args): if args.checkpoint == '': args.checkpoint = "checkpoints/ic15_%s_bs_%d_ep_%d"%(args.arch, args.batch_size, args.n_epoch) if args.pretrain: if 'synth' in args.pretrain: args.checkpoint += "_pretrain_synth" else: args.checkpoint += "_pretrain_s1280" print(('checkpoint path: %s'%args.checkpoint)) print(('init lr: %.8f'%args.lr)) print(('schedule: ', args.schedule)) sys.stdout.flush() if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) writer=SummaryWriter(args.checkpoint) kernel_num=18 start_epoch = 0 ##### # # # ##### data_loader = IC15Loader(is_transform=True, img_size=args.img_size) train_loader = torch.utils.data.DataLoader( data_loader, batch_size=args.batch_size, shuffle=True, num_workers=3, drop_last=False, pin_memory=True) if args.arch == "resnet50": model = models.resnet50(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet101": model = models.resnet101(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet152": model = models.resnet152(pretrained=True, num_classes=kernel_num) elif args.arch == "vgg16": model = models.vgg16(pretrained=False,num_classes=kernel_num) model = torch.nn.DataParallel(model).cuda() model.train() if hasattr(model.module, 'optimizer'): optimizer = model.module.optimizer else: # NOTE 这个地方的momentum对训练影响相当之大,使用0.99时训练crossentropy无法收敛. optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) title = 'icdar2015' if args.pretrain: print('Using pretrained model.') assert os.path.isfile(args.pretrain), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.pretrain) model.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss','Train Acc.', 'Train IOU.']) elif args.resume: print('Resuming from checkpoint.') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print('Training from scratch.') logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss','Train Acc.', 'Train IOU.']) images_loss = {} # data_plot = images_loss.values() # import matplotlib.pyplot as plt # plt.plot(data_plot) # plt.ylabel('Loss plot') # plt.show() for epoch in range(start_epoch, args.n_epoch): adjust_learning_rate(args, optimizer, epoch) print(('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.n_epoch, optimizer.param_groups[0]['lr']))) train_loss, train_te_acc, train_te_iou = train(train_loader,images_loss, model, dice_loss, optimizer, epoch,writer) if epoch %5 == 0 and epoch != 0: save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'lr': args.lr, 'optimizer' : optimizer.state_dict(), }, checkpoint=args.checkpoint,filename='checkpoint_%d.pth'%epoch) logger.append([optimizer.param_groups[0]['lr'], train_loss, train_te_acc, train_te_iou]) logger.close() writer.flush() writer.close()
def main(args): # torch.backends.cudnn.benchmark = True title = args.title if args.checkpoint == '': args.checkpoint = "checkpoints/%s_%s_bs_%d_ep_%d" % ( title, args.arch, args.batch_size, args.n_epoch) if args.pretrain: if 'synth' in args.pretrain: args.checkpoint += "_pretrain_synth" else: args.checkpoint += "_pretrain_ic17" print(('checkpoint path: %s' % args.checkpoint)) print(('init lr: %.8f' % args.lr)) print(('schedule: ', args.schedule)) args.vals = args.vals.split(';') if args.vals else [] print('vals:', args.vals) sys.stdout.flush() if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) kernel_num = 7 min_scale = 0.4 start_epoch = 0 #data_loader = CTW1500Loader(is_transform=True, img_size=args.img_size, kernel_num=kernel_num, min_scale=min_scale) #data_loader = IC15Loader(is_transform=True, img_size=args.img_size, kernel_num=kernel_num, min_scale=min_scale) data_loader = OcrDataLoader(args, is_transform=True, img_size=args.img_size, kernel_num=kernel_num, min_scale=min_scale) train_loader = torch.utils.data.DataLoader(data_loader, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=True, pin_memory=True) if args.arch == "resnet50": model = models.resnet50(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet101": model = models.resnet101(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet152": model = models.resnet152(pretrained=True, num_classes=kernel_num) if len(args.gpus) > 1: model = DataParallel(model, device_ids=args.gpus, chunk_sizes=args.chunk_sizes).cuda() optimizer = model.module.optimizer else: model = model.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.99, weight_decay=5e-4) # if hasattr(model.module, 'optimizer'): # optimizer = model.module.optimizer # else: # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.99, weight_decay=5e-4) if args.pretrain: print('Using pretrained model.') assert os.path.isfile( args.pretrain), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.pretrain) model.load_state_dict(checkpoint['state_dict']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) elif args.resume: print('Resuming from checkpoint.') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print('Training from scratch.') logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) best_target = {'epoch': 0, 'val': 0} for epoch in range(start_epoch, args.n_epoch): adjust_learning_rate(args, optimizer, epoch) print(('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.n_epoch, optimizer.param_groups[0]['lr']))) train_loss, train_te_acc, train_ke_acc, train_te_iou, train_ke_iou = train( train_loader, model, dice_loss, optimizer, epoch) # validate if args.vals: target = run_tests(args, model, epoch) # save best model if target > best_target['val']: best_target['val'] = target best_target['epoch'] = epoch + 1 save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'lr': args.lr, 'optimizer': optimizer.state_dict(), }, checkpoint=args.checkpoint, filename='best.pth.tar') print('best_target: epoch: %d, val:%.4f' % (best_target['epoch'], best_target['val'])) # save latest model save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'lr': args.lr, 'optimizer': optimizer.state_dict(), }, checkpoint=args.checkpoint) logger.append([ optimizer.param_groups[0]['lr'], train_loss, train_te_acc, train_te_iou ]) logger.close()
def main(args): if args.checkpoint == '': # args.checkpoint = "checkpointsfuns/funs19_%s_bs_%d_ep_%d"%(args.arch, args.batch_size, args.n_epoch) args.checkpoint = "checkpoints/model_funs_pretrain_ic15_frozen_dense_layers" if args.pretrain: if 'synth' in args.pretrain: args.checkpoint += "_pretrain_synth" else: args.checkpoint += "_pretrain_ic17" print('checkpoint path: %s' % args.checkpoint) print('init lr: %.8f' % args.lr) print('schedule: ', args.schedule) sys.stdout.flush() if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) kernel_num = 7 min_scale = 0.4 start_epoch = 0 data_loader = IC15Loader(is_transform=True, img_size=args.img_size, kernel_num=kernel_num, min_scale=min_scale) train_loader = torch.utils.data.DataLoader(data_loader, batch_size=args.batch_size, shuffle=True, num_workers=3, drop_last=True, pin_memory=True) if args.arch == "resnet50": model = models.resnet50(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet101": model = models.resnet101(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet152": model = models.resnet152(pretrained=True, num_classes=kernel_num) elif args.arch == "pvanet": model = models.pvanet(inputsize=args.img_size, num_classes=kernel_num) model = torch.nn.DataParallel(model).cuda() if hasattr(model.module, 'optimizer'): optimizer = model.module.optimizer else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.99, weight_decay=5e-4) title = 'icdar2015' if args.pretrain: print('Using pretrained model.') assert os.path.isfile( args.pretrain), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.pretrain) model.load_state_dict(checkpoint['state_dict']) # fine tune output layers # grad = [ # 'module.conv2.weight' # 'module.conv2.bias', # 'module.bn2.weight', # 'module.bn2.bias', # 'module.conv3.weight', # 'module.conv3.bias' # ] # for name,value in model.named_parameters(): # if name in grad: # value.requires_grad = True # else: # value.requires_grad = False logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) elif args.resume: print('Resuming from checkpoint.') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print('Training from scratch.') logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) writer = SummaryWriter(args.summary_path) for epoch in range(start_epoch, args.n_epoch): adjust_learning_rate(args, optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.n_epoch, optimizer.param_groups[0]['lr'])) train_loss, train_te_acc, train_ke_acc, train_te_iou, train_ke_iou = train( train_loader, model, dice_loss, optimizer, epoch) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'lr': args.lr, 'optimizer': optimizer.state_dict(), }, checkpoint=args.checkpoint) writer.add_scalar('Loss', train_loss, epoch) writer.add_scalar('train_te_acc', train_te_acc, epoch) writer.add_scalar('train_te_iou', train_te_iou, epoch) writer.flush() logger.append([ optimizer.param_groups[0]['lr'], train_loss, train_te_acc, train_te_iou ]) logger.close()
def train_psenet(config_file): import sys sys.path.append('./detection_model/PSENet') # sys.path.append('/home/cjy/PSENet-master') import torch import argparse import numpy as np import torch.nn as nn import torch.nn.functional as F import shutil from torch.autograd import Variable from torch.utils import data import os from dataset import IC15Loader from metrics import runningScore import models from util import Logger, AverageMeter import time from tensorboardX import SummaryWriter import util from yacs.config import CfgNode as CN writer = SummaryWriter() def read_config_file(config_file): # 用yaml重构配置文件 f = open(config_file) opt = CN.load_cfg(f) return opt args = read_config_file(config_file) def ohem_single(score, gt_text, training_mask): pos_num = (int)(np.sum(gt_text > 0.5)) - (int)( np.sum((gt_text > 0.5) & (training_mask <= 0.5))) if pos_num == 0: # selected_mask = gt_text.copy() * 0 # may be not good selected_mask = training_mask selected_mask = selected_mask.reshape( 1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32') return selected_mask neg_num = (int)(np.sum(gt_text <= 0.5)) neg_num = (int)(min(pos_num * 3, neg_num)) if neg_num == 0: selected_mask = training_mask selected_mask = selected_mask.reshape( 1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32') return selected_mask neg_score = score[gt_text <= 0.5] neg_score_sorted = np.sort(-neg_score) threshold = -neg_score_sorted[neg_num - 1] selected_mask = ((score >= threshold) | (gt_text > 0.5)) & (training_mask > 0.5) selected_mask = selected_mask.reshape( 1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32') return selected_mask def ohem_batch(scores, gt_texts, training_masks): scores = scores.data.cpu().numpy() gt_texts = gt_texts.data.cpu().numpy() training_masks = training_masks.data.cpu().numpy() selected_masks = [] for i in range(scores.shape[0]): selected_masks.append( ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[i, :, :])) selected_masks = np.concatenate(selected_masks, 0) selected_masks = torch.from_numpy(selected_masks).float() return selected_masks def dice_loss(input, target, mask): input = torch.sigmoid(input) input = input.contiguous().view(input.size()[0], -1) target = target.contiguous().view(target.size()[0], -1) mask = mask.contiguous().view(mask.size()[0], -1) input = input * mask target = target * mask a = torch.sum(input * target, 1) b = torch.sum(input * input, 1) + 0.001 c = torch.sum(target * target, 1) + 0.001 d = (2 * a) / (b + c) dice_loss = torch.mean(d) return 1 - dice_loss def cal_text_score(texts, gt_texts, training_masks, running_metric_text): training_masks = training_masks.data.cpu().numpy() pred_text = torch.sigmoid(texts).data.cpu().numpy() * training_masks pred_text[pred_text <= 0.5] = 0 pred_text[pred_text > 0.5] = 1 pred_text = pred_text.astype(np.int32) gt_text = gt_texts.data.cpu().numpy() * training_masks gt_text = gt_text.astype(np.int32) running_metric_text.update(gt_text, pred_text) score_text, _ = running_metric_text.get_scores() return score_text def cal_kernel_score(kernels, gt_kernels, gt_texts, training_masks, running_metric_kernel): mask = (gt_texts * training_masks).data.cpu().numpy() kernel = kernels[:, -1, :, :] gt_kernel = gt_kernels[:, -1, :, :] pred_kernel = torch.sigmoid(kernel).data.cpu().numpy() pred_kernel[pred_kernel <= 0.5] = 0 pred_kernel[pred_kernel > 0.5] = 1 pred_kernel = (pred_kernel * mask).astype(np.int32) gt_kernel = gt_kernel.data.cpu().numpy() gt_kernel = (gt_kernel * mask).astype(np.int32) running_metric_kernel.update(gt_kernel, pred_kernel) score_kernel, _ = running_metric_kernel.get_scores() return score_kernel def train(train_loader, model, criterion, optimizer, epoch): model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() running_metric_text = runningScore(2) running_metric_kernel = runningScore(2) end = time.time() for batch_idx, (imgs, gt_texts, gt_kernels, training_masks) in enumerate(train_loader): data_time.update(time.time() - end) imgs = Variable(imgs.cuda()) gt_texts = Variable(gt_texts.cuda()) gt_kernels = Variable(gt_kernels.cuda()) training_masks = Variable(training_masks.cuda()) outputs = model(imgs) texts = outputs[:, 0, :, :] kernels = outputs[:, 1:, :, :] selected_masks = ohem_batch(texts, gt_texts, training_masks) selected_masks = Variable(selected_masks.cuda()) loss_text = criterion(texts, gt_texts, selected_masks) loss_kernels = [] mask0 = torch.sigmoid(texts).data.cpu().numpy() mask1 = training_masks.data.cpu().numpy() selected_masks = ((mask0 > 0.5) & (mask1 > 0.5)).astype('float32') selected_masks = torch.from_numpy(selected_masks).float() selected_masks = Variable(selected_masks.cuda()) for i in range(6): kernel_i = kernels[:, i, :, :] gt_kernel_i = gt_kernels[:, i, :, :] loss_kernel_i = criterion(kernel_i, gt_kernel_i, selected_masks) loss_kernels.append(loss_kernel_i) loss_kernel = sum(loss_kernels) / len(loss_kernels) loss = 0.7 * loss_text + 0.3 * loss_kernel losses.update(loss.item(), imgs.size(0)) if batch_idx % 100 == 0: writer.add_scalar('loss_text', loss_text, batch_idx + epoch * len(train_loader)) writer.add_scalar('loss_kernel', loss_kernel, batch_idx + epoch * len(train_loader)) writer.add_scalar('total_loss', loss, batch_idx + epoch * len(train_loader)) optimizer.zero_grad() loss.backward() optimizer.step() score_text = cal_text_score(texts, gt_texts, training_masks, running_metric_text) score_kernel = cal_kernel_score(kernels, gt_kernels, gt_texts, training_masks, running_metric_kernel) batch_time.update(time.time() - end) end = time.time() if batch_idx % 20 == 0: output_log = '({batch}/{size}) Batch: {bt:.3f}s | TOTAL: {total:.0f}min | ETA: {eta:.0f}min | Loss: {loss:.4f} | Acc_t: {acc: .4f} | IOU_t: {iou_t: .4f} | IOU_k: {iou_k: .4f}'.format( batch=batch_idx + 1, size=len(train_loader), bt=batch_time.avg, total=batch_time.avg * batch_idx / 60.0, eta=batch_time.avg * (len(train_loader) - batch_idx) / 60.0, loss=losses.avg, acc=score_text['Mean Acc'], iou_t=score_text['Mean IoU'], iou_k=score_kernel['Mean IoU']) print(output_log) sys.stdout.flush() return (losses.avg, score_text['Mean Acc'], score_kernel['Mean Acc'], score_text['Mean IoU'], score_kernel['Mean IoU']) def adjust_learning_rate(args, optimizer, epoch): global state if epoch in args.schedule: args.lr = args.lr * 0.1 for param_group in optimizer.param_groups: param_group['lr'] = args.lr def save_checkpoint(state, checkpoint='checkpoint', filename='_checkpoint.pth.tar', epoch=0): filepath = os.path.join(checkpoint, 'epoch_' + str(epoch) + filename) torch.save(state, filepath) if args.checkpoint == '': args.checkpoint = "checkpoints/ic15_%s_bs_%d_ep_%d" % ( args.arch, args.batch_size, args.n_epoch) if args.pretrain: if 'synth' in args.pretrain: args.checkpoint += "_pretrain_synth" else: args.checkpoint += "_pretrain_LSVT" print('checkpoint path: %s' % args.checkpoint) print('init lr: %.8f' % args.lr) print('schedule: ', args.schedule) sys.stdout.flush() # if not os.path.isdir(args.checkpoint): # os.makedirs(args.checkpoint) kernel_num = 7 min_scale = 0.4 start_epoch = 0 data_loader = IC15Loader(is_transform=True, img_size=args.img_size, kernel_num=kernel_num, min_scale=min_scale) train_loader = torch.utils.data.DataLoader(data_loader, batch_size=args.batch_size, shuffle=True, num_workers=3, drop_last=True, pin_memory=True) if args.arch == "resnet50": model = models.resnet50(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet101": model = models.resnet101(pretrained=True, num_classes=kernel_num) elif args.arch == "resnet152": model = models.resnet152(pretrained=True, num_classes=kernel_num) model = torch.nn.DataParallel(model).cuda() if hasattr(model.module, 'optimizer'): optimizer = model.module.optimizer else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.99, weight_decay=5e-4) title = 'icdar2015' if args.pretrain: print('Using pretrained model.') assert os.path.isfile( args.pretrain), 'Error: no checkpoint directory found!' print(args.pretrain) checkpoint = torch.load(args.pretrain) state = model.state_dict() for key in state.keys(): if key in checkpoint.keys(): state[key] = pretrained_model[key] model.load_state_dict(state) # model.load_state_dict(checkpoint['state_dict']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) elif args.resume: print('Resuming from checkpoint.') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print('Training from scratch.') logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Learning Rate', 'Train Loss', 'Train Acc.', 'Train IOU.']) for epoch in range(start_epoch, args.n_epoch): adjust_learning_rate(args, optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.n_epoch, optimizer.param_groups[0]['lr'])) train_loss, train_te_acc, train_ke_acc, train_te_iou, train_ke_iou = train( train_loader, model, dice_loss, optimizer, epoch) if (epoch + 1) % 5 == 0: save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'lr': args.lr, 'optimizer': optimizer.state_dict(), }, checkpoint=args.checkpoint, epoch=epoch) logger.append([ optimizer.param_groups[0]['lr'], train_loss, train_te_acc, train_te_iou ]) logger.close()
def main(args): start_epoch = 0 start_layer = 1 if args.checkpoint == '': args.checkpoint = "lista_checkpoint/n%d_s%d_p%d_snr%d/%s_bs_%d_ep_%d/measurements%d"\ %(args.sample_nums, args.antenna_x*args.antenna_y, args.fault_prob*100, args.SNR, args.arch, args.batch_size, args.n_epoch, args.measurements) print('checkpoint path: %s' % args.checkpoint) print('init lr: %.8f' % args.lr) #print('schedule: ', args.schedule) sys.stdout.flush() if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) data_loader = ListaDataLoader(args) train_loader = torch.utils.data.DataLoader(data_loader, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=True, pin_memory=True) # sensing matrix A = ArrayResposeLoad(measurements=args.measurements, antenna_x=args.antenna_x, antenna_y=args.antenna_y) if args.arch == "LISTA": model = models.LISTA(A=A, T=args.T, lam=args.lam, untied=args.untied, coord=args.coord) model = torch.nn.DataParallel(model).cuda() #for p,v in model.named_parameters(): # pdb.set_trace() if args.pretrain: print('Using pretrained model.') assert os.path.isfile( args.pretrain), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.pretrain) d = collections.OrderedDict() keys = list(checkpoint['state_dict'].keys()) for pname, para in model.named_parameters(): if pname in keys and checkpoint['state_dict'][ pname].shape == para.shape: d[pname] = checkpoint['state_dict'][pname] model.load_state_dict(d) logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names(['Layer', 'Epoch', 'Learning Rate', 'nmse']) elif args.resume: print("Resuming from checkpoint") assert os.path.isfile( args.resume), 'Error: no resume checkpoint directory found!' checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 start_layer = checkpoint['layer'] model.load_state_dict(checkpoint['state_dict']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True) else: print('Training from scratch.') logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names(['Layer', 'Epoch', 'Learning Rate', 'nmse']) bestResult = np.inf for layer in range(start_layer, args.T + 1): print('Start training layer:{}'.format(layer)) if args.untied: for name, para in model.named_parameters(): if name.endswith('_{}'.format(layer)): para.requires_grad = True else: para.requires_grad = False else: for name, para in model.named_parameters(): if name.endswith('W') or name.endswith('B'): para.requires_grad = True continue if name.endswith('theta_{}'.format(layer)): para.requires_grad = True else: para.requires_grad = False #for name, para in model.named_parameters(): # pdb.set_trace() optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad == True, model.parameters()), lr=args.lr) lr_scheduler = CosineAnnealingLR(optimizer, T_max=args.n_epoch, eta_min=5e-6) if args.resume: checkpoint = torch.load(args.resume) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) for epoch in range(start_epoch, args.n_epoch): start_epoch = 0 #adjust_learning_rate(args, optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.n_epoch, optimizer.param_groups[0]['lr'])) nmse = train(train_loader, model, optimizer, epoch, layer) lr_scheduler.step() save_checkpoint( { 'layer': layer, 'epoch': epoch, 'state_dict': model.state_dict(), 'lr': args.lr, 'lr_scheduler': lr_scheduler.state_dict(), 'optimizer': optimizer.state_dict(), }, layer, epoch + 1, checkpoint=args.checkpoint) if args.need_validate and (epoch + 1) % 5 == 0: print('Validating the model') avgNmse = validate(model, args, layer) print('The normalized mse in val set is:{nmse:.6f}'.format( nmse=avgNmse)) if True and avgNmse < bestResult: print('Save the best model!') bestResult = avgNmse save_best_checkpoint( { 'layer': layer, 'epoch': epoch, 'state_dict': model.state_dict(), 'lr': args.lr, 'lr_scheduler': lr_scheduler.state_dict(), 'optimizer': optimizer.state_dict(), }, checkpoint=args.checkpoint) logger.append( [layer, epoch + 1, optimizer.param_groups[0]['lr'], nmse]) #recovery parameters args.resume = None # lastly finetune the model finetune(args, model, train_loader, start_epoch, logger, bestResult) logger.close()