def main(): # os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_list FLAGS.checkpoint_path = osp.join(FLAGS.checkpoint_path, str(datetime.date.today())) # check if checkpoint path exists if not os.path.exists(FLAGS.checkpoint_path): os.makedirs(FLAGS.checkpoint_path) else: shutil.rmtree(FLAGS.checkpoint_path) os.makedirs(FLAGS.checkpoint_path) train_dataset_size = 1125 val_dataset_size = 471 train_data_generator = data_processor.generator( 'train_1125_13_train_15_train_8_8_64_no###.h5', dataset_size=train_dataset_size, batch_size=8) val_data_generator = data_processor.generator( 'val_471_15_test_8_8_64_no###.h5', dataset_size=val_dataset_size, batch_size=8) east = EAST_model(FLAGS.input_size) model = east.model # model = multi_gpu_model(east.model, gpus=2) score_map_loss_weight = K.variable(0.01, name='score_map_loss_weight') csv_logger = CSVLogger(filename='icdar_2015_2013.csv', separator=',', append=True) checkpoint = ModelCheckpoint( filepath='icdar_2015_2013_{epoch:02d}_{loss:.4f}_{val_loss:.4f}.h5', monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=False, mode='auto', period=1) terminate_on_nan = TerminateOnNaN() callbacks = [csv_logger, checkpoint, terminate_on_nan] opt = AdamW(FLAGS.init_learning_rate) model.compile(loss=[ dice_loss(east.text_region_boundary_mask, score_map_loss_weight), rbox_loss(east.target_score_map) ], loss_weights=[1., 1.], optimizer=opt) initial_epoch = 0 history = model.fit_generator( train_data_generator, epochs=FLAGS.max_epochs, steps_per_epoch=train_dataset_size // FLAGS.batch_size, validation_data=val_data_generator, validation_steps=val_dataset_size // FLAGS.batch_size, callbacks=callbacks, initial_epoch=initial_epoch, verbose=1)
def main(argv=None): os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_list # check if checkpoint path exists if not os.path.exists(FLAGS.checkpoint_path): os.mkdir(FLAGS.checkpoint_path) else: #if not FLAGS.restore: # shutil.rmtree(FLAGS.checkpoint_path) # os.mkdir(FLAGS.checkpoint_path) shutil.rmtree(FLAGS.checkpoint_path) os.mkdir(FLAGS.checkpoint_path) train_data_generator = data_processor.generator(FLAGS) train_samples_count = data_processor.count_samples(FLAGS) val_data = data_processor.load_data(FLAGS) if len(gpus) <= 1: print('Training with 1 GPU') if FLAGS.drn: east = EAST_DRN_model(input_size=FLAGS.input_size) else: east = EAST_model(FLAGS.input_size) parallel_model = east.model else: print('Training with %d GPUs' % len(gpus)) with tf.device("/cpu:0"): east = EAST_model(FLAGS.input_size) if FLAGS.restore_model is not '': east.model.load_weights(FLAGS.restore_model) parallel_model = multi_gpu_model(east.model, gpus=len(gpus)) score_map_loss_weight = K.variable(0.01, name='score_map_loss_weight') small_text_weight = K.variable(0., name='small_text_weight') lr_scheduler = LearningRateScheduler(lr_decay) ckpt = CustomModelCheckpoint(model=east.model, path=FLAGS.checkpoint_path + '/model-{epoch:02d}.h5', period=FLAGS.save_checkpoint_epochs, save_weights_only=True) tb = CustomTensorBoard(log_dir=FLAGS.checkpoint_path + '/train', score_map_loss_weight=score_map_loss_weight, small_text_weight=small_text_weight, data_generator=train_data_generator, write_graph=True) small_text_weight_callback = SmallTextWeight(small_text_weight) validation_evaluator = ValidationEvaluator(val_data, validation_log_dir=FLAGS.checkpoint_path + '/val') callbacks = [lr_scheduler, ckpt, tb, small_text_weight_callback, validation_evaluator] opt = AdamW(FLAGS.init_learning_rate) parallel_model.compile(loss=[dice_loss(east.overly_small_text_region_training_mask, east.text_region_boundary_training_mask, score_map_loss_weight, small_text_weight), rbox_loss(east.overly_small_text_region_training_mask, east.text_region_boundary_training_mask, small_text_weight, east.target_score_map)], loss_weights=[1., 1.], optimizer=opt) east.model.summary() model_json = east.model.to_json() with open(FLAGS.checkpoint_path + '/model.json', 'w') as json_file: json_file.write(model_json) history = parallel_model.fit_generator(train_data_generator, epochs=FLAGS.max_epochs, steps_per_epoch=train_samples_count/FLAGS.batch_size, workers=FLAGS.nb_workers, use_multiprocessing=True, callbacks=callbacks, verbose=1)
def pretrain_generator(netG, module, param, batch_size): outel = list(netG._modules.values())[-1].weight.shape[0] dwidth = min(1024, outel) netD = make_netD(dwidth, batch_size) print( f"Layer size: {outel}, G params: {param_count(netG)}, D params: {param_count(netD)}" ) optimG = AdamW(netG.parameters(), lr=5e-4, weight_decay=1e-4) optimD = AdamW(netD.parameters(), lr=5e-5, weight_decay=1e-4) i = 0 d_adv_meter = AverageMeter() while True: netG.zero_grad() netD.zero_grad() z = fast_randn((batch_size, 256), requires_grad=True, device=device) q = netG(z) free_params([netD]) freeze_params([netG]) noise = codes_with_dropout(generate_noise(module, param, batch_size), dwidth) codes = codes_with_dropout(q, dwidth) d_real = netD(noise) d_fake = netD(codes) interp = random_interpolate(noise, codes, device=device) gp = calc_gradient_penalty(netD, interp, device=device) d_adv = d_fake.mean() - d_real.mean() d_loss = d_adv + 10 * gp d_adv_meter.update(d_adv.item()) d_loss.backward(retain_graph=True) optimD.step() freeze_params([netD]) free_params([netG]) d_fake_loss = -d_fake.mean() d_fake_loss.backward() optimG.step() if i % 50 == 0: print(d_adv_meter.avg, gp.item()) if i > 2000 and d_adv_meter.avg > 0: break d_adv_meter.reset() i += 1
def main(): train_data_generator = DataGenerator(input_size=FLAGS.input_size, batch_size=FLAGS.batch_size, data_path=FLAGS.training_data_path, FLAGS=FLAGS, is_train=True) train_samples_count = len(train_data_generator.image_paths) validation_data_generator = DataGenerator(input_size=FLAGS.input_size, batch_size=FLAGS.batch_size, data_path=FLAGS.validation_data_path, FLAGS=FLAGS, is_train=False) east = EastModel(FLAGS.input_size) if FLAGS.pretrained_weights_path != '': print(f'Loading pre-trained model at {FLAGS.pretrained_weights_path}') east.model.load_weights(FLAGS.pretrained_weights_path) score_map_loss_weight = K.variable(0.01, name='score_map_loss_weight') small_text_weight = K.variable(0., name='small_text_weight') opt = AdamW(FLAGS.init_learning_rate) east.model.compile( loss=[ dice_loss(east.overly_small_text_region_training_mask, east.text_region_boundary_training_mask, score_map_loss_weight, small_text_weight), rbox_loss(east.overly_small_text_region_training_mask, east.text_region_boundary_training_mask, small_text_weight, east.target_score_map) ], loss_weights=[1., 1.], optimizer=opt, ) tb_callback = tensorboard_callback() cp_callback = checkpoint_callback() with open(os.path.join(FLAGS.checkpoint_path, 'model.json'), 'w') as json_file: json_file.write(east.model.to_json()) east.model.fit_generator( generator=train_data_generator, epochs=FLAGS.max_epochs, steps_per_epoch=train_samples_count // FLAGS.batch_size, validation_data=validation_data_generator, callbacks=[cp_callback, tb_callback], workers=FLAGS.nb_workers, use_multiprocessing=True, max_queue_size=10, verbose=1, )
batch_size=1, shuffle=False) test_set = RealDataset(opt.real_path, opt.channels, split='test') test_loader = DataLoader(dataset=test_set, num_workers=0, batch_size=1, shuffle=False) opt.n_classes = train_set.n_classes net = PowderNet(opt.arch, opt.n_channels, train_set.n_classes) net = net.cuda() optimizer = AdamW([{ 'params': get_1x_lr_params(net) }, { 'params': get_10x_lr_params(net), 'lr': opt.lr * 10 }], lr=opt.lr, weight_decay=opt.decay) scheduler = CosineLRWithRestarts(optimizer, opt.batch_size, len(train_set), opt.period, opt.t_mult) vis = Visualizer(server=opt.server, env=opt.env) start_epoch = 0 if opt.resume is not None: checkpoint = torch.load(opt.resume) old_opt = checkpoint['opt'] assert (old_opt.channels == opt.channels) assert (old_opt.bands == opt.bands) assert (old_opt.arch == opt.arch) assert (old_opt.blend == opt.blend) assert (old_opt.lr == opt.lr)
def main(config): seed_all() os.makedirs('cache', exist_ok=True) os.makedirs(config.logdir, exist_ok=True) print("Logging to: %s" % config.logdir) src_files = sorted(glob('*.py')) for src_fn in src_files: dst_fn = os.path.join(config.logdir, src_fn) copyfile(src_fn, dst_fn) train_image_fns = sorted(glob(os.path.join(config.train_dir, '*.jpg'))) test_image_fns = sorted(glob(os.path.join(config.test_dir, '*.jpg'))) assert len(train_image_fns) == 3881 assert len(test_image_fns) == 4150 gt, label_to_int = load_gt(config.train_rle) int_to_label = {v: k for k, v in label_to_int.items()} # create folds np.random.shuffle(train_image_fns) if config.subset > 0: train_image_fns = train_image_fns[:config.subset] folds = np.arange(len(train_image_fns)) % config.num_folds val_image_fns = [ fn for k, fn in enumerate(train_image_fns) if folds[k] == config.fold ] train_image_fns = [ fn for k, fn in enumerate(train_image_fns) if folds[k] != config.fold ] if config.add_val: print("Training on validation set") train_image_fns = train_image_fns + val_image_fns[:] print(len(val_image_fns), len(train_image_fns)) # TODO: drop empty images <- is this helpful? train_image_fns = [ fn for fn in train_image_fns if KuzushijiDataset.fn_to_id(fn) in gt ] val_image_fns = [ fn for fn in val_image_fns if KuzushijiDataset.fn_to_id(fn) in gt ] print("VAL: ", len(val_image_fns), val_image_fns[123]) print("TRAIN: ", len(train_image_fns), train_image_fns[456]) train_ds = KuzushijiDataset(train_image_fns, gt_boxes=gt, label_to_int=label_to_int, augment=True) val_ds = KuzushijiDataset(val_image_fns, gt_boxes=gt, label_to_int=label_to_int) if config.cache: train_ds.cache() val_ds.cache() val_loader = data.DataLoader(val_ds, batch_size=config.batch_size // 8, shuffle=False, num_workers=config.num_workers, pin_memory=config.pin, drop_last=False) model = FPNSegmentation(config.slug) if config.weight is not None: print("Loading: %s" % config.weight) model.load_state_dict(th.load(config.weight)) model = model.to(config.device) no_decay = ['mean', 'std', 'bias'] + ['.bn%d.' % i for i in range(100)] grouped_parameters = [{ 'params': [], 'weight_decay': config.weight_decay }, { 'params': [], 'weight_decay': 0.0 }] for n, p in model.named_parameters(): if not any(nd in n for nd in no_decay): # print("Decay: %s" % n) grouped_parameters[0]['params'].append(p) else: # print("No Decay: %s" % n) grouped_parameters[1]['params'].append(p) optimizer = AdamW(grouped_parameters, lr=config.lr) if config.apex: model, optimizer = apex.amp.initialize(model, optimizer, opt_level="O1", verbosity=0) updates_per_epoch = len(train_ds) // config.batch_size num_updates = int(config.epochs * updates_per_epoch) scheduler = WarmupLinearSchedule(warmup=config.warmup, t_total=num_updates) # training loop smooth = 0.1 best_acc = 0.0 best_fn = None global_step = 0 for epoch in range(1, config.epochs + 1): smooth_loss = None smooth_accuracy = None model.train() train_loader = data.DataLoader(train_ds, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, pin_memory=config.pin, drop_last=True) progress = tqdm(total=len(train_ds), smoothing=0.01) if True: for i, (X, fns, hm, centers, classes) in enumerate(train_loader): X = X.to(config.device).float() hm = hm.to(config.device) centers = centers.to(config.device) classes = classes.to(config.device) hm_pred, classes_pred = model(X, centers=centers) loss = kuzushiji_loss(hm, centers, classes, hm_pred, classes_pred) if config.apex: with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() lr_this_step = None if (i + 1) % config.accumulation_step == 0: optimizer.step() optimizer.zero_grad() lr_this_step = config.lr * scheduler.get_lr( global_step, config.warmup) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step global_step += 1 smooth_loss = loss.item() if smooth_loss is None else \ smooth * loss.item() + (1. - smooth) * smooth_loss # print((y_true >= 0.5).sum().item()) accuracy = th.mean( ((th.sigmoid(hm_pred) >= 0.5) == (hm == 1)).to( th.float)).item() smooth_accuracy = accuracy if smooth_accuracy is None else \ smooth * accuracy + (1. - smooth) * smooth_accuracy progress.set_postfix( ep='%d/%d' % (epoch, config.epochs), loss='%.4f' % smooth_loss, accuracy='%.4f' % (smooth_accuracy), lr='%.6f' % (config.lr if lr_this_step is None else lr_this_step)) progress.update(len(X)) # skip validation if epoch not in [10, 20, 30, 40, 50]: if 1 < epoch <= 65: continue # validation loop model.eval() progress = tqdm(enumerate(val_loader), total=len(val_loader)) hm_correct, classes_correct = 0, 0 num_hm, num_classes = 0, 0 with th.no_grad(): for i, (X, fns, hm, centers, classes) in progress: X = X.to(config.device).float() hm = hm.cuda() centers = centers.cuda() classes = classes.cuda() hm_pred, classes_pred = model(X) hm_pred = th.sigmoid(hm_pred) classes_pred = th.nn.functional.softmax(classes_pred, 1) hm_cuda = hm.cuda() # PyTorch 1.2 has `bool` if hasattr(hm_cuda, 'bool'): hm_cuda = hm_cuda.bool() hm_correct += (hm_cuda == (hm_pred >= 0.5)).float().sum().item() num_hm += np.prod(hm.shape) num_samples = len(X) for sample_ind in range(num_samples): center_mask = centers[sample_ind, :, 0] != -1 per_image_letters = center_mask.sum().item() if per_image_letters == 0: continue num_classes += per_image_letters centers_per_img = centers[sample_ind][center_mask] classes_per_img = classes[sample_ind][center_mask] classes_per_img_pred = classes_pred[ sample_ind][:, centers_per_img[:, 1], centers_per_img[:, 0]].argmax(0) classes_correct += ( classes_per_img_pred == classes_per_img).sum().item() num_classes += per_image_letters val_hm_acc = hm_correct / num_hm val_classes_acc = classes_correct / num_classes summary_str = 'f%02d-ep-%04d-val_hm_acc-%.4f-val_classes_acc-%.4f' % ( config.fold, epoch, val_hm_acc, val_classes_acc) progress.write(summary_str) if val_classes_acc >= best_acc: weight_fn = os.path.join(config.logdir, summary_str + '.pth') progress.write("New best: %s" % weight_fn) th.save(model.state_dict(), weight_fn) best_acc = val_classes_acc best_fn = weight_fn fns = sorted( glob(os.path.join(config.logdir, 'f%02d-*.pth' % config.fold))) for fn in fns[:-config.n_keep]: os.remove(fn) # create submission test_ds = KuzushijiDataset(test_image_fns) test_loader = data.DataLoader(test_ds, batch_size=config.batch_size // 8, shuffle=False, num_workers=config.num_workers, pin_memory=False, drop_last=False) if best_fn is not None: model.load_state_dict(th.load(best_fn)) model.eval() sub = create_submission(model, test_loader, int_to_label, config, pred_zip=config.pred_zip) sub.to_csv(config.submission_fn, index=False) print("Wrote to: %s" % config.submission_fn) # create val submission val_fn = config.submission_fn.replace('.csv', '_VAL.csv') model.eval() sub = [] sub = create_submission(model, val_loader, int_to_label, config, pred_zip=config.pred_zip.replace( '.zip', '_VAL.zip')) sub.to_csv(val_fn, index=False) print("Wrote to: %s" % val_fn)
def main_worker(gpu, parallel, args, result_dir): if parallel: args.rank = args.rank + gpu torch.distributed.init_process_group(backend='nccl', init_method=args.dist_url, world_size=args.world_size, rank=args.rank) torch.backends.cudnn.benchmark = True random_seed(args.seed + args.rank) # make data aug different for different processes torch.cuda.set_device(gpu) assert args.batch_size % args.world_size == 0 from dataset import load_data, get_statistics, default_eps, input_dim train_loader, test_loader = load_data(args.dataset, 'data/', args.batch_size // args.world_size, parallel, augmentation=True, classes=None) mean, std = get_statistics(args.dataset) num_classes = len(train_loader.dataset.classes) from model.bound_module import Predictor, BoundFinalIdentity from model.mlp import MLPFeature, MLP from model.conv import ConvFeature, Conv model_name, params = parse_function_call(args.model) if args.predictor_hidden_size > 0: model = locals()[model_name](input_dim=input_dim[args.dataset], **params) predictor = Predictor(model.out_features, args.predictor_hidden_size, num_classes) else: model = locals()[model_name](input_dim=input_dim[args.dataset], num_classes=num_classes, **params) predictor = BoundFinalIdentity() model = Model(model, predictor, eps=0) model = model.cuda(gpu) if parallel: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu]) loss_name, params = parse_function_call(args.loss) loss = Loss(globals()[loss_name](**params), args.kappa) output_flag = not parallel or gpu == 0 if output_flag: logger = Logger(os.path.join(result_dir, 'log.txt')) for arg in vars(args): logger.print(arg, '=', getattr(args, arg)) logger.print(train_loader.dataset.transform) logger.print(model) logger.print('number of params: ', sum([p.numel() for p in model.parameters()])) logger.print('Using loss', loss) train_logger = TableLogger(os.path.join(result_dir, 'train.log'), ['epoch', 'loss', 'acc']) test_logger = TableLogger(os.path.join(result_dir, 'test.log'), ['epoch', 'loss', 'acc']) else: logger = train_logger = test_logger = None optimizer = AdamW(model, lr=args.lr, weight_decay=args.wd, betas=(args.beta1, args.beta2), eps=args.epsilon) if args.checkpoint: assert os.path.isfile(args.checkpoint) if parallel: torch.distributed.barrier() checkpoint = torch.load( args.checkpoint, map_location=lambda storage, loc: storage.cuda(gpu)) state_dict = checkpoint['state_dict'] if next(iter(state_dict))[0:7] == 'module.' and not parallel: new_state_dict = OrderedDict([(k[7:], v) for k, v in state_dict.items()]) state_dict = new_state_dict elif next(iter(state_dict))[0:7] != 'module.' and parallel: new_state_dict = OrderedDict([('module.' + k, v) for k, v in state_dict.items()]) state_dict = new_state_dict model.load_state_dict(state_dict) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded '{}'".format(args.checkpoint)) if parallel: torch.distributed.barrier() if args.eps_test is None: args.eps_test = default_eps[args.dataset] if args.eps_train is None: args.eps_train = args.eps_test args.eps_train /= std args.eps_test /= std up = torch.FloatTensor((1 - mean) / std).view(-1, 1, 1).cuda(gpu) down = torch.FloatTensor((0 - mean) / std).view(-1, 1, 1).cuda(gpu) attacker = AttackPGD(model, args.eps_test, step_size=args.eps_test / 4, num_steps=20, up=up, down=down) args.epochs = [int(epoch) for epoch in args.epochs.split(',')] schedule = create_schedule(args, len(train_loader), model, loss, optimizer) if args.visualize and output_flag: from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter(result_dir) else: writer = None for epoch in range(args.start_epoch, args.epochs[-1]): if parallel: train_loader.sampler.set_epoch(epoch) train_loss, train_acc = train(model, loss, epoch, train_loader, optimizer, schedule, logger, train_logger, gpu, parallel, args.print_freq) test_loss, test_acc = test(model, loss, epoch, test_loader, logger, test_logger, gpu, parallel, args.print_freq) if writer is not None: writer.add_scalar('curve/p', get_p_norm(model), epoch) writer.add_scalar('curve/train loss', train_loss, epoch) writer.add_scalar('curve/test loss', test_loss, epoch) writer.add_scalar('curve/train acc', train_acc, epoch) writer.add_scalar('curve/test acc', test_acc, epoch) if epoch % 50 == 49: if logger is not None: logger.print( 'Generate adversarial examples on training dataset and test dataset (fast, inaccurate)' ) robust_train_acc = gen_adv_examples(model, attacker, train_loader, gpu, parallel, logger, fast=True) robust_test_acc = gen_adv_examples(model, attacker, test_loader, gpu, parallel, logger, fast=True) if writer is not None: writer.add_scalar('curve/robust train acc', robust_train_acc, epoch) writer.add_scalar('curve/robust test acc', robust_test_acc, epoch) if epoch % 5 == 4: certified_acc = certified_test(model, args.eps_test, up, down, epoch, test_loader, logger, gpu, parallel) if writer is not None: writer.add_scalar('curve/certified acc', certified_acc, epoch) if epoch > args.epochs[-1] - 3: if logger is not None: logger.print("Generate adversarial examples on test dataset") gen_adv_examples(model, attacker, test_loader, gpu, parallel, logger) certified_test(model, args.eps_test, up, down, epoch, test_loader, logger, gpu, parallel) schedule(args.epochs[-1], 0) if output_flag: logger.print( "Calculate certified accuracy on training dataset and test dataset" ) certified_test(model, args.eps_test, up, down, args.epochs[-1], train_loader, logger, gpu, parallel) certified_test(model, args.eps_test, up, down, args.epochs[-1], test_loader, logger, gpu, parallel) if output_flag: torch.save( { 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, os.path.join(result_dir, 'model.pth')) if writer is not None: writer.close()
Path(opt.out_path).mkdir(parents=True, exist_ok=True) train_set = HalfHalfDataset(opt.real_path, opt.syn_path, opt.params_path, opt.blend, opt.channels, opt.split) train_loader = DataLoader(dataset=train_set, num_workers=opt.threads, batch_size=opt.batch_size, shuffle=True, pin_memory=True) val_set = RealDataset(opt.real_path, opt.channels, split='val') val_loader = DataLoader(dataset=val_set, num_workers=0, batch_size=1, shuffle=False) test_set = RealDataset(opt.real_path, opt.channels, split='test') test_loader = DataLoader(dataset=test_set, num_workers=0, batch_size=1, shuffle=False) opt.n_classes = train_set.n_classes net = PowderNet(opt.arch, opt.n_channels, train_set.n_classes) net = net.cuda() optimizer = AdamW([{'params': get_1x_lr_params(net)}, {'params': get_10x_lr_params(net), 'lr': opt.lr * 10}], lr=opt.lr, weight_decay=opt.decay) scheduler = CosineLRWithRestarts(optimizer, opt.batch_size, len(train_set), opt.period, opt.t_mult) vis = Visualizer(server=opt.server, env=opt.env) start_epoch = 0 if opt.resume is not None: checkpoint = torch.load(opt.resume) old_opt = checkpoint['opt'] assert(old_opt.channels == opt.channels) assert(old_opt.bands == opt.bands) assert(old_opt.arch == opt.arch) assert(old_opt.blend == opt.blend) assert(old_opt.lr == opt.lr) assert(old_opt.decay == opt.decay) assert(old_opt.period == opt.period) assert(old_opt.t_mult == opt.t_mult) net.load_state_dict(checkpoint['state_dict'])
def main(): print(args) if not osp.exists(args.dir): os.makedirs(args.dir) if args.use_gpu: torch.cuda.set_device(args.gpu) cudnn.enabled = True cudnn.benchmark = True if args.manualSeed is None: args.manualSeed = random.randint(1, 10000) np.random.seed(args.manualSeed) labeled_size = args.label_num + args.val_num num_classes = 10 data_dir = '../cifar10_data/' normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2470, 0.2435, 0.2616]) # transform is implemented inside zca dataloader dataloader = cifar.CIFAR10 if args.auto: transform_train = transforms.Compose([ transforms.RandomCrop( 32, padding=4, fill=128 ), # fill parameter needs torchvision installed from source transforms.RandomHorizontalFlip(), CIFAR10Policy(), transforms.ToTensor(), Cutout( n_holes=1, length=16 ), # (https://github.com/uoguelph-mlrg/Cutout/blob/master/util/cutout.py) normalize ]) else: transform_train = transforms.Compose([ transforms.RandomCrop( 32, padding=4, fill=128 ), # fill parameter needs torchvision installed from source transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) base_dataset = datasets.CIFAR10(data_dir, train=True, download=True) train_labeled_idxs, train_unlabeled_idxs, val_idxs = train_val_split( base_dataset.targets, int(args.label_num / 10)) labelset = CIFAR10_labeled(data_dir, train_labeled_idxs, train=True, transform=transform_train) labelset2 = CIFAR10_labeled(data_dir, train_labeled_idxs, train=True, transform=transform_test) unlabelset = CIFAR10_labeled(data_dir, train_unlabeled_idxs, train=True, transform=transform_train) unlabelset2 = CIFAR10_labeled(data_dir, train_unlabeled_idxs, train=True, transform=transform_test) validset = CIFAR10_labeled(data_dir, val_idxs, train=True, transform=transform_test) testset = CIFAR10_labeled(data_dir, train=False, transform=transform_test) label_y = np.array(labelset.targets).astype(np.int32) unlabel_y = np.array(unlabelset.targets).astype(np.int32) unlabel_num = unlabel_y.shape[0] label_loader = torch.utils.data.DataLoader(labelset, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=True, drop_last=True) label_loader2 = torch.utils.data.DataLoader( labelset2, batch_size=args.eval_batch_size, num_workers=args.num_workers, pin_memory=True) unlabel_loader = torch.utils.data.DataLoader( unlabelset, batch_size=args.eval_batch_size, num_workers=args.num_workers, pin_memory=True) unlabel_loader2 = torch.utils.data.DataLoader( unlabelset2, batch_size=args.eval_batch_size, num_workers=args.num_workers, pin_memory=True) validloader = torch.utils.data.DataLoader(validset, batch_size=args.eval_batch_size, num_workers=args.num_workers, pin_memory=True) testloader = torch.utils.data.DataLoader(testset, batch_size=args.eval_batch_size, num_workers=args.num_workers, pin_memory=True) #initialize models model1 = create_model(args.num_classes, args.model) model2 = create_model(args.num_classes, args.model) ema_model = create_model(args.num_classes, args.model) if args.use_gpu: model1 = model1.cuda() model2 = model2.cuda() ema_model = ema_model.cuda() for param in ema_model.parameters(): param.detach_() df = pd.DataFrame() stats_path = osp.join(args.dir, 'stats.txt') '''if prop > args.scale: prop = args.scale''' optimizer1 = AdamW(model1.parameters(), lr=args.lr) if args.init1 and osp.exists(args.init1): model1.load_state_dict( torch.load(args.init1, map_location='cuda:{}'.format(args.gpu))) ema_optimizer = WeightEMA(model1, ema_model, alpha=args.ema_decay) if args.init and osp.exists(args.init): model1.load_state_dict( torch.load(args.init, map_location='cuda:{}'.format(args.gpu))) _, best_acc = evaluate(validloader, ema_model, prefix='val') best_ema_path = osp.join(args.dir, 'best_ema.pth') best_model1_path = osp.join(args.dir, 'best_model1.pth') best_model2_path = osp.join(args.dir, 'best_model2.pth') init_path = osp.join(args.dir, 'init_ema.pth') init_path1 = osp.join(args.dir, 'init1.pth') init_path2 = osp.join(args.dir, 'init2.pth') torch.save(ema_model.state_dict(), init_path) torch.save(model1.state_dict(), init_path1) torch.save(model2.state_dict(), init_path2) torch.save(ema_model.state_dict(), best_ema_path) torch.save(model1.state_dict(), best_model1_path) skip_model2 = False end_iter = False confident_indices = np.array([], dtype=np.int64) all_indices = np.arange(unlabel_num).astype(np.int64) #no_help_indices = np.array([]).astype(np.int64) pseudo_labels = np.zeros(all_indices.shape, dtype=np.int32) steps_per_epoch = len(iter(label_loader)) max_epoch = args.steps // steps_per_epoch logger = logging.getLogger('init') file_handler = logging.FileHandler(osp.join(args.dir, 'init.txt')) logger.addHandler(file_handler) logger.setLevel(logging.INFO) for epoch in range(max_epoch * 4 // 5): if args.mix: train_init_mix(label_loader, model1, optimizer1, ema_optimizer, steps_per_epoch, epoch, logger=logger) else: train_init(label_loader, model1, optimizer1, ema_optimizer, steps_per_epoch, epoch, logger=logger) if epoch % 10 == 0: val_loss, val_acc = evaluate(validloader, ema_model, logger, 'valid') if val_acc >= best_acc: best_acc = val_acc evaluate(testloader, ema_model, logger, 'test') torch.save(ema_model.state_dict(), init_path) torch.save(model1.state_dict(), init_path1) adjust_learning_rate_adam(optimizer1, args.lr * 0.2) for epoch in range(max_epoch // 5): if args.mix: train_init_mix(label_loader, model1, optimizer1, ema_optimizer, steps_per_epoch, epoch, logger=logger) else: train_init(label_loader, model1, optimizer1, ema_optimizer, steps_per_epoch, epoch, logger=logger) if epoch % 10 == 0: val_loss, val_acc = evaluate(validloader, ema_model, logger, 'valid') if val_acc >= best_acc: best_acc = val_acc evaluate(testloader, ema_model, logger, 'test') torch.save(ema_model.state_dict(), init_path) torch.save(model1.state_dict(), init_path1) ema_model.load_state_dict(torch.load(init_path)) model1.load_state_dict(torch.load(init_path1)) logger.info('init train finished') evaluate(validloader, ema_model, logger, 'valid') evaluate(testloader, ema_model, logger, 'test') for i_round in range(args.round): mask = np.zeros(all_indices.shape, dtype=bool) mask[confident_indices] = True other_indices = all_indices[~mask] optimizer2 = AdamW(model2.parameters(), lr=args.lr) logger = logging.getLogger('model2_round_{}'.format(i_round)) file_handler = logging.FileHandler( osp.join(args.dir, 'model2_round_{}.txt'.format(i_round))) logger.addHandler(file_handler) logger.setLevel(logging.INFO) if args.auto: probs = predict_probs(ema_model, unlabel_loader2) else: probs = np.zeros((unlabel_num, args.num_classes)) for i in range(args.K): probs += predict_probs(ema_model, unlabel_loader) probs /= args.K pseudo_labels[other_indices] = probs.argmax(axis=1).astype( np.int32)[other_indices] #pseudo_labels = probs.argmax(axis=1).astype(np.int32) df2 = create_basic_stats_dataframe() df2['iter'] = i_round df2['train_acc'] = accuracy_score(unlabel_y, pseudo_labels) df = df.append(df2, ignore_index=True) df.to_csv(stats_path, index=False) #phase2: train model2 unlabelset.targets = pseudo_labels.copy() trainset = ConcatDataset([labelset, unlabelset]) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size2, num_workers=args.num_workers, pin_memory=True, shuffle=True) model2.load_state_dict(torch.load(init_path2)) best_val_epoch = 0 best_model2_acc = 0 steps_per_epoch = len(iter(trainloader)) max_epoch2 = args.steps2 // steps_per_epoch for epoch in range(max_epoch2): train_model2(trainloader, model2, optimizer2, epoch, logger=logger) val_loss, val_acc = evaluate(validloader, model2, logger, 'val') if val_acc >= best_model2_acc: best_model2_acc = val_acc best_val_epoch = epoch torch.save(model2.state_dict(), best_model2_path) evaluate(testloader, model2, logger, 'test') if (epoch - best_val_epoch) * steps_per_epoch > args.stop_steps2: break df.loc[df['iter'] == i_round, 'valid_acc'] = best_model2_acc df.loc[df['iter'] == i_round, 'valid_epoch'] = best_val_epoch df.to_csv(stats_path, index=False) model2.load_state_dict(torch.load(best_model2_path)) logger.info('model2 train finished') evaluate(trainloader, model2, logger, 'train') evaluate(validloader, model2, logger, 'val') evaluate(label_loader2, model2, logger, 'reward') evaluate(testloader, model2, logger, 'test') #phase3: get confidence of unlabeled data by labeled data, split confident and unconfident data '''if args.auto: probs = predict_probs(model2,unlabel_loader2) else: probs = np.zeros((unlabel_num,args.num_classes)) for i in range(args.K): probs += predict_probs(model2, unlabel_loader) probs /= args.K''' probs = predict_probs(model2, unlabel_loader2) new_pseudo_labels = probs.argmax(axis=1) confidences = probs[all_indices, pseudo_labels] if args.schedule == 'exp': confident_num = int((len(confident_indices) + args.label_num) * (1 + args.scale)) - args.label_num elif args.schedule == 'linear': confident_num = len(confident_indices) + int( unlabel_num * args.scale) old_confident_indices = confident_indices.copy() confident_indices = np.array([], dtype=np.int64) for j in range(args.num_classes): j_cands = (pseudo_labels == j) k_size = int(min(confident_num // args.num_classes, j_cands.sum())) logger.info('class: {}, confident size: {}'.format(j, k_size)) if k_size > 0: j_idx_top = all_indices[j_cands][ confidences[j_cands].argsort()[-k_size:]] confident_indices = np.concatenate( (confident_indices, all_indices[j_idx_top])) '''new_confident_indices = np.intersect1d(new_confident_indices, np.setdiff1d(new_confident_indices, no_help_indices)) new_confident_indices = new_confident_indices[(-confidences[new_confident_indices]).argsort()] confident_indices = np.concatenate((old_confident_indices, new_confident_indices))''' acc = accuracy_score(unlabel_y[confident_indices], pseudo_labels[confident_indices]) logger.info('confident data num:{}, prop: {:4f}, acc: {:4f}'.format( len(confident_indices), len(confident_indices) / len(unlabel_y), acc)) '''if len(old_confident_indices) > 0: acc = accuracy_score(unlabel_y[old_confident_indices],pseudo_labels[old_confident_indices]) logger.info('old confident data prop: {:4f}, acc: {:4f}'.format(len(old_confident_indices)/len(unlabel_y), acc)) acc = accuracy_score(unlabel_y[new_confident_indices],pseudo_labels[new_confident_indices]) logger.info('new confident data prop: {:4f}, acc: {:4f}'.format(len(new_confident_indices)/len(unlabel_y), acc))''' #unlabelset.train_labels_ul = pseudo_labels.copy() confident_dataset = torch.utils.data.Subset(unlabelset, confident_indices) #phase4: refine model1 by confident data and reward data #train_dataset = torch.utils.data.ConcatDataset([confident_dataset,labelset]) logger = logging.getLogger('model1_round_{}'.format(i_round)) file_handler = logging.FileHandler( osp.join(args.dir, 'model1_round_{}.txt'.format(i_round))) logger.addHandler(file_handler) logger.setLevel(logging.INFO) best_val_epoch = 0 evaluate(validloader, ema_model, logger, 'valid') evaluate(testloader, ema_model, logger, 'test') optimizer1 = AdamW(model1.parameters(), lr=args.lr) confident_dataset = torch.utils.data.Subset(unlabelset, confident_indices) trainloader = torch.utils.data.DataLoader(confident_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, drop_last=True) #steps_per_epoch = len(iter(trainloader)) steps_per_epoch = 200 max_epoch1 = args.steps1 // steps_per_epoch for epoch in range(max_epoch1): '''current_num = int(cal_consistency_weight( (epoch + 1) * steps_per_epoch, init_ep=0, end_ep=args.stop_steps1//2, init_w=start_num, end_w=end_num)) current_confident_indices = confident_indices[:current_num] logger.info('current num: {}'.format(current_num))''' if args.mix: train_model1_mix(label_loader, trainloader, model1, optimizer1, ema_model, ema_optimizer, steps_per_epoch, epoch, logger=logger) else: train_model1(label_loader, trainloader, model1, optimizer1, ema_model, ema_optimizer, steps_per_epoch, epoch, logger=logger) val_loss, val_acc = evaluate(validloader, ema_model, logger, 'valid') if val_acc >= best_acc: best_acc = val_acc best_val_epoch = epoch evaluate(testloader, ema_model, logger, 'test') torch.save(model1.state_dict(), best_model1_path) torch.save(ema_model.state_dict(), best_ema_path) if (epoch - best_val_epoch) * steps_per_epoch > args.stop_steps1: break ema_model.load_state_dict(torch.load(best_ema_path)) model1.load_state_dict(torch.load(best_model1_path)) logger.info('model1 train finished') evaluate(validloader, ema_model, logger, 'valid') evaluate(testloader, ema_model, logger, 'test') '''no_help_indices = np.concatenate((no_help_indices,confident_indices[current_num:])) confident_indices = confident_indices[:current_num]''' if len(confident_indices) >= len(all_indices): break
batch_size=batch_size, num_workers=12, shuffle=True, pin_memory=True, drop_last=True) val_data_loader = DataLoader(val_train, batch_size=val_batch_size, num_workers=12, shuffle=False, pin_memory=False) model = SeResNext50_Unet_9ch(pretrained=None) #.cuda() params = model.parameters() optimizer = AdamW(params, lr=0.0002, weight_decay=4e-6) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[4, 8, 10], gamma=0.5) model = nn.DataParallel(model).cuda() snap_to_load = 'res50_9ch_{}_0_best'.format(seed) print("=> loading checkpoint '{}'".format(snap_to_load)) checkpoint = torch.load(path.join(models_folder, snap_to_load), map_location='cpu') loaded_dict = checkpoint['state_dict'] sd = model.state_dict() for k in model.state_dict(): if k in loaded_dict and sd[k].size() == loaded_dict[k].size():
Path(opt.out_path).mkdir(parents=True, exist_ok=True) train_set = SyntheticDataset(opt.syn_path, opt.params_path, opt.blend, opt.channels) train_loader = DataLoader(dataset=train_set, num_workers=opt.threads, batch_size=opt.batch_size, shuffle=True, pin_memory=True) val_set = RealDataset(opt.real_path, opt.channels, split='val') val_loader = DataLoader(dataset=val_set, num_workers=0, batch_size=1, shuffle=False) test_set = RealDataset(opt.real_path, opt.channels, split='test') test_loader = DataLoader(dataset=test_set, num_workers=0, batch_size=1, shuffle=False) opt.n_classes = train_set.n_classes net = PowderNet(opt.arch, opt.n_channels, train_set.n_classes) net = net.cuda() optimizer = AdamW(net.parameters(), lr=opt.lr, weight_decay=opt.decay) scheduler = CosineLRWithRestarts(optimizer, opt.batch_size, len(train_set), opt.period, opt.t_mult) vis = Visualizer(server=opt.server, env=opt.env) start_epoch = 0 if opt.resume is not None: checkpoint = torch.load(opt.resume) old_opt = checkpoint['opt'] assert(old_opt.channels == opt.channels) assert(old_opt.bands == opt.bands) assert(old_opt.arch == opt.arch) assert(old_opt.blend == opt.blend) assert(old_opt.lr == opt.lr) assert(old_opt.decay == opt.decay) assert(old_opt.period == opt.period) assert(old_opt.t_mult == opt.t_mult)
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) np.random.seed(args.random_seed) random.seed(args.random_seed) torch.backends.cudnn.deterministic = True # import network # args.gen_model is TransGAN_8_8_1 for example gen_net = eval('models.'+args.gen_model+'.Generator')(args=args).cuda() dis_net = eval('models.'+args.dis_model+'.Discriminator')(args=args).cuda() gen_net.set_arch(args.arch, cur_stage=2) print("The shit!") # weight init: Xavier Uniform def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format(args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net.apply(weights_init) dis_net.apply(weights_init) gpu_ids = [i for i in range(int(torch.cuda.device_count()))] gen_net = torch.nn.DataParallel(gen_net.to("cuda:0"), device_ids=gpu_ids) dis_net = torch.nn.DataParallel(dis_net.to("cuda:0"), device_ids=gpu_ids) # print(gen_net.module.cur_stage) if args.optimizer == "adam": gen_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) elif args.optimizer == "adamw": gen_optimizer = AdamW(filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, weight_decay=args.wd) dis_optimizer = AdamW(filter(lambda p: p.requires_grad, dis_net.parameters()), args.g_lr, weight_decay=args.wd) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, args.max_iter * args.n_critic) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, args.max_iter * args.n_critic) # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz' elif args.fid_stat is not None: fid_stat = args.fid_stat else: raise NotImplementedError # (f"no fid stat for %s"%args.dataset.lower()") assert os.path.exists(fid_stat) dataset = datasets.ImageDataset(args, cur_img_size=8) train_loader = dataset.train writer=SummaryWriter() writer_dict = {'writer':writer} writer_dict["train_global_steps"]=0 writer_dict["valid_global_steps"]=0 best = 1e4 for epoch in range(args.max_epoch): train(args, gen_net = gen_net, dis_net = dis_net, gen_optimizer = gen_optimizer, dis_optimizer = dis_optimizer, gen_avg_param = None, train_loader = train_loader, epoch = epoch, writer_dict = writer_dict, fixed_z = None, schedulers=[gen_scheduler, dis_scheduler]) checkpoint = {'epoch':epoch, 'best_fid':best} checkpoint['gen_state_dict'] = gen_net.state_dict() checkpoint['dis_state_dict'] = dis_net.state_dict() score = validate(args, None, fid_stat, epoch, gen_net, writer_dict, clean_dir=True) # print these scores, is it really the latest print(f'FID score: {score} - best ID score: {best} || @ epoch {epoch}.') if epoch == 0 or epoch > 30: if score < best: save_checkpoint(checkpoint, is_best=(score<best), output_dir=args.output_dir) print("Saved Latest Model!") best = score checkpoint = {'epoch':epoch, 'best_fid':best} checkpoint['gen_state_dict'] = gen_net.state_dict() checkpoint['dis_state_dict'] = dis_net.state_dict() score = validate(args, None, fid_stat, epoch, gen_net, writer_dict, clean_dir=True) save_checkpoint(checkpoint, is_best=(score<best), output_dir=args.output_dir)
def train_gan(zq=256, ze=512, batch_size=32, outdir=".", name="tmp", dry=False, **kwargs): if not dry: tensorboard_path = Path(outdir) / 'tensorboard' / name model_path = Path(outdir) / 'models' / name tensorboard_path.mkdir(exist_ok=True, parents=True) model_path.mkdir(exist_ok=True, parents=True) sw = SummaryWriter(str(tensorboard_path)) netT = resnet20().to(device) # netT = SimpleConvNet(bias=False).to(device) netH = HyperNet(netT, ze, zq).to(device) print("Loading pretrained generators...") pretrain = torch.load('pretrained.pt') netH.load_state_dict(pretrain['netH']) netD = SimpleLinearNet( [zq * batch_size, zq * batch_size // 2, zq * batch_size // 4, 1024, 1], final_sigmoid=True, batchnorm=False).to(device) print(netT, netH, netD) print(f"netT params: {param_count(netT)}") print(f"netH params: {param_count(netH)}") print(f"netD params: {param_count(netD)}") generator_count = param_layer_count(netT) optimH = AdamW(netH.parameters(), lr=1e-4, betas=(0.5, 0.9), weight_decay=1e-4) optimD = AdamW(netD.parameters(), lr=5e-5, betas=(0.5, 0.9), weight_decay=1e-4) g_loss_meter, d_loss_meter = AverageMeter(), AverageMeter() d_acc_meter = AverageMeter() gp_meter = AverageMeter() dgrad_meter = AverageMeter() adversarial_loss = nn.BCELoss() real_label, fake_label = 0, 1 label = torch.zeros((generator_count, 1), device=device) ops = 0 start_time = time.time() minibatch_count = 1562 for epoch in range(100000): d_loss_meter.reset() g_loss_meter.reset() d_acc_meter.reset() gp_meter.reset() dgrad_meter.reset() # schedH.step() # schedD.step() for batch_idx in range(minibatch_count): n_iter = epoch * minibatch_count + batch_idx netH.zero_grad() netD.zero_grad() z = fast_randn((batch_size, ze), device=device, requires_grad=True) q = netH.encoder(z).view(-1, generator_count, zq) # Z Adversary free_params([netD]) freeze_params([netH]) codes = q.permute((1, 0, 2)).contiguous().view(generator_count, -1) noise = fast_randn((generator_count, zq * batch_size), device=device, requires_grad=True) d_real = netD(noise) d_fake = netD(codes) d_real_loss = adversarial_loss(d_real, label.fill_(real_label)) d_real_loss.backward(retain_graph=True) d_fake_loss = adversarial_loss(d_fake, label.fill_(fake_label)) d_fake_loss.backward(retain_graph=True) d_loss = d_real_loss + d_fake_loss # gp = calc_gradient_penalty(netD, noise, codes, device=device) # d_loss = d_fake.mean() - d_real.mean() + 10 * gp # d_loss.backward(retain_graph=True) dgrad_meter.update(model_grad_norm(netD)) d_loss_meter.update(d_loss.item()) d_acc_meter.update((sum(d_real < 0.5) + sum(d_fake > 0.5)).item() / (generator_count * 2)) # gp_meter.update(gp.item()) optimD.step() # schedD.batch_step() # Train the generator freeze_params([netD]) free_params([netH]) # fool the discriminator # d_fake_loss = -d_fake.mean() # d_fake_loss.backward() d_fake_loss = adversarial_loss(d_fake, label.fill_(real_label)) d_fake_loss.backward(retain_graph=True) optimH.step() with torch.no_grad(): """ Update Statistics """ if batch_idx % 50 == 0: current_time = time.time() ops_per_sec = ops // (current_time - start_time) start_time = current_time ops = 0 print("*" * 70 + " " + name) print("{}/{} D Loss: {}".format(epoch, batch_idx, d_loss.item())) print("{} ops/s".format(ops_per_sec)) ops += batch_size if batch_idx > 1 and batch_idx % 199 == 0: if not dry: sw.add_scalar('G/loss', g_loss_meter.avg, n_iter) sw.add_scalar('D/loss', d_loss_meter.avg, n_iter) sw.add_scalar('D/acc', d_acc_meter.avg, n_iter) sw.add_scalar('D/gp', gp_meter.avg, n_iter) sw.add_scalar('D/gradnorm', dgrad_meter.avg, n_iter) netH.eval() netH_samples = [ netH(fast_randn((batch_size, ze)).cuda()) for _ in range(10) ] netH.train() sw.add_scalar( 'G/g_var', sum( x.std(0).mean() for v in netH_samples for x in v[1].values()) / (generator_count * 10), n_iter) sw.add_scalar( 'G/q_var', torch.cat([ s[0].view(-1, zq) for s in netH_samples ]).var(0).mean(), n_iter) if kwargs['embeddings']: sw.add_embedding( q.view(-1, zq), global_step=n_iter, tag="q", metadata=list(range(generator_count)) * batch_size) torch.save( { 'netH': netH.state_dict(), 'netD': netD.state_dict() }, str(model_path / 'pretrain.pt'))
def optimization_algorithms(SCI_optimizer, cnn, LR, SCI_SGD_MOMENTUM, REGULARIZATION): if type(SCI_optimizer) is str: if (SCI_optimizer == 'Adam'): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (SCI_optimizer == 'AMSGrad'): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION, amsgrad=True) if (SCI_optimizer == 'AdamW'): optimizer = AdamW(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (SCI_optimizer == 'RMSprop'): optimizer = optim.RMSprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'SparseAdam') or (int(SCI_optimizer) == 4) : #optimizer = optim.SparseAdam(cnn.parameters(), lr=LR) if (SCI_optimizer == 'SGD'): optimizer = optim.SGD(cnn.parameters(), lr=LR, momentum=SCI_SGD_MOMENTUM, weight_decay=REGULARIZATION) if (SCI_optimizer == 'Adadelta'): optimizer = optim.Adadelta(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (SCI_optimizer == 'Rprop'): optimizer = optim.Rprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'Adagrad') or (int(SCI_optimizer) == 7) : # optimizer = optim.Adagrad(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (SCI_optimizer == 'Adamax'): optimizer = optim.Adamax(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (SCI_optimizer == 'ASGD'): optimizer = optim.ASGD(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) #if (SCI_optimizer == 'LBFGS') or (int(SCI_optimizer) == 10) : #optimizer = optim.LBFGS(cnn.parameters(), lr=LR) else: if (int(SCI_optimizer) == 1): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 2): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION, amsgrad=True) if (int(SCI_optimizer) == 3): optimizer = AdamW(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 4): optimizer = optim.RMSprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'SparseAdam') or (int(SCI_optimizer) == 4) : #optimizer = optim.SparseAdam(cnn.parameters(), lr=LR) if (int(SCI_optimizer) == 5): optimizer = optim.SGD(cnn.parameters(), lr=LR, momentum=SCI_SGD_MOMENTUM, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 6): optimizer = optim.Adadelta(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 7): optimizer = optim.Rprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'Adagrad') or (int(SCI_optimizer) == 7) : # optimizer = optim.Adagrad(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 8): optimizer = optim.Adamax(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 9): optimizer = optim.ASGD(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) #if (SCI_optimizer == 'LBFGS') or (int(SCI_optimizer) == 10) : #optimizer = optim.LBFGS(cnn.parameters(), lr=LR) return optimizer
else: print('Received: {}'.format(architecture)) raise Exception('Model must be one of: Transformer_features, LSTM_raw, LSTM_features, CNN_raw, CNN_features, SincNet_raw') if cuda: cost = cost.cuda() model = model.cuda() print('FunTimes: {:d} parameters'.format(sum(p.numel() for p in model.parameters()))) # Instantiate optimizer and learning rate scheduler if optimizer_to_use == 'AMSGrad': optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay, amsgrad=True) elif optimizer_to_use == 'AdamW': optimizer = AdamW(model.parameters(), lr, weight_decay=weight_decay) elif optimizer_to_use == 'Adam': optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay) elif optimizer_to_use == 'RMSProp': optimizer = optim.RMSprop(model.parameters(), lr,alpha=0.95, eps=1e-8, weight_decay=weight_decay) else: print('Optimizer selected: {}'.format(optimizer_to_use)) raise Exception('Optimizer once be one of: AMSGrad, AdamW, Adam, RMSProp') # Load last checkpoint if one exists subprocess.call(['gsutil', 'cp', '']) state_dict = None state_dict = load_checkpoint(save_dir, restore_file, model, optimizer) last_epoch = state_dict['last_epoch'] if state_dict is not None else -1 # Track validation performance for early stopping
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) np.random.seed(args.random_seed) random.seed(args.random_seed) torch.backends.cudnn.deterministic = True # set tf env _init_inception() inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # import network gen_net = eval('models.' + args.gen_model + '.Generator')(args=args).cuda() dis_net = eval('models.' + args.dis_model + '.Discriminator')(args=args).cuda() gen_net.set_arch(args.arch, cur_stage=2) # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform_(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net.apply(weights_init) dis_net.apply(weights_init) gpu_ids = [i for i in range(int(torch.cuda.device_count()))] gen_net = torch.nn.DataParallel(gen_net.to("cuda:0"), device_ids=gpu_ids) dis_net = torch.nn.DataParallel(dis_net.to("cuda:0"), device_ids=gpu_ids) gen_net.module.cur_stage = 0 dis_net.module.cur_stage = 0 gen_net.module.alpha = 1. dis_net.module.alpha = 1. # set optimizer if args.optimizer == "adam": gen_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) elif args.optimizer == "adamw": gen_optimizer = AdamW(filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, weight_decay=args.wd) dis_optimizer = AdamW(filter(lambda p: p.requires_grad, dis_net.parameters()), args.g_lr, weight_decay=args.wd) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, args.max_iter * args.n_critic) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, args.max_iter * args.n_critic) # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz' elif args.fid_stat is not None: fid_stat = args.fid_stat else: raise NotImplementedError(f'no fid stat for {args.dataset.lower()}') assert os.path.exists(fid_stat) # epoch number for dis_net args.max_epoch = args.max_epoch * args.n_critic dataset = datasets.ImageDataset(args, cur_img_size=8) train_loader = dataset.train if args.max_iter: args.max_epoch = np.ceil(args.max_iter * args.n_critic / len(train_loader)) # initial fixed_z = torch.cuda.FloatTensor( np.random.normal(0, 1, (64, args.latent_dim))) gen_avg_param = copy_params(gen_net) start_epoch = 0 best_fid = 1e4 # set writer if args.load_path: print(f'=> resuming from {args.load_path}') assert os.path.exists(args.load_path) checkpoint_file = os.path.join(args.load_path) assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint['epoch'] best_fid = checkpoint['best_fid'] gen_net.load_state_dict(checkpoint['gen_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) gen_optimizer.load_state_dict(checkpoint['gen_optimizer']) dis_optimizer.load_state_dict(checkpoint['dis_optimizer']) avg_gen_net = deepcopy(gen_net) avg_gen_net.load_state_dict(checkpoint['avg_gen_state_dict']) gen_avg_param = copy_params(avg_gen_net) del avg_gen_net cur_stage = cur_stages(start_epoch, args) gen_net.module.cur_stage = cur_stage dis_net.module.cur_stage = cur_stage gen_net.module.alpha = 1. dis_net.module.alpha = 1. # args.path_helper = checkpoint['path_helper'] else: # create new log dir assert args.exp_name args.path_helper = set_log_dir('logs', args.exp_name) logger = create_logger(args.path_helper['log_path']) logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': start_epoch * len(train_loader), 'valid_global_steps': start_epoch // args.val_freq, } # train loop for epoch in tqdm(range(int(start_epoch), int(args.max_epoch)), desc='total progress'): lr_schedulers = (gen_scheduler, dis_scheduler) if args.lr_decay else None train(args, gen_net, dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, lr_schedulers) if epoch and epoch % args.val_freq == 0 or epoch == int( args.max_epoch) - 1: backup_param = copy_params(gen_net) load_params(gen_net, gen_avg_param) inception_score, fid_score = validate(args, fixed_z, fid_stat, epoch, gen_net, writer_dict) logger.info( f'Inception score: {inception_score}, FID score: {fid_score} || @ epoch {epoch}.' ) load_params(gen_net, backup_param) if fid_score < best_fid: best_fid = fid_score is_best = True else: is_best = False else: is_best = False avg_gen_net = deepcopy(gen_net) load_params(avg_gen_net, gen_avg_param) save_checkpoint( { 'epoch': epoch + 1, 'gen_model': args.gen_model, 'dis_model': args.dis_model, 'gen_state_dict': gen_net.state_dict(), 'dis_state_dict': dis_net.state_dict(), 'avg_gen_state_dict': avg_gen_net.state_dict(), 'gen_optimizer': gen_optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict(), 'best_fid': best_fid, 'path_helper': args.path_helper }, is_best, args.path_helper['ckpt_path']) del avg_gen_net
def objective(SCI_RELU, SCI_BIAS, SCI_loss_type, SCI_optimizer, SCI_BATCH_SIZE, SCI_MM, SCI_REGULARIZATION, SCI_LR, SCI_DROPOUT, SCI_L_SECOND, SCI_EPOCHS, SCI_BN_MOMENTUM, SCI_SGD_MOMENTUM, SCI_LINEARITY): global device, MaxCredit global count, CreditVector, CreditVec SCI_BATCH_SIZE = int(SCI_BATCH_SIZE) # integer between 4 and 256 SCI_MM = round(SCI_MM, 3) # real with three decimals between (0.001, 0.999) SCI_REGULARIZATION = round(SCI_REGULARIZATION, 3) # real with three decimals between (0.001, 0.7) SCI_LR = round(SCI_LR, 5) # real with five decimals between(1e-4, 7e-1) SCI_DROPOUT = round(SCI_DROPOUT, 2) # real with two decimals between (0, 0.4) SCI_L_SECOND = int(SCI_L_SECOND) # integer between 2 and 64 SCI_EPOCHS = int(SCI_EPOCHS) # integer between (100, 500) SCI_BN_MOMENTUM = round(SCI_BN_MOMENTUM, 2) # real with two decimals between (0, 0.99) SCI_SGD_MOMENTUM = round(SCI_SGD_MOMENTUM, 2) # real with two decimals between (0, 0.99) SCI_optimizer = int(SCI_optimizer) # integer between 1 and 4 SCI_loss_type = int(SCI_loss_type) # integer between 1 and 3 ('CrossEntropyLoss', 'MultiMarginLoss','NLLLoss') SCI_LINEARITY = int(SCI_LINEARITY) if int(SCI_RELU) == 1 : # integer between 1 and 2 ('True', 'False') SCI_RELU = True else: SCI_RELU = False if int(SCI_BIAS) == 1 : # integer between 1 and 2 ('True', 'False') SCI_BIAS = True else: SCI_BIAS = False from cnn_model import CNN6 cnn = CNN6(L_FIRST, SCI_L_SECOND, KERNEL_X, SCI_BIAS, SCI_BN_MOMENTUM, SCI_RELU, SCI_DROPOUT, dataset.CLASSES, SCI_LINEARITY) if GPU_SELECT == 2: if torch.cuda.device_count() > 1: cnn = nn.DataParallel(cnn, device_ids=[0, 1], dim=0) cnn = cnn.cuda() if GPU_SELECT == 1: cnn.to(device) if GPU_SELECT == 0: cnn.to(device) # next(cnn.parameters()).is_cuda # print(cnn) # net architecture # list(cnn.parameters()) cnn.apply(CNN6.weights_reset) cnn.share_memory() loss_func = nn.CrossEntropyLoss() def create_loss(LOSS): if LOSS == 1: loss_func = nn.CrossEntropyLoss() if LOSS == 2: loss_func = nn.NLLLoss() else: loss_func = nn.MultiMarginLoss() return loss_func MM = float(str(SCI_MM)) REGULARIZATION = float(str(SCI_REGULARIZATION)) # optimizer = str(SCI_optimizer) LR = float(str(SCI_LR)) train_losses = [] # to track the training loss as the model trains output = 0 loss = 0 accuracy = 0 early_stopping.counter = 0 early_stopping.best_score = None early_stopping.early_stop = False early_stopping.verbose = False TEST_RESULTS = torch.zeros(1, 2) loss_type = create_loss(SCI_loss_type) from adamw import AdamW if SCI_optimizer == 1: optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.9, 0.99), weight_decay=REGULARIZATION) if SCI_optimizer == 2: optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.9, 0.99), weight_decay=REGULARIZATION, amsgrad=True) if SCI_optimizer == 3: optimizer = AdamW(cnn.parameters(), lr=LR, betas=(0.9, 0.99), weight_decay=REGULARIZATION) if SCI_optimizer == 4: optimizer = optim.SGD(cnn.parameters(), lr=LR, momentum=SCI_SGD_MOMENTUM, weight_decay=REGULARIZATION) if SCI_optimizer == 5: optimizer = optim.Adadelta(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if SCI_optimizer == 6: optimizer = optim.Adagrad(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) from Utillities import Utillities Utillities.listing(optimizer, SCI_SGD_MOMENTUM, SCI_BN_MOMENTUM, SCI_L_SECOND, SCI_LR, SCI_RELU, SCI_BIAS, SCI_loss_type, REGULARIZATION, SCI_BATCH_SIZE, SCI_DROPOUT, SCI_LINEARITY) # Data Loader for easy mini-batch return in training SCI_BATCH_SIZE = int(SCI_BATCH_SIZE) train_loader = Data.DataLoader(dataset=dataset.train_dataset, batch_size=SCI_BATCH_SIZE, shuffle=True, num_workers=0, drop_last=True, pin_memory=True) validation_loader = Data.DataLoader(dataset=dataset.validation_dataset, batch_size=144, shuffle=True, num_workers=0, drop_last=True, pin_memory=True) test_loader = Data.DataLoader(dataset=dataset.test_dataset, batch_size=599, shuffle=True, num_workers=0, drop_last=True, pin_memory=True) for epoch in range(SCI_EPOCHS): loss = None cnn.train().cuda() for step, (train_data, train_target) in enumerate(train_loader): train_data, train_target = train_data.to(device), train_target.to(device) output, temp = cnn(train_data) # forward pass: compute predicted outputs by passing inputs to the model loss = loss_func(output, train_target) train_losses.append(loss.item()) # record training loss loss.backward() # backward pass: compute gradient of the loss with respect to model parameters optimizer.zero_grad() optimizer.step() # perform a single optimization step (parameter update) cnn.eval().cuda() # switch to evaluation (no change) mode valid_loss = 0 accuracy = 0 running_loss = 0.0 with torch.no_grad(): for step, (validation_data, validation_target) in enumerate(validation_loader): validation_data, validation_target = validation_data.to(device), validation_target.to(device) output, temp = cnn(validation_data) # forward pass: compute predicted outputs by passing inputs to the model valid_loss += loss_func(output, validation_target).item() # ps = torch.exp(output) # equality = (validation_target[0].data == ps.max(dim=1)[1]) # accuracy += equality.type(torch.FloatTensor).mean() # print('valid_loss: ', valid_loss) # print statistics running_loss += valid_loss if epoch % 100 == 0: print('average loss: %.6f' % (running_loss)) running_loss = 0.0 train_losses = [] early_stopping(valid_loss, cnn) if early_stopping.early_stop: if os.path.exists('checkpoint.pt'): # cnn = TheModelClass(*args, **kwargs) print("Loaded the model with the lowest Validation Loss!") cnn.load_state_dict(torch.load('checkpoint.pt')) # Choose whatever GPU device number you want cnn.to(device) break cnn.eval() class_correct = list(0. for i in range(1000)) class_total = list(0. for i in range(1000)) with torch.no_grad(): for (test_data, test_target) in test_loader: test_data, test_target = test_data.to(device), test_target.to(device) outputs, temp = cnn(test_data) _, predicted = torch.max(outputs, 1) c = (predicted == test_target).squeeze() for i in range(test_target.size(0)): label = test_target[i] class_correct[label] += c[i].item() class_total[label] += 1 for i in range(dataset.CLASSES): TEST_RESULTS[0, i] = class_correct[i] / dataset.TESTED_ELEMENTS[i] print('Class: ', i, ' accuracy: ', TEST_RESULTS[0, i]) print('Class: ', i, ' correct: ', class_correct[i], ' of ', dataset.TESTED_ELEMENTS[i]) percent = (TEST_RESULTS[0, 0] + TEST_RESULTS[0, 1]) / 2 print('Final percentage: ', percent) CreditCost = int((1 - TEST_RESULTS[0, 0]) * dataset.TESTED_ELEMENTS[0] + (1 - TEST_RESULTS[0, 1]) * dataset.TESTED_ELEMENTS[1] * 5) if TEST_RESULTS[0, 0] == 0 or TEST_RESULTS[0, 1] == 0 : CreditCost = CreditCost + 300 print('Last epoch: ', epoch) print('Credit Cost: ', -CreditCost) # list(cnn.parameters()) if os.path.exists('checkpoint.pt'): os.remove('checkpoint.pt') print() print() if -CreditCost > MaxCredit : MaxCredit = -CreditCost print('Best Score So Far: ', MaxCredit) CreditVector[count] = MaxCredit CreditVec[count] = count # plot the data fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(CreditVec, -CreditVector, color='tab:blue') # print(CreditVec, -CreditVector) count = count + 1 # display the plot plt.show() return -CreditCost
batch_size=batch_size, num_workers=6, shuffle=True, pin_memory=False, drop_last=True) val_data_loader = DataLoader(val_train, batch_size=val_batch_size, num_workers=6, shuffle=False, pin_memory=False) model = SeResNext50_Unet_Double().cuda() params = model.parameters() optimizer = AdamW(params, lr=0.00001, weight_decay=1e-6) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[ 1, 2, 3, 4, 5, 7, 9, 11, 17, 23, 29, 33, 47, 50, 60, 70, 90, 110, 130, 150, 170, 180, 190 ], gamma=0.5) model = nn.DataParallel(model).cuda() snap_to_load = 'res50_cls_cce_{}_0_best'.format(seed) print("=> loading checkpoint '{}'".format(snap_to_load))
def objective(SCI_SGD_MOMENTUM, SCI_DROPOUT, SCI_BATCH_SIZE, SCI_L_SECOND, SCI_optimizer, LINEARITY): global SCI_REGULARIZATION, SCI_EPOCHS, SCI_loss_type, SCI_RELU global SCI_BIAS, SCI_BN_MOMENTUM, device, SCI_LR, MaxCredit, count, CreditVector, CreditVec SCI_SGD_MOMENTUM = SCI_SGD_MOMENTUM / 10 DROPOUT = (SCI_DROPOUT / 2).item() if SCI_DROPOUT < 0 : DROPOUT = 0 BATCH_SIZE = int(SCI_BATCH_SIZE) if SCI_L_SECOND < 4 : SCI_L_SECOND = 4 if SCI_optimizer < 1 : SCI_optimizer = 1 L_SECOND = int(SCI_L_SECOND) loss_func = nn.CrossEntropyLoss() def create_loss(LOSS): if LOSS == 'CrossEntropyLoss': loss_func = nn.CrossEntropyLoss() if LOSS == 'NLLLoss': loss_func = nn.NLLLoss() else: loss_func = nn.MultiMarginLoss() return loss_func REGULARIZATION = float(str(SCI_REGULARIZATION)) optimizer1 = str(SCI_optimizer) from cnn_model import CNN6 cnn = CNN6(L_FIRST, L_SECOND, KERNEL_X, SCI_BIAS, SCI_BN_MOMENTUM, SCI_RELU, DROPOUT, dataset.CLASSES, LINEARITY) if GPU_SELECT == 2: if torch.cuda.device_count() > 1: cnn = nn.DataParallel(cnn, device_ids=[0, 1], dim=0) cnn = cnn.cuda() if GPU_SELECT == 1: cnn.to(device) if GPU_SELECT == 0: cnn.to(device) cnn.apply(CNN6.weights_reset) cnn.share_memory() train_losses = [] # to track the training loss as the model trains output = 0 loss = 0 accuracy = 0 early_stopping.counter = 0 early_stopping.best_score = None early_stopping.early_stop = False early_stopping.verbose = False TEST_RESULTS = torch.zeros(1, 2) loss_type = create_loss(SCI_loss_type) from adamw import AdamW if optimizer1 == '1': optimizer = optim.Adam(cnn.parameters(), lr=SCI_LR, betas=(0.9, 0.99), weight_decay=REGULARIZATION) if optimizer1 == '2': optimizer = optim.Adam(cnn.parameters(), lr=SCI_LR, betas=(0.9, 0.99), weight_decay=REGULARIZATION, amsgrad=True) if optimizer1 == '3': optimizer = AdamW(cnn.parameters(), lr=SCI_LR, betas=(0.9, 0.99), weight_decay=REGULARIZATION) if optimizer1 == '4': optimizer = optim.SGD(cnn.parameters(), lr=SCI_LR, momentum=SCI_SGD_MOMENTUM, weight_decay=REGULARIZATION) if optimizer1 == '5': optimizer = optim.Adadelta(cnn.parameters(), lr=SCI_LR, weight_decay=REGULARIZATION) if optimizer1 == '6': optimizer = optim.Adagrad(cnn.parameters(), lr=SCI_LR, weight_decay=REGULARIZATION) if optimizer1 > '6': optimizer = optim.Adam(cnn.parameters(), lr=SCI_LR, betas=(0.9, 0.99), weight_decay=REGULARIZATION) from Utillities import Utillities Utillities.listing(optimizer, SCI_SGD_MOMENTUM, SCI_BN_MOMENTUM, L_SECOND, SCI_LR, SCI_RELU, SCI_BIAS, SCI_loss_type, REGULARIZATION, BATCH_SIZE, DROPOUT, LINEARITY) train_loader = Data.DataLoader(dataset=dataset.train_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, drop_last=True, pin_memory=True) validation_loader = Data.DataLoader(dataset=dataset.validation_dataset, batch_size=144, shuffle=False, num_workers=0, drop_last=True, pin_memory=True) test_loader = Data.DataLoader(dataset=dataset.test_dataset, batch_size=599, shuffle=False, num_workers=0, pin_memory=True, drop_last=True) for epoch in range(SCI_EPOCHS): loss = None cnn.train().cuda() for step, (train_data, train_target) in enumerate(train_loader): train_data, train_target = train_data.to(device), train_target.to(device) output, temp = cnn(train_data) # forward pass: compute predicted outputs by passing inputs to the model loss = loss_func(output, train_target) train_losses.append(loss.item()) # record training loss loss.backward() # backward pass: compute gradient of the loss with respect to model parameters optimizer.zero_grad() optimizer.step() # perform a single optimization step (parameter update) cnn.eval().cuda() # switch to evaluation (no change) mode valid_loss = 0 accuracy = 0 with torch.no_grad(): for step, (validation_data, validation_target) in enumerate(validation_loader): validation_data, validation_target = validation_data.to(device), validation_target.to(device) output, temp = cnn(validation_data) # forward pass: compute predicted outputs by passing inputs to the model valid_loss += loss_func(output, validation_target).item() ps = torch.exp(output) equality = (validation_target[0].data == ps.max(dim=1)[1]) accuracy += equality.type(torch.FloatTensor).mean() train_losses = [] early_stopping(valid_loss, cnn) if early_stopping.early_stop: if os.path.exists('checkpoint.pt'): print("Loaded the model with the lowest Validation Loss!") cnn.load_state_dict(torch.load('checkpoint.pt', map_location="cuda:1")) # Choose whatever GPU device number you want cnn.to(device) break cnn.eval() class_correct = list(0. for i in range(1000)) class_total = list(0. for i in range(1000)) with torch.no_grad(): for (test_data, test_target) in test_loader: test_data, test_target = test_data.to(device), test_target.to(device) outputs, temp = cnn(test_data) _, predicted = torch.max(outputs, 1) c = (predicted == test_target).squeeze() for i in range(test_target.size(0)): label = test_target[i] class_correct[label] += c[i].item() class_total[label] += 1 for i in range(dataset.CLASSES): TEST_RESULTS[0, i] = class_correct[i] / dataset.TESTED_ELEMENTS[i] print('Class: ', i, ' accuracy: ', TEST_RESULTS[0, i]) print('Class: ', i, ' correct: ', class_correct[i]) percent = (TEST_RESULTS[0, 0] + TEST_RESULTS[0, 1]) / 2 print('Final percentage: ', percent) CreditCost = (1 - TEST_RESULTS[0, 0]) * dataset.TESTED_ELEMENTS[0] + (1 - TEST_RESULTS[0, 1]) * dataset.TESTED_ELEMENTS[1] * 5 if TEST_RESULTS[0, 0] == 0 or TEST_RESULTS[0, 1] == 0 : CreditCost = CreditCost + 300 print('Last epoch: ', epoch) if os.path.exists('checkpoint.pt'): os.remove('checkpoint.pt') print() torch.cuda.empty_cache() print() CreditCost = CreditCost + (SCI_SGD_MOMENTUM + SCI_DROPOUT + SCI_BATCH_SIZE + SCI_L_SECOND + SCI_optimizer) / 1000 print('Credit Cost: ', CreditCost) if -CreditCost > MaxCredit : MaxCredit = -CreditCost print('Best Score So Far: ', MaxCredit) CreditVector[count] = MaxCredit CreditVec[count] = count # plot the data fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(CreditVec, -CreditVector, color='tab:orange') # print(CreditVec, -CreditVector) count = count + 1 # display the plot plt.show() return CreditCost
def main(config): seed_all() os.makedirs('cache', exist_ok=True) os.makedirs(config.logdir, exist_ok=True) print("Logging to: %s" % config.logdir) src_files = sorted(glob('*.py')) for src_fn in src_files: dst_fn = os.path.join(config.logdir, src_fn) copyfile(src_fn, dst_fn) train_image_fns = sorted(glob(os.path.join(config.train_dir, '*/*/*.dcm'))) test_image_fns = sorted(glob(os.path.join(config.test_dir, '*/*/*.dcm'))) # assert len(train_image_fns) == 10712 # assert len(test_image_fns) == 1377 gt = load_gt(config.train_rle) # create folds np.random.shuffle(train_image_fns) if config.subset > 0: train_image_fns = train_image_fns[:config.subset] folds = np.arange(len(train_image_fns)) % config.num_folds val_image_fns = [fn for k, fn in enumerate(train_image_fns) if folds[k] == config.fold] train_image_fns = [fn for k, fn in enumerate(train_image_fns) if folds[k] != config.fold] # remove not-used files: # https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/discussion/98478#latest-572385 # noqa train_image_fns = [fn for fn in train_image_fns if DicomDataset.fn_to_id(fn) in gt] val_image_fns = [fn for fn in val_image_fns if DicomDataset.fn_to_id(fn) in gt] print("VAL: ", len(val_image_fns), os.path.basename(val_image_fns[0])) print("TRAIN: ", len(train_image_fns), os.path.basename(train_image_fns[0])) train_ds = DicomDataset(train_image_fns, gt_rles=gt, augment=True) val_ds = DicomDataset(val_image_fns, gt_rles=gt) if config.cache: train_ds.cache() val_ds.cache() val_loader = data.DataLoader(val_ds, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers, pin_memory=config.pin, drop_last=False) model = FPNSegmentation(config.slug, ema=config.ema) if config.weight is not None: print("Loading: %s" % config.weight) model.load_state_dict(th.load(config.weight)) model = model.to(config.device) no_decay = ['mean', 'std', 'bias'] + ['.bn%d.' % i for i in range(100)] grouped_parameters = [{'params': [], 'weight_decay': config.weight_decay}, {'params': [], 'weight_decay': 0.0}] for n, p in model.named_parameters(): if not any(nd in n for nd in no_decay): print("Decay: %s" % n) grouped_parameters[0]['params'].append(p) else: print("No Decay: %s" % n) grouped_parameters[1]['params'].append(p) optimizer = AdamW(grouped_parameters, lr=config.lr) if config.apex: model, optimizer = apex.amp.initialize(model, optimizer, opt_level="O1", verbosity=0) updates_per_epoch = len(train_ds) // config.batch_size num_updates = int(config.epochs * updates_per_epoch) scheduler = WarmupLinearSchedule(warmup=config.warmup, t_total=num_updates) # training loop smooth = 0.1 best_dice = 0.0 best_fn = None global_step = 0 for epoch in range(1, config.epochs + 1): smooth_loss = None smooth_accuracy = None model.train() train_loader = data.DataLoader(train_ds, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, pin_memory=config.pin, drop_last=True) progress = tqdm(total=len(train_ds), smoothing=0.01) for i, (X, _, y_true) in enumerate(train_loader): X = X.to(config.device).float() y_true = y_true.to(config.device) y_pred = model(X) loss = siim_loss(y_true, y_pred, weights=None) if config.apex: with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() lr_this_step = None if (i + 1) % config.accumulation_step == 0: optimizer.step() optimizer.zero_grad() lr_this_step = config.lr * scheduler.get_lr(global_step, config.warmup) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step global_step += 1 smooth_loss = loss.item() if smooth_loss is None else \ smooth * loss.item() + (1. - smooth) * smooth_loss # print((y_true >= 0.5).sum().item()) accuracy = th.mean(((y_pred >= 0.5) == (y_true == 1)).to( th.float)).item() smooth_accuracy = accuracy if smooth_accuracy is None else \ smooth * accuracy + (1. - smooth) * smooth_accuracy progress.set_postfix(ep='%d/%d' % (epoch, config.epochs), loss='%.4f' % smooth_loss, accuracy='%.4f' % (smooth_accuracy), lr='%.6f' % (config.lr if lr_this_step is None else lr_this_step)) progress.update(len(X)) if epoch <= 12: continue # validation loop model.eval() thresholds = [0.1, 0.2] dice_coeffs = [[] for _ in range(len(thresholds))] progress = tqdm(enumerate(val_loader), total=len(val_loader)) with th.no_grad(): for i, (X, _, y_trues) in progress: X = X.to(config.device).float() y_trues = y_trues.to(config.device) y_preds = model(X) y_preds_flip = th.flip(model(th.flip(X, (-1, ))), (-1, )) y_preds = 0.5 * (y_preds + y_preds_flip) y_trues = y_trues.cpu().numpy() y_preds = y_preds.cpu().numpy() for yt, yp in zip(y_trues, y_preds): yt = (yt.squeeze() >= 0.5).astype('uint8') yp = yp.squeeze() for dind, threshold in enumerate(thresholds): yp_ = (yp >= threshold).astype(np.uint8) sc = score(yt, yp_) dice_coeffs[dind].append(sc) best_threshold_ind = -1 dice_coeff = -1 for dind, threshold in enumerate(thresholds): dc = np.mean([x[0] for x in dice_coeffs[dind] if x[1] == 'non-empty']) # progress.write("Dice @%.2f: %.4f" % (threshold, dc)) if dc > dice_coeff: dice_coeff = dc best_threshold_ind = dind dice_coeffs = dice_coeffs[best_threshold_ind] num_empty = sum(1 for x in dice_coeffs if x[1] == 'empty') num_total = len(dice_coeffs) num_non_empty = num_total - num_empty empty_sum = np.sum([d[0] for d in dice_coeffs if d[1] == 'empty']) non_empty_sum = np.sum([d[0] for d in dice_coeffs if d[1] == 'non-empty']) dice_coeff_empty = empty_sum / num_empty dice_coeff_non_empty = non_empty_sum / num_non_empty progress.write('[Empty: %d]: %.3f | %.3f, [Non-Empty: %d]: %.3f | %.3f' % ( num_empty, dice_coeff_empty, empty_sum / num_total, num_non_empty, dice_coeff_non_empty, non_empty_sum / num_total)) dice_coeff = float(dice_coeff) summary_str = 'f%02d-ep-%04d-val_dice-%.4f@%.2f' % (config.fold, epoch, dice_coeff, thresholds[best_threshold_ind]) progress.write(summary_str) if dice_coeff > best_dice: weight_fn = os.path.join(config.logdir, summary_str + '.pth') th.save(model.state_dict(), weight_fn) best_dice = dice_coeff best_fn = weight_fn fns = sorted(glob(os.path.join(config.logdir, 'f%02d-*.pth' % config.fold))) for fn in fns[:-config.n_keep]: os.remove(fn) # create submission test_ds = DicomDataset(test_image_fns) test_loader = data.DataLoader(test_ds, batch_size=config.batch_size, shuffle=False, num_workers=0, pin_memory=False, drop_last=False) if best_fn is not None: model.load_state_dict(th.load(best_fn)) model.eval() sub = create_submission(model, test_loader, config, pred_zip=config.pred_zip) sub.to_csv(config.submission_fn, index=False) print("Wrote to: %s" % config.submission_fn) # create val submission val_fn = config.submission_fn.replace('.csv', '_VAL.csv') model.eval() sub = [] sub = create_submission(model, val_loader, config, pred_zip=config.pred_zip.replace('.zip', '_VAL.zip')) sub.to_csv(val_fn, index=False) print("Wrote to: %s" % val_fn)
def train(name, loader, checkpoint, num_rep, lr, beta1, gamma_gan, num_epochs, wd, device): discriminator = Discriminator().to(device) generator = Generator(num_rep).to(device) losses = {'D': [], 'G': []} optimizer_D = AdamW(discriminator.parameters(), lr=lr, weight_decay=wd, betas=(beta1, 0.99)) optimizer_G = AdamW(generator.parameters(), lr=lr, weight_decay=wd, betas=(beta1, 0.99)) bce = nn.BCELoss() mse = nn.MSELoss() normalizer = Normalizer(cfg.mean, cfg.std, device) if torch.cuda.device_count() > 1: generator = nn.DataParallel(generator) discriminator = nn.DataParallel(discriminator) save_path = Path('.') / 'save' / name if not save_path.is_dir(): save_path.mkdir(parents=True) if checkpoint: losses = load_checkpoint(save_path, discriminator, generator, optimizer_D, optimizer_G) last_epoch = len(losses['D']) - 1 logging.info('Last epoch={}'.format(last_epoch)) for epoch in range(last_epoch + 1, num_epochs): losses_G = 0.0 losses_D = 0.0 loss_G_gan_acc = 0.0 loss_G_M_acc = 0.0 iter_count = 0 for image, gt, _ in loader: batchsize = image.size(0) image, gt = image.to(device), gt.to(device) # Phrase 1: train the D discriminator.zero_grad() labels = torch.full((batchsize, 1), 1, device=device) output = discriminator(gt) D_x = output.mean().item() loss_D_real = bce(output, labels) loss_D_real.backward() fake = generator(image) fake = normalizer(fake) labels.fill_(0) output = discriminator(fake.detach()) D_G_z1 = output.mean().item() loss_D_fake = bce(output, labels) loss_D_fake.backward() loss_D = loss_D_real.item() + loss_D_fake.item() optimizer_D.step() # Phrase 2: train the G generator.zero_grad() output = discriminator(fake) D_G_z2 = output.mean().item() labels.fill_(1) loss_G_gan = bce(output, labels) loss_G_gan_acc += loss_G_gan.item() loss_G_M = mse(fake, gt) loss_G_M_acc += loss_G_M.item() loss_G = gamma_gan * loss_G_gan + loss_G_M loss_G.backward() optimizer_G.step() losses_D += loss_D losses_G += loss_G.item() if iter_count % 20 == 0: logging.info( "Iteration {} loss -- Loss D {:.4f}, " "Loss G {:.4f}, D(x) {:.4f} D(g(z)) {:.4f} / {:.4f}". format(iter_count, loss_D, loss_G, D_x, D_G_z1, D_G_z2)) iter_count += 1 logging.info("D Loss: {:.4f}, G Loss: {:.4f} at epoch {}.".format( losses_D, losses_G, epoch)) logging.info('loss_G_gan_acc={:.4f}, loss_G_M_acc={:.4f}'.format( loss_G_gan_acc, loss_G_M_acc)) losses['D'].append(losses_D) losses['G'].append(losses_G) if checkpoint: save_checkpoint(save_path, discriminator, generator, optimizer_D, optimizer_G, losses)
def build_model(self): x1, x2, s1, s2 = self.x1_in, self.x2_in, self.s1_in, self.s2_in self.x_mask = Lambda( lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x1) x = self.bert_pretrained_model([x1, x2]) # BiGRU + DNN # # from https://github.com/hecongqing/CCKS2019EventEntityExtraction_Rank5/blob/master/src/SEBERT_model.py # l = Lambda(lambda t: t[:, -1])(x) # x = Add()([x, l]) # x = Dropout(0.1)(x) # x = Lambda(lambda x: x[0] * x[1])([x, x_mask]) # # x = SpatialDropout1D(0.1)(x) # x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x) # x = Lambda(lambda x: x[0] * x[1])([x, x_mask]) # x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x) # x = Lambda(lambda x: x[0] * x[1])([x, x_mask]) # # x = Dense(1024, use_bias=False, activation='tanh')(x) # x = Dropout(0.2)(x) # x = Dense(64, use_bias=False, activation='tanh')(x) # x = Dropout(0.2)(x) # x = Dense(8, use_bias=False, activation='tanh')(x) ps1 = Dense(1, use_bias=False)(x) ps1 = Lambda(lambda x: x[0][..., 0] - (1 - x[1][..., 0]) * 1e10)( [ps1, self.x_mask]) ps2 = Dense(1, use_bias=False)(x) ps2 = Lambda(lambda x: x[0][..., 0] - (1 - x[1][..., 0]) * 1e10)( [ps2, self.x_mask]) self.predict_model = Model([self.x1_in, self.x2_in], [ps1, ps2]) train_model = Model([self.x1_in, self.x2_in, self.s1_in, self.s2_in], [ps1, ps2]) if Config.gpus > 1: train_model = multi_gpu_model(train_model, gpus=Config.gpus) def get_loss(y_true, y_pred, with_weights=False): weights = 0.0 if with_weights: # 根据标签位置距离,计算分类权重。 i_true = K.argmax(y_true, axis=1) i_pred = K.argmax(y_pred, axis=1) distance = K.abs(i_true - i_pred) weights = K.cast(distance, dtype='float32') # length = K.int_shape(y_true)[1] - 1 # weights = K.cast(distance / length, dtype='float32') losses = (1.0 + weights) * K.categorical_crossentropy( y_true, y_pred, from_logits=True) # losses = ( # (1.0 + weights) * # K.categorical_crossentropy(y_true, y_pred, from_logits=True)) loss = K.mean(losses) return loss # loss1 = K.mean( # K.categorical_crossentropy(self.s1_in, ps1, from_logits=True)) # ps2 -= (1 - K.cumsum(s1, 1)) * 1e10 # loss2 = K.mean( # K.categorical_crossentropy(self.s2_in, ps2, from_logits=True)) # self.loss = loss1 + loss2 loss1 = get_loss(self.s1_in, ps1, with_weights=True) ps2 -= (1 - K.cumsum(s1, 1)) * 1e10 loss2 = get_loss(self.s2_in, ps2, with_weights=True) self.loss = loss1 + loss2 train_model.add_loss(self.loss) if 'COLAB_TPU_ADDR' in os.environ: train_model.compile( #optimizer=tf.train.RMSPropOptimizer(self.learning_rate)) optimizer=RMSprop()) else: # from accum_optimizer import AccumOptimizer # train_model.compile(optimizer=AccumOptimizer( # Adam(self.learning_rate), steps_per_update)) train_model.compile(optimizer=AdamW(self.learning_rate)) train_model.summary() self.train_model = train_model
if args.distributed: model = nn.SyncBatchNorm.convert_sync_batchnorm(model, pg) model = model.cuda() params = model.parameters() # param_optimizer = list(model.named_parameters()) # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0001}, # {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} # ] optimizer = AdamW(params, lr=2e-5) # SGD(params, lr=0.01, momentum=0.9) #, weight_decay=1e-4) #AdamW(params, lr=5e-4) #SGD(params, lr=0.04, momentum=0.9, weight_decay=1e-4) #params) #, nesterov=True #AdamW(params, lr=1e-4) #SGD(params, lr=0.001, momentum=0.9, weight_decay=1e-7, nesterov=True) #AdamW(params, lr=1e-3, weight_decay=0.1) #Novograd(params, lr=4e-4, weight_decay=2e-5) #AdamW(params, lr=1e-4, weight_decay=0.15) # model, optimizer = amp.initialize(model, optimizer, opt_level="O2") if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) loss_scaler = torch.cuda.amp.GradScaler() snap_to_load = 'eff6_4k_{0}_best_full2_0'.format(fold) if args.local_rank == 0: print("=> loading checkpoint '{}'".format(snap_to_load)) checkpoint = torch.load(path.join(models_folder, snap_to_load), map_location='cpu') loaded_dict = checkpoint['state_dict']
train_data_loader = DataLoader(data_train, batch_size=batch_size, num_workers=4, shuffle=True, pin_memory=True) val_data_loader = DataLoader(val_train, batch_size=batch_size, num_workers=2, shuffle=False, pin_memory=True) model = nn.DataParallel(Dpn92_9ch_Unet()).cuda() params = model.parameters() optimizer = AdamW(params, lr=1e-4, weight_decay=1e-4) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[4, 12, 22], gamma=0.5) loss_function = ComboLoss({ 'dice': 1.0, 'focal': 10.0 }, per_image=True).cuda() l1_loss = torch.nn.SmoothL1Loss().cuda() best_score = 0 for epoch in range(25): train_epoch(epoch, loss_function, l1_loss, model, optimizer, scheduler, train_data_loader) torch.save(
def train(args): model = models.myecgnet() if args.ckpt and not args.resume: state = torch.load(args.ckpt, map_location='cpu') model.load_state_dict(state['state_dict']) print('train with pretrained weight val_f1', state['f1']) model = model.to(device) train_dataset = ECGDataset(data_path=config.train_data, train=True) train_dataloader = DataLoader(train_dataset, collate_fn=my_collate_fn, batch_size=config.batch_size, shuffle=True, num_workers=8) val_dataset = ECGDataset(data_path=config.train_data, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=8) print("train_datasize", len(train_dataset), "val_datasize", len(val_dataset)) optimizer = AdamW(model.parameters(), lr=config.lr) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) criterion = utils.WeightedMultilabel(w) model_save_dir = '%s/%s_%s' % (config.ckpt, config.model_name, time.strftime("%Y%m%d%H%M")) os.mkdir(model_save_dir) if args.ex: model_save_dir += args.ex best_f1 = -1 lr = config.lr start_epoch = 1 stage = 1 if args.resume: if os.path.exists(args.ckpt): model_save_dir = args.ckpt current_w = torch.load(os.path.join(args.ckpt, config.current_w)) best_w = torch.load(os.path.join(model_save_dir, config.best_w)) best_f1 = best_w['loss'] start_epoch = current_w['epoch'] + 1 lr = current_w['lr'] stage = current_w['stage'] model.load_state_dict(current_w['state_dict']) if start_epoch - 1 in config.stage_epoch: stage += 1 lr /= config.lr_decay utils.adjust_learning_rate(optimizer, lr) model.load_state_dict(best_w['state_dict']) print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_f1 = train_epoch(model, optimizer, criterion, train_dataloader, show_interval=10) val_loss, val_f1 = val_epoch(model, criterion, val_dataloader) print( '#epoch:%03d\tstage:%d\ttrain_loss:%.4f\ttrain_f1:%.3f\tval_loss:%0.4f\tval_f1:%.3f\ttime:%s\n' % (epoch, stage, train_loss, train_f1, val_loss, val_f1, utils.print_time_cost(since))) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } save_ckpt(state, best_f1 < val_f1, model_save_dir) best_f1 = max(best_f1, val_f1) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay best_w = os.path.join(model_save_dir, config.best_w) model.load_state_dict(torch.load(best_w)['state_dict']) print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr)