def __init__(self, actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr=None, eps=None, max_grad_norm=None, use_clipped_value_loss=True, optimizer='adam', beta1=0.0, beta2=0.999): # betas not passed to optimizers self.actor_critic = actor_critic self.clip_param = clip_param self.ppo_epoch = ppo_epoch self.num_mini_batch = num_mini_batch self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss if optimizer == 'adam': print("using adam optimizer!") self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr, eps=eps, betas=(0.0, 0.999)) elif optimizer == 'lamb': print("using lamb optimizer!") self.optimizer = Lamb(actor_critic.parameters(), lr=lr, eps=eps, betas=(0.0, 0.999)) elif optimizer == 'sgd': print("using SGD optimizer!") self.optimizer = optim.SGD(actor_critic.parameters(), lr=lr, momentum=0.0) elif optimizer == 'nero': print("using nero optimizer!") self.optimizer = Nero(actor_critic.parameters(), lr=lr)
lr=args.lr, betas=(args.momentum, args.beta), weight_decay=args.wd, constraints=True) elif args.optimizer == 'madam': print("using madam!") optimizer = Madam(net.parameters(), lr=args.lr) elif args.optimizer == 'madamcs': print("using madamcs!") optimizer = MadamCS(net.parameters(), lr=args.lr, constraints=True) elif args.optimizer == 'nero': print("using nero!") optimizer = Nero(net.parameters(), lr=args.lr, constraints=True) elif args.optimizer == 'neroabl': print("using nero ablated!") optimizer = Nero_abl(net.parameters(), lr=args.lr, c1=args.c1, c2=args.c2) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=settings.MILESTONES, gamma=args.gamma) #learning rate decay iter_per_epoch = len(cifar_training_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) args.prefix = "seed" + str(args.seed) + args.prefix
class PPO(): def __init__(self, actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr=None, eps=None, max_grad_norm=None, use_clipped_value_loss=True, optimizer='adam', beta1=0.0, beta2=0.999): # betas not passed to optimizers self.actor_critic = actor_critic self.clip_param = clip_param self.ppo_epoch = ppo_epoch self.num_mini_batch = num_mini_batch self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss if optimizer == 'adam': print("using adam optimizer!") self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr, eps=eps, betas=(0.0, 0.999)) elif optimizer == 'lamb': print("using lamb optimizer!") self.optimizer = Lamb(actor_critic.parameters(), lr=lr, eps=eps, betas=(0.0, 0.999)) elif optimizer == 'sgd': print("using SGD optimizer!") self.optimizer = optim.SGD(actor_critic.parameters(), lr=lr, momentum=0.0) elif optimizer == 'nero': print("using nero optimizer!") self.optimizer = Nero(actor_critic.parameters(), lr=lr) def update(self, rollouts): advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) value_loss_epoch = 0 action_loss_epoch = 0 dist_entropy_epoch = 0 for e in range(self.ppo_epoch): if self.actor_critic.is_recurrent: data_generator = rollouts.recurrent_generator( advantages, self.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, self.num_mini_batch) for sample in data_generator: obs_batch, recurrent_hidden_states_batch, actions_batch, \ value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( obs_batch, recurrent_hidden_states_batch, masks_batch, actions_batch) ratio = torch.exp(action_log_probs - old_action_log_probs_batch) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() if self.use_clipped_value_loss: value_pred_clipped = value_preds_batch + \ (values - value_preds_batch).clamp(-self.clip_param, self.clip_param) value_losses = (values - return_batch).pow(2) value_losses_clipped = (value_pred_clipped - return_batch).pow(2) value_loss = 0.5 * torch.max(value_losses, value_losses_clipped).mean() else: value_loss = 0.5 * (return_batch - values).pow(2).mean() self.optimizer.zero_grad() (value_loss * self.value_loss_coef + action_loss - dist_entropy * self.entropy_coef).backward() nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) self.optimizer.step() value_loss_epoch += value_loss.item() action_loss_epoch += action_loss.item() dist_entropy_epoch += dist_entropy.item() num_updates = self.ppo_epoch * self.num_mini_batch value_loss_epoch /= num_updates action_loss_epoch /= num_updates dist_entropy_epoch /= num_updates return value_loss_epoch, action_loss_epoch, dist_entropy_epoch
criterion = nn.NLLLoss() if args.optim == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) elif args.optim == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(args.momentum, args.beta)) elif args.optim == 'lamb': optimizer = Lamb(model.parameters(), lr=args.lr, betas=(args.momentum, args.beta)) elif args.optim == 'nero': optimizer = Nero(model.parameters(), lr=args.lr) ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Tensors, to detach them from their history.""" if isinstance(h, torch.Tensor): return h.detach() else: return tuple(repackage_hidden(v) for v in h)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if not torch.cuda.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() criterion = nn.CrossEntropyLoss().cuda(args.gpu) logname = args.prefix if args.optimizer == 'sgd': logname += "SGD_" print("sgd") optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer == 'nero': logname += "Nero_" print("Nero") optimizer = Nero(model.parameters(), lr=args.lr,constraints=True) cos_sch = False scheduler = None T_max = math.ceil(1281167.0/float(args.batch_size)) * (args.epochs) if args.sch == 'cos' or args.sch == 'cosine': print("cosine scheduler") cos_sch = True scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=T_max,eta_min=0.0) logname += args.arch + "_sch_" +str(args.sch)+ "_lr" +str(args.lr) + \ '_epoch' + str(args.epochs) + \ "_opt_" + args.optimizer + \ "_b" + str(args.batch_size) + \ '_momentum' + str(args.momentum) + "_beta" + str(args.beta) + \ '_wd' + str(args.weight_decay) writer = SummaryWriter(args.logdir + '/' + logname) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict'],strict=False) optimizer.load_state_dict(checkpoint['optimizer']) for group in optimizer.param_groups: group["lr"] = args.lr if args.sch == 'cos': for i in range(checkpoint['epoch']*math.ceil(1281167.0/float(args.batch_size))): scheduler.step() print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return lr = args.lr for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) current_lr = optimizer.param_groups[0]['lr'] print("current learning rate: {}".format(current_lr)) writer.add_scalar('lr',current_lr,epoch) # train for one epoch top1_train, top5_train, losses_train, batch_time_train, scheduler = train(train_loader, model, criterion, optimizer, epoch, args, writer,scheduler=scheduler) if not cos_sch: lr = adjust_learning_rate(optimizer, epoch, lr, lr_decay_epoch,args.lr_decay) writer.add_scalar('train/batch_time_mean', batch_time_train, epoch) writer.add_scalar('train/loss_mean', losses_train, epoch) writer.add_scalar('train/top1_mean', top1_train, epoch) writer.add_scalar('train/top5_mean', top5_train, epoch) # evaluate on validation set top1_val, top5_val, losses_val, batch_time_val = validate(val_loader, model, criterion, args, epoch, writer) writer.add_scalar('val/batch_time_mean', batch_time_val, epoch) writer.add_scalar('val/loss_mean', losses_val, epoch) writer.add_scalar('val/top1_mean', top1_val, epoch) writer.add_scalar('val/top5_mean', top5_val, epoch) # remember best acc@1 and save checkpoint is_best = top1_val > best_acc1 best_acc1 = max(top1_val, best_acc1) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): if ((epoch + 1) % 5 == 0) : save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer' : optimizer.state_dict(), }, is_best, args.logdir + '/' + logname + '/epoch' + str(epoch+1) + '_checkpoint.pth.tar') writer.close()
lr=args.initial_lr, betas=(0.0, 0.999)) optD = optim.Adam(netD.parameters(), lr=args.initial_lr, betas=(0.9, 0.999)) elif args.optim == 'lamb': optG = Lamb(netG.parameters(), lr=args.initial_lr, betas=(0.0, 0.999)) optD = Lamb(netD.parameters(), lr=args.initial_lr, betas=(0.0, 0.999)) elif args.optim == 'sgd': optG = optim.SGD(netG.parameters(), lr=args.initial_lr, momentum=0.0) optD = optim.SGD(netD.parameters(), lr=args.initial_lr, momentum=0.0) elif args.optim == 'nero': optG = Nero(netG.parameters(), lr=args.initial_lr) optD = Nero(netD.parameters(), lr=args.initial_lr) else: raise Exception("Unsupported optim") ######################################### #### Train ############################## ######################################### def train(): print("Training...") netG.train() netD.train()