def forward(self, x): b = x.ndimension() > 1 if not b: x = x.unsqueeze(0) assert x.ndimension() == 2 nbatch = x.size(0) y_init = torch.zeros(nbatch, self.Enet.n_out, device=x.device, requires_grad=True) if (self.init_scheme == 'gt'): y_init = (x * torch.sin(x)).clone() y = Variable(y_init.data, requires_grad=True) inner_opt = higher.get_diff_optim(torch.optim.SGD([y], lr=self.inner_lr), [y], device=x.device) for _ in range(self.n_inner_iter): E = self.Enet(x, y) E = torch.square(E) y, = inner_opt.step(E.sum(), params=[y]) return y
def ddpg_update(config, fnet_actor, diff_act_opt, models, optimizers, memory_cache, update_type='meta'): summed_policy_loss = torch.zeros(1) summed_value_loss = torch.zeros(1) diff_crit_opt = higher.get_diff_optim(optimizers['critic_opt'], models['critic'].parameters(), track_higher_grads=False) diff_act_opt_internal = higher.create_diff_optim(torch.optim.SGD, fmodel=fnet_actor, track_higher_grads=True, opt_kwargs={'lr': config.actor_rl_learning_rate, 'momentum': 0.01}) for it in range(config.offpol_num_iterations_update): states, next_states, actions_init, rewards, dones, _ = get_shaped_memory_sample(config, memory_cache) inverted_dones = 1 - dones rewards = rewards.view(-1, 1) inverted_dones = inverted_dones.view(-1, 1) target_Q = models['critic_target'](next_states, models['actor_target'](next_states)) target_Q = rewards + (inverted_dones * config.discount_factor * target_Q).detach() current_Q = models['critic'](states, actions_init) torch.autograd.set_detect_anomaly(True) critic_loss = F.mse_loss(current_Q, target_Q) diff_crit_opt.step(critic_loss, models['critic'].parameters()) summed_value_loss += critic_loss actor_loss = -models['critic'](states, fnet_actor(states)).mean() # Optimize the actor # Functionally diff_act_opt.step(actor_loss) summed_policy_loss += actor_loss # Update the frozen target models # Critic is as before. for param, target_param in zip(models['critic'].parameters(), models['critic_target'].parameters()): target_param.data.copy_(config.offpol_target_update_rate * param.data + (1 - config.offpol_target_update_rate) * target_param.data) if not (fnet_actor.dim == models['actor_target'].dim): # print('Changing target actor to resemble fnet_actor') models['actor_target'] = SequentialActor(config, fnet_actor.state_scaler, fnet_actor.state_normalizer, dim=fnet_actor.dim, num_layers=fnet_actor.num_layers).reset_weights() # For actor, performing a bit of a unwieldier approach. for param, target_param in zip(fnet_actor.parameters(), models['actor_target'].parameters()): target_param.data.copy_(config.offpol_target_update_rate * param.data + (1 - config.offpol_target_update_rate) * target_param.data) return summed_policy_loss, summed_value_loss.item()
def forward(self, x): b = x.ndimension() > 1 if not b: x = x.unsqueeze(0) assert x.ndimension() == 2 nbatch = x.size(0) y = torch.zeros(nbatch, self.Enet.n_out, device=x.device, requires_grad=True) inner_opt = higher.get_diff_optim(torch.optim.SGD([y], lr=self.inner_lr), [y], device=x.device) for _ in range(self.n_inner_iter): E = self.Enet(x, y) y, = inner_opt.step(E.sum(), params=[y]) return y
def solve(self, xinit): assert xinit.ndimension() == 2 nbatch = xinit.size(0) z = torch.zeros(nbatch, self.latent_size, device=xinit.device, requires_grad=True) inner_opt = higher.get_diff_optim(torch.optim.SGD( [z], lr=self.inner_optim_opts.lr), [z], device=xinit.device) f_emb = self.get_cost_f(xinit) for _ in range(self.inner_optim_opts.n_iter): cost = f_emb(z) z, = inner_opt.step(cost.sum(), params=[z]) us = self.decode(z) rews, xs = rew_nominal(self.dx, xinit, us) cost = -rews return z, cost
def forward(self, x): assert x.ndimension() == 4 nbatch = x.size(0) # Make an initial guess of the labels. # For more sophisticated tasks this could also be learned. y = torch.zeros(nbatch, self.n_cls, device=x.device, requires_grad=True) # Define a differentiable optimizer to update the label with. inner_opt = higher.get_diff_optim(torch.optim.SGD([y], lr=1e-1), [y], device=x.device) # Take a few gradient steps to find the labels that # optimize the energy function. for _ in range(self.n_inner_iter): E = self.Enet(x, y) y, = inner_opt.step(E.sum(), params=[y]) return y
def td3_update(config, fnet_actor, actor_rl_opt, models, optimizers, memory_cache, update_type='meta'): summed_policy_loss = torch.zeros(1) summed_value_loss = torch.zeros(1) optimizers['critic_opt'].zero_grad() diff_crit_opt = higher.get_diff_optim(optimizers['critic_opt'], models['critic'].parameters(), track_higher_grads=False) diff_crit_opt_2 = higher.get_diff_optim(optimizers['critic_opt_2'], models['critic_2'].parameters(), track_higher_grads=False) diff_act_opt_internal = higher.create_diff_optim(torch.optim.SGD, fmodel=fnet_actor, track_higher_grads=True, opt_kwargs={'lr': config.actor_rl_learning_rate, 'momentum': 0.01}) # Initially attempted alternate structure with functional critics, abandoned as found to be unnecessary. # with higher.innerloop_ctx(models['critic'], optimizers['critic_opt'], copy_initial_weights=False) as (fnet_critic, diff_crit_opt): # with higher.innerloop_ctx(models['critic_2'], optimizers['critic_opt_2'], copy_initial_weights=False) as (fnet_critic_2, diff_crit_opt_2): for it in range(config.offpol_num_iterations_update): states, next_states, actions_init, rewards, dones, _ = get_shaped_memory_sample(config, memory_cache) inverted_dones = 1 - dones rewards = rewards.view(-1, 1) inverted_dones = inverted_dones.view(-1, 1) noise = torch.FloatTensor(actions_init).data.normal_(0, 0.2) noise = noise.clamp(-0.5, 0.5) next_action = (models['actor_target'](next_states) + noise).clamp(-config.action_space_high[0], config.action_space_high[0]) target_Q1 = models['critic_target'](next_states, next_action) target_Q2 = models['critic_target_2'](next_states, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = rewards + (inverted_dones * config.discount_factor * target_Q).detach() current_Q1 = models['critic'](states, actions_init) current_Q2 = models['critic_2'](states, actions_init) torch.autograd.set_detect_anomaly(True) critic_loss_1 = F.mse_loss(current_Q1, target_Q) critic_loss_2 = F.mse_loss(current_Q2, target_Q) diff_crit_opt.step(critic_loss_1, models['critic'].parameters()) diff_crit_opt_2.step(critic_loss_2, models['critic_2'].parameters()) summed_value_loss += critic_loss_1 + critic_loss_2 if it % 2 == 0: actor_loss = - models['critic'](states, fnet_actor(states)).mean() # Optimize the actor # Functionally diff_act_opt_internal.step(actor_loss) summed_policy_loss += actor_loss # Update the frozen target models # Critic is as before. for param, target_param in zip( models['critic'].parameters(), models['critic_target'].parameters()): target_param.data.copy_(config.offpol_target_update_rate * param.data + ( 1 - config.offpol_target_update_rate) * target_param.data) for param, target_param in zip( models['critic_2'].parameters(), models['critic_target_2'].parameters()): target_param.data.copy_(config.offpol_target_update_rate * param.data + ( 1 - config.offpol_target_update_rate) * target_param.data) if not (fnet_actor.dim == models['actor_target'].dim): # print('Changing target actor to resemble fnet_actor') models['actor_target'] = SequentialActor(config, fnet_actor.state_scaler, fnet_actor.state_normalizer, dim=fnet_actor.dim, num_layers=fnet_actor.num_layers).reset_weights() # For actor, performing a bit of a unwieldier approach. for param, target_param in zip(fnet_actor.parameters(), models['actor_target'].parameters()): target_param.data.copy_(config.offpol_target_update_rate * param.data + (1 - config.offpol_target_update_rate) * target_param.data) return summed_policy_loss, summed_value_loss.item()