def test_dynamics_gradients(self): cfg = get_cfg_defaults() cfg.merge_from_file("/home/aleksi/Workspace/Model-Based-RL/configs/swimmer.yaml") agent = build_agent(cfg) mj_dynamics_forward_fn = agent.forward_factory("dynamics") mj_dynamics_gradients_fn = agent.gradient_factory("dynamics") nwarmup = 5 agent.reset() nv = agent.sim.model.nv nu = agent.sim.model.nu for _ in range(nwarmup): action = torch.Tensor(agent.action_space.sample()) ob, r, _, _ = agent.step(action) state = ob.detach().numpy() action = agent.action_space.sample() state_action = np.concatenate([state, action], axis=0) dsdsa = mj_dynamics_gradients_fn(state_action) dsds = dsdsa[:, :nv * 2] dsda = dsdsa[:, -nu:] eps = 1e-6 state_action_prime = state_action + eps s = mj_dynamics_forward_fn(state_action) s_prime = mj_dynamics_forward_fn(state_action_prime) s_prime_estimate = s + \ np.squeeze(np.matmul(dsds, np.array([eps] * 2 * nv).reshape([-1, 1]))) + \ np.squeeze(np.matmul(dsda, np.array([eps] * nu).reshape([-1, 1]))) print(s) print(s_prime) print(s_prime_estimate) self.assert_(np.allclose(s_prime, s_prime_estimate))
def test_reward_gradients(self): cfg = get_cfg_defaults() cfg.MUJOCO.ENV = "InvertedPendulumEnv" agent = build_agent(cfg) mj_reward_forward_fn = agent.forward_factory("reward") mj_reward_gradients_fn = agent.gradient_factory("reward") nwarmup = 5 agent.reset() nv = agent.sim.model.nv nu = agent.sim.model.nu for _ in range(nwarmup): action = torch.Tensor(agent.action_space.sample()) ob, r, _, _ = agent.step(action) state = ob.detach().numpy() action = agent.action_space.sample() state_action = np.concatenate([state, action], axis=0) drdsa = mj_reward_gradients_fn(state_action) drds = drdsa[:, :nv * 2] drda = drdsa[:, -nu:] eps = 1e-6 state_action_prime = state_action + eps r = mj_reward_forward_fn(state_action) r_prime = mj_reward_forward_fn(state_action_prime) r_prime_estimate = r + \ np.squeeze(np.matmul(drds, np.array([eps] * 2 * nv).reshape([-1, 1]))) + \ np.squeeze(np.matmul(drda, np.array([eps] * nu).reshape([-1, 1]))) self.assertAlmostEqual(r_prime[0], r_prime_estimate[0], places=5)
def do_training(cfg, logger, output_results_dir, output_rec_dir, output_weights_dir): # Build the agent agent = build_agent(cfg) # Build a forward dynamics model dynamics_model = DynamicsModel(agent) # Set mode to training (aside from policy output, matters for Dropout, BatchNorm, etc.) dynamics_model.train() # Set up visdom if cfg.LOG.PLOT.ENABLED: visdom = VisdomLogger(cfg.LOG.PLOT.DISPLAY_PORT) visdom.register_keys([ 'total_loss', 'average_sd', 'average_action', "reinforce_loss", "objective_loss", "sd", "action_grad", "sd_grad", "actions" ]) # wrap screen recorder if testing mode is on if cfg.LOG.TESTING.ENABLED: if cfg.LOG.PLOT.ENABLED: visdom.register_keys(['test_reward']) # Collect losses here output = {"epoch": [], "objective_loss": []} # Start training for epoch_idx in range(cfg.MODEL.EPOCHS): batch_loss = torch.empty(cfg.MODEL.BATCH_SIZE, cfg.MODEL.POLICY.MAX_HORIZON_STEPS, dtype=torch.float64) batch_loss.fill_(np.nan) for episode_idx in range(cfg.MODEL.BATCH_SIZE): # Generate "random walk" set of actions (separately for each dimension) action = torch.zeros(agent.action_space.shape, dtype=torch.float64) #actions = np.zeros((agent.action_space.shape[0], cfg.MODEL.POLICY.MAX_HORIZON_STEPS)) actions = [] initial_state = torch.Tensor(agent.reset()) predicted_states = [] real_states = [] corrections = [] for step_idx in range(cfg.MODEL.POLICY.MAX_HORIZON_STEPS): # Generate random actions action = action + 0.1 * ( 2 * torch.rand(agent.action_space.shape) - 1) # Clamp to [-1, 1] action.clamp_(-1, 1) # Save action actions.append(action) previous_state = torch.from_numpy(agent.unwrapped._get_obs()) # Advance the actual simulation next_state, _, _, _ = agent.step(action) next_state = torch.from_numpy(next_state) real_states.append(next_state) # Advance with learned dynamics simulation pred_next_state = dynamics_model(previous_state.float(), action.float()).double() batch_loss[episode_idx, step_idx] = torch.pow(next_state - pred_next_state, 2).mean() #if agent.is_done: # break #dot = torchviz.make_dot(pred_next_state, params=dict(dynamics_model.named_parameters())) loss = torch.sum(batch_loss) dynamics_model.optimizer.zero_grad() loss.backward() dynamics_model.optimizer.step() output["objective_loss"].append(loss.detach().numpy()) output["epoch"].append(epoch_idx) if epoch_idx % cfg.LOG.PERIOD == 0: if cfg.LOG.PLOT.ENABLED: visdom.update({"total_loss": loss.detach().numpy()}) visdom.set({'actions': torch.stack(actions).detach().numpy()}) #visdom.set({'total_loss': loss["total_loss"].transpose()}) #visdom.update({'average_grad': np.log(torch.mean(model.policy_net.mean._layers["linear_layer_0"].weight.grad.abs()).detach().numpy())}) logger.info("REWARD: \t\t{} (iteration {})".format( loss.detach().numpy(), epoch_idx)) if cfg.LOG.PLOT.ENABLED and epoch_idx % cfg.LOG.PLOT.ITER_PERIOD == 0: visdom.do_plotting() # if epoch_idx % cfg.LOG.CHECKPOINT_PERIOD == 0: # torch.save(model.state_dict(), # os.path.join(output_weights_dir, 'iter_{}.pth'.format(epoch_idx))) if False: #cfg.LOG.TESTING.ENABLED: if epoch_idx % cfg.LOG.TESTING.ITER_PERIOD == 0: # Record if required agent.start_recording( os.path.join(output_rec_dir, "iter_{}.mp4".format(epoch_idx))) test_rewards = [] for _ in range(cfg.LOG.TESTING.COUNT_PER_ITER): test_reward = do_testing( cfg, model, agent, # first_state=state_xr.get_item(), ) test_rewards.append(test_reward) # Set training mode on again model.train() # Close the recorder agent.stop_recording() # Save outputs into log folder lg.save_dict_into_csv(output_results_dir, "output", output) # Save model torch.save(dynamics_model.state_dict(), os.path.join(output_weights_dir, "final_weights.pt"))
def do_training(cfg, logger, output_results_dir, output_rec_dir, output_weights_dir, iter): if cfg.MODEL.RANDOM_SEED > 0: np.random.seed(cfg.MODEL.RANDOM_SEED + iter) torch.manual_seed(cfg.MODEL.RANDOM_SEED + iter) # Build the agent agent = build_agent(cfg) # Build the model model = build_model(cfg, agent) device = torch.device(cfg.MODEL.DEVICE) model.to(device) # Set mode to training (aside from policy output, matters for Dropout, BatchNorm, etc.) model.train() # Set up visdom if cfg.LOG.PLOT.ENABLED: visdom = VisdomLogger(cfg.LOG.PLOT.DISPLAY_PORT) visdom.register_keys([ 'total_loss', 'average_sd', 'average_action', "reinforce_loss", "objective_loss", "sd", "action_grad", "sd_grad", "average_grad" ]) for action_idx in range(model.policy_net.action_dim): visdom.register_keys(["action_" + str(action_idx)]) # wrap screen recorder if testing mode is on if cfg.LOG.TESTING.ENABLED: if cfg.LOG.PLOT.ENABLED: visdom.register_keys(['test_reward']) # Collect losses here output = {"epoch": [], "objective_loss": [], "average_sd": []} # Start training for epoch_idx in range(cfg.MODEL.EPOCHS): batch_loss = torch.empty(cfg.MODEL.BATCH_SIZE, cfg.MODEL.POLICY.MAX_HORIZON_STEPS, dtype=torch.float64) batch_loss.fill_(np.nan) for episode_idx in range(cfg.MODEL.BATCH_SIZE): initial_state = torch.DoubleTensor(agent.reset()) states = [] states.append(initial_state) #grads = np.zeros((cfg.MODEL.POLICY.MAX_HORIZON_STEPS, 120)) for step_idx in range(cfg.MODEL.POLICY.MAX_HORIZON_STEPS): state, reward = model(states[step_idx]) batch_loss[episode_idx, step_idx] = -reward #(-reward).backward(retain_graph=True) #grads[step_idx, :] = model.policy_net.optimizer.mean.grad.detach().numpy() #grads[step_idx, step_idx+1:40] = np.nan #grads[step_idx, 40+step_idx+1:80] = np.nan #grads[step_idx, 80+step_idx+1:] = np.nan #model.policy_net.optimizer.optimizer.zero_grad() states.append(state) if agent.is_done: break agent.running_sum = 0 loss = model.policy_net.optimize(batch_loss) #zero = np.abs(grads) < 1e-9 #grads[zero] = np.nan #medians = np.nanmedian(grads, axis=0) #model.policy_net.optimizer.mean.grad.data = torch.from_numpy(medians) #torch.nn.utils.clip_grad_value_([model.policy_net.optimizer.mean, model.policy_net.optimizer.sd], 1) #model.policy_net.optimizer.optimizer.step() #model.policy_net.optimizer.optimizer.zero_grad() #loss = {'objective_loss': torch.sum(batch_loss, dim=1).mean().detach().numpy()} output["objective_loss"].append(loss["objective_loss"]) output["epoch"].append(epoch_idx) output["average_sd"].append(np.mean(model.policy_net.get_clamped_sd())) if epoch_idx % cfg.LOG.PERIOD == 0: if cfg.LOG.PLOT.ENABLED: visdom.update(loss) #visdom.set({'total_loss': loss["total_loss"].transpose()}) clamped_sd = model.policy_net.get_clamped_sd() clamped_action = model.policy_net.get_clamped_action() #visdom.update({'average_grad': np.log(torch.mean(model.policy_net.mean._layers["linear_layer_0"].weight.grad.abs()).detach().numpy())}) if len(clamped_sd) > 0: visdom.update({'average_sd': np.mean(clamped_sd, axis=1)}) visdom.update({ 'average_action': np.mean(clamped_action, axis=(1, 2)).squeeze() }) for action_idx in range(model.policy_net.action_dim): visdom.set({ 'action_' + str(action_idx): clamped_action[action_idx, :, :] }) if clamped_sd is not None: visdom.set({'sd': clamped_sd.transpose()}) # visdom.set({'action_grad': model.policy_net.mean.grad.detach().numpy().transpose()}) logger.info("REWARD: \t\t{} (iteration {})".format( loss["objective_loss"], epoch_idx)) if cfg.LOG.PLOT.ENABLED and epoch_idx % cfg.LOG.PLOT.ITER_PERIOD == 0: visdom.do_plotting() if epoch_idx % cfg.LOG.CHECKPOINT_PERIOD == 0: torch.save( model.state_dict(), os.path.join(output_weights_dir, 'iter_{}.pth'.format(epoch_idx))) if cfg.LOG.TESTING.ENABLED: if epoch_idx % cfg.LOG.TESTING.ITER_PERIOD == 0: # Record if required agent.start_recording( os.path.join(output_rec_dir, "iter_{}_{}.mp4".format(iter, epoch_idx))) test_rewards = [] for _ in range(cfg.LOG.TESTING.COUNT_PER_ITER): test_reward = do_testing( cfg, model, agent, # first_state=state_xr.get_item(), ) test_rewards.append(test_reward) # Set training mode on again model.train() # Close the recorder agent.stop_recording() # Save outputs into log folder lg.save_dict_into_csv(output_results_dir, "output_{}".format(iter), output) # Return actions return agent
def run_reward_test(self, cfg_file, sigma): cfg = get_cfg_defaults() cfg.merge_from_file(cfg_file) agent = build_agent(cfg) mj_forward_fn = agent.forward_factory("dynamics") mj_gradients_fn = agent.gradient_factory("reward") model = build_model(cfg, agent) device = torch.device(cfg.MODEL.DEVICE) model.to(device) # Start from the same state with constant action, make sure reward is equal in both repetitions # Drive both simulations forward 5 steps nwarmup = 5 # Reset and get initial state agent.reset() init_qpos = agent.data.qpos.copy() init_qvel = agent.data.qvel.copy() # Set constant action na = agent.model.actuator_acc0.shape[0] action = torch.DoubleTensor(np.random.randn(na)*sigma) # Do first simulation for _ in range(nwarmup): mj_forward_fn(action) # Take a snapshot of this state so we can use it in gradient calculations agent.data.ctrl[:] = action.detach().numpy().copy() data = agent.get_snapshot() # Advance simulation with one step and get the reward mj_forward_fn(action) reward = agent.reward.copy() next_state = np.concatenate((agent.data.qpos.copy(), agent.data.qvel.copy())) # Reset and set to initial state, then do the second simulation; this time call mj_forward_fn without args agent.reset() agent.data.qpos[:] = init_qpos agent.data.qvel[:] = init_qvel for _ in range(nwarmup): agent.data.ctrl[:] = action.detach().numpy() mj_forward_fn() # Advance simulation with one step and get reward agent.data.ctrl[:] = action.detach().numpy() mj_forward_fn() reward2 = agent.reward.copy() # reward1 and reward2 should be equal self.assertEqual(reward, reward2, "Simulations from same initial state diverged") # Then make sure simulation from snapshot doesn't diverge from original simulation agent.set_snapshot(data) mj_forward_fn() reward_snapshot = agent.reward.copy() self.assertEqual(reward, reward_snapshot, "Simulation from snapshot diverged") # Make sure simulations are correct in the gradient calculations as well mj_gradients_fn(data, next_state, reward, test=True)