def test_dynamics_gradients(self):
        cfg = get_cfg_defaults()
        cfg.merge_from_file("/home/aleksi/Workspace/Model-Based-RL/configs/swimmer.yaml")
        agent = build_agent(cfg)
        mj_dynamics_forward_fn = agent.forward_factory("dynamics")
        mj_dynamics_gradients_fn = agent.gradient_factory("dynamics")

        nwarmup = 5
        agent.reset()
        nv = agent.sim.model.nv
        nu = agent.sim.model.nu
        for _ in range(nwarmup):
            action = torch.Tensor(agent.action_space.sample())
            ob, r, _, _ = agent.step(action)
        state = ob.detach().numpy()
        action = agent.action_space.sample()
        state_action = np.concatenate([state, action], axis=0)
        dsdsa = mj_dynamics_gradients_fn(state_action)
        dsds = dsdsa[:, :nv * 2]
        dsda = dsdsa[:, -nu:]

        eps = 1e-6
        state_action_prime = state_action + eps
        s = mj_dynamics_forward_fn(state_action)
        s_prime = mj_dynamics_forward_fn(state_action_prime)

        s_prime_estimate = s + \
                           np.squeeze(np.matmul(dsds, np.array([eps] * 2 * nv).reshape([-1, 1]))) + \
                           np.squeeze(np.matmul(dsda, np.array([eps] * nu).reshape([-1, 1])))
        print(s)
        print(s_prime)
        print(s_prime_estimate)
        self.assert_(np.allclose(s_prime, s_prime_estimate))
    def test_reward_gradients(self):
        cfg = get_cfg_defaults()
        cfg.MUJOCO.ENV = "InvertedPendulumEnv"
        agent = build_agent(cfg)
        mj_reward_forward_fn = agent.forward_factory("reward")
        mj_reward_gradients_fn = agent.gradient_factory("reward")

        nwarmup = 5
        agent.reset()
        nv = agent.sim.model.nv
        nu = agent.sim.model.nu
        for _ in range(nwarmup):
            action = torch.Tensor(agent.action_space.sample())
            ob, r, _, _ = agent.step(action)
        state = ob.detach().numpy()
        action = agent.action_space.sample()
        state_action = np.concatenate([state, action], axis=0)
        drdsa = mj_reward_gradients_fn(state_action)
        drds = drdsa[:, :nv * 2]
        drda = drdsa[:, -nu:]

        eps = 1e-6
        state_action_prime = state_action + eps
        r = mj_reward_forward_fn(state_action)
        r_prime = mj_reward_forward_fn(state_action_prime)

        r_prime_estimate = r + \
                           np.squeeze(np.matmul(drds, np.array([eps] * 2 * nv).reshape([-1, 1]))) + \
                           np.squeeze(np.matmul(drda, np.array([eps] * nu).reshape([-1, 1])))
        self.assertAlmostEqual(r_prime[0], r_prime_estimate[0], places=5)
def do_training(cfg, logger, output_results_dir, output_rec_dir,
                output_weights_dir):
    # Build the agent
    agent = build_agent(cfg)

    # Build a forward dynamics model
    dynamics_model = DynamicsModel(agent)

    # Set mode to training (aside from policy output, matters for Dropout, BatchNorm, etc.)
    dynamics_model.train()

    # Set up visdom
    if cfg.LOG.PLOT.ENABLED:
        visdom = VisdomLogger(cfg.LOG.PLOT.DISPLAY_PORT)
        visdom.register_keys([
            'total_loss', 'average_sd', 'average_action', "reinforce_loss",
            "objective_loss", "sd", "action_grad", "sd_grad", "actions"
        ])

    # wrap screen recorder if testing mode is on
    if cfg.LOG.TESTING.ENABLED:
        if cfg.LOG.PLOT.ENABLED:
            visdom.register_keys(['test_reward'])

    # Collect losses here
    output = {"epoch": [], "objective_loss": []}

    # Start training
    for epoch_idx in range(cfg.MODEL.EPOCHS):
        batch_loss = torch.empty(cfg.MODEL.BATCH_SIZE,
                                 cfg.MODEL.POLICY.MAX_HORIZON_STEPS,
                                 dtype=torch.float64)
        batch_loss.fill_(np.nan)

        for episode_idx in range(cfg.MODEL.BATCH_SIZE):

            # Generate "random walk" set of actions (separately for each dimension)
            action = torch.zeros(agent.action_space.shape, dtype=torch.float64)
            #actions = np.zeros((agent.action_space.shape[0], cfg.MODEL.POLICY.MAX_HORIZON_STEPS))
            actions = []

            initial_state = torch.Tensor(agent.reset())
            predicted_states = []
            real_states = []
            corrections = []
            for step_idx in range(cfg.MODEL.POLICY.MAX_HORIZON_STEPS):

                # Generate random actions
                action = action + 0.1 * (
                    2 * torch.rand(agent.action_space.shape) - 1)

                # Clamp to [-1, 1]
                action.clamp_(-1, 1)

                # Save action
                actions.append(action)

                previous_state = torch.from_numpy(agent.unwrapped._get_obs())

                # Advance the actual simulation
                next_state, _, _, _ = agent.step(action)
                next_state = torch.from_numpy(next_state)
                real_states.append(next_state)

                # Advance with learned dynamics simulation
                pred_next_state = dynamics_model(previous_state.float(),
                                                 action.float()).double()

                batch_loss[episode_idx,
                           step_idx] = torch.pow(next_state - pred_next_state,
                                                 2).mean()
                #if agent.is_done:
                #    break

        #dot = torchviz.make_dot(pred_next_state, params=dict(dynamics_model.named_parameters()))

        loss = torch.sum(batch_loss)
        dynamics_model.optimizer.zero_grad()
        loss.backward()
        dynamics_model.optimizer.step()

        output["objective_loss"].append(loss.detach().numpy())
        output["epoch"].append(epoch_idx)

        if epoch_idx % cfg.LOG.PERIOD == 0:

            if cfg.LOG.PLOT.ENABLED:
                visdom.update({"total_loss": loss.detach().numpy()})
                visdom.set({'actions': torch.stack(actions).detach().numpy()})
                #visdom.set({'total_loss': loss["total_loss"].transpose()})
                #visdom.update({'average_grad': np.log(torch.mean(model.policy_net.mean._layers["linear_layer_0"].weight.grad.abs()).detach().numpy())})

            logger.info("REWARD: \t\t{} (iteration {})".format(
                loss.detach().numpy(), epoch_idx))

        if cfg.LOG.PLOT.ENABLED and epoch_idx % cfg.LOG.PLOT.ITER_PERIOD == 0:
            visdom.do_plotting()

#        if epoch_idx % cfg.LOG.CHECKPOINT_PERIOD == 0:
#            torch.save(model.state_dict(),
#                       os.path.join(output_weights_dir, 'iter_{}.pth'.format(epoch_idx)))

        if False:  #cfg.LOG.TESTING.ENABLED:
            if epoch_idx % cfg.LOG.TESTING.ITER_PERIOD == 0:

                # Record if required
                agent.start_recording(
                    os.path.join(output_rec_dir,
                                 "iter_{}.mp4".format(epoch_idx)))

                test_rewards = []
                for _ in range(cfg.LOG.TESTING.COUNT_PER_ITER):
                    test_reward = do_testing(
                        cfg,
                        model,
                        agent,
                        # first_state=state_xr.get_item(),
                    )
                    test_rewards.append(test_reward)

                # Set training mode on again
                model.train()

                # Close the recorder
                agent.stop_recording()

    # Save outputs into log folder
    lg.save_dict_into_csv(output_results_dir, "output", output)

    # Save model
    torch.save(dynamics_model.state_dict(),
               os.path.join(output_weights_dir, "final_weights.pt"))
示例#4
0
def do_training(cfg, logger, output_results_dir, output_rec_dir,
                output_weights_dir, iter):

    if cfg.MODEL.RANDOM_SEED > 0:
        np.random.seed(cfg.MODEL.RANDOM_SEED + iter)
        torch.manual_seed(cfg.MODEL.RANDOM_SEED + iter)

    # Build the agent
    agent = build_agent(cfg)

    # Build the model
    model = build_model(cfg, agent)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    # Set mode to training (aside from policy output, matters for Dropout, BatchNorm, etc.)
    model.train()

    # Set up visdom
    if cfg.LOG.PLOT.ENABLED:
        visdom = VisdomLogger(cfg.LOG.PLOT.DISPLAY_PORT)
        visdom.register_keys([
            'total_loss', 'average_sd', 'average_action', "reinforce_loss",
            "objective_loss", "sd", "action_grad", "sd_grad", "average_grad"
        ])
        for action_idx in range(model.policy_net.action_dim):
            visdom.register_keys(["action_" + str(action_idx)])

    # wrap screen recorder if testing mode is on
    if cfg.LOG.TESTING.ENABLED:
        if cfg.LOG.PLOT.ENABLED:
            visdom.register_keys(['test_reward'])

    # Collect losses here
    output = {"epoch": [], "objective_loss": [], "average_sd": []}

    # Start training
    for epoch_idx in range(cfg.MODEL.EPOCHS):
        batch_loss = torch.empty(cfg.MODEL.BATCH_SIZE,
                                 cfg.MODEL.POLICY.MAX_HORIZON_STEPS,
                                 dtype=torch.float64)
        batch_loss.fill_(np.nan)

        for episode_idx in range(cfg.MODEL.BATCH_SIZE):

            initial_state = torch.DoubleTensor(agent.reset())
            states = []
            states.append(initial_state)
            #grads = np.zeros((cfg.MODEL.POLICY.MAX_HORIZON_STEPS, 120))
            for step_idx in range(cfg.MODEL.POLICY.MAX_HORIZON_STEPS):
                state, reward = model(states[step_idx])
                batch_loss[episode_idx, step_idx] = -reward
                #(-reward).backward(retain_graph=True)
                #grads[step_idx, :] = model.policy_net.optimizer.mean.grad.detach().numpy()
                #grads[step_idx, step_idx+1:40] = np.nan
                #grads[step_idx, 40+step_idx+1:80] = np.nan
                #grads[step_idx, 80+step_idx+1:] = np.nan
                #model.policy_net.optimizer.optimizer.zero_grad()
                states.append(state)
                if agent.is_done:
                    break

        agent.running_sum = 0
        loss = model.policy_net.optimize(batch_loss)
        #zero = np.abs(grads) < 1e-9
        #grads[zero] = np.nan
        #medians = np.nanmedian(grads, axis=0)
        #model.policy_net.optimizer.mean.grad.data = torch.from_numpy(medians)
        #torch.nn.utils.clip_grad_value_([model.policy_net.optimizer.mean, model.policy_net.optimizer.sd], 1)
        #model.policy_net.optimizer.optimizer.step()
        #model.policy_net.optimizer.optimizer.zero_grad()
        #loss = {'objective_loss': torch.sum(batch_loss, dim=1).mean().detach().numpy()}

        output["objective_loss"].append(loss["objective_loss"])
        output["epoch"].append(epoch_idx)
        output["average_sd"].append(np.mean(model.policy_net.get_clamped_sd()))

        if epoch_idx % cfg.LOG.PERIOD == 0:

            if cfg.LOG.PLOT.ENABLED:
                visdom.update(loss)
                #visdom.set({'total_loss': loss["total_loss"].transpose()})

                clamped_sd = model.policy_net.get_clamped_sd()
                clamped_action = model.policy_net.get_clamped_action()

                #visdom.update({'average_grad': np.log(torch.mean(model.policy_net.mean._layers["linear_layer_0"].weight.grad.abs()).detach().numpy())})

                if len(clamped_sd) > 0:
                    visdom.update({'average_sd': np.mean(clamped_sd, axis=1)})
                visdom.update({
                    'average_action':
                    np.mean(clamped_action, axis=(1, 2)).squeeze()
                })

                for action_idx in range(model.policy_net.action_dim):
                    visdom.set({
                        'action_' + str(action_idx):
                        clamped_action[action_idx, :, :]
                    })
                if clamped_sd is not None:
                    visdom.set({'sd': clamped_sd.transpose()})
#                visdom.set({'action_grad': model.policy_net.mean.grad.detach().numpy().transpose()})

            logger.info("REWARD: \t\t{} (iteration {})".format(
                loss["objective_loss"], epoch_idx))

        if cfg.LOG.PLOT.ENABLED and epoch_idx % cfg.LOG.PLOT.ITER_PERIOD == 0:
            visdom.do_plotting()

        if epoch_idx % cfg.LOG.CHECKPOINT_PERIOD == 0:
            torch.save(
                model.state_dict(),
                os.path.join(output_weights_dir,
                             'iter_{}.pth'.format(epoch_idx)))

        if cfg.LOG.TESTING.ENABLED:
            if epoch_idx % cfg.LOG.TESTING.ITER_PERIOD == 0:

                # Record if required
                agent.start_recording(
                    os.path.join(output_rec_dir,
                                 "iter_{}_{}.mp4".format(iter, epoch_idx)))

                test_rewards = []
                for _ in range(cfg.LOG.TESTING.COUNT_PER_ITER):
                    test_reward = do_testing(
                        cfg,
                        model,
                        agent,
                        # first_state=state_xr.get_item(),
                    )
                    test_rewards.append(test_reward)

                # Set training mode on again
                model.train()

                # Close the recorder
                agent.stop_recording()

    # Save outputs into log folder
    lg.save_dict_into_csv(output_results_dir, "output_{}".format(iter), output)

    # Return actions
    return agent
    def run_reward_test(self, cfg_file, sigma):

        cfg = get_cfg_defaults()
        cfg.merge_from_file(cfg_file)
        agent = build_agent(cfg)
        mj_forward_fn = agent.forward_factory("dynamics")
        mj_gradients_fn = agent.gradient_factory("reward")

        model = build_model(cfg, agent)
        device = torch.device(cfg.MODEL.DEVICE)
        model.to(device)

        # Start from the same state with constant action, make sure reward is equal in both repetitions

        # Drive both simulations forward 5 steps
        nwarmup = 5

        # Reset and get initial state
        agent.reset()
        init_qpos = agent.data.qpos.copy()
        init_qvel = agent.data.qvel.copy()

        # Set constant action
        na = agent.model.actuator_acc0.shape[0]
        action = torch.DoubleTensor(np.random.randn(na)*sigma)

        # Do first simulation
        for _ in range(nwarmup):
            mj_forward_fn(action)

        # Take a snapshot of this state so we can use it in gradient calculations
        agent.data.ctrl[:] = action.detach().numpy().copy()
        data = agent.get_snapshot()

        # Advance simulation with one step and get the reward
        mj_forward_fn(action)
        reward = agent.reward.copy()
        next_state = np.concatenate((agent.data.qpos.copy(), agent.data.qvel.copy()))

        # Reset and set to initial state, then do the second simulation; this time call mj_forward_fn without args
        agent.reset()
        agent.data.qpos[:] = init_qpos
        agent.data.qvel[:] = init_qvel
        for _ in range(nwarmup):
            agent.data.ctrl[:] = action.detach().numpy()
            mj_forward_fn()

        # Advance simulation with one step and get reward
        agent.data.ctrl[:] = action.detach().numpy()
        mj_forward_fn()
        reward2 = agent.reward.copy()

        # reward1 and reward2 should be equal
        self.assertEqual(reward, reward2, "Simulations from same initial state diverged")

        # Then make sure simulation from snapshot doesn't diverge from original simulation
        agent.set_snapshot(data)
        mj_forward_fn()
        reward_snapshot = agent.reward.copy()
        self.assertEqual(reward, reward_snapshot, "Simulation from snapshot diverged")

        # Make sure simulations are correct in the gradient calculations as well
        mj_gradients_fn(data, next_state, reward, test=True)