Пример #1
0
    def predict_batch(self, states):
        batch_size = states.shape[0]
        actions = self.action_sampler.sample((self.horizon, batch_size, self.num_random_action_selection))
        states = np.expand_dims(states, axis=1)
        states = (states, (1, self.num_random_action_selection, 1))
        states = convert_to_tensor(states)
        actions = convert_to_tensor(actions)

        cost = torch.zeros(size=(batch_size, self.num_random_action_selection)).type(FloatTensor)
        for i in range(self.horizon):
            # reshape states and actions
            states = states.view(-1, *states.shape[2:])  # (b, K, ob_dim) -> (b * K, ob_dim)
            current_action = actions[i].view(-1, *actions[i][2:])
            # predict next states and rewards
            next_states, rewards = self.model.predict_next_states_rewards(states, current_action)
            # reshape back
            rewards = rewards.view(batch_size, self.num_random_action_selection)
            next_states = next_states.view(batch_size, self.num_random_action_selection, *states.shape[2:])

            cost += -rewards * self.gamma_inverse
            states = next_states

        best_action = torch.gather(actions[0], dim=1, index=torch.argmin(cost, dim=1, keepdim=True))
        best_action = best_action.cpu().numpy()
        return best_action
Пример #2
0
 def predict_next_state(self, state, action):
     states = np.expand_dims(state, axis=0)
     actions = np.expand_dims(action, axis=0)
     states = convert_to_tensor(states)
     actions = convert_to_tensor(actions)
     next_state = self.predict_next_states(states, actions).cpu().numpy()[0]
     return next_state
Пример #3
0
 def predict(self, state):
     state = np.expand_dims(state, axis=0)
     with torch.no_grad():
         state = convert_to_tensor(state)
         state = (state - self.state_mean) / self.state_std
         action = self.model.forward(state)
     return action.cpu().numpy()[0]
Пример #4
0
    def update(self,
               replay_buffer,
               num_updates,
               action_limit,
               policy_freq=2,
               batch_size=128,
               target_noise=0.2,
               clip_noise=0.5,
               tau=5e-3,
               gamma=0.99):
        for i in range(num_updates):
            transition = replay_buffer.sample(batch_size)
            s_batch, a_batch, s2_batch, r_batch, t_batch = convert_to_tensor(
                transition, location='gpu')

            r_batch = r_batch.type(FloatTensor)
            t_batch = t_batch.type(FloatTensor)

            # get ground truth q value
            with torch.no_grad():
                target_action_noise = torch.clamp(torch.randn_like(a_batch) *
                                                  target_noise,
                                                  min=-clip_noise,
                                                  max=clip_noise)
                target_action = torch.clamp(
                    self.target_actor_module.forward(s2_batch) +
                    target_action_noise,
                    min=-action_limit,
                    max=action_limit)
                target_q = self.target_critic_module.forward(
                    state=s2_batch, action=target_action, minimum=True)

                q_target = r_batch + gamma * target_q * (1 - t_batch)

            # critic loss
            q_values, q_values2 = self.critic_module.forward(s_batch,
                                                             a_batch,
                                                             minimum=False)
            q_values_loss = F.mse_loss(q_values, q_target) + F.mse_loss(
                q_values2, q_target)

            self.critic_optimizer.zero_grad()
            q_values_loss.backward()
            self.critic_optimizer.step()

            if i % policy_freq == 0:
                action = self.actor_module.forward(s_batch)
                q_values = self.critic_module.forward(s_batch,
                                                      action,
                                                      minimum=False)[0]
                loss = -torch.mean(q_values)
                self.actor_optimizer.zero_grad()
                loss.backward()
                self.actor_optimizer.step()

                soft_update(self.target_critic_module, self.critic_module, tau)
                soft_update(self.target_actor_module, self.actor_module, tau)
Пример #5
0
 def set_statistics(self, dataset: ReplayBuffer):
     state_mean, state_std = dataset.state_mean_std
     self.state_mean = convert_to_tensor(state_mean).unsqueeze(dim=0)
     self.state_std = convert_to_tensor(state_std).unsqueeze(dim=0)
     if self.dynamics_model.discrete:
         self.action_mean = None
         self.action_std = None
     else:
         action_mean, action_std = dataset.action_mean_std
         self.action_mean = convert_to_tensor(action_mean).unsqueeze(dim=0)
         self.action_std = convert_to_tensor(action_std).unsqueeze(dim=0)
     delta_state_mean, delta_state_std = dataset.delta_state_mean_std
     self.delta_state_mean = convert_to_tensor(delta_state_mean).unsqueeze(
         dim=0)
     self.delta_state_std = convert_to_tensor(delta_state_std).unsqueeze(
         dim=0)
     if self.cost_fn_batch is None:
         reward_mean, reward_std = dataset.reward_mean_std
         self.reward_mean = reward_mean
         self.reward_std = reward_std
Пример #6
0
    def update(self, obs, actions, next_obs, done, reward):
        """ Sample a mini-batch from replay buffer and update the network

        Args:
            obs: (batch_size, ob_dim)
            actions: (batch_size, action_dim)
            next_obs: (batch_size, ob_dim)
            done: (batch_size,)
            reward: (batch_size,)

        Returns: None

        """
        obs = convert_to_tensor(obs)
        actions = convert_to_tensor(actions)
        next_obs = convert_to_tensor(next_obs)
        done = convert_to_tensor(done).type(FloatTensor)
        reward = convert_to_tensor(reward)

        # q loss
        q_values, q_values2 = self.q_network.forward(obs, actions, False)

        with torch.no_grad():
            next_action_distribution = self.policy_net.forward_action(next_obs)
            next_action = next_action_distribution.sample()
            next_action_log_prob = next_action_distribution.log_prob(
                next_action)
            target_q_values = self.target_q_network.forward(
                next_obs, next_action,
                True) - self.alpha * next_action_log_prob
            q_target = reward + self.gamma * (1.0 - done) * target_q_values

        q_values_loss = F.mse_loss(q_values, q_target) + F.mse_loss(
            q_values2, q_target)

        # policy loss
        if self.discrete:
            # for discrete action space, we can directly compute kl divergence analytically without sampling
            action_distribution = self.policy_net.forward_action(obs)
            q_values_min = self.q_network.forward(obs, None,
                                                  True)  # (batch_size, ac_dim)
            probs = F.softmax(q_values_min, dim=-1)
            target_distribution = torch.distributions.Categorical(probs=probs)
            policy_loss = torch.distributions.kl_divergence(
                action_distribution, target_distribution).mean()
            log_prob = -action_distribution.entropy()

        else:
            action_distribution = self.policy_net.forward_action(obs)
            pi = action_distribution.rsample()
            log_prob = action_distribution.log_prob(
                pi)  # should be shape (batch_size,)
            q_values_pi_min = self.q_network.forward(obs, pi, True)
            policy_loss = torch.mean(log_prob * self.alpha - q_values_pi_min)

        # alpha loss
        if self.log_alpha_tensor is not None:
            alpha_loss = -(self.log_alpha_tensor *
                           (log_prob + self.target_entropy).detach()).mean()

            self.alpha_optimizer.zero_grad()
            alpha_loss.backward()
            self.alpha_optimizer.step()

            self.alpha = self.log_alpha_tensor.exp().item()

        self.q_optimizer.zero_grad()
        q_values_loss.backward()
        self.q_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
Пример #7
0
 def predict_batch(self, states):
     states = convert_to_tensor(states.astype(np.float32))
     action_distribution = self.policy_net.forward_action(states)
     return action_distribution.sample().cpu().numpy()
Пример #8
0
 def predict_batch(self, state):
     state = convert_to_tensor(state.astype(np.float32))
     return self.actor_module.forward(state).cpu().numpy()
Пример #9
0
 def set_state_stats(self, state_mean, state_std):
     self.state_mean = convert_to_tensor(state_mean).unsqueeze(dim=0)
     self.state_std = convert_to_tensor(state_std).unsqueeze(dim=0)