Exemplo n.º 1
0
    def _optimize_td_loss1(self):
        if self._step < self.batch_size or self._step < self.initial_memory_threshold:
            return
        # Sample a batch from replay memory
        states, actions, rewards, next_states, terminals = self.replay_memory.sample(self.batch_size, random_machine=self.np_random)
        states = torch.from_numpy(states).to(self.device)    #将numpy转换为torch
        actions_combined = torch.from_numpy(actions).to(self.device)  # make sure to separate actions and parameters
        d_actions = actions_combined[:, 0].long()
        actions = actions_combined[:, 1:4]
        action_parameters = actions_combined[:, 4:8]
        rewards = torch.from_numpy(rewards).to(self.device).squeeze()
        next_states = torch.from_numpy(next_states).to(self.device)
        terminals = torch.from_numpy(terminals).to(self.device).squeeze()


        #具体的 actor网络用于产生离散动作,actor_param网络用于产生连续动作参数,actor_param_critic作为actor_param的critic部分
        # ---------------------- optimize Q-network high level (actor)----------------------
        #actor 是上层的q网络
        with torch.no_grad():

            pred_Q_a = self.actor_target(next_states)
            Qprime = torch.max(pred_Q_a, 1, keepdim=True)[0].squeeze()
            # Compute the TD error
            target = rewards + (1 - terminals) * self.gamma * Qprime

        # Compute current Q-values using policy network
        q_values = self.actor(states)
        y_predicted = q_values.gather(1, d_actions.view(-1, 1)).squeeze()
        y_expected = target
        loss_Q = self.loss_func(y_predicted, y_expected)

        self.actor_optimiser.zero_grad()
        loss_Q.backward()
        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_grad)
        self.actor_optimiser.step()

        soft_update_target_network(self.actor, self.actor_target, self.tau_actor)
Exemplo n.º 2
0
    def _optimize_td_loss(self):
        if self._step < self.batch_size or self._step < self.initial_memory_threshold:
            return
        # Sample a batch from replay memory
        states, actions, rewards, next_states, terminals = self.replay_memory.sample(
            self.batch_size, random_machine=self.np_random)

        states = torch.from_numpy(states).to(self.device)
        actions_combined = torch.from_numpy(actions).to(
            self.device)  # make sure to separate actions and parameters
        actions = actions_combined[:, 0].long()
        action_parameters = actions_combined[:, 1:]
        rewards = torch.from_numpy(rewards).to(self.device).squeeze()
        next_states = torch.from_numpy(next_states).to(self.device)
        terminals = torch.from_numpy(terminals).to(self.device).squeeze()

        # ---------------------- optimize Q-network ----------------------
        with torch.no_grad():
            pred_next_action_parameters = self.actor_param_target.forward(
                next_states)
            pred_Q_a = self.actor_target(next_states,
                                         pred_next_action_parameters)
            Qprime = torch.max(pred_Q_a, 1, keepdim=True)[0].squeeze()

            # Compute the TD error
            target = rewards + (1 - terminals) * self.gamma * Qprime

        # Compute current Q-values using policy network
        q_values = self.actor(states, action_parameters)
        y_predicted = q_values.gather(1, actions.view(-1, 1)).squeeze()
        y_expected = target
        loss_Q = self.loss_func(y_predicted, y_expected)

        self.actor_optimiser.zero_grad()
        loss_Q.backward()
        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(),
                                           self.clip_grad)
        self.actor_optimiser.step()

        # ---------------------- optimize actor ----------------------
        with torch.no_grad():
            action_params = self.actor_param(states)
        action_params.requires_grad = True
        assert (self.weighted ^ self.average ^ self.random_weighted) or \
               not (self.weighted or self.average or self.random_weighted)
        Q = self.actor(states, action_params)
        Q_val = Q
        if self.weighted:
            # approximate categorical probability density (i.e. counting)
            counts = Counter(actions.cpu().numpy())
            weights = torch.from_numpy(
                np.array([
                    counts[a] / actions.shape[0]
                    for a in range(self.num_actions)
                ])).float().to(self.device)
            Q_val = weights * Q
        elif self.average:
            Q_val = Q / self.num_actions
        elif self.random_weighted:
            weights = np.random.uniform(0, 1., self.num_actions)
            weights /= np.linalg.norm(weights)
            weights = torch.from_numpy(weights).float().to(self.device)
            Q_val = weights * Q
        if self.indexed:
            Q_indexed = Q_val.gather(1, actions.unsqueeze(1))
            Q_loss = torch.mean(Q_indexed)
        else:
            Q_loss = torch.mean(torch.sum(Q_val, 1))
        self.actor.zero_grad()
        Q_loss.backward()
        from copy import deepcopy
        delta_a = deepcopy(action_params.grad.data)
        # step 2
        action_params = self.actor_param(Variable(states))
        delta_a[:] = self._invert_gradients(delta_a,
                                            action_params,
                                            grad_type="action_parameters",
                                            inplace=True)
        if self.zero_index_gradients:
            delta_a[:] = self._zero_index_gradients(
                delta_a, batch_action_indices=actions, inplace=True)

        out = -torch.mul(delta_a, action_params)
        self.actor_param.zero_grad()
        out.backward(torch.ones(out.shape).to(self.device))
        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(self.actor_param.parameters(),
                                           self.clip_grad)

        self.actor_param_optimiser.step()

        soft_update_target_network(self.actor, self.actor_target,
                                   self.tau_actor)
        soft_update_target_network(self.actor_param, self.actor_param_target,
                                   self.tau_actor_param)
Exemplo n.º 3
0
    def _optimize_td_loss(self):

        if self.replay_memory.nb_entries < self.batch_size or \
                self.replay_memory.nb_entries < self.initial_memory_threshold:
            return

        # Sample a batch from replay memory
        if self.n_step_returns:
            states, actions, rewards, next_states, terminals, n_step_returns = self.replay_memory.sample(
                self.batch_size, random_machine=self.np_random)
        else:
            states, actions, rewards, next_states, terminals = self.replay_memory.sample(
                self.batch_size, random_machine=self.np_random)
            n_step_returns = None

        states = torch.from_numpy(states).to(device)
        actions_combined = torch.from_numpy(actions).to(
            device)  # make sure to separate actions and action-parameters
        actions = actions_combined[:, 1:self.num_actions + 1]
        action_parameters = actions_combined[:, self.num_actions + 1:]
        rewards = torch.from_numpy(rewards).to(device)
        next_states = torch.from_numpy(next_states).to(device)
        terminals = torch.from_numpy(terminals).to(device)
        if self.n_step_returns:
            n_step_returns = torch.from_numpy(n_step_returns).to(device)

        # ---------------------- optimize critic ----------------------
        with torch.no_grad():
            pred_next_actions, pred_next_action_parameters = self.actor_target.forward(
                next_states)
            off_policy_next_val = self.critic_target.forward(
                next_states, pred_next_actions, pred_next_action_parameters)
            off_policy_target = rewards + (
                1 - terminals) * self.gamma * off_policy_next_val
            if self.n_step_returns:
                on_policy_target = n_step_returns
                target = self.beta * on_policy_target + (
                    1. - self.beta) * off_policy_target
            else:
                target = off_policy_target

        y_expected = target
        y_predicted = self.critic.forward(states, actions, action_parameters)
        loss_critic = self.loss_func(y_predicted, y_expected)

        self.critic_optimiser.zero_grad()
        loss_critic.backward()
        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(),
                                           self.clip_grad)
        self.critic_optimiser.step()

        # ---------------------- optimise actor ----------------------
        # 1 - calculate gradients from critic
        with torch.no_grad():
            actions, action_params = self.actor(states)
            action_params = torch.cat((actions, action_params), dim=1)
        action_params.requires_grad = True
        Q_val = self.critic(states, action_params[:, :self.num_actions],
                            action_params[:, self.num_actions:]).mean()
        self.critic.zero_grad()
        Q_val.backward()

        from copy import deepcopy
        delta_a = deepcopy(action_params.grad.data)
        # 2 - apply inverting gradients and combine with gradients from actor
        actions, action_params = self.actor(Variable(states))
        action_params = torch.cat((actions, action_params), dim=1)
        delta_a[:, self.num_actions:] = self._invert_gradients(
            delta_a[:, self.num_actions:].cpu(),
            action_params[:, self.num_actions:].cpu(),
            grad_type="action_parameters",
            inplace=True)
        delta_a[:, :self.num_actions] = self._invert_gradients(
            delta_a[:, :self.num_actions].cpu(),
            action_params[:, :self.num_actions].cpu(),
            grad_type="actions",
            inplace=True)
        out = -torch.mul(delta_a, action_params)
        self.actor.zero_grad()
        out.backward(torch.ones(out.shape).to(device))

        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(),
                                           self.clip_grad)
        self.actor_optimiser.step()

        soft_update_target_network(self.actor, self.actor_target,
                                   self.tau_actor)
        soft_update_target_network(self.critic, self.critic_target,
                                   self.tau_critic)
Exemplo n.º 4
0
    def _optimize_td_loss(self):
        if self._step < self.batch_size or self._step < self.initial_memory_threshold:
            return
        # Sample a batch from replay memory
        states, actions, rewards, next_states, terminals = self.replay_memory.sample(
            self.batch_size, random_machine=self.np_random)
        states = torch.from_numpy(states).to(self.device)  #将numpy转换为torch
        actions_combined = torch.from_numpy(actions).to(
            self.device)  # make sure to separate actions and parameters
        d_actions = actions_combined[:, 0].long()
        actions = actions_combined[:, 1:4]
        action_parameters = actions_combined[:, 4:8]
        rewards = torch.from_numpy(rewards).to(self.device).squeeze()
        next_states = torch.from_numpy(next_states).to(self.device)
        terminals = torch.from_numpy(terminals).to(self.device).squeeze()

        #具体的 actor网络用于产生离散动作,actor_param网络用于产生连续动作参数,actor_param_critic作为actor_param的critic部分
        # ---------------------- optimize Q-network high level (actor)----------------------
        #actor 是上层的q网络
        with torch.no_grad():

            pred_Q_a = self.actor_target(next_states)
            Qprime = torch.max(pred_Q_a, 1, keepdim=True)[0].squeeze()
            # Compute the TD error
            target = rewards + (1 - terminals) * self.gamma * Qprime

        # Compute current Q-values using policy network
        q_values = self.actor(states)
        y_predicted = q_values.gather(1, d_actions.view(-1, 1)).squeeze()
        y_expected = target
        loss_Q = self.loss_func(y_predicted, y_expected)

        self.actor_optimiser.zero_grad()
        loss_Q.backward()
        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(),
                                           self.clip_grad)
        self.actor_optimiser.step()

        # ---------------------- 下层可以理解为一个DDPG网络----------------------
        # ---------------------- optimize actor parameter (low)连续动作参数网络----------------------
        with torch.no_grad():
            pred_action = self.actor.forward(states)
            action_params = self.actor_param(states, pred_action)
        action_params.requires_grad = True
        assert (self.weighted ^ self.average ^ self.random_weighted) or \
               not (self.weighted or self.average or self.random_weighted)
        Q = self.actor_param_critic(states, pred_action, action_params)
        Q_val = Q
        #actor的误差函数在这里
        Q_loss = torch.mean(torch.sum(Q_val, 1))  #求和再取均值

        self.actor_param_critic.zero_grad()
        Q_loss.backward()

        #如果将p-dqn中下面这个部分去掉,则性能将会明显下降,甚至不如hhqn

        from copy import deepcopy
        delta_a = deepcopy(action_params.grad.data)
        # step 2
        pred_action = self.actor.forward(Variable(states))
        action_params = self.actor_param(Variable(states),
                                         Variable(pred_action))
        delta_a[:] = self._invert_gradients(delta_a,
                                            action_params,
                                            grad_type="action_parameters",
                                            inplace=True)
        if self.zero_index_gradients:
            delta_a[:] = self._zero_index_gradients(
                delta_a, batch_action_indices=actions, inplace=True)
        out = -torch.mul(delta_a, action_params)
        # print(out)
        self.actor_param.zero_grad()
        out.backward(torch.ones(out.shape).to(self.device))

        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(self.actor_param.parameters(),
                                           self.clip_grad)

        self.actor_param_optimiser.step()

        # ---------------------- optimize Q-network low level  (critic)----------------------
        with torch.no_grad():
            pred_next_action = self.actor_target.forward(next_states)
            next_action_params = self.actor_param_target(
                next_states, pred_next_action)
            pred_Q_a = self.actor_param_target_critic(next_states,
                                                      pred_next_action,
                                                      next_action_params)
            Qprime = torch.max(pred_Q_a, 1, keepdim=True)[0].squeeze()

            # Compute the TD error
            target = rewards + (1 - terminals) * self.gamma * Qprime

        # Compute current Q-values using policy network
        q_values = self.actor_param_critic(states, actions, action_parameters)
        y_expected = target.unsqueeze(1)
        y_predicted = q_values
        loss_Q = self.loss_func(y_predicted, y_expected)
        self.actor_param_critic_optimiser.zero_grad()
        loss_Q.backward()
        if self.clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(
                self.actor_param_critic.parameters(), self.clip_grad)
        self.actor_param_critic_optimiser.step()

        soft_update_target_network(self.actor, self.actor_target,
                                   self.tau_actor)
        soft_update_target_network(self.actor_param, self.actor_param_target,
                                   self.tau_actor_param)
        soft_update_target_network(self.actor_param_critic,
                                   self.actor_param_target_critic,
                                   self.tau_actor_param_critic)