Пример #1
0
    def select_action(self, state, test=False):
        value_c, value_d = self.actor.forward(to_variable(state, volatile=True))

        action_d = (F.softmax(value_d))
        action_d = to_numpy(action_d.multinomial())

        action_c = to_numpy(value_c)
        action_c += (max(self.epsilon, 0) * self.random_process.sample()) if not test else 0
        action_c = action_c[0]
        return action_c, action_d
Пример #2
0
    def select_action(self, state, test=False):
        value = to_numpy(self.actor.forward(to_variable(state, volatile=True)))

        cur_episode = len(self.experience_replay)

        action = np.clip(value[0] + self.noise.generate(cur_episode), -1, 1)

        return action
Пример #3
0
    def select_action(self, state, test=False):

        on_state = to_variable(state, volatile=True)

        greedy = np.random.rand()
        if greedy < self.epsilon and not test:  # explore
            action = np.random.randint(self.action_num)
        else:  # exploit
            action = np.argmax(to_numpy(self.net.forward(on_state)))

        return action
Пример #4
0
    def update(self, state, action, reward, new_state, done):

        self.experience_replay.append((state, action, reward, new_state,
                                       done))  # add new transition to dataset

        self.epsilon = max(
            self.epsilon -
            (self.initial_epsilon - self.final_epsilon) / self.epsilon_decay,
            0)

        if len(self.experience_replay
               ) >= self.observation:  # if have enough experience example, go

            # minibatch = np.array(random.sample(self.experience_replay, self.batch_size))
            # states, actions, rewards, new_states, dones = tuple(minibatch[:, k] for k in range(5))

            mini_batch = random.sample(self.experience_replay, self.batch_size)
            states = torch.cat([
                mini_batch[k][0].unsqueeze(0) for k in range(self.batch_size)
            ])
            actions = [mini_batch[k][1] for k in range(self.batch_size)]
            rewards = [mini_batch[k][2] for k in range(self.batch_size)]
            new_states = torch.cat([
                mini_batch[k][3].unsqueeze(0) for k in range(self.batch_size)
            ])
            dones = [mini_batch[k][4] for k in range(self.batch_size)]

            new_states = torch.cat([x.unsqueeze(0) for x in new_states], 0)
            new_states = to_variable(new_states)

            q_prime = to_numpy(self.net.forward(new_states))

            states = torch.cat([x.unsqueeze(0) for x in states], 0)
            states = to_variable(states)
            out = self.net.forward(states)

            # Perform Gradient Descent
            action_input = to_variable(actions, dtype='long')
            y_label = to_variable([
                rewards[i] if dones[i] else rewards[i] +
                self.gamma * np.max(q_prime[i]) for i in range(self.batch_size)
            ])

            try:
                y_out = out.gather(1, action_input.view(-1, 1))
            except RuntimeError:
                pass

            self.optimizer.zero_grad()
            loss = self.loss(y_out, y_label)
            loss.backward()
            self.optimizer.step()
Пример #5
0
    def select_action(self, state, test=False):
        value = to_numpy(self.actor.forward(to_variable(state, volatile=True)))
        # print(value)

        cur_episode = len(self.experience_replay)

        if self.action_type == 'continuous':
            action = np.clip(value[0] + self.noise.generate(cur_episode), -1,
                             1)
        else:
            action = self.noise.generate(value[0], cur_episode)
            if isinstance(action, int):
                action = np.array([1., 0.] if action == 0 else [0., 1.])
            else:
                # action = np.clip(action, 0.4, 0.6)
                action = action

        return action
Пример #6
0
    def select_action(self, state, test=False):
        state = to_variable(state, volatile=test)

        if self.config['name'] == 'LSTM':
            on_state = state, (self.hx, self.cx)
            value, logit, (hx, cx) = self.net.forward(on_state)
        else:
            on_state = state
            # print(state.size())
            value, logit = self.net.forward(on_state)

        prob = F.softmax(logit)
        log_prob = F.log_softmax(logit)
        entropy = -(log_prob * prob).sum(1)

        action = to_numpy(prob.multinomial())
        log_prob = log_prob.gather(1, to_variable(action, dtype='long'))

        action = action[0, 0]

        if self.config['name'] == 'LSTM':
            self.hx, self.cx = hx, cx

        return action, value, log_prob, entropy