Exemplo n.º 1
0
    def train(self):
        epsilon = 1.00
        epsiode_rewards = []
        for episode in range(1, self.num_episodes + 1):
            state, reward = self.tsm.initialize()
            rewards = []
            for _ in tqdm(range(self.tsm.episode_length)):
                if random.random() < epsilon:
                    action = self.random_action()
                else:
                    action = self.actor_trainer.select_action(
                        inputs=np.array([state.features]))[0][0]
                trans_state, reward = self.tsm.step(action)
                rewards.append(reward)
                self.rpb.store(old_state=state,
                               action=action,
                               reward=reward,
                               new_state=trans_state)
                if self.rpb.ready(self.batch_size):
                    transitions = self.rpb.sample(batch_size=self.batch_size,
                                                  recurrent=False)
                    batch_states = []  # [batch_size, num_assets, num_features]
                    batch_actions = []  # [batch_size, 1]
                    batch_y = []  # [batch_size, 1]
                    for transition in transitions:
                        old_state, action, reward, new_state = transition
                        target_action = self.actor_target.select_action(
                            inputs=np.array([new_state.features]))
                        target_q = self.critic_target.get_q_value(
                            inputs=np.array([new_state.features]),
                            actions=target_action)[0]
                        y = reward + self.gamma * target_q
                        #print("Y:", y)
                        #print("Y:", y, "Target_q:", target_q, "Target_action:", target_action, "reward:", reward)
                        batch_y.append(y)
                        batch_states.append(old_state.features)
                        batch_actions.append([action])
                    self.critic_trainer.train_step(
                        inputs=np.array(batch_states),
                        actions=np.array(batch_actions),
                        predicted_q_value=np.array(batch_y))
                    policy_actions = self.actor_trainer.select_action(
                        inputs=np.array(
                            batch_states))  # [batch_size, num_assets]
                    action_grads = self.critic_trainer.get_action_gradients(
                        inputs=np.array(batch_states),
                        actions=policy_actions)[0]
                    self.actor_trainer.train_step(
                        inputs=np.array(batch_states),
                        action_gradient=np.array(action_grads))
                    ActorNetwork.update_actor(self.sess, self.tau)
                    CriticNetwork.update_critic(self.sess, self.tau)
                state = trans_state

            epsiode_rewards.append(np.sum(rewards))
            if epsilon > 0.1:
                epsilon -= 2.0 / self.num_episodes

            if (episode % 1) == 0:
                self.infer(train=False, episode=episode)

        plt.plot(epsiode_rewards)
        plt.savefig("./episode_rewards.png")

        self.infer(train=False, episode=episode)
Exemplo n.º 2
0
    def train(self):
        global_step = 0
        training_rewards = []
        for episode in range(1, self.num_episodes + 1):
            state = self.env.reset()
            state = np.reshape(state, (self.actor_trainer.state_dimension, ))
            episode_rewards = 0
            episode_ave_max_q = 0
            for _ in range(self.episode_length):
                action = self.actor_trainer.select_action(inputs=np.array(
                    [state]))[0] + self.actor_noise()  # [action_dim]
                trans_state, reward, terminal, info = self.env.step(action)
                trans_state = np.reshape(
                    trans_state, (self.actor_trainer.state_dimension, ))
                episode_rewards += reward

                self.rpb.store_w_terminal(old_state=state,
                                          action=action,
                                          reward=reward,
                                          terminal=terminal,
                                          new_state=trans_state)
                if self.rpb.ready(self.batch_size):
                    batch_states, batch_actions, batch_rewards, batch_terminal, batch_trans_state \
                        = self.rpb.sample_batch(batch_size=self.batch_size)

                    target_actions = self.actor_target.select_action(
                        inputs=batch_trans_state)  # [batch_size, action_dim]
                    target_q = self.critic_target.get_q_value(
                        inputs=batch_trans_state,  # [batch_size, 1]
                        actions=target_actions)
                    batch_y = []
                    for ind in range(self.batch_size):
                        if batch_terminal[ind]:
                            batch_y.append([batch_rewards[ind]])
                        else:
                            batch_y.append(batch_rewards[ind] +
                                           self.gamma * target_q[ind])
                    batch_y = np.array(batch_y)  # [batch_size, 1]
                    self.critic_trainer.train_step(inputs=batch_states,
                                                   actions=batch_actions,
                                                   predicted_q_value=batch_y)
                    policy_actions = self.actor_trainer.select_action(
                        inputs=batch_states)  # [batch_size, num_assets]
                    action_grads = self.critic_trainer.get_action_gradients(
                        inputs=batch_states, actions=policy_actions)[0]
                    self.actor_trainer.train_step(
                        inputs=batch_states,
                        action_gradient=np.array(action_grads))
                    ActorNetwork.update_actor(self.sess, self.tau)
                    CriticNetwork.update_critic(self.sess, self.tau)

                global_step += 1
                state = trans_state

                if terminal:
                    print("Episode number:", episode)
                    summary = self.sess.run(
                        self.summary_ops,
                        feed_dict={self.episode_reward: episode_rewards})
                    self.writer.add_summary(summary, episode)
                    print("Reward:", episode_rewards)
                    break