Exemplo n.º 1
0
def train_model(context, data, training_batch):
    # Crear clase de datos sintéticos
    context.synthetic_data = SyntheticData(context=context,
                                           data=data,
                                           window=10000,
                                           frequency=30)

    # Crear configuración del modelo junto con redes neuronales
    create_model(context)

    # Configurar resumen de operaciones
    summary_ops, summary_vars = build_summaries()
    writer = tf.summary.FileWriter(
        "/home/enzo/PycharmProjects/DDPGPorfolioOptimization/summaries",
        context.sess.graph)

    if os.path.exists(context.model_path):
        context.saver.restore(context.sess, context.model_path)

    # Inicializar la memoria de repetición
    replay_buffer = ReplayBuffer(context.buffer_size)
    for episode in range(context.max_episodes):
        data, close_prices = context.synthetic_data.get_trayectory(
            t_intervals=context.max_ep_steps + context.n)

        # Resetear los valores del portafolio al inicio de cada episodio
        context.portfolio_value_memory = []
        context.portfolio_value_memory.append(context.init_train_portfolio)
        context.train_invested_quantity = 0.0
        context.assets_quantity_invested = []
        context.portfolio_w_memory = []
        context.init_portfolio_w = []
        for i in range(len(context.assets) + 1):
            context.init_portfolio_w.append(0.0)
        context.portfolio_w_memory.append(context.init_portfolio_w)
        for i in range(len(context.assets)):
            context.assets_quantity_invested.append(0.0)
        context.train_cash = context.init_train_portfolio
        context.last_train_operation = 2
        context.open_trade = False

        ep_reward = 0
        ep_ave_max_q = 0
        ep_loss = 0

        # Se resta uno para tomar el cuenta la obtención del siguiente estado
        for i in range(context.max_ep_steps - 1):
            # Obtener el estado
            s = data[:, i:i + context.n, :]

            # Aplicar un error a la acción que permita equilibrar el problema de explotación/exploración
            random = np.random.rand()
            if random > context.epsilon:
                if s.shape == (len(context.assets), context.n,
                               len(context.features)):
                    a = context.actor.predict([s])[0]
                else:
                    print("Episodio:", episode, "Paso:", i,
                          "La forma del estado actual es incorrecta")
                    continue
            else:
                rand_array = np.random.rand(len(context.assets) + 1)
                a = np.exp(rand_array) / np.sum(np.exp(rand_array))
            context.epsilon = context.epsilon * context.epsilon_decay

            # Siguiente estado
            s2 = data[:, i + 1:i + 1 + context.n, :]
            if not s2.shape == (len(
                    context.assets), context.n, len(context.features)):
                print("Episodio:", episode, "Paso:", i,
                      "La forma del siguiente estado es incorrecta")
                continue

            # Recompensa
            this_closes = close_prices[:, i + context.n]
            previous_closes = close_prices[:, i + context.n - 1]

            r = get_reward(context, this_closes, previous_closes, a)

            # Punto terminal
            if i == (context.max_ep_steps - context.n - 2):
                t = True
            else:
                t = False

            replay_buffer.add(s, a, r, t, s2)

            if replay_buffer.size() > context.minibatch_size:
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    context.minibatch_size)
                # Calcular objetivos
                target_q = context.critic.predict_target(
                    s2_batch, context.actor.predict_target(s2_batch))
                y_i = []

                for k in range(context.minibatch_size):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + context.gamma * target_q[k])

                # Actualizar el crítico dados los objetivos
                predicted_q_value_batch = np.reshape(
                    y_i, (context.minibatch_size, 1))
                predicted_q_value, losses, _ = context.critic.train(
                    s_batch, a_batch, predicted_q_value_batch)

                ep_loss += np.mean(losses)
                ep_ave_max_q += np.amax(predicted_q_value)

                # Actualizar la política del actor utilizando el ejemplar de gradiente
                a_outs = context.actor.predict(s_batch)
                grads = context.critic.action_gradients(s_batch, a_outs)
                context.actor.train(s_batch, grads[0])

                # Actualizar las redes objetivo
                context.actor.update_target_network()
                context.critic.update_target_network()

            ep_reward += r

            if i == (context.max_ep_steps - 2):
                summary_str = context.sess.run(summary_ops,
                                               feed_dict={
                                                   summary_vars[0]:
                                                   ep_reward,
                                                   summary_vars[1]:
                                                   ep_ave_max_q / float(i),
                                                   summary_vars[2]:
                                                   ep_loss / float(i)
                                               })

                writer.add_summary(summary_str, episode)
                writer.flush()

                print(
                    '| Reward: {:.5f} | Episode: {:d} | Qmax: {:.4f} | Porfolio value: {:.4f} | Epsilon: {:.5f} '
                    .format(ep_reward, episode, (ep_ave_max_q / float(i)),
                            context.portfolio_value_memory[-1],
                            context.epsilon))

        _ = context.saver.save(context.sess, context.model_path)
Exemplo n.º 2
0
class MultiAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents=2,
                 eps_before_train=500,
                 gamma=0.99,
                 batch_size=128,
                 buffer_size=int(1e5),
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0,
                 tau=1e-3,
                 noise_weight=1.0,
                 noise_decay=0.999998,
                 noise_min=1e-3,
                 seed=0,
                 device="cuda:0"):

        # (self, state_size, action_size, num_agents=2, random_seed=1, lr_actor=2e-4, lr_critic=1e-3,
        #          weight_decay=0, tau=2e-3, device=device)

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.action_dim = action_size * num_agents

        self.eps_before_train = eps_before_train
        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.tau = tau

        self.noise_weight = noise_weight
        self.noise_decay = noise_decay
        self.noise_min = noise_min
        self.device = device
        self.i_episode = 0

        self.agents = [
            DDPG(self.state_size,
                 self.action_size,
                 self.num_agents,
                 random_seed=2 * i * seed,
                 lr_actor=self.lr_actor,
                 lr_critic=self.lr_critic,
                 weight_decay=self.weight_decay,
                 tau=self.tau,
                 device=self.device) for i in range(self.num_agents)
        ]
        self.memory = ReplayBuffer(self.action_size, self.buffer_size, seed)

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def act(self, states, add_noise=True):
        noise_weight = self.noise_weight if add_noise else 0.0

        if (self.i_episode >= self.eps_before_train) and (self.noise_weight >
                                                          self.noise_min):
            self.noise_weight *= self.noise_decay
            noise_weight = self.noise_weight

        actions = [
            agent.act(s, noise_weight=noise_weight)
            for s, agent in zip(states, self.agents)
        ]
        return np.array(actions)

    def step(self, states, actions, rewards, next_states, dones, t, i_episode):

        full_state = states.reshape(-1)
        full_next_state = next_states.reshape(-1)
        self.i_episode = i_episode

        self.memory.add(state=states,
                        full_state=full_state,
                        action=actions,
                        reward=rewards,
                        next_state=next_states,
                        full_next_state=full_next_state,
                        done=dones)

        if (i_episode >= self.eps_before_train) and (self.memory.size() >=
                                                     self.batch_size):
            for agent_id in range(self.num_agents):
                experiences = self.memory.sample(self.batch_size)
                self.learn(agent_id, experiences)
            self.soft_update_all()

    def soft_update_all(self):
        for agent in self.agents:
            agent.soft_update_all()

    def learn(self, agent_id, experiences):
        agent = self.agents[agent_id]

        states_e, full_states_e, actions_e, rewards_e, next_states_e, full_next_states_e, dones_e = experiences
        rewards = rewards_e[:, agent_id].view(-1, 1)
        dones = dones_e[:, agent_id].view(-1, 1)

        # Update critic
        target_actions = self.target_act(next_states_e)
        Q_target_next = agent.critic_target(
            full_next_states_e, target_actions.view(-1, self.action_dim))
        Q_target = rewards + self.gamma * Q_target_next * (1.0 - dones)
        Q_local = agent.critic_local(full_states_e,
                                     actions_e.view(-1, self.action_dim))

        critic_loss = F.mse_loss(input=Q_local, target=Q_target.detach())
        agent.critic_local.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.critic_local.parameters(), 2)
        agent.critic_optimizer.step()

        # Update the actor policy
        agent_states = states_e[:, agent_id]
        agent_actions = agent.actor_local(agent_states)
        actions = actions_e.clone()
        actions[:, agent_id] = agent_actions

        actor_loss = -agent.critic_local(
            full_states_e, actions.view(-1, self.action_dim)).mean()
        agent.actor_local.zero_grad()
        torch.nn.utils.clip_grad_norm_(agent.actor_local.parameters(), 2)
        actor_loss.backward()
        agent.actor_optimizer.step()

        actor_loss_value = actor_loss.cpu().detach().item()
        critic_loss_value = critic_loss.cpu().detach().item()
        return actor_loss_value, critic_loss_value

    def target_act(self, states):
        actions = torch.zeros(states.shape[:2] + (self.action_size, ),
                              dtype=torch.float,
                              device=self.device)
        for i in range(self.num_agents):
            actions[:, i, :] = self.agents[i].actor_target(states[:, i])
        return actions

    def local_act(self, states):
        actions = torch.zeros(states.shape[:2] + (self.action_size, ),
                              dtype=torch.float,
                              device=self.device)
        for i in range(self.num_agents):
            actions[:, i, :] = self.agents[i].actor_local(states[:, i])
        return actions