def train_model(context, data, training_batch): # Crear clase de datos sintéticos context.synthetic_data = SyntheticData(context=context, data=data, window=10000, frequency=30) # Crear configuración del modelo junto con redes neuronales create_model(context) # Configurar resumen de operaciones summary_ops, summary_vars = build_summaries() writer = tf.summary.FileWriter( "/home/enzo/PycharmProjects/DDPGPorfolioOptimization/summaries", context.sess.graph) if os.path.exists(context.model_path): context.saver.restore(context.sess, context.model_path) # Inicializar la memoria de repetición replay_buffer = ReplayBuffer(context.buffer_size) for episode in range(context.max_episodes): data, close_prices = context.synthetic_data.get_trayectory( t_intervals=context.max_ep_steps + context.n) # Resetear los valores del portafolio al inicio de cada episodio context.portfolio_value_memory = [] context.portfolio_value_memory.append(context.init_train_portfolio) context.train_invested_quantity = 0.0 context.assets_quantity_invested = [] context.portfolio_w_memory = [] context.init_portfolio_w = [] for i in range(len(context.assets) + 1): context.init_portfolio_w.append(0.0) context.portfolio_w_memory.append(context.init_portfolio_w) for i in range(len(context.assets)): context.assets_quantity_invested.append(0.0) context.train_cash = context.init_train_portfolio context.last_train_operation = 2 context.open_trade = False ep_reward = 0 ep_ave_max_q = 0 ep_loss = 0 # Se resta uno para tomar el cuenta la obtención del siguiente estado for i in range(context.max_ep_steps - 1): # Obtener el estado s = data[:, i:i + context.n, :] # Aplicar un error a la acción que permita equilibrar el problema de explotación/exploración random = np.random.rand() if random > context.epsilon: if s.shape == (len(context.assets), context.n, len(context.features)): a = context.actor.predict([s])[0] else: print("Episodio:", episode, "Paso:", i, "La forma del estado actual es incorrecta") continue else: rand_array = np.random.rand(len(context.assets) + 1) a = np.exp(rand_array) / np.sum(np.exp(rand_array)) context.epsilon = context.epsilon * context.epsilon_decay # Siguiente estado s2 = data[:, i + 1:i + 1 + context.n, :] if not s2.shape == (len( context.assets), context.n, len(context.features)): print("Episodio:", episode, "Paso:", i, "La forma del siguiente estado es incorrecta") continue # Recompensa this_closes = close_prices[:, i + context.n] previous_closes = close_prices[:, i + context.n - 1] r = get_reward(context, this_closes, previous_closes, a) # Punto terminal if i == (context.max_ep_steps - context.n - 2): t = True else: t = False replay_buffer.add(s, a, r, t, s2) if replay_buffer.size() > context.minibatch_size: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( context.minibatch_size) # Calcular objetivos target_q = context.critic.predict_target( s2_batch, context.actor.predict_target(s2_batch)) y_i = [] for k in range(context.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + context.gamma * target_q[k]) # Actualizar el crítico dados los objetivos predicted_q_value_batch = np.reshape( y_i, (context.minibatch_size, 1)) predicted_q_value, losses, _ = context.critic.train( s_batch, a_batch, predicted_q_value_batch) ep_loss += np.mean(losses) ep_ave_max_q += np.amax(predicted_q_value) # Actualizar la política del actor utilizando el ejemplar de gradiente a_outs = context.actor.predict(s_batch) grads = context.critic.action_gradients(s_batch, a_outs) context.actor.train(s_batch, grads[0]) # Actualizar las redes objetivo context.actor.update_target_network() context.critic.update_target_network() ep_reward += r if i == (context.max_ep_steps - 2): summary_str = context.sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(i), summary_vars[2]: ep_loss / float(i) }) writer.add_summary(summary_str, episode) writer.flush() print( '| Reward: {:.5f} | Episode: {:d} | Qmax: {:.4f} | Porfolio value: {:.4f} | Epsilon: {:.5f} ' .format(ep_reward, episode, (ep_ave_max_q / float(i)), context.portfolio_value_memory[-1], context.epsilon)) _ = context.saver.save(context.sess, context.model_path)
class MultiAgent: def __init__(self, state_size, action_size, num_agents=2, eps_before_train=500, gamma=0.99, batch_size=128, buffer_size=int(1e5), lr_actor=1e-4, lr_critic=1e-3, weight_decay=0, tau=1e-3, noise_weight=1.0, noise_decay=0.999998, noise_min=1e-3, seed=0, device="cuda:0"): # (self, state_size, action_size, num_agents=2, random_seed=1, lr_actor=2e-4, lr_critic=1e-3, # weight_decay=0, tau=2e-3, device=device) torch.manual_seed(seed) np.random.seed(seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.action_dim = action_size * num_agents self.eps_before_train = eps_before_train self.gamma = gamma self.batch_size = batch_size self.buffer_size = buffer_size self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.tau = tau self.noise_weight = noise_weight self.noise_decay = noise_decay self.noise_min = noise_min self.device = device self.i_episode = 0 self.agents = [ DDPG(self.state_size, self.action_size, self.num_agents, random_seed=2 * i * seed, lr_actor=self.lr_actor, lr_critic=self.lr_critic, weight_decay=self.weight_decay, tau=self.tau, device=self.device) for i in range(self.num_agents) ] self.memory = ReplayBuffer(self.action_size, self.buffer_size, seed) def reset(self): for agent in self.agents: agent.reset() def act(self, states, add_noise=True): noise_weight = self.noise_weight if add_noise else 0.0 if (self.i_episode >= self.eps_before_train) and (self.noise_weight > self.noise_min): self.noise_weight *= self.noise_decay noise_weight = self.noise_weight actions = [ agent.act(s, noise_weight=noise_weight) for s, agent in zip(states, self.agents) ] return np.array(actions) def step(self, states, actions, rewards, next_states, dones, t, i_episode): full_state = states.reshape(-1) full_next_state = next_states.reshape(-1) self.i_episode = i_episode self.memory.add(state=states, full_state=full_state, action=actions, reward=rewards, next_state=next_states, full_next_state=full_next_state, done=dones) if (i_episode >= self.eps_before_train) and (self.memory.size() >= self.batch_size): for agent_id in range(self.num_agents): experiences = self.memory.sample(self.batch_size) self.learn(agent_id, experiences) self.soft_update_all() def soft_update_all(self): for agent in self.agents: agent.soft_update_all() def learn(self, agent_id, experiences): agent = self.agents[agent_id] states_e, full_states_e, actions_e, rewards_e, next_states_e, full_next_states_e, dones_e = experiences rewards = rewards_e[:, agent_id].view(-1, 1) dones = dones_e[:, agent_id].view(-1, 1) # Update critic target_actions = self.target_act(next_states_e) Q_target_next = agent.critic_target( full_next_states_e, target_actions.view(-1, self.action_dim)) Q_target = rewards + self.gamma * Q_target_next * (1.0 - dones) Q_local = agent.critic_local(full_states_e, actions_e.view(-1, self.action_dim)) critic_loss = F.mse_loss(input=Q_local, target=Q_target.detach()) agent.critic_local.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(agent.critic_local.parameters(), 2) agent.critic_optimizer.step() # Update the actor policy agent_states = states_e[:, agent_id] agent_actions = agent.actor_local(agent_states) actions = actions_e.clone() actions[:, agent_id] = agent_actions actor_loss = -agent.critic_local( full_states_e, actions.view(-1, self.action_dim)).mean() agent.actor_local.zero_grad() torch.nn.utils.clip_grad_norm_(agent.actor_local.parameters(), 2) actor_loss.backward() agent.actor_optimizer.step() actor_loss_value = actor_loss.cpu().detach().item() critic_loss_value = critic_loss.cpu().detach().item() return actor_loss_value, critic_loss_value def target_act(self, states): actions = torch.zeros(states.shape[:2] + (self.action_size, ), dtype=torch.float, device=self.device) for i in range(self.num_agents): actions[:, i, :] = self.agents[i].actor_target(states[:, i]) return actions def local_act(self, states): actions = torch.zeros(states.shape[:2] + (self.action_size, ), dtype=torch.float, device=self.device) for i in range(self.num_agents): actions[:, i, :] = self.agents[i].actor_local(states[:, i]) return actions