Python ReplayBuffer.size примеры использования

Язык программирования: Python

Пространство имен/Пакет: buffer

Класс/Тип: ReplayBuffer

Метод/Функция: size

Примеров на hotexamples.com: 2

Python ReplayBuffer.size - 2 примера найдено. Это лучшие примеры Python кода для buffer.ReplayBuffer.size, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayBuffer(30)

sample(30)

add(30)

push(26)

sample_buffer(16)

store_transition(15)

sample_batch(11)

store(9)

get_minibatch(8)

append(2)

add_transition(2)

size(2)

update_priorities(2)

random_next_batch(2)

reset(2)

add_experience(2)

getBatch(2)

store_trans(1)

store_frame(1)

store_episode(1)

save(1)

store_transtions(1)

sample_minibatch(1)

random_indices(1)

remember(1)

length(1)

isSampling(1)

insert(1)

encode_recent_observation(1)

dequeue(1)

clear(1)

append_data(1)

add_record(1)

add_items(1)

update_priority(1)

Пример #1

Показать файл

Файл: main.py Проект: pyzeon/Deep-Learning-in-Catalyst

def train_model(context, data, training_batch):
    # Crear clase de datos sintéticos
    context.synthetic_data = SyntheticData(context=context,
                                           data=data,
                                           window=10000,
                                           frequency=30)

    # Crear configuración del modelo junto con redes neuronales
    create_model(context)

    # Configurar resumen de operaciones
    summary_ops, summary_vars = build_summaries()
    writer = tf.summary.FileWriter(
        "/home/enzo/PycharmProjects/DDPGPorfolioOptimization/summaries",
        context.sess.graph)

    if os.path.exists(context.model_path):
        context.saver.restore(context.sess, context.model_path)

    # Inicializar la memoria de repetición
    replay_buffer = ReplayBuffer(context.buffer_size)
    for episode in range(context.max_episodes):
        data, close_prices = context.synthetic_data.get_trayectory(
            t_intervals=context.max_ep_steps + context.n)

        # Resetear los valores del portafolio al inicio de cada episodio
        context.portfolio_value_memory = []
        context.portfolio_value_memory.append(context.init_train_portfolio)
        context.train_invested_quantity = 0.0
        context.assets_quantity_invested = []
        context.portfolio_w_memory = []
        context.init_portfolio_w = []
        for i in range(len(context.assets) + 1):
            context.init_portfolio_w.append(0.0)
        context.portfolio_w_memory.append(context.init_portfolio_w)
        for i in range(len(context.assets)):
            context.assets_quantity_invested.append(0.0)
        context.train_cash = context.init_train_portfolio
        context.last_train_operation = 2
        context.open_trade = False

        ep_reward = 0
        ep_ave_max_q = 0
        ep_loss = 0

        # Se resta uno para tomar el cuenta la obtención del siguiente estado
        for i in range(context.max_ep_steps - 1):
            # Obtener el estado
            s = data[:, i:i + context.n, :]

            # Aplicar un error a la acción que permita equilibrar el problema de explotación/exploración
            random = np.random.rand()
            if random > context.epsilon:
                if s.shape == (len(context.assets), context.n,
                               len(context.features)):
                    a = context.actor.predict([s])[0]
                else:
                    print("Episodio:", episode, "Paso:", i,
                          "La forma del estado actual es incorrecta")
                    continue
            else:
                rand_array = np.random.rand(len(context.assets) + 1)
                a = np.exp(rand_array) / np.sum(np.exp(rand_array))
            context.epsilon = context.epsilon * context.epsilon_decay

            # Siguiente estado
            s2 = data[:, i + 1:i + 1 + context.n, :]
            if not s2.shape == (len(
                    context.assets), context.n, len(context.features)):
                print("Episodio:", episode, "Paso:", i,
                      "La forma del siguiente estado es incorrecta")
                continue

            # Recompensa
            this_closes = close_prices[:, i + context.n]
            previous_closes = close_prices[:, i + context.n - 1]

            r = get_reward(context, this_closes, previous_closes, a)

            # Punto terminal
            if i == (context.max_ep_steps - context.n - 2):
                t = True
            else:
                t = False

            replay_buffer.add(s, a, r, t, s2)

            if replay_buffer.size() > context.minibatch_size:
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    context.minibatch_size)
                # Calcular objetivos
                target_q = context.critic.predict_target(
                    s2_batch, context.actor.predict_target(s2_batch))
                y_i = []

                for k in range(context.minibatch_size):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + context.gamma * target_q[k])

                # Actualizar el crítico dados los objetivos
                predicted_q_value_batch = np.reshape(
                    y_i, (context.minibatch_size, 1))
                predicted_q_value, losses, _ = context.critic.train(
                    s_batch, a_batch, predicted_q_value_batch)

                ep_loss += np.mean(losses)
                ep_ave_max_q += np.amax(predicted_q_value)

                # Actualizar la política del actor utilizando el ejemplar de gradiente
                a_outs = context.actor.predict(s_batch)
                grads = context.critic.action_gradients(s_batch, a_outs)
                context.actor.train(s_batch, grads[0])

                # Actualizar las redes objetivo
                context.actor.update_target_network()
                context.critic.update_target_network()

            ep_reward += r

            if i == (context.max_ep_steps - 2):
                summary_str = context.sess.run(summary_ops,
                                               feed_dict={
                                                   summary_vars[0]:
                                                   ep_reward,
                                                   summary_vars[1]:
                                                   ep_ave_max_q / float(i),
                                                   summary_vars[2]:
                                                   ep_loss / float(i)
                                               })

                writer.add_summary(summary_str, episode)
                writer.flush()

                print(
                    '| Reward: {:.5f} | Episode: {:d} | Qmax: {:.4f} | Porfolio value: {:.4f} | Epsilon: {:.5f} '
                    .format(ep_reward, episode, (ep_ave_max_q / float(i)),
                            context.portfolio_value_memory[-1],
                            context.epsilon))

        _ = context.saver.save(context.sess, context.model_path)

Пример #2

Показать файл

class MultiAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents=2,
                 eps_before_train=500,
                 gamma=0.99,
                 batch_size=128,
                 buffer_size=int(1e5),
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0,
                 tau=1e-3,
                 noise_weight=1.0,
                 noise_decay=0.999998,
                 noise_min=1e-3,
                 seed=0,
                 device="cuda:0"):

        # (self, state_size, action_size, num_agents=2, random_seed=1, lr_actor=2e-4, lr_critic=1e-3,
        #          weight_decay=0, tau=2e-3, device=device)

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.action_dim = action_size * num_agents

        self.eps_before_train = eps_before_train
        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.tau = tau

        self.noise_weight = noise_weight
        self.noise_decay = noise_decay
        self.noise_min = noise_min
        self.device = device
        self.i_episode = 0

        self.agents = [
            DDPG(self.state_size,
                 self.action_size,
                 self.num_agents,
                 random_seed=2 * i * seed,
                 lr_actor=self.lr_actor,
                 lr_critic=self.lr_critic,
                 weight_decay=self.weight_decay,
                 tau=self.tau,
                 device=self.device) for i in range(self.num_agents)
        ]
        self.memory = ReplayBuffer(self.action_size, self.buffer_size, seed)

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def act(self, states, add_noise=True):
        noise_weight = self.noise_weight if add_noise else 0.0

        if (self.i_episode >= self.eps_before_train) and (self.noise_weight >
                                                          self.noise_min):
            self.noise_weight *= self.noise_decay
            noise_weight = self.noise_weight

        actions = [
            agent.act(s, noise_weight=noise_weight)
            for s, agent in zip(states, self.agents)
        ]
        return np.array(actions)

    def step(self, states, actions, rewards, next_states, dones, t, i_episode):

        full_state = states.reshape(-1)
        full_next_state = next_states.reshape(-1)
        self.i_episode = i_episode

        self.memory.add(state=states,
                        full_state=full_state,
                        action=actions,
                        reward=rewards,
                        next_state=next_states,
                        full_next_state=full_next_state,
                        done=dones)

        if (i_episode >= self.eps_before_train) and (self.memory.size() >=
                                                     self.batch_size):
            for agent_id in range(self.num_agents):
                experiences = self.memory.sample(self.batch_size)
                self.learn(agent_id, experiences)
            self.soft_update_all()

    def soft_update_all(self):
        for agent in self.agents:
            agent.soft_update_all()

    def learn(self, agent_id, experiences):
        agent = self.agents[agent_id]

        states_e, full_states_e, actions_e, rewards_e, next_states_e, full_next_states_e, dones_e = experiences
        rewards = rewards_e[:, agent_id].view(-1, 1)
        dones = dones_e[:, agent_id].view(-1, 1)

        # Update critic
        target_actions = self.target_act(next_states_e)
        Q_target_next = agent.critic_target(
            full_next_states_e, target_actions.view(-1, self.action_dim))
        Q_target = rewards + self.gamma * Q_target_next * (1.0 - dones)
        Q_local = agent.critic_local(full_states_e,
                                     actions_e.view(-1, self.action_dim))

        critic_loss = F.mse_loss(input=Q_local, target=Q_target.detach())
        agent.critic_local.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.critic_local.parameters(), 2)
        agent.critic_optimizer.step()

        # Update the actor policy
        agent_states = states_e[:, agent_id]
        agent_actions = agent.actor_local(agent_states)
        actions = actions_e.clone()
        actions[:, agent_id] = agent_actions

        actor_loss = -agent.critic_local(
            full_states_e, actions.view(-1, self.action_dim)).mean()
        agent.actor_local.zero_grad()
        torch.nn.utils.clip_grad_norm_(agent.actor_local.parameters(), 2)
        actor_loss.backward()
        agent.actor_optimizer.step()

        actor_loss_value = actor_loss.cpu().detach().item()
        critic_loss_value = critic_loss.cpu().detach().item()
        return actor_loss_value, critic_loss_value

    def target_act(self, states):
        actions = torch.zeros(states.shape[:2] + (self.action_size, ),
                              dtype=torch.float,
                              device=self.device)
        for i in range(self.num_agents):
            actions[:, i, :] = self.agents[i].actor_target(states[:, i])
        return actions

    def local_act(self, states):
        actions = torch.zeros(states.shape[:2] + (self.action_size, ),
                              dtype=torch.float,
                              device=self.device)
        for i in range(self.num_agents):
            actions[:, i, :] = self.agents[i].actor_local(states[:, i])
        return actions