Exemplos de ReplayBuffer.add_transition em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: buffer

Classe / Tipo: ReplayBuffer

Método / Função: add_transition

Exemplos em hotexamples.com: 2

ReplayBuffer.add_transition em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de buffer.ReplayBuffer.add_transition em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

ReplayBuffer(30)

sample(30)

add(30)

push(26)

sample_buffer(16)

store_transition(15)

sample_batch(11)

store(9)

get_minibatch(8)

append(2)

add_transition(2)

size(2)

update_priorities(2)

random_next_batch(2)

reset(2)

add_experience(2)

getBatch(2)

store_trans(1)

store_frame(1)

store_episode(1)

save(1)

store_transtions(1)

sample_minibatch(1)

random_indices(1)

remember(1)

length(1)

isSampling(1)

insert(1)

encode_recent_observation(1)

dequeue(1)

clear(1)

append_data(1)

add_record(1)

add_items(1)

update_priority(1)

Métodos Frequentes

ReplayBuffer (30)

sample (30)

add (30)

push (26)

sample_buffer (16)

store_transition (15)

sample_batch (11)

store (9)

get_minibatch (8)

append (2)

Métodos Frequentes

add_transition (2)

size (2)

update_priorities (2)

random_next_batch (2)

reset (2)

add_experience (2)

getBatch (2)

store_trans (1)

store_frame (1)

store_episode (1)

save (1)

store_transtions (1)

sample_minibatch (1)

random_indices (1)

remember (1)

length (1)

isSampling (1)

insert (1)

encode_recent_observation (1)

dequeue (1)

Métodos Frequentes

save (1)

store_transtions (1)

sample_minibatch (1)

random_indices (1)

remember (1)

length (1)

isSampling (1)

insert (1)

encode_recent_observation (1)

dequeue (1)

clear (1)

append_data (1)

add_record (1)

add_items (1)

update_priority (1)

Métodos Frequentes

clear (1)

append_data (1)

add_record (1)

add_items (1)

update_priority (1)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: td3.py Projeto: joseab10/cont_cartpole

class TD3: def __init__(self, actor, critic, reward_fun, gamma=0.99, tau=0.005, # policy_noise=0.2, noise_clip=0.5, policy_freq=2, max_buffer_size=1e6, batch_size=64, lr=3e-4 ): self._actor = actor self._actor_target = copy.deepcopy(self._actor) self._actor_optimizer = torch.optim.Adam(self._actor.parameters(), lr=lr) self._critic = critic self._critic_target = copy.deepcopy(self._critic) self._critic_loss = nn.MSELoss() self._critic_optimizer = torch.optim.Adam(self._critic.parameters(), lr=lr) self.reward_fun = reward_fun self._gamma = gamma self._tau = tau self._policy_freq = policy_freq self._rbuffer_max_size = max_buffer_size self._replay_buffer = ReplayBuffer(self._rbuffer_max_size) self._batch_size = batch_size self._steps = 0 self._run = 0 def get_action(self, s, deterministic=False): return self._actor.get_action(s, deterministic=deterministic) def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) self._run += 1 for e in range(episodes): s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) for t in range(time_steps): a = self._actor.get_action(s, deterministic=False) ns, r, d, _ = env.step(tn(a)) stats.episode_rewards[e] += r stats.episode_lengths[e] = t self._steps += 1 self._replay_buffer.add_transition(s, a, ns, r, d) # Sample replay buffer b_states, b_actions, b_nstates, b_rewards, b_terminal = self._replay_buffer.random_next_batch(self._batch_size) # Get action according to target actor policy b_nactions = self._actor_target.get_action(b_nstates, deterministic=False) # Compute the target Q value from target critic target_Q1, target_Q2 = self._critic_target(b_nstates, b_nactions) target_Q = torch.min(target_Q1, target_Q2).reshape((-1)) target_Q = b_rewards + (1 - b_terminal) * self._gamma * target_Q target_Q = target_Q.reshape((-1, 1)).detach() # Get current Q estimates from critic current_Q1, current_Q2 = self._critic(b_states, b_actions) # Compute critic loss critic_loss = self._critic_loss(current_Q1, target_Q) + self._critic_loss(current_Q2, target_Q) stats.episode_loss[e] += critic_loss.item() # Optimize the critic self._critic_optimizer.zero_grad() critic_loss.backward() self._critic_optimizer.step() # Delayed policy updates if self._steps % self._policy_freq == 0: # Compute actor losses by the deterministic policy gradient actor_loss = -self._critic.Q1(b_states, self._actor.get_action(b_states, deterministic=True)).mean() # Optimize the actor self._actor_optimizer.zero_grad() actor_loss.backward() self._actor_optimizer.step() # Soft-Update the target models soft_update(self._critic_target, self._critic, self._tau) soft_update(self._actor_target, self._actor, self._tau) if d: break s = ns pr_stats = {'run': self._run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e]} print_stats(pr_stats) return stats def reset_parameters(self): self._actor.reset_parameters() self._actor_target.reset_parameters() self._critic.reset_parameters() self._critic_target.reset_parameters() hard_update(self._actor_target, self._actor) hard_update(self._critic_target, self._critic) self._steps = 0 self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)

Exemplo n.º 2

0

Exibir arquivo

class DQN: def __init__(self, policy, action_fun, q, q_target, state_dim, action_dim, gamma, double_q=True, reward_fun=None, replay_buffer=False, max_buffer_size=1e6, batch_size=64, tau=0.01, lr=1e-4): self._q = q self._q_target = q_target self._pi = policy self._action_fun = action_fun self.reward_fun = reward_fun self._doubleQ = double_q if torch.cuda.is_available(): self._q.cuda() self._q_target.cuda() self._gamma = gamma self._tau = tau self._state_dim = state_dim self._action_dim = action_dim self._use_rbuffer = replay_buffer if self._use_rbuffer: self._rbuffer_max_size = max_buffer_size self._replay_buffer = ReplayBuffer(self._rbuffer_max_size) self._batch_size = batch_size self._learning_rate = lr self._loss_function = nn.MSELoss() self._q_optimizer = optim.Adam(self._q.parameters(), lr=self._learning_rate) self._run = 0 def _get_action(self, s, deterministic=False): return self._pi.get_action(s, deterministic=deterministic) def get_action(self, s, deterministic=False): return self._action_fun.act2env( self._get_action(s, deterministic=deterministic)) def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) self._run += 1 for e in range(episodes): s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) total_r = 0 # Step policy for advancing the scheduler epsilon = self._pi.epsilon() # print("\t\t\tStep: {:5d} Epsilon: {:6.5f}".format(t, epsilon)) self._pi.step() for t in range(time_steps): a = self._get_action(s) ns, r, d, _ = env.step(self._action_fun.act2env(a)) stats.episode_rewards[e] += r stats.episode_lengths[e] = t total_r += r if self._use_rbuffer: self._replay_buffer.add_transition(s, a, ns, r, d) b_states, b_actions, b_nstates, b_rewards, b_terminal = self._replay_buffer.random_next_batch( self._batch_size) dim = 1 else: b_states = s b_actions = a b_nstates = ns b_rewards = r b_terminal = d dim = 0 if self._doubleQ: # Q-Values from next states [Q] used only to determine the optima next actions q_nstates = self._q(b_nstates) # Optimal Action Prediction [Q] nactions = torch.argmax(q_nstates, dim=dim) if self._use_rbuffer: nactions = [ torch.arange(self._batch_size).long(), nactions ] # Q-Values from [Q_target] function using the action indices from [Q] function q_target_nstates = self._q_target(b_nstates)[nactions] else: q_target_nstates = self._q_target(b_nstates) q_target_nstates = torch.max(q_target_nstates, dim=dim) target_prediction = b_rewards + ( 1 - b_terminal) * self._gamma * q_target_nstates if self._use_rbuffer: q_actions = [ torch.arange(self._batch_size).long(), b_actions.long() ] else: q_actions = b_actions current_prediction = self._q(b_states)[q_actions] loss = self._loss_function(current_prediction, target_prediction.detach()) stats.episode_loss[e] += loss.item() self._q_optimizer.zero_grad() loss.backward() self._q_optimizer.step() soft_update(self._q_target, self._q, self._tau) if d: break s = ns pr_stats = { 'run': self._run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e] } print_stats(pr_stats, ', Epsilon: {:6.5f}'.format(epsilon)) return stats def reset_parameters(self): self._q.reset_parameters() self._q_target.reset_parameters() hard_update(self._q_target, self._q) self._pi.reset_parameters() if self._use_rbuffer: self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)