Python ReplayBuffer примеры использования

Язык программирования: Python

Пространство имен/Пакет: buffers.ReplayBuffer

Класс/Тип: ReplayBuffer

Примеров на hotexamples.com: 8

Python ReplayBuffer - 8 примеров найдено. Это лучшие примеры Python кода для buffers.ReplayBuffer.ReplayBuffer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayBuffer(4)

add(3)

sample(3)

add_new_experience(1)

update_priorities(1)

Пример #1

Показать файл

class MADDPG:
    def __init__(self, state_size, action_size, seed):
        super(MADDPG, self).__init__()

        self.maddpg_agents = [
            Agent(state_size, action_size, seed),
            Agent(state_size, action_size, seed)
        ]

        self.t_step = 0

        self.memory = ReplayBuffer(2, BUFFER_SIZE, BATCH_SIZE, 0)
        self.batch_size = BATCH_SIZE
        self.gamma = GAMMA
        self.update_every = UPDATE_EVERY
        self.num_updates = NUM_UPDATES

    def save(self):
        for i in range(len(self.maddpg_agents)):
            torch.save(self.maddpg_agents[i].actor_local.state_dict(),
                       'models/checkpoint_actor_{}_final.pth'.format(i))
            torch.save(self.maddpg_agents[i].critic_local.state_dict(),
                       'models/checkpoint_critic_{}_final.pth'.format(i))

    def load(self):
        for i in range(len(self.maddpg_agents)):
            actor_file = 'models/checkpoint_actor_{}_final.pth'.format(i)
            critic_file = 'models/checkpoint_critic_{}_final.pth'.format(i)
            self.maddpg_agents[i].actor_local.load_state_dict(
                torch.load(actor_file))
            self.maddpg_agents[i].critic_local.load_state_dict(
                torch.load(critic_file))

    def reset(self):
        for agent in self.maddpg_agents:
            agent.reset()

    def act(self, all_states):
        actions = [
            agent.act(np.expand_dims(states, axis=0))
            for agent, states in zip(self.maddpg_agents, all_states)
        ]
        return actions

    def step(self, states, actions, rewards, next_states, dones):
        for s, a, r, ns, d in zip(states, actions, rewards, next_states,
                                  dones):
            self.memory.add(s, a, r, ns, d)

        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0 and len(self.memory) > self.batch_size:
            for _ in range(self.num_updates):
                for agent in self.maddpg_agents:
                    experiences = self.memory.sample()
                    agent.learn(experiences, self.gamma)

Пример #2

Показать файл

    def __init__(self, state_size, action_size, seed):
        super(MADDPG, self).__init__()

        self.maddpg_agents = [
            Agent(state_size, action_size, seed),
            Agent(state_size, action_size, seed)
        ]

        self.t_step = 0

        self.memory = ReplayBuffer(2, BUFFER_SIZE, BATCH_SIZE, 0)
        self.batch_size = BATCH_SIZE
        self.gamma = GAMMA
        self.update_every = UPDATE_EVERY
        self.num_updates = NUM_UPDATES

Пример #3

Показать файл

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

Пример #4

Показать файл

Файл: ddpg_agent.py Проект: monusurana/reinforcement-learning-continuous-control

    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

Пример #5

Показать файл

Файл: dqn_agent.py Проект: gtuzi/Reinforcement-Learning-Navigation

    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 sample_method='uniform',
                 method='dqn',
                 device=None,
                 **kwargs):
        """Initialize an Agent object.

        Params
        ======
            state_size (c:int x h:int x w:int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.seqlen = SEQ_LEN
        self.device = VisualAgent.device if device is None else device

        # Q-Network
        self.qnetwork_local = VizQNet(self.seqlen, action_size,
                                      seed).to(self.device)
        self.qnetwork_target = VizQNet(self.seqlen, action_size,
                                       seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.dqnmethod = method
        self.sample_method = sample_method
        # Replay memory
        if sample_method == 'minority_resampled':
            self.memory = MinorityResampledReplayBuffer(action_size,
                                                        BUFFER_SIZE,
                                                        BATCH_SIZE,
                                                        seed,
                                                        device=self.device)
        elif sample_method == 'prioritized':
            if 'alpha' in kwargs:
                alpha = kwargs['alpha']
            else:
                alpha = None
            if 'beta0' in kwargs:
                beta = kwargs['beta0']
            else:
                beta = None
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  device=self.device,
                                                  alpha=alpha,
                                                  beta0=beta)
        elif sample_method == 'uniform':
            self.memory = ReplayBuffer(action_size,
                                       BUFFER_SIZE,
                                       BATCH_SIZE,
                                       seed,
                                       device=self.device)
        else:
            raise Exception('Unrecognized sampling method')

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.update_step = 0
        self.replay_step = 0
        self.episode_count = 0

Пример #6

Показать файл

Файл: dqn_agent.py Проект: gtuzi/Reinforcement-Learning-Navigation

class VisualAgent():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 sample_method='uniform',
                 method='dqn',
                 device=None,
                 **kwargs):
        """Initialize an Agent object.

        Params
        ======
            state_size (c:int x h:int x w:int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.seqlen = SEQ_LEN
        self.device = VisualAgent.device if device is None else device

        # Q-Network
        self.qnetwork_local = VizQNet(self.seqlen, action_size,
                                      seed).to(self.device)
        self.qnetwork_target = VizQNet(self.seqlen, action_size,
                                       seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.dqnmethod = method
        self.sample_method = sample_method
        # Replay memory
        if sample_method == 'minority_resampled':
            self.memory = MinorityResampledReplayBuffer(action_size,
                                                        BUFFER_SIZE,
                                                        BATCH_SIZE,
                                                        seed,
                                                        device=self.device)
        elif sample_method == 'prioritized':
            if 'alpha' in kwargs:
                alpha = kwargs['alpha']
            else:
                alpha = None
            if 'beta0' in kwargs:
                beta = kwargs['beta0']
            else:
                beta = None
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  device=self.device,
                                                  alpha=alpha,
                                                  beta0=beta)
        elif sample_method == 'uniform':
            self.memory = ReplayBuffer(action_size,
                                       BUFFER_SIZE,
                                       BATCH_SIZE,
                                       seed,
                                       device=self.device)
        else:
            raise Exception('Unrecognized sampling method')

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.update_step = 0
        self.replay_step = 0
        self.episode_count = 0

    def step(self, x, action, reward, next_x, done):

        # Generate the state
        state = np.array(self.sequence)[None, ...]

        # Append the new image, and create the next state
        self._preprocess_(next_x)
        next_state = np.array(self.sequence)[None, ...]

        # Save experience in replay memory
        self.memory.add_new_experience(state, action, reward, next_state, done)

        self.update_step = (self.update_step +
                            1) % UPDATE_EVERY  # This is checked in learn
        self.replay_step = (self.replay_step + 1) % REPLAY_EVERY
        self.episode_count = self.episode_count + 1 if done else self.episode_count

        if len(self.memory) > BATCH_SIZE:
            if (self.sample_method == 'prioritized'):
                if (self.replay_step == 0):
                    self.learn(GAMMA, method=self.dqnmethod)
            else:
                self.learn(GAMMA, method=self.dqnmethod)

    def on_new_episode(self, x1):
        self.sequence = deque(maxlen=self.seqlen)  # Reset the sequence
        # On new episode, just repeat the first image in the sequence
        for _ in range(self.seqlen):
            self._preprocess_(np.copy(x1))

    def _preprocess_(self, xi):
        '''
            Pre-process the sequence to state.
            We're using a deque, which inserts left to right. This pre-processes
            only the most recent image
        :return:
        '''

        xi = xi.squeeze()
        # Un-normalize
        xi = np.round((xi * 255.)).astype(np.uint8)
        x = convert_colorspace(xi, fromspace='rgb',
                               tospace='YCbCr')[..., 0]  # Get only the chroma
        self.sequence.append(x)

    def act(self, x, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current image
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # Generate the state
        state = np.array(self.sequence)[None, ...]
        state = torch.from_numpy(state).float().to(self.device)

        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def dqn(self, rewards, next_states, dones, gamma):
        y = torch.zeros_like(rewards)

        # For end of episode, the return is just the final reward
        y[(dones == 1).squeeze(), ...] = rewards[(dones == 1).squeeze(), ...]

        # Compute the aproximation of the optimal target reward values (Q*)
        with torch.no_grad():
            logits = self.qnetwork_target(next_states[(dones == 0).squeeze(),
                                                      ...])
            next_values, _ = torch.max(logits, 1)  # Values of next max actions
            y[(dones == 0).squeeze(),
              ...] = rewards[(dones == 0).squeeze(),
                             ...] + gamma * next_values.unsqueeze(-1)

        return y

    def doubledqn(self, rewards, next_states, dones, gamma):
        y = torch.zeros_like(rewards)

        # For end of episode, the return is just the final reward
        y[(dones == 1).squeeze(), ...] = rewards[(dones == 1).squeeze(), ...]

        # Compute the aproximation of the optimal target reward values (Q*)
        with torch.no_grad():
            # 1 - Get the local net next action
            next_local_logits = self.qnetwork_local(
                next_states[(dones == 0).squeeze(), ...])
            _, max_next_local_act = torch.max(next_local_logits,
                                              1)  # Values of next max actions

            # 2 - Get target network's value of the local's max next action
            next_target_logits = self.qnetwork_target(
                next_states[(dones == 0).squeeze(), ...])
            values = next_target_logits.gather(
                1, max_next_local_act.unsqueeze(-1))

            # 3 - Obtain the approximation of
            y[(dones == 0).squeeze(),
              ...] = rewards[(dones == 0).squeeze(), ...] + gamma * values

        return y

    def learn(self, gamma, method='doubledqn'):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, idc, weights = self.memory.sample(
            self.episode_count)
        dones = torch.round(dones).int()

        if method == 'dqn':
            y = self.dqn(rewards=rewards,
                         next_states=next_states,
                         dones=dones,
                         gamma=gamma)
        elif method == 'doubledqn':
            y = self.doubledqn(rewards=rewards,
                               next_states=next_states,
                               dones=dones,
                               gamma=gamma)
        else:
            raise Exception('Unrecognized method')

        ## TODO: compute and minimize the loss
        # GT: We train the local network,
        # and update the target network parameters
        # zero the parameter gradients

        self.qnetwork_local.train()

        # 1 - Clear out gradients from the local network
        self.optimizer.zero_grad()

        # 2 - Local estimation of action values
        local_q = self.qnetwork_local(states)

        # 3 - Loss between the approximation of optimal target reward values (Q*) and local estimates
        local_q = local_q.gather(1, actions)  # Prior expected returns

        # Temporal Difference (TD) error
        td_error = y - local_q

        # Update the priorities
        self.memory.update_priorities(
            np.abs(td_error.data.clone().cpu().numpy()) + 1.0e-5, idc)

        if self.sample_method == 'prioritized':
            local_q.backward(-weights * td_error)
        else:
            loss = torch.nn.MSELoss(reduce=False)(local_q, y)
            # 4 - Gradient descend on local network
            loss.backward(weights)

        # 5 - Gradient update
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        if self.update_step == 0:
            # 6 - Set local network to eval
            # self.qnetwork_local.eval()
            self.soft_update(TAU)

    def soft_update(self, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(self.qnetwork_target.parameters(),
                                             self.qnetwork_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #7

Показать файл

class DQNAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #8

Показать файл

Файл: ddpg_agent.py Проект: monusurana/reinforcement-learning-continuous-control

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)