Python ReplayBuffer.sample примеры использования

Язык программирования: Python

Пространство имен/Пакет: agents.replaybuffer

Класс/Тип: ReplayBuffer

Метод/Функция: sample

Примеров на hotexamples.com: 4

Python ReplayBuffer.sample - 4 примера найдено. Это лучшие примеры Python кода для agents.replaybuffer.ReplayBuffer.sample, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayBuffer(6)

add(4)

sample(4)

is_sufficient(1)

Пример #1

Показать файл

Файл: ddpg.py Проект: mayur29/Udacity_Machine_Learning_Projects

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        # self.exploration_theta = 0.085
        # self.exploration_sigma = 0.15
        self.exploration_theta = 0.070
        self.exploration_sigma = 0.20
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.70  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

Пример #2

Показать файл

class DQLagent():
    """Reinforcement Learning agent that learns using a DQL network."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size

        # Exploration parameters
        self.decay_max   = 1.0              # exploration probability at start
        self.decay_min   = 0.01             # minimum exploration probability 
        self.decay_rate  = 0.0001           # exponential decay rate for exploration prob
        self.decay_step  = np.exp(-self.decay_rate)
        self.decay_range = self.decay_max - self.decay_min
        self.decay_factor = 1.
        self.explore_p = self.decay_max

        # Network parameters
        self.learning_rate = 0.0001         # Q-network learning rate
        #self.learning_rate = 0.001         # Q-network learning rate

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor

        # Score tracker and learning parameters
        self.best_score = -np.inf
        self.score = -np.inf
        self.loss = 0

        self.qnet = QNetwork(self.state_size, self.action_size, name='main', learning_rate=self.learning_rate)

        # Episode variables
        self.reset_episode()
        
    def reset_episode(self,new_tgt_pos=None):
        self.total_reward = 0.0
        self.count = 0
        state = self.task.reset(new_tgt_pos)
        self.last_state = state
        return state

    def act(self, tfsess, state):
        """Returns actions for given state(s) as per current policy."""
        # Explore or Exploit
        if len(self.memory) > self.batch_size:
            # epsilon-greedy policy:
            self.decay_factor *= self.decay_step
            self.explore_p = self.decay_min + (self.decay_range*self.decay_factor) 
            if self.explore_p > np.random.rand():
                # Make a random action
                actions = np.random.randint(0,self.action_size)
            else:
                # Get actions from Q-network
                feed = {self.qnet.inputs_: state.reshape((1, *state.shape))}
                Qs = tfsess.run(self.qnet.output, feed_dict=feed)
                actions = np.argmax(Qs)
        else:
            # pick actions equi-probablistically
            actions = np.random.randint(0,self.action_size)
        return actions

    def step(self,
             tfsess,
             action,     # int
             reward,     # np.ndarray (action_repeat,)
             next_state, # np.ndarray (state_size*action_repeat,)
             done):      # bool
         # Save experience / reward
        self.memory.add(self.last_state, action, np.mean(reward), next_state, done)

        # Save experience / reward
        self.total_reward += np.mean(reward)
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(tfsess,experiences)

        # Roll over last state and action
        self.last_state = next_state

    def learn(self, tfsess, expbatch):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states      = np.array([e.state for e in expbatch if e is not None])
        actions     = np.array([e.action for e in expbatch if e is not None]).astype(np.float32)
        rewards     = np.array([e.reward for e in expbatch if e is not None]).astype(np.float32)
        dones       = np.array([e.done for e in expbatch if e is not None]).astype(np.uint8)
        next_states = np.array([e.next_state for e in expbatch if e is not None])

        # Train network
        target_Qs = tfsess.run(self.qnet.output, feed_dict={self.qnet.inputs_: next_states})
            
        # Set target_Qs to 0 for states where episode ends
        target_Qs[dones] = np.zeros(self.action_size)
        
        targets = rewards + self.gamma * np.max(target_Qs, axis=1)

        self.loss, _ = tfsess.run([self.qnet.loss, self.qnet.opt],
                                   feed_dict={self.qnet.inputs_: states,
                                              self.qnet.targetQs_: targets,
                                              self.qnet.actions_: actions})

        self.score = self.total_reward / float(self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score

Пример #3

Показать файл

Файл: agent.py Проект: rrahimi01/DLND_DDPG_Quadcopter

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""

    #name: is a name to use to save the netural Network models
    #load: load data from existing models or cretae an entirly new model
    def __init__(self, task, name, loadfile=False):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.name = name
        if loadfile:
            self.actor_local.model.load_weights("./weights/" + name +
                                                "_actor.h5")
            self.critic_local.model.load_weights("./weights/" + name +
                                                 "_critic.h5")

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15  #0.3 #original 0.15
        self.exploration_sigma = 0.3  #0.3 #original 0.3

        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)

        #rewards = np.interp(rewards, (rewards.min(), rewards.max()), (-1, +1)) #TESTING to scale rewards to a small number.

        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def save_weights(self):
        self.actor_local.model.save_weights("./weights/" + self.name +
                                            "_actor.h5")
        self.critic_local.model.save_weights("./weights/" + self.name +
                                             "_critic.h5")

    #Notice that after training over a batch of experiences, we could just copy our newly learned weights (from the local model) to the target model.
    #However, individual batches can introduce a lot of variance into the process, so it's better to perform a soft update, controlled by the parameter tau.

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

Пример #4

Показать файл

class Agent():
    def __init__(self, cfg):
        # Environment configuration
        self.action_shape = cfg['env']['action_shape']

        # Replay memory
        cfg['agent']['memory']['action_shape'] = self.action_shape
        self.memory = ReplayBuffer(**cfg['agent']['memory'])

        # Algorithm parameters
        self.exploration_mu, self.exploration_sigma = cfg['agent']['noise']

        self.gamma = cfg['agent']['gamma']
        self.tau = cfg['agent']['tau']

        state_flatten_shape = [np.prod(self.memory.flatten_state_shape)]
        # Actor Model
        self.actor = Actor(state_flatten_shape, self.action_shape,
                           cfg['env']['action_range'], self.tau,
                           self.memory.batch_size, cfg['actor'])

        # Critic Model
        self.critic = Critic(state_flatten_shape, self.action_shape, self.tau,
                             cfg['critic'])

        # Flag & Counter
        self.training = True
        self.episode = 0
        self.max_episode_explore = cfg['agent']['explore']

    def init_actor_critic(self):
        # Initialize target model
        self.critic.copy_local_in_target()
        self.actor.copy_local_in_target()

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done,
                        self.training)

    def act(self, state):
        self.last_state = state

        window_states = state.reshape(1, -1)
        action = self.actor.predict(window_states)

        if self.training and self.episode < self.max_episode_explore:
            p = self.episode / self.max_episode_explore
            action = p * action + (p - 1) * np.random.normal(
                self.exploration_mu, self.exploration_sigma)
        return np.clip(action.ravel(), a_max=900, a_min=0)

    def learn(self):
        if self.memory.is_sufficient():
            experiences = self.memory.sample()

            states = experiences['state'][:,
                                          0].reshape(self.memory.batch_size,
                                                     -1)
            actions = experiences['action'][:,
                                            0].reshape(self.memory.batch_size,
                                                       -1)
            rewards = experiences['reward']
            dones = experiences['done']
            next_states = experiences['next_state'][:, 0].reshape(
                self.memory.batch_size, -1)

            # get predicted next state action and Q values from target models
            actions_next = self.actor.get_targets(next_states)
            Q_targets_next = self.critic.get_targets(next_states, actions_next)

            # Compute Q targets for current states and train critic model
            Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
            self.critic.fit(states, actions, Q_targets)

            # Train actor model
            action_gradients = self.critic.get_actions_grad(states, actions)[0]
            self.actor.fit(states, action_gradients)

            # Soft-update target models
            self.critic.soft_update()
            self.actor.soft_update()