Python OUNoise примеры использования

Язык программирования: Python

Пространство имен/Пакет: noise_model

Класс/Тип: OUNoise

Примеров на hotexamples.com: 6

Python OUNoise - 6 примеров найдено. Это лучшие примеры Python кода для noise_model.OUNoise, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

OUNoise(3)

reset(3)

sample(3)

Основные методы

OUNoise (3)

reset (3)

sample (3)

Пример #1

Показать файл

Файл: agent_ddpg.py Проект: VikramRadhakrishnan/eagle-has-landed

    def __init__(self, state_size, action_size, actor_lr, critic_lr,
                 random_seed, mu, theta, sigma, buffer_size, batch_size,
                 epsilon_start, epsilon_min, epsilon_decay, gamma, tau,
                 n_time_steps, n_learn_updates, device):

        self.state_size = state_size
        self.action_size = action_size

        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, name="Actor_local")
        self.actor_target = Actor(state_size, action_size, name="Actor_target")
        self.actor_optimizer = Adam(learning_rate=self.actor_lr)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   name="Critic_local")
        self.critic_target = Critic(state_size,
                                    action_size,
                                    name="Critic_target")
        self.critic_optimizer = Adam(learning_rate=self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(action_size, random_seed, mu, theta, sigma)
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        # Replay memory
        self.batch_size = int(batch_size)
        self.buffer_size = int(buffer_size)
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size,
                                   random_seed)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters
        self.n_time_steps = n_time_steps  # number of time steps before updating network parameters
        self.n_learn_updates = n_learn_updates  # number of updates per learning step

        # Device
        self.device = device

        tf.keras.backend.clear_session()

Пример #2

Показать файл

    def __init__(self,
                 agent_id,
                 model,
                 action_size=2,
                 seed=42,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0.0):
        """Initialize parameters and build single DDPG Agent.
        Params
        ======
            agent_id (int):       ID of the agent
            model (object):       model object
            action_size (int):    dimension of each action
            seed (int):           random seed
            tau (float):          param for soft update of target parameters
            lr_actor (float):     learning rate for actor
            lr_critic (float):    learning rate for critic
            weight_decay (float): L2 weight decay
        """
        random.seed(seed)

        self.id = agent_id
        self.action_size = action_size
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic

        # Actor Network
        self.actor_local = model.actor_local
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network
        self.critic_local = model.critic_local
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Set weights for local and target actor, respectively, same for the critic
        self.hard_copy_init(self.actor_target, self.actor_local)
        self.hard_copy_init(self.critic_target, self.critic_local)

        self.noise = OUNoise(action_size, seed)

Пример #3

Показать файл

    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        self.actor_lr = 0.0001
        self.critic_lr = 0.001

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001 #0.01  # for soft update of target parameters
        
        # Score tracker and learning parameters
        self.best_score = -np.inf
        self.score = 0

Пример #4

Показать файл

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        self.actor_lr = 0.0001
        self.critic_lr = 0.001

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001 #0.01  # for soft update of target parameters
        
        # Score tracker and learning parameters
        self.best_score = -np.inf
        self.score = 0

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        
        self.total_reward += reward
        self.count += 1
        
        if done:
            self.score = self.total_reward / float(self.count) if self.count else 0.0 # Calculate the running average reward
            if self.score > self.best_score:
                self.best_score = self.score

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)   

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)

Пример #5

Показать файл

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, state_size, action_size, actor_lr, critic_lr,
                 random_seed, mu, theta, sigma, buffer_size, batch_size, gamma,
                 tau, n_time_steps, n_learn_updates, device):

        self.state_size = state_size
        self.action_size = action_size
        self.action_high = action_high
        self.action_low = action_low

        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, name="Actor_local")
        self.actor_target = Actor(state_size, action_size, name="Actor_target")
        self.actor_optimizer = Adam(learning_rate=self.actor_lr)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   name="Critic_local")
        self.critic_target = Critic(state_size,
                                    action_size,
                                    name="Critic_target")
        self.critic_optimizer = Adam(learning_rate=self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(self.action_size, random_seed, mu, theta, sigma)

        # Replay memory
        self.batch_size = int(batch_size)
        self.buffer_size = int(buffer_size)
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size,
                                   random_seed)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters
        self.n_time_steps = n_time_steps  # number of time steps before updating network parameters
        self.n_learn_updates = n_learn_updates  # number of updates per learning step

        # Device
        self.device = device

    def reset(self):
        """Reset the agent."""
        self.noise.reset()

    def step(self, time_step, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state[:], action[:], reward, next_state[:], done)

        if time_step % self.n_time_steps != 0:
            return

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:

            # Train the network for a number of epochs specified by the parameter
            for i in range(self.n_learn_updates):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = np.expand_dims(state, axis=0)
        action = self._act_tf(tf.constant(state))
        action = action.numpy()[0]

        if add_noise:
            action += self.noise.sample()

        action = action.clip(-1, 1)

        return action

    @tf.function
    def _act_tf(self, state):
        return self.actor_local.model(state)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences : tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        self._learn_tf(experiences, tf.constant(self.gamma, dtype=tf.float64))

    @tf.function
    def _learn_tf(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        with tf.GradientTape() as tape:
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target.model(next_states)
            Q_targets_next = self.critic_target.model(
                [next_states, actions_next])
            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
            # Compute critic loss
            Q_expected = self.critic_local.model([states, actions])
            critic_loss = MSE(Q_expected, Q_targets)

        # Minimize the loss
        critic_grad = tape.gradient(
            critic_loss, self.critic_local.model.trainable_variables)
        self.critic_optimizer.apply_gradients(
            zip(critic_grad, self.critic_local.model.trainable_variables))

        # ---------------------------- update actor ---------------------------- #
        with tf.GradientTape() as tape:
            # Compute actor loss
            actions_pred = self.actor_local.model(states)
            actor_loss = -tf.reduce_mean(
                self.critic_local.model([states, actions_pred]))

        # Minimize the loss
        actor_grad = tape.gradient(actor_loss,
                                   self.actor_local.model.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor_local.model.trainable_variables))

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local.model, self.critic_target.model,
                         self.tau)
        self.soft_update(self.actor_local.model, self.actor_target.model,
                         self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: TF2 model
            target_model: TF2 model
            tau (float): interpolation parameter 
        """
        for target_var, local_var in zip(target_model.weights,
                                         local_model.weights):
            target_var.assign(tau * local_var + (1.0 - tau) * target_var)

Пример #6

Показать файл

class DDPGAgent():
    """Single DDPG Agent with basic functionality."""
    def __init__(self,
                 agent_id,
                 model,
                 action_size=2,
                 seed=42,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0.0):
        """Initialize parameters and build single DDPG Agent.
        Params
        ======
            agent_id (int):       ID of the agent
            model (object):       model object
            action_size (int):    dimension of each action
            seed (int):           random seed
            tau (float):          param for soft update of target parameters
            lr_actor (float):     learning rate for actor
            lr_critic (float):    learning rate for critic
            weight_decay (float): L2 weight decay
        """
        random.seed(seed)

        self.id = agent_id
        self.action_size = action_size
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic

        # Actor Network
        self.actor_local = model.actor_local
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network
        self.critic_local = model.critic_local
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Set weights for local and target actor, respectively, same for the critic
        self.hard_copy_init(self.actor_target, self.actor_local)
        self.hard_copy_init(self.critic_target, self.critic_local)

        self.noise = OUNoise(action_size, seed)

    def act(self, state, noise_weight=1.0, add_noise=True):
        """Return actions for given state as per current policy.
        Params
        ======
            state (array):        current state per agent
            noise_weight (float): decay coefficient for action noise
            add_noise (bool):     flag to add noise to actions
        """

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            self.noise_val = self.noise.sample() * noise_weight
            action += self.noise_val
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, agent_id, experiences, gamma, all_next_actions,
              all_actions):
        """Update policy and value parameters using given batch of experience tuples.
        Params
        ======
            agent_id (int):                    ID of an agent               
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float):                     discount factor
            all_next_actions (list):           next action per each agent, calculated by its actor
            all_actions (list):                action per each agent, calculated by its actor
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # get predicted next-state actions and Q values from target models
        self.critic_optimizer.zero_grad()

        agent_id = torch.tensor([agent_id]).to(device)
        actions_next = torch.cat(all_next_actions, dim=1).to(device)

        with torch.no_grad():
            q_targets_next = self.critic_target(next_states, actions_next)
        # q_targets = reward of this timestep + discount * Q(st+1,at+1) from target network
        q_targets = rewards.index_select(
            1, agent_id) + (gamma * q_targets_next *
                            (1 - dones.index_select(1, agent_id)))

        # compute Q targets for current states (y_i)
        q_expected = self.critic_local(states, actions)

        # compute critic loss
        critic_loss = F.mse_loss(q_expected, q_targets.detach())

        # minimize loss
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # compute actor loss
        self.actor_optimizer.zero_grad()

        # detach actions from other agents
        actions_pred = [
            actions if i == self.id else actions.detach()
            for i, actions in enumerate(all_actions)
        ]
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # minimize loss
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_copy_init(self, target, source):
        """
        Init network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)