def __init__(self, state_size, action_size, actor_lr, critic_lr,
                 random_seed, mu, theta, sigma, buffer_size, batch_size,
                 epsilon_start, epsilon_min, epsilon_decay, gamma, tau,
                 n_time_steps, n_learn_updates, device):

        self.state_size = state_size
        self.action_size = action_size

        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, name="Actor_local")
        self.actor_target = Actor(state_size, action_size, name="Actor_target")
        self.actor_optimizer = Adam(learning_rate=self.actor_lr)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   name="Critic_local")
        self.critic_target = Critic(state_size,
                                    action_size,
                                    name="Critic_target")
        self.critic_optimizer = Adam(learning_rate=self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(action_size, random_seed, mu, theta, sigma)
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        # Replay memory
        self.batch_size = int(batch_size)
        self.buffer_size = int(buffer_size)
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size,
                                   random_seed)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters
        self.n_time_steps = n_time_steps  # number of time steps before updating network parameters
        self.n_learn_updates = n_learn_updates  # number of updates per learning step

        # Device
        self.device = device

        tf.keras.backend.clear_session()
Пример #2
0
    def __init__(self,
                 agent_id,
                 model,
                 action_size=2,
                 seed=42,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0.0):
        """Initialize parameters and build single DDPG Agent.
        Params
        ======
            agent_id (int):       ID of the agent
            model (object):       model object
            action_size (int):    dimension of each action
            seed (int):           random seed
            tau (float):          param for soft update of target parameters
            lr_actor (float):     learning rate for actor
            lr_critic (float):    learning rate for critic
            weight_decay (float): L2 weight decay
        """
        random.seed(seed)

        self.id = agent_id
        self.action_size = action_size
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic

        # Actor Network
        self.actor_local = model.actor_local
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network
        self.critic_local = model.critic_local
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Set weights for local and target actor, respectively, same for the critic
        self.hard_copy_init(self.actor_target, self.actor_local)
        self.hard_copy_init(self.critic_target, self.critic_local)

        self.noise = OUNoise(action_size, seed)
Пример #3
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        self.actor_lr = 0.0001
        self.critic_lr = 0.001

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001 #0.01  # for soft update of target parameters
        
        # Score tracker and learning parameters
        self.best_score = -np.inf
        self.score = 0
Пример #4
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        self.actor_lr = 0.0001
        self.critic_lr = 0.001

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001 #0.01  # for soft update of target parameters
        
        # Score tracker and learning parameters
        self.best_score = -np.inf
        self.score = 0

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        
        self.total_reward += reward
        self.count += 1
        
        if done:
            self.score = self.total_reward / float(self.count) if self.count else 0.0 # Calculate the running average reward
            if self.score > self.best_score:
                self.best_score = self.score

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)   

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #5
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, state_size, action_size, actor_lr, critic_lr,
                 random_seed, mu, theta, sigma, buffer_size, batch_size, gamma,
                 tau, n_time_steps, n_learn_updates, device):

        self.state_size = state_size
        self.action_size = action_size
        self.action_high = action_high
        self.action_low = action_low

        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, name="Actor_local")
        self.actor_target = Actor(state_size, action_size, name="Actor_target")
        self.actor_optimizer = Adam(learning_rate=self.actor_lr)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   name="Critic_local")
        self.critic_target = Critic(state_size,
                                    action_size,
                                    name="Critic_target")
        self.critic_optimizer = Adam(learning_rate=self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(self.action_size, random_seed, mu, theta, sigma)

        # Replay memory
        self.batch_size = int(batch_size)
        self.buffer_size = int(buffer_size)
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size,
                                   random_seed)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters
        self.n_time_steps = n_time_steps  # number of time steps before updating network parameters
        self.n_learn_updates = n_learn_updates  # number of updates per learning step

        # Device
        self.device = device

    def reset(self):
        """Reset the agent."""
        self.noise.reset()

    def step(self, time_step, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state[:], action[:], reward, next_state[:], done)

        if time_step % self.n_time_steps != 0:
            return

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:

            # Train the network for a number of epochs specified by the parameter
            for i in range(self.n_learn_updates):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = np.expand_dims(state, axis=0)
        action = self._act_tf(tf.constant(state))
        action = action.numpy()[0]

        if add_noise:
            action += self.noise.sample()

        action = action.clip(-1, 1)

        return action

    @tf.function
    def _act_tf(self, state):
        return self.actor_local.model(state)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences : tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        self._learn_tf(experiences, tf.constant(self.gamma, dtype=tf.float64))

    @tf.function
    def _learn_tf(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        with tf.GradientTape() as tape:
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target.model(next_states)
            Q_targets_next = self.critic_target.model(
                [next_states, actions_next])
            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
            # Compute critic loss
            Q_expected = self.critic_local.model([states, actions])
            critic_loss = MSE(Q_expected, Q_targets)

        # Minimize the loss
        critic_grad = tape.gradient(
            critic_loss, self.critic_local.model.trainable_variables)
        self.critic_optimizer.apply_gradients(
            zip(critic_grad, self.critic_local.model.trainable_variables))

        # ---------------------------- update actor ---------------------------- #
        with tf.GradientTape() as tape:
            # Compute actor loss
            actions_pred = self.actor_local.model(states)
            actor_loss = -tf.reduce_mean(
                self.critic_local.model([states, actions_pred]))

        # Minimize the loss
        actor_grad = tape.gradient(actor_loss,
                                   self.actor_local.model.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor_local.model.trainable_variables))

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local.model, self.critic_target.model,
                         self.tau)
        self.soft_update(self.actor_local.model, self.actor_target.model,
                         self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: TF2 model
            target_model: TF2 model
            tau (float): interpolation parameter 
        """
        for target_var, local_var in zip(target_model.weights,
                                         local_model.weights):
            target_var.assign(tau * local_var + (1.0 - tau) * target_var)
Пример #6
0
class DDPGAgent():
    """Single DDPG Agent with basic functionality."""
    def __init__(self,
                 agent_id,
                 model,
                 action_size=2,
                 seed=42,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0.0):
        """Initialize parameters and build single DDPG Agent.
        Params
        ======
            agent_id (int):       ID of the agent
            model (object):       model object
            action_size (int):    dimension of each action
            seed (int):           random seed
            tau (float):          param for soft update of target parameters
            lr_actor (float):     learning rate for actor
            lr_critic (float):    learning rate for critic
            weight_decay (float): L2 weight decay
        """
        random.seed(seed)

        self.id = agent_id
        self.action_size = action_size
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic

        # Actor Network
        self.actor_local = model.actor_local
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network
        self.critic_local = model.critic_local
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Set weights for local and target actor, respectively, same for the critic
        self.hard_copy_init(self.actor_target, self.actor_local)
        self.hard_copy_init(self.critic_target, self.critic_local)

        self.noise = OUNoise(action_size, seed)

    def act(self, state, noise_weight=1.0, add_noise=True):
        """Return actions for given state as per current policy.
        Params
        ======
            state (array):        current state per agent
            noise_weight (float): decay coefficient for action noise
            add_noise (bool):     flag to add noise to actions
        """

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            self.noise_val = self.noise.sample() * noise_weight
            action += self.noise_val
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, agent_id, experiences, gamma, all_next_actions,
              all_actions):
        """Update policy and value parameters using given batch of experience tuples.
        Params
        ======
            agent_id (int):                    ID of an agent               
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float):                     discount factor
            all_next_actions (list):           next action per each agent, calculated by its actor
            all_actions (list):                action per each agent, calculated by its actor
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # get predicted next-state actions and Q values from target models
        self.critic_optimizer.zero_grad()

        agent_id = torch.tensor([agent_id]).to(device)
        actions_next = torch.cat(all_next_actions, dim=1).to(device)

        with torch.no_grad():
            q_targets_next = self.critic_target(next_states, actions_next)
        # q_targets = reward of this timestep + discount * Q(st+1,at+1) from target network
        q_targets = rewards.index_select(
            1, agent_id) + (gamma * q_targets_next *
                            (1 - dones.index_select(1, agent_id)))

        # compute Q targets for current states (y_i)
        q_expected = self.critic_local(states, actions)

        # compute critic loss
        critic_loss = F.mse_loss(q_expected, q_targets.detach())

        # minimize loss
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # compute actor loss
        self.actor_optimizer.zero_grad()

        # detach actions from other agents
        actions_pred = [
            actions if i == self.id else actions.detach()
            for i, actions in enumerate(all_actions)
        ]
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # minimize loss
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_copy_init(self, target, source):
        """
        Init network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)