Exemplo n.º 1
0
class DDPG():
    def __init__(self, task, sess):
        self.sess = sess
        self.env = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 0.0001
        self.tau = 0.001
        self.minibatch_size = 64
        self.critic_lr = 0.001
        self.gamma = 0.99
        self.buffer_size = 1000000
        self.random_seed = 1234
        self.summary_dir = "/"
        #self.max_episode = 100
        #self.max_episode_len = 100
        self.mu = 0

        self.actor = ActorNetwork(self.sess, self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr, self.tau, self.minibatch_size)

        self.critic = CriticNetwork(self.sess, self.state_size,
                                    self.action_size, self.critic_lr, self.tau,
                                    self.gamma,
                                    self.actor.get_num_trainable_vars())

        # Initialize replay memory
        self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed)
        self.sess.run(tf.global_variables_initializer())
        self.actor.update_target_network()
        self.critic.update_target_network()

        self.noise = OUNoise(self.action_size, self.mu)

        self.sess.run(tf.global_variables_initializer())

    def reset_episode(self):
        #self.actor_noise.reset()
        state = self.env.reset()
        self.last_state = state
        self.ep_ave_max_q = 0
        self.ep_reward = 0
        return state

    def step(self, s, a, r, terminal, s2):
        # Save experience / reward
        #self.memory.add(self.last_state, action, reward, next_state, done)
        #summary_ops, summary_vars = self.build_summaries()
        self.replay_buffer.add(np.reshape(s, (self.actor.s_dim, )),
                               np.reshape(a, (self.actor.a_dim, )), r,
                               terminal, np.reshape(s2, (self.actor.s_dim, )))
        # Learn, if enough samples are available in memory
        if self.replay_buffer.size() > self.minibatch_size:

            s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch(
                self.minibatch_size)
            #self.train(s_batch, a_batch, r_batch, t_batch, s2_batch)
            target_q = self.critic.predict_target(
                s2_batch, self.actor.predict_target(s2_batch))

            y_i = []
            for k in range(self.minibatch_size):
                if t_batch[k]:
                    y_i.append(r_batch[k])
                else:
                    y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

                    # Update the critic given the targets
            predicted_q_value, _ = self.critic.train(
                s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1)))

            #self.ep_ave_max_q += np.amax(predicted_q_value)

            # Update the actor policy using the sampled gradient
            a_outs = self.actor.predict(s_batch)
            grads = self.critic.action_gradients(s_batch, a_outs)
            self.actor.train(s_batch, grads[0])

            # Update target networks
            self.actor.update_target_network()
            self.critic.update_target_network()

        # Roll over last state and action
        self.last_state = s2
        '''
        self.ep_reward +=r
        
        if terminal:
            
            summary_str = self.sess.run(
            , feed_dict={summary_vars[0]: self.ep_reward, summary_vars[1]: self.ep_ave_max_q / float(j)})

            writer.add_summary(summary_str, i)
            #writer.flush()
            
            print('| Reward: {:d} |Qmax: {:.4f}'.format(int(self.ep_reward), \
                             (self.ep_ave_max_q / float(j))))
             '''

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])

        actions = self.actor.predict(states)[0]
        #actornoises = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size))
        #print(actions)

        return actions + self.noise.sample()  # add some noise for exploration

    def train(self, s_batch, a_batch, r_batch, t_batch, s2_batch):

        target_q = self.critic.predict_target(
            s2_batch, self.actor.predict_target(s2_batch))

        y_i = []
        for k in range(self.minibatch_size):
            if t_batch[k]:
                y_i.append(r_batch[k])
            else:
                y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

                # Update the critic given the targets
        predicted_q_value, _ = self.critic.train(
            s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1)))

        #self.ep_ave_max_q += np.amax(predicted_q_value)

        # Update the actor policy using the sampled gradient
        a_outs = self.actor.predict(s_batch)
        grads = self.critic.action_gradients(s_batch, a_outs)
        self.actor.train(s_batch, grads[0])

        # Update target networks
        self.actor.update_target_network()
        self.critic.update_target_network()

    def build_summaries(self):
        episode_reward = tf.Variable(0.)
        tf.summary.scalar("Reward", episode_reward)
        episode_ave_max_q = tf.Variable(0.)
        tf.summary.scalar("Qmax Value", episode_ave_max_q)

        summary_vars = [episode_reward, episode_ave_max_q]
        summary_ops = tf.summary.merge_all()

        return summary_ops, summary_vars
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        self.score = 0
        self.best_score = -np.inf
        self.noise_scale = 0.1

    def reset_episode(self):
        self.noise.reset()
        self.total_reward = 0.0
        self.count = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score
            self.noise_scale = max(0.5 * self.noise_scale, 0.01)
        else:
            self.noise_scale = min(2.0 * self.noise_scale, 3.2)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Exemplo n.º 3
0
    action_range = action_high - action_low

    #Start with random action
    action = np.array([np.random.uniform() for _ in action_low])
    noise = OUNoise(action.shape[0], exploration_mu, exploration_theta,
                    exploration_sigma)

    time_limit = 10
    for i in range(10):
        start_time = time.time()
        env.reset()
        done = False
        j = 0
        while not done:
            j += 1
            ns = noise.sample()
            action = action + ns

            v_size, angle, speed = np.array(transform_action(
                action, action_range, action_low),
                                            dtype='uint8')
            ns, rw, done = env.step((v_size, angle, speed))
            if time.time() - start_time > time_limit:
                print("Time limit, will reset")
                done = True
                #env.reset()

            #cv2.imshow('frame', frame[28: 112, :])
            if cv2.waitKey(25) & 0xFF == ord('q'):
                cv2.destroyAllWindows()
                #asb.stop()
    #h, w = buff.get_device_screen_shape()
    exploration_mu = 0
    exploration_theta = 0.15
    exploration_sigma = 0.2
    action_size = 3
    action_low = np.array([1, 0, 1])
    action_high = np.array([10, 359, 2000])
    action_range = action_high - action_low

    action = np.array([np.random.uniform() for _ in action_low])
    noise = OUNoise(action.shape[0], exploration_mu, exploration_theta,
                    exploration_sigma)
    values = np.zeros((100, 3))
    iis = [i for i in range(100)]
    for i in iis:
        action = action + noise.sample()
        action = np.array(transform_action(action, action_range, action_low),
                          dtype='uint8')
        values[i] = action

    print(values.shape)

    fig, ax = plt.subplots(3, sharex='col', sharey='row')
    labels = ['Size', 'Angle', 'Duration']
    for i in range(3):
        ax[i].plot(iis, values[:, i])
        ax[i].set_xlabel(labels[i])

    plt.grid()
    plt.show()
Exemplo n.º 5
0
class Agent:
    """Interacts and learns from the environment"""
    def __init__(self,
                 device,
                 state_size,
                 action_size,
                 random_seed,
                 fc1=128,
                 fc2=128,
                 lr_actor=1e-04,
                 lr_critic=1e-04,
                 weight_decay=0,
                 buffer_size=100000,
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3):
        """
        Parameters
        ----------
            brain_name (String):
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            fc1 (int): 1st fully connected layer size for model (actor & critic)
            fc2 (int): 2nd fully connected layer size for model (actor & critic)
            device: CPU/GPU

            lr_actor (float): learning rate for Actor
            lr_critic (flaot): learning rate for Critic
            weight_decay (float): weight decay used in model optimizer
            buffer_size (int): replay buffer size
            batch_size (int): batch size to sample from buffer
            gamma (float): parameter used to calculate Q target
            tau (float): soft update interpolation parameter
        """
        self.device = device

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        # Actor network (with target)
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 random_seed,
                                 fc1_units=fc1,
                                 fc2_units=fc2).to(device)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  random_seed,
                                  fc1_units=fc1,
                                  fc2_units=fc2).to(device)
        self.actor_optimizer = optim.Adam(params=self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic actor
        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   random_seed,
                                   fc1_units=fc1,
                                   fc2_units=fc2).to(device)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    random_seed,
                                    fc1_units=fc1,
                                    fc2_units=fc2).to(device)
        self.critic_optimizer = optim.Adam(
            params=self.critic_local.parameters(),
            lr=lr_critic,
            weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   self.device, random_seed)

        self.make_copy(self.critic_local, self.critic_target)
        self.make_copy(self.actor_local, self.actor_target)

        print("Initilized agent with state size = {} and action size = {}".
              format(self.state_size, self.action_size))

    def make_copy(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def step(self, state, action, reward, next_state, done):
        """
        Save experience in replay memory, and use random sample from buffer to learn
        """

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.batch_size:
            batch = self.memory.sample()
            self.learn(batch)

    def act(self, state, add_noise=True):
        """
        Returns actions for given state as per current policy.
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, batch):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Parameters
        ----------
            batch (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = batch

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_target_next = self.critic_target(next_states, actions_next)
        # compute Q targets for next states (y_i)
        Q_targets = rewards + (self.gamma * Q_target_next * (1.0 - dones))
        # Compute citic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimise loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimise loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Parameters
        ----------
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
Exemplo n.º 6
0
class DDPG():
    """DDPG agent with own actor and critic."""
    def __init__(self, agent_id, model, action_size=2, seed=0):
        """Initialize an Agent object.
        """
        self.seed = random.seed(seed)
        self.id = agent_id
        self.action_size = action_size

        # Actor Network
        self.actor_local = model.actor_local
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network
        self.critic_local = model.critic_local
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Set weights for local and target actor, respectively, critic the same
        self.hard_copy_weights(self.actor_target, self.actor_local)
        self.hard_copy_weights(self.critic_target, self.critic_local)

        # Noise process
        self.noise = OUNoise(action_size, seed)

    def hard_copy_weights(self, target, source):
        """ copy weights from source to target network (part of initialization)"""
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    def act(self, state, noise_weight=1.0, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            self.noise_val = self.noise.sample() * noise_weight
            action += self.noise_val
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, agent_id, experiences, gamma, all_next_actions,
              all_actions):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # get predicted next-state actions and Q values from target models
        self.critic_optimizer.zero_grad()
        agent_id = torch.tensor([agent_id]).to(device)
        actions_next = torch.cat(all_next_actions, dim=1).to(device)
        with torch.no_grad():
            q_targets_next = self.critic_target(next_states, actions_next)
        # compute Q targets for current states (y_i)
        q_expected = self.critic_local(states, actions)
        # q_targets = reward of this timestep + discount * Q(st+1,at+1) from target network
        q_targets = rewards.index_select(
            1, agent_id) + (gamma * q_targets_next *
                            (1 - dones.index_select(1, agent_id)))
        # compute critic loss
        critic_loss = F.mse_loss(q_expected, q_targets.detach())
        # minimize loss
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # compute actor loss
        self.actor_optimizer.zero_grad()
        # detach actions from other agents
        actions_pred = [
            actions if i == self.id else actions.detach()
            for i, actions in enumerate(all_actions)
        ]
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # minimize loss
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 7
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 device,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 random_seed=0):
        """
            Initialize an Agent object.
        :param state_size: size of state
        :param action_size: size of action
        :param num_agents: number of agents
        :param gamma: discount factor
        :param tau: factor for soft update of target parameters
        :param lr_actor: Learning rate of actor
        :param lr_critic: Learning rate of critic
        :param random_seed: Random seed
        :param device: cuda or cpu
        """

        self.device = device
        self.gamma = gamma
        self.tau = tau

        self.num_agents = num_agents

        self.state_size = state_size
        self.action_size = action_size
        self.full_state_size = state_size * num_agents
        self.full_action_size = action_size * num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, device,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, device,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.full_state_size,
                                   self.full_action_size,
                                   device=device,
                                   random_seed=random_seed).to(device)
        self.critic_target = Critic(self.full_state_size,
                                    self.full_action_size,
                                    device=device,
                                    random_seed=random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=0)

        self.noise = OUNoise(action_size, random_seed)

    def save_model(self, agent_number):
        torch.save(self.actor_local.state_dict(),
                   f'models/checkpoint_actor_{agent_number}.pth')
        torch.save(self.critic_local.state_dict(),
                   f'models/checkpoint_critic_{agent_number}.pth')

    def load_model(self, agent_number):
        checkpoint = torch.load(f'models/checkpoint_actor_{agent_number}.pth',
                                map_location=torch.device('cpu'))
        self.actor_local.load_state_dict(checkpoint)

        checkpoint = torch.load(f'models/checkpoint_critic_{agent_number}.pth',
                                map_location=torch.device('cpu'))
        self.critic_local.load_state_dict(checkpoint)

    def act(self, state, noise=0., train=False):
        """Returns actions for given state as per current policy.
        :param state: state as seen from single agent
        """

        if train is True:
            self.actor_local.train()
        else:
            self.actor_local.eval()

        action = self.actor_local(state)
        if noise > 0:
            noise = torch.tensor(noise * self.noise.sample(),
                                 dtype=state.dtype,
                                 device=state.device)
        return action + noise

    def target_act(self, state, noise=0.):
        #self.actor_target.eval()
        # convert to cpu() since noise is in cpu()
        self.actor_target.eval()
        action = self.actor_target(state).cpu()
        if noise > 0.:
            noise = torch.tensor(noise * self.noise.sample(),
                                 dtype=state.dtype,
                                 device=state.device)
        return action + noise

    def update_critic(self, rewards, dones, all_states, all_actions,
                      all_next_states, all_next_actions):
        with torch.no_grad():
            Q_targets_next = self.critic_target(all_next_states,
                                                all_next_actions)
            # Compute Q targets for current states (y_i)
        q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        q_expected = self.critic_local(all_states, all_actions)
        # critic_loss = F.mse_loss(q_expected, q_targets)
        critic_loss = ((q_expected - q_targets.detach())**2).mean()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

    def update_actor(self, all_states, all_predicted_actions):
        """Update actor network

        :param all_states: all states
        :param all_predicted_actions: all predicted actions
        """
        actor_loss = -self.critic_local(all_states,
                                        all_predicted_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor_optimizer.step()

    def update_targets(self):
        self.soft_update(self.actor_local, self.actor_target, self.tau)
        self.soft_update(self.critic_local, self.critic_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset(self):
        self.noise.reset()
Exemplo n.º 8
0
class Agent():
    """ DDPG Agent, interacts with environment and learns from environment """
    def __init__(self, device, state_size, n_agents, action_size, random_seed, \
                         buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay,  \
                         learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'):

        # Set Computational device
        self.DEVICE = device

        # Init State, action and agent dimensions
        self.state_size = state_size
        self.n_agents = n_agents
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.l_step = 0
        self.log_interval = 200

        # Init Hyperparameters
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = TAU
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.WEIGHT_DECAY = weight_decay
        self.LEARN_INTERVAL = learn_interval
        self.LEARN_NUM = learn_num

        # Init Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Init Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Init Noise Process
        self.noise = OUNoise((n_agents, action_size),
                             random_seed,
                             mu=0.,
                             theta=ou_theta,
                             sigma=ou_sigma)

        # Init Replay Memory
        self.memory = ReplayBuffer(device, action_size, buffer_size,
                                   batch_size, random_seed)

    # think
    def act(self, states, add_noise=True):
        """ Decide what action to take next """

        # evaluate state through actor_local
        states = torch.from_numpy(states).float().to(self.DEVICE)
        actions = np.zeros((self.n_agents, self.action_size))

        self.actor_local.eval()  # put actor_local network in "evaluation" mode
        with torch.no_grad():
            for n, state in enumerate(states):
                actions[n, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()  # put actor_local back into "training" mode

        # add noise for better performance
        if add_noise:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    # embody
    def step(self, t, s, a, r, s_, done):
        """ Commit step into the brain """

        # Save SARS' to replay buffer --- state-action-reward-next_state tuple
        for n in range(self.n_agents):
            # self.memory.add(s, a, r, s_, done)

            # print ("going to learn 10 times")

            self.memory.add(s[n], a[n], r[n], s_[n], done[n])

        if t % self.LEARN_INTERVAL != 0:
            return

        # Learn (if enough samples are available in memory        )
        if len(self.memory) > self.BATCH_SIZE:
            # print ("going to learn 10 times")
            for _ in range(self.LEARN_NUM):
                experiences = self.memory.sample()  # get a memory sample
                self.learn(experiences, self.GAMMA)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """ Learn from experiences, with discount factor gamma
        
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Params:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # ------ Update Critic ------ #

        # get predicted next-state actions and Q values from target networks
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #         torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ------ Update Actor ------ #

        # compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ------ Update Target Networks ------ #
        self.soft_update(self.critic_local, self.critic_target, self.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.TAU)

        # keep count of steps taken
        # self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 num_all_agents,
                 seed,
                 batch_size,
                 buffer_size=int(1e6),
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=4e-4,
                 lr_critic=4e-4,
                 weight_decay=0,
                 discrete_actions=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_all_agents (int): number of agents
            seed (int): random seed
            batch_size (int): minibatch size
            buffer_size (int): replay buffer size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr_actor (float): learning rate of the actor
            lr_critic (float): learning rate of the critic
            weight_decay (float): L2 weight decay
        """
        random.seed(seed)

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.num_all_agents = num_all_agents
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        self.noise = OUNoise(action_size, seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size,
                                 action_size,
                                 seed,
                                 use_batch_norm_layers=False).to(device)
        self.actor_target = Actor(state_size,
                                  action_size,
                                  seed,
                                  use_batch_norm_layers=False).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        if discrete_actions:
            action_size = 1
        self.critic_local = Critic(state_size * num_all_agents,
                                   action_size * num_all_agents,
                                   seed).to(device)
        self.critic_target = Critic(state_size * num_all_agents,
                                    action_size * num_all_agents,
                                    seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, tau, agent_index):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states_all, actions_all, rewards_all, next_states_all, dones, actions_next_target_all, actions_next_local_all = experiences

        rewards_self = rewards_all[:, agent_index]
        states_self = states_all.view(-1, self.num_all_agents,
                                      self.state_size)[:, agent_index, :]
        del rewards_all

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        Q_targets_next = self.critic_target(next_states_all,
                                            actions_next_target_all)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards_self + (gamma * Q_targets_next) * (1 - dones)
        # Compute critic loss
        Q_expected = self.critic_local(states_all, actions_all)
        critic_loss = F.mse_loss(Q_expected.view(-1, self.batch_size),
                                 Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actor_loss = -self.critic_local(states_all,
                                        actions_next_local_all).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, tau)
        self.soft_update(self.actor_local, self.actor_target, tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 10
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(
        self,
        num_agents,
        state_size,
        action_size,
        buffer_size=int(1e5),
        batch_size=128,
        gamma=0.99,
        tau=1e-3,
        lr_actor=1e-4,
        lr_critic=1e-3,
        weight_decay=0,
        random_seed=2,
    ):
        """Initialize an Agent object.

        Params
        ======
            num_agents (int): number of agents
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(
            action_size=action_size,
            buffer_size=buffer_size,
            batch_size=batch_size,
            seed=random_seed,
        )

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       1)  # clip gradients at 1
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDPGAgent:
    '''
    DDPG Agent implementation
    '''
    def __init__(self, agent_id, state_size, action_size, rand_seed,
                 meta_agent):
        """ Creates a new DDPG Agent """

        self.agent_id = agent_id
        self.action_size = action_size

        # Defines the Actor Networks
        self.actor_local = Actor(state_size, action_size, rand_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  rand_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Defines the Critic Networks
        self.critic_local = Critic(state_size, action_size,
                                   meta_agent.agents_qty, rand_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    meta_agent.agents_qty,
                                    rand_seed).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=LR_CRITIC)  #, weight_decay=WEIGHT_DECAY)

        self.noise = OUNoise(action_size, rand_seed)

        # Refers to the MA agent memory
        self.memory = meta_agent.memory

        self.t_step = 0

    def step(self):

        # Takes an step
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        (states_list, actions_list, rewards, next_states_list,
         dones) = experiences

        # Get the target actions for all the states
        l_all_next_actions = []
        for states in states_list:
            l_all_next_actions.append(self.actor_target(states))

        # Convert the experiences into Torch tensors
        all_next_actions = torch.cat(l_all_next_actions, dim=1).to(device)
        all_next_states = torch.cat(next_states_list, dim=1).to(device)
        all_states = torch.cat(states_list, dim=1).to(device)
        all_actions = torch.cat(actions_list, dim=1).to(device)

        Q_targets_next = self.critic_target(all_next_states, all_next_actions)

        # Calculates the Q function using all the next states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # --------------------------- update actor ---------------------------
        actions_pred = []
        for states in states_list:
            actions_pred.append(self.actor_local(states))

        actions_pred = torch.cat(actions_pred, dim=1).to(device)

        actor_loss = -self.critic_local(all_states, actions_pred).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ---------------------- update target networks ----------------------
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def act(self, states, add_noise=True):
        """ Returns the actions to take by the agent"""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """ Performs the softupdate """
        iter_params = zip(target_model.parameters(), local_model.parameters())
        for target_param, local_param in iter_params:
            tensor_aux = tau * local_param.data + (1.0 -
                                                   tau) * target_param.data
            target_param.data.copy_(tensor_aux)

    def reset(self):
        self.noise.reset()
class DDPG:
    def __init__(self, task):
        # Hyper parameters
        self.learning_rate_actor = 1e-4
        self.learning_rate_critic = 1e-3
        self.gamma = 0.99
        self.tau = 0.001

        # Define net
        self.sess = tf.Session()
        self.task = task
        self.actor = ActorNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_actor, \
                     self.task.action_low, self.task.action_high, self.tau)
        self.critic = CriticNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_critic, self.tau)

        # Define noise
        self.mu = 0
        self.theta = 0.15
        self.sigma = 0.20
        self.noise = OUNoise(self.task.action_size, self.mu, self.theta, self.sigma)

        # Define memory replay
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = Replay(self.buffer_size, self.batch_size)

        # Score
        self.best_score = -np.inf
        self.best_reward = -np.inf

    def reset(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.total_reward = 0.0
        self.count = 0
        return state

    def learn(self, experience):
        # Turn into different np arrays
        state_batch = np.vstack([e[0] for e in experience])
        action_batch = np.vstack([e[1] for e in experience])
        reward_batch = np.vstack([e[2] for e in experience])
        next_state_batch = np.vstack([e[3] for e in experience])
        done_batch = np.vstack([e[4] for e in experience])

        # Calculate next_state q value
        next_action_batch = self.actor.target_actions(next_state_batch)
        next_q_targets = self.critic.targetQ(next_state_batch, next_action_batch)

        # Train critic net
        q_targets = reward_batch + self.gamma * next_q_targets * (1 - done_batch)
        self.critic.train(state_batch, action_batch, q_targets)

        # Train actor net
        action_gradients = self.critic.gradients(state_batch, action_batch)
        self.actor.train(action_gradients, state_batch)

        # Update target network
        self.actor.update_target(False)
        self.critic.update_target(False)

    def step(self, action, reward, next_state, done):
        self.memory.add([self.last_state, action, reward, next_state, done])
        self.total_reward += reward
        self.count += 1
        if done:
            self.score = self.total_reward / float(self.count) if self.count else 0.0
            self.best_score = max(self.best_score, self.score)
            self.best_reward = max(self.total_reward, self.best_reward)

        if len(self.memory.buffer) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, states):
        states = np.reshape(states, [-1, self.task.state_size])
        action = self.actor.actions(states)[0]
        return list(action + self.noise.sample())
class Agent(object):
    """
    The Agent interacts with and learns from the environment.
    """
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed=0,
                 params=params):
        """
        Initialize an Agent object.
        Params
        ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        num_agents (int): number of agents
        random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.params = params

        # Actor (Policy) Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.params['DEVICE'])
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.params['DEVICE'])
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.params['LR_ACTOR'])

        # Critic (Value) Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.params['DEVICE'])
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.params['DEVICE'])
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.params['LR_CRITIC'],
            weight_decay=self.params['WEIGHT_DECAY'])

        # Initialize target and local to same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'],
                                   self.params['BATCH_SIZE'], random_seed)

    def hard_update(self, local_model, target_model):
        """
        Hard update model parameters.
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def step(self, states, actions, rewards, next_states, dones):
        """
        Save experiences in replay memory and use random sample from buffer to learn.
        """

        # Save experience / reward, cater for when multiples
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn if enough samples are available in memory
        if len(self.memory) > self.params['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.learn(experiences, self.params['GAMMA'])

    def act(self, states, add_noise=True):
        """
        Returns actions for a given state as per current policy.
        """
        states = torch.from_numpy(states).float().to(self.params['DEVICE'])
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                actions[i, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma=params['GAMMA']):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Update Critic(Value)
        # Get predicted next-state actions and Q-Values from target Network
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q Targe for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimise the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # Stabilize learning per bernchmark guidelines
        self.critic_optimizer.step()

        # Update Actor (Policy)
        # Compute Actor Loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.critic_local,
                         self.critic_target,
                         tau=self.params['TAU'])
        self.soft_update(self.actor_local,
                         self.actor_target,
                         tau=self.params['TAU'])

    def soft_update(self, local_model, target_model, tau=params['TAU']):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 14
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.session = K.get_session()
        init = tf.global_variables_initializer()
        self.session.run(init)
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.score = -math.inf
        self.best_score = -math.inf
        self.last_loss = math.inf

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        self.noise_scale = (self.exploration_mu, self.exploration_theta,
                            self.exploration_sigma)
        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 16
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        self.total_reward = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.total_reward += reward

        # Learn, if enough samples are available in memory
        print("Memory Size: {}, Batch Size: {}".format(len(self.memory),
                                                       self.batch_size))
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        #state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(np.array([state]))[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        print("Fitting model iteration ...")
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.array([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.array(
            [e.next_state for e in experiences if e is not None])
        print("Next states shape: {}".format(next_states.shape))
        self.score = rewards.mean()
        self.best_score = max(self.score, self.best_score)

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        r = self.actor_local.train_fn([states, action_gradients, 1])

        self.last_loss = np.mean(-action_gradients * actions)

        # custom training function Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"
        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    if __name__ == "__main__":
        state_size = (84, 296, 9)
        action_low = np.array([1, 0, 1])
        action_high = np.array([10, 359, 2000])
        net = Actor(state_size, 3, action_low, action_high)
        #net = Critic(state_size, 3)
        net.model.summary()
Exemplo n.º 15
0
class Agent():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # actor policy model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_high, self.action_low)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_high, self.action_low)

        # critic value model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.25
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # replay buffer
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # algorithm parameters
        self.gamma = 0.9  # discount rate
        self.tau = 0.1  # soft update parameter

        self.total_reward = 0
        self.count = 0
        self.score = 0
        self.best_score = -np.inf

        self.reset_episode()

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # keep track of rewards
        self.total_reward += reward
        self.count += 1
        # save experience/reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        # if there are enough experiences, learn from them
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, states):
        # returns action for a given state(s) as per the current policy
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())

    def learn(self, experiences):
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0

        # update the policy and value parameters given batch of experience tuples
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # get predicted next state and Q values from target models
        next_actions = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, next_actions])

        # compute Q targets for current state and train local critic model
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # train local actor model
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom train function

        # soft update target models
        self.soft_update(self.actor_local.model, self.actor_target.model)
        self.soft_update(self.critic_local.model, self.critic_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Exemplo n.º 16
0
class Agent:
    def __init__(self, state_size, action_size, random_seed):
        """ Creates a new DDPG agent initilizing the networks """
        self.state_size = state_size
        self.action_size = action_size

        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)

        self.critic = Critic(state_size, action_size, 17).to(device)
        self.critic_target = Critic(state_size, action_size, 17).to(device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.actor = Actor(state_size, action_size, 17).to(device)
        self.actor_target = Actor(state_size, action_size, 17).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR)

        self.seed = random.seed(random_seed)
        # Noise process
        self.noise = OUNoise(action_size, random_seed)

    def next_action(self, states, add_noise=True):
        """ Returns the next action to take """
        states = torch.from_numpy(states).float().to(device)

        self.actor.eval()
        with torch.no_grad():
            action_values = self.actor(states).cpu().data.numpy()
        self.actor.train()

        actions = action_values

        if add_noise:
            actions += self.noise.sample()

        actions = np.clip(actions, -1, 1)

        return actions

    def step(self, state, action, reward, next_state, done):
        """ Takes a next step saving the data in the replay buffer and learning new experiences """

        ## Save in experience replay buffer"""

        for s, a, r, ns, d in zip(state, action, reward, next_state, done):
            self.memory.add(s, a, r, ns, d)

        ## If there is sufficient memories in the replay buffer
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()

            self._learn(experiences, GAMMA)

    def _learn(self, experiences, gamma):
        """ Learns new experiences """

        state, action, reward, next_state, dones = experiences
        ## Update Critic
        # Recovers next action from actor
        next_action = self.actor(next_state)

        # Train the critic using the experience (4)
        Q_target_next = self.critic_target(next_state, next_action)

        Q_target = reward + (gamma * Q_target_next * (1 - dones))

        Q_expected = self.critic(state, action)

        critic_loss = F.mse_loss(Q_expected, Q_target)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

        ## Update Actor
        new_action = self.actor(state)
        actor_loss = -self.critic(state, new_action).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
        self.actor_optimizer.step()

        self.soft_update(self.critic, self.critic_target, TAU)
        self.soft_update(self.actor, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)