class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        hard_update(self.actor_target, self.actor_local)
        hard_update(self.critic_target, self.critic_local)


        # Noise process
        self.noise = OUNoise(action_size, random_seed)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)
    
    def target_act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_target.eval()
        with torch.no_grad():
            action = self.actor_target(state).cpu().data.numpy()
        self.actor_target.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)
    

    def reset(self):
        self.noise.reset()
예제 #2
0
class DDPGAgent():

    def __init__(self, index, config, filenames=None):
        random.seed(config.general.seed)
        np.random.seed(config.general.seed)

        self.noise = OUNoise(config)
        self.index = index
        self.action_size = config.environment.action_size
        self.tau = config.hyperparameters.tau

        self.actor_local = Network(config.actor, config.general.seed)
        self.actor_target = Network(config.actor, config.general.seed)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=config.actor.lr)
        self.critic_local = Network(config.critic, config.general.seed)
        self.critic_target = Network(config.critic, config.general.seed)
        self.critic_optimizer = Adam(self.critic_local.parameters(), lr=config.critic.lr, weight_decay=config.hyperparameters.weight_decay)

    def act(self, state, noise, random):
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(torch.from_numpy(state).float().to(device)).cpu().data.numpy()
        self.actor_local.train()
        if noise is not None:
            action += self.noise.sample() * noise
        if random is not None:
            action = (1 - random) * action + random * (np.random.rand(self.action_size) - 0.5) * 2.0
        return np.clip(action, -1, 1)

    def learn(self, index, experiences, gamma, all_next_actions, all_actions):
        states, actions, rewards, next_states, dones = experiences

        self.critic_optimizer.zero_grad()

        index = torch.tensor([index]).to(device)
        actions_next = torch.cat(all_next_actions, dim=1).to(device)
        with torch.no_grad():
            q_next = self.critic_target(critic_input(next_states, actions_next))
        q_exp = self.critic_local(critic_input(states, actions))
        q_t = rewards.index_select(1, index) + (gamma * q_next * (1 - dones.index_select(1, index)))
        F.mse_loss(q_exp, q_t.detach()).backward()
        self.critic_optimizer.step()

        self.actor_optimizer.zero_grad()

        actions_pred = [actions if i == self.index else actions.detach() for i, actions in enumerate(all_actions)]
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        actor_loss = -self.critic_local(critic_input(states, actions_pred)).mean()
        actor_loss.backward()

        self.actor_optimizer.step()

        self.actor_target.soft_update(self.actor_local, self.tau)
        self.critic_target.soft_update(self.critic_local, self.tau)
예제 #3
0
class DDPG_agent(nn.Module):
    def __init__(self, in_actor, in_critic, action_size, num_agents,
                 random_seed):
        super(DDPG_agent, self).__init__()
        """init the agent"""

        self.action_size = action_size
        self.seed = random_seed

        # Fully connected actor network
        self.actor_local = Actor(in_actor, self.action_size,
                                 self.seed).to(device)
        self.actor_target = Actor(in_actor, self.action_size,
                                  self.seed).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Fully connected critic network
        self.critic_local = Critic(in_critic, num_agents * self.action_size,
                                   self.seed).to(device)
        self.critic_target = Critic(in_critic, num_agents * self.action_size,
                                    self.seed).to(device)
        self.critic_optimizer = Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck noise process for exploration
        self.noise = OUNoise((action_size), random_seed)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def target_act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        action = self.actor_target(state)
        return action

    def reset(self):
        """ Resets noise """
        self.noise.reset()
예제 #4
0
class Agent():
    """Main DDPG agent that extracts experiences and learns from them"""
    def __init__(self, state_size, action_size, random_seed):
        """
        Initializes Agent object.
        @Param:
        1. state_size: dimension of each state.
        2. action_size: number of actions.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        #Actor network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        #Critic network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        #Perform hard copy
        self.hard_copy_weights(self.actor_target, self.actor_local)
        self.hard_copy_weights(self.critic_target, self.critic_local)

        #Noise proccess
        self.noise = OUNoise(action_size,
                             random_seed)  #define Ornstein-Uhlenbeck process

    def reset(self):
        """Resets the noise process to mean"""
        self.noise.reset()

    def act(self, state, add_noise=True):
        """
        Returns a deterministic action given current state.
        @Param:
        1. state: current state, S.
        2. add_noise: (bool) add bias to agent, default = True (training mode)
        """
        state = torch.from_numpy(state).float().to(
            device)  #typecast to torch.Tensor
        self.actor_local.eval()  #set in evaluation mode
        with torch.no_grad():  #reset gradients
            action = self.actor_local(state).cpu().data.numpy(
            )  #deterministic action based on Actor's forward pass.
        self.actor_local.train()  #set training mode

        #If training mode, i.e. add_noise = True, add noise to the model to learn a more accurate policy for current state.
        if (add_noise):
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def learn(self, experiences, gamma):
        """
        Learn from a set of experiences picked up from a random sampling of even frequency (not prioritized)
        of experiences when buffer_size = MINI_BATCH.
        Updates policy and value parameters accordingly
        @Param:
        1. experiences: (Tuple[torch.Tensor]) set of experiences, trajectory, tau. tuple of (s, a, r, s', done)
        2. gamma: immediate reward hyper-parameter, 0.99 by default.
        """
        #Extrapolate experience into (state, action, reward, next_state, done) tuples
        states, actions, rewards, next_states, dones = experiences

        #Update Critic network
        actions_next = self.actor_target(
            next_states
        )  # Get predicted next-state actions and Q values from target models
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next *
                               (1 - dones))  #  r + γ * Q-values(a,s)

        # Compute critic loss using MSE
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                 1)  #clip gradients
        self.critic_optimizer.step()

        #Update Actor Network

        # Compute actor loss
        actions_pred = self.actor_local(states)  #gets mu(s)
        actor_loss = -self.critic_local(states,
                                        actions_pred).mean()  #gets V(s,a)
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters. Copies model τ every experience.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_copy_weights(self, target, source):
        """
        Copy weights from source to target network
        @Params:
        1. target: copy weights into (destination).
        2. source: copy weights from (source).
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
예제 #5
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Variables to store best score and scores
        self.best_score = -np.inf
        self.score_list = []

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state

        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state
        self.last_state = next_state

        # Track rewards
        self.total_reward += reward
        self.count += 1
        if done:
            # Average total reward by step counts
            self.score = self.total_reward / float(
                self.count) if self.count else 0.0
            # Store scores and update the best core
            self.score_list.append(self.score)
            if self.score > self.best_score:
                self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
예제 #6
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.timestep = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, agent_num):
        """Save experience in replay memory,
        and use random sample from buffer to learn."""
        self.timestep += 1

        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA, agent_num)

    def act(self, states, eps, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += eps * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_num):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]):
                tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # -------------------------- update critic -------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        if agent_num == 0:
            actions_next = torch.cat((actions_next, actions[:, :2]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # -------------------------- update actor -------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        if agent_num == 0:
            actions_pred = torch.cat((actions_pred, actions[:, :2]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # --------------------- update target networks --------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class AgentDDPG():
    def __init__(self, env):
        """

        :param task: (class instance) Instructions about the goal and reward
        """

        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.action_low = env.action_space.low
        self.action_high = env.action_space.high
        self.score = 0.0
        self.best = 0.0

        # Instances of the policy function or actor and the value function or critic
        # Actor critic with Advantage

        # Actor local and target
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)

        # Save actor model for future use
        actor_local_model_yaml = self.actor_local.model.to_yaml()
        with open("actor_local_model.yaml", "w") as yaml_file:
            yaml_file.write(actor_local_model_yaml)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic local and target
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model with local model
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Initialize the Gaussin Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Initialize the Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64  # original 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Parameters for the Algorithm
        self.gamma = 0.99  # Discount factor
        self.tau = 0.01  # Soft update for target parameters Actor Critic with Advantage

    # Actor can reset the episode
    def reset_episode(self):
        # Your total reward goes to 0 same as your count
        self.total_reward = 0.0
        self.count = 0
        # Reset the gaussian noise
        self.noise.reset()
        # Gets a new state from the task
        state = self.env.reset()
        # Protect the state obtained from the task
        # by storing it as last state
        self.last_state = state
        # Return the state obtained from task
        return state

    # Actor interact with the environment
    def step(self, action, reward, next_state, done):
        # Add to the total reward the reward of this time step
        self.total_reward += reward
        # Increase your count based on the number of rewards
        # received in the episode
        self.count += 1
        # Stored previous state in the replay buffer
        self.memory.add(self.last_state, action, reward, next_state, done)
        # Check to see if you have enough to produce a batch
        # and learn from it
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            # Train the networks using the experiences
            self.learn(experiences)

        # Roll over last state action
        self.last_state = next_state

    # Actor determines what to do based on the policy
    def act(self, state):
        # Given a state return the action recommended by the policy
        # Reshape the state to fit the keras model input
        state = np.reshape(state, newshape=[-1, self.state_size])
        # Pass the state to the actor local model to get an action
        # recommend for the policy in a state
        action = self.actor_local.model.predict(state)[0]
        # Because we are exploring we add some noise to the
        # action vector
        return list(action + self.noise.sample())

    # This is the Actor learning logic called when the agent
    # take a step to learn
    def learn(self, experiences):
        """
        Learning means that the networks parameters needs to be updated
        Using the experineces batch.
        Network learns from experiences not form interaction with the
        environment
        """

        # Reshape the experience tuples in separate arrays of states, actions
        # rewards, next_state, done
        # Your are converting every memeber of the tuple in a column or vector
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Firs we pass a batch of next states to the actor so it tell us what actions
        # to execute, we use the actor target network instead of the actor local network
        # because of the advantage principle
        actions_next = self.actor_target.model.predict_on_batch(next_states)

        # The critic evaluates the actions taking by the actor and generates the
        # Q(a,s) value of those actions. This action, state tuple comes from the
        # ReplayBuffer not from interacting with the environment.
        # Remember the Critic or value function inputs is states, actions
        Q_targets_next = self.critic_target.model.predict_on_batch(
            ([next_states, actions_next]))

        # With the Q_targets_next that is a vector of action values Q(s,a) of a random selected
        # next_states from the replay buffer. We calculate the target Q(s,a).
        # For that we use the TD one-step Sarsa equations
        # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value
        # This is done to train the critic in a supervise learning fashion.
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train the actor
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # Custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def get_episode_score(self):
        """
        Calculate the episode scores
        :return: None
        """
        # Update score and best score
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best:
            self.best = self.score

    def save_model_weights(self, actor_model):
        actor_model.model.save_weights('weights.h5')
예제 #8
0
class DdpgAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 seed,
                 batch_size,
                 buffer_size,
                 actor_lr,
                 critic_lr,
                 weight_decay,
                 tau,
                 update_every,
                 gamma,
                 device,
                 hidden_1_size=256,
                 hidden_2_size=128,
                 checkpoint_dir="."):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): how many agents are running in each step
            seed (int): random seed
            batch_size (int): how many experience tuples to process at once
            buffer_size (int): size of experience buffer
            actor_lr (float): actor learning rate alpha
            critic_lr (float): critic learning rate alpha
            weight_decay (float): rate of nn weight decay for critic network
            tau (float): soft update rate for synchronizing target and train network weights 
            update_every (int): how many env steps to train agent
            gamma (float): reward discount factor
            device (string): device to run PyTorch computation on (CPU, GPU)
            checkpoint_dir (string) : where to save checkpoints (trained weights)
        """
        self.num_agents = num_agents
        self.seed = torch.manual_seed(seed)
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.tau = tau
        self.action_size = action_size

        self.update_every = update_every
        self.gamma = gamma
        self.device = device

        # NN models for
        # network size

        # Critic
        self.critic_train = CriticQNetwork(state_size, action_size, seed,
                                           hidden_1_size,
                                           hidden_2_size).to(device)
        self.critic_target = CriticQNetwork(state_size, action_size, seed,
                                            hidden_1_size,
                                            hidden_2_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_train.parameters(),
                                           lr=critic_lr,
                                           weight_decay=weight_decay)
        # Actor
        self.actor_train = ActorQNetwork(state_size, action_size, seed,
                                         hidden_1_size,
                                         hidden_2_size).to(device)
        self.actor_target = ActorQNetwork(state_size, action_size, seed,
                                          hidden_1_size,
                                          hidden_2_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_train.parameters(),
                                          lr=actor_lr)

        # init Noise process
        self.noise = OUNoise((num_agents, action_size),
                             seed,
                             theta=0.15,
                             sigma=0.2)

        # init Replay Buffer
        self.memory = ReplayBuffer(action_size=action_size,
                                   buffer_size=buffer_size,
                                   batch_size=batch_size,
                                   seed=seed,
                                   device=device)

        self.step_counter = 0
        self.actor_loss = 0
        self.critic_loss = 0

        #checkpointing
        self.checkpoint_dir = checkpoint_dir

        self.actor_weights = self.checkpoint_dir + "/" + "actor.pth"
        self.critic_weights = self.checkpoint_dir + "/" + "critic.pth"

    def load_checkpoint(self, file_prefix=None):
        actor_weights = file_prefix + "_actor.pth" if file_prefix else self.actor_weights
        critic_weights = file_prefix + "_critic.pth" if file_prefix else self.critic_weights

        if os.path.isfile(actor_weights) and os.path.isfile(critic_weights):
            self.actor_target.load_state_dict(torch.load(actor_weights))
            self.actor_train.load_state_dict(torch.load(actor_weights))
            self.critic_target.load_state_dict(torch.load(critic_weights))
            self.critic_train.load_state_dict(torch.load(critic_weights))

    def save_checkpoint(self, file_name=None):
        actor_weights = file_name + "_actor.pth" if file_name else self.actor_weights
        critic_weights = file_name + "_critic.pth" if file_name else self.critic_weights
        torch.save(self.actor_train.state_dict(), actor_weights)
        torch.save(self.critic_train.state_dict(), critic_weights)

    def act(self, states, add_noise=True, noise_decay=1.0):
        states = torch.from_numpy(states).float().to(self.device)
        self.actor_train.eval()
        with torch.no_grad():
            actions = self.actor_train(states).cpu().data.numpy()
        self.actor_train.train()

        if add_noise:
            actions += self.noise.sample() * noise_decay
        return np.clip(actions, -1, 1)

    def step(self, states, actions, rewards, next_states, dones):
        [
            self.memory.add(s, a, r, s_next, d)
            for (s, a, r, s_next,
                 d) in zip(states, actions, rewards, next_states, dones)
        ]

        self.step_counter += 1
        if len(self.memory) >= self.batch_size:
            self.learn(self.memory.sample())

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## DDPG  implementation
        #### Critic network training

        # Calculate Q_Targets
        # first use target Actor to predict best next actions for next states S'
        with torch.no_grad():
            target_actions_pred = self.actor_target(next_states).to(
                self.device)
        # Then use target critic to asses Q value of this (S', pred_action) tuple
        target_pred = self.critic_target(next_states, target_actions_pred)
        # calculate the Q_target using TD error formula
        Q_target = rewards + (self.gamma * target_pred * (1 - dones))

        # find what Q value does Critic train network assign to this (state, action) - current state, actual action performed
        Q_pred = self.critic_train(states, actions).to(self.device)

        # Minimize critic loss
        # do Gradient Descent step on Critic train network by minimizing diff between (Q_pred, Q_target)
        self.critic_optimizer.zero_grad()
        critic_loss = F.smooth_l1_loss(Q_pred, Q_target.detach())
        self.critic_loss = critic_loss.cpu().detach().item()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_train.parameters(), 1)
        self.critic_optimizer.step()

        #### Actor network training
        # find wich action does Actor train predict
        actions_pred = self.actor_train(states)
        # Loss is negative of Critic_train Q estimate of (S,  actions_pred)
        # i.e. we want to maximize (minimize the negative) of action state Value function (Q) prediction by critic_train
        # for current state and next action predicted by actor_train
        actor_loss = -self.critic_train(states, actions_pred).mean()

        self.actor_loss = actor_loss.cpu().detach().item()
        # minimize Actor loss
        # do Gradient Descent step on Actor train network
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_train, self.critic_target, self.tau)
        self.soft_update(self.actor_train, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #9
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, env):
        """Class initialization."""
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.action_low = env.action_space.high[0]
        self.action_high = env.action_space.low[0]

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset(self):
        """Start a new episode."""
        self.noise.reset()
        state = self.env.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        """Save in experience buffer and batch learn from buffer step.

        Save the action, reward, next_state in the experience buffer and if the
        buffer has enough samples to satisfy the batch size then make a learning
        step.

        """
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        # if len(self.memory) > self.batch_size:
        if len(self.memory) > self.batch_size * 50:
            experiences = self.memory.sample()
            loss_critic = self.learn(experiences)
        else:
            loss_critic = None

        # Roll over last state and action
        self.last_state = next_state

        return loss_critic

    def act(self, state):
        """Return actions for given state(s) as per current policy.

        Also add some noise to the action (control-command) to explore the
        space.

        """
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]

        # add some noise for exploration
        return list(action + self.noise.sample())

    def learn(self, experiences):
        """Update policy and value parameters.

        Use given batch of experience tuples from the experience buffer.

        """

        # Convert experience tuples to separate arrays for each element (states,
        # actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from (target) models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        loss_critic = self.critic_local.model.train_on_batch(
            x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        # Customized actor training function
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        return loss_critic

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.

        Update the target model with the local model weights. Do so gradually
        by using a soft update parameter, tau.

        Note
        ----
        After training over a batch of experiences, we could just copy our newly
        learned weights (from the local model) to the target model. However,
        individual batches can introduce a lot of variance into the process, so
        it's better to perform a soft update, controlled by the parameter tau.

        """
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), \
            "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + \
            (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
예제 #10
0
class MADDPGAgent(object):
    """Multi Agent DDPG Implementation

    Paper: https://arxiv.org/abs/1706.02275
    I used their code to understand how the agents were implemented
    https://github.com/openai/maddpg
    """
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 agent_index,
                 writer,
                 random_seed,
                 dirname,
                 print_every=1000,
                 model_path=None,
                 saved_config=None,
                 eval_mode=False):
        """Initialize an Agent object.
        
        Parameters:    
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            agent_index (int): index (id) of current agent
            writer (object): visdom visualiser for realtime visualisations            
            random_seed (int): random seed
            dirname (string): output directory to store config, losses
            print_every (int): how often to print progress
            model_path (string): if defined, load saved model to resume training
            eval_mode (bool): whether to use eval mode
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.agent_index = agent_index
        self.writer = writer
        self.dirname = dirname
        self.print_every = print_every
        # save config params
        if not saved_config:
            self.config = CONFIG
            save_to_json(self.config,
                         '{}/hyperparams.json'.format(self.dirname))
        else:
            self.config = json.load(open(saved_config, 'r'))
            logger.info(
                'Loading config from saved location {}'.format(saved_config))

        # Create Critic network
        self.local_critic = Critic(self.state_size * num_agents,
                                   self.action_size * num_agents,
                                   random_seed,
                                   fc1_units=self.config['FC1'],
                                   fc2_units=self.config['FC2']).to(device)
        self.target_critic = Critic(self.state_size * num_agents,
                                    self.action_size * num_agents,
                                    random_seed,
                                    fc1_units=self.config['FC1'],
                                    fc2_units=self.config['FC2']).to(device)
        # Optimizer
        self.critic_optimizer = optim.Adam(
            self.local_critic.parameters(),
            lr=self.config['LR_CRITIC'],
            weight_decay=self.config['WEIGHT_DECAY'])

        # Create Actor network
        self.local_actor = Actor(self.state_size,
                                 self.action_size,
                                 random_seed,
                                 fc1_units=self.config['FC1'],
                                 fc2_units=self.config['FC2']).to(device)
        self.target_actor = Actor(self.state_size,
                                  self.action_size,
                                  random_seed,
                                  fc1_units=self.config['FC1'],
                                  fc2_units=self.config['FC2']).to(device)
        self.actor_optimizer = optim.Adam(self.local_actor.parameters(),
                                          lr=self.config['LR_ACTOR'])

        # Load saved model (if available)
        if model_path:
            logger.info('Loading model from {}'.format(model_path))
            self.local_actor.load_state_dict(
                torch.load('{}/checkpoint_actor_{}.pth'.format(
                    model_path, self.agent_index)))
            self.target_actor.load_state_dict(
                torch.load('{}/checkpoint_actor_{}.pth'.format(
                    model_path, self.agent_index)))
            self.local_critic.load_state_dict(
                torch.load('{}/checkpoint_critic_{}.pth'.format(
                    model_path, self.agent_index)))
            self.target_critic.load_state_dict(
                torch.load('{}/checkpoint_critic_{}.pth'.format(
                    model_path, self.agent_index)))
            if eval_mode:
                logger.info('agent {} set to eval mode')
                self.actor_local.eval()

        self.noise = OUNoise(self.action_size,
                             random_seed,
                             sigma=self.config['SIGMA'])
        self.learn_step = 0

    def act(self, state, add_noise=True, noise_weight=1):
        """Get the actions to take under the supplied states

        Parameters:
            state (array_like): Game state provided by the environment
            add_noise (bool): Whether we should apply the noise
            noise_weight (int): How much weight should be applied to the noise
        """
        state = torch.from_numpy(state).float().to(device)
        # Run inference in eval mode
        self.local_actor.eval()
        with torch.no_grad():
            action = self.local_actor(state).cpu().data.numpy()
        self.local_actor.train()
        # add noise if true
        if add_noise:
            action += self.noise.sample() * noise_weight
        return np.clip(action, -1, 1)

    def reset(self):
        """Resets the noise"""
        self.noise.reset()

    def learn(self, agents, experience, gamma):
        """Use the experience to allow agents to learn. 
        The critic of each agent can see the actions taken by all agents 
        and incorporate that in the learning.

        Parameters:
            agents (MADDPGAgent): instance of all the agents
            experience (Tuple[torch.Tensor]):  tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        num_agents = len(agents)
        states, actions, rewards, next_states, dones = experience
        # ---------------central critic-------------------
        # use target actor to get action, here we get target actors from
        # all agents to predict the next action
        next_actions = torch.zeros(
            (len(states), num_agents, self.action_size)).to(device)
        for i, agent in enumerate(agents):
            next_actions[:, i] = agent.target_actor(states[:, i, :])

        # Flatten state and action
        # e.g from state (100,2,24) --> (100, 48)
        critic_states = flatten(next_states)
        next_actions = flatten(next_actions)

        # calculate target and expected
        Q_targets_next = self.target_critic(critic_states, next_actions)
        Q_targets = rewards[:, self.agent_index, :] + (
            gamma * Q_targets_next * (1 - dones[:, self.agent_index, :]))
        Q_expected = self.local_critic(flatten(states), flatten(actions))

        # use mse loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        critic_loss_value = critic_loss.item()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        if self.config['CLIP_GRADS']:
            for param in self.local_critic.parameters():
                param.grad.data.clamp_(-1 * self.config['CLAMP_VALUE'],
                                       self.config['CLAMP_VALUE'])
        self.critic_optimizer.step()

        # ---------------actor---------------------
        # Only update the predicted action of current agent
        predicted_actions = torch.zeros(
            (len(states), num_agents, self.action_size)).to(device)
        predicted_actions.data.copy_(actions.data)
        predicted_actions[:, self.agent_index] = self.local_actor(
            states[:, self.agent_index])
        actor_loss = -self.local_critic(flatten(states),
                                        flatten(predicted_actions)).mean()
        # Kept to remind myself about the mistake that several tooks hours of investigation
        # and was only found when I looked at grads from self.local_actor.parameters()
        # actor_loss = -self.local_critic(flatten(states), flatten(actions)).mean()

        actor_loss_value = actor_loss.item()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        if self.config['CLIP_GRADS']:
            for param in self.local_actor.parameters():
                # import pdb; pdb.set_trace()
                param.grad.data.clamp_(-1 * self.config['CLAMP_VALUE'],
                                       self.config['CLAMP_VALUE'])
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        if self.learn_step == 0:
            # One time only, start local and target with same parameters
            self._copy_weights(self.local_critic, self.target_critic)
            self._copy_weights(self.local_actor, self.target_actor)
        else:
            self.soft_update(self.local_critic, self.target_critic,
                             self.config["TAU"])
            self.soft_update(self.local_actor, self.target_actor,
                             self.config["TAU"])

        self.learn_step += 1
        return actor_loss_value, critic_loss_value

    def _copy_weights(self, source_network, target_network):
        """Copy source network weights to target"""
        for target_param, source_param in zip(target_network.parameters(),
                                              source_network.parameters()):
            target_param.data.copy_(source_param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def checkpoint(self):
        """Checkpoint actor and critic models"""
        if not os.path.exists('{}/multi'.format(self.dirname)):
            os.makedirs('{}/multi'.format(self.dirname))
        torch.save(
            self.local_critic.state_dict(),
            '{}/multi/checkpoint_critic_{}.pth'.format(self.dirname,
                                                       self.agent_index))
        torch.save(
            self.local_actor.state_dict(),
            '{}/multi/checkpoint_actor_{}.pth'.format(self.dirname,
                                                      self.agent_index))
예제 #11
0
class Agent:
    """Interacts with and learns from the environment.
    (see the README for an explanation of the various hyperparameters)
    """
    def __init__(self, state_size: int, action_size: int, agent_no: int,
                 params: dict):
        """Initialize an Agent object.

        Args:
            state_size: dimension of each state
            action_size: dimension of each action
            agent_no: agent id
            params: architecture and hyperparameters
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = params['agent_seed']
        self.batch_size = params['batch_size']
        self.lr_actor = params['lr_actor']
        self.lr_critic = params['lr_critic']
        self.critic_weight_decay = params['critic_weight_decay']
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.update_step = params['update_step']
        self.num_agents = params['num_agents']

        random.seed(self.seed)
        self.t_step = 0
        self.agent_no = agent_no

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 params['first_hidden_units'],
                                 params['second_hidden_units'],
                                 self.seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  params['first_hidden_units'],
                                  params['second_hidden_units'],
                                  self.seed).to(device)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * self.num_agents,
                                   action_size * self.num_agents,
                                   params['first_hidden_units'],
                                   params['second_hidden_units'],
                                   self.seed).to(device)
        self.critic_target = Critic(state_size * self.num_agents,
                                    action_size * self.num_agents,
                                    params['first_hidden_units'],
                                    params['second_hidden_units'],
                                    self.seed).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.lr_critic,
            weight_decay=self.critic_weight_decay)

        # Noise process
        self.noise = OUNoise(action_size,
                             self.seed,
                             sigma=params['noise_sigma'])

    def step(self, memory: object, agents: Dict[int, object]):
        """Save experience in replay memory, and use random sample
        from buffer to learn every `update_step` if there are enough samples
        in the buffer to form a batch

        Args:
            memory: fixed-size buffer to store experience tuples
            agents: object references to Agent instances within the environment
        """
        self.t_step += 1

        if (len(memory) >= self.batch_size) & (self.t_step % self.update_step
                                               == 0):
            agents_experiences = memory.sample()
            self.learn(agents_experiences, agents)

    def act(self,
            state: np.array,
            add_noise: bool = True,
            scale: float = 1.0) -> np.array:
        """Returns actions for given state as per current policy.

        Args:
            state:
            add_noise: whether to add noise to actions for exploration during training
                or not (for evaluation)
            scale: noise scaling parameter

        Returrns:
            action: clipped action of the agent
        """
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * scale
        np.clip(action, -1, 1)
        return action

    def reset(self):
        self.noise.reset()

    def learn(self, experiences: Dict[int, Tuple[torch.tensor, torch.tensor,
                                                 torch.tensor, torch.tensor,
                                                 torch.tensor]],
              agents: Dict[int, object]):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Args:
            experiences: dictionary with agent-specific tuples
                with five tensors each that comprise states, actions, rewards,
                next_states, and dones batch_wise following exactly that order,
                i.e. tensor objects of size
                (`batch_size`, dim) where dim is `state_size` for states
                and next_states, `action_size` for actions, and 1 for rewards and dones
            agents: object references to Agent instances within the environment
        """
        self_rewards = experiences[self.agent_no][2]
        self_dones = experiences[self.agent_no][4]

        joint_next_states = torch.cat(
            [experiences[no][3] for no in range(self.num_agents)], dim=1)

        # compute actions_next applying ea. agents target policy
        # on its next_states observations
        joint_actions_next = torch.cat([
            agents[no].actor_target(experiences[no][3])
            for no in range(self.num_agents)
        ],
                                       dim=1)

        # --------------------------- update critic ---------------------------- #
        # compute the Q_targets (y) using the agent's target critic network with
        # on the next_states observations of all agents and joint_actions_next
        Q_targets_next = self.critic_target(joint_next_states,
                                            joint_actions_next)
        Q_targets = self_rewards + (self.gamma * Q_targets_next *
                                    (1 - self_dones))

        joint_states = torch.cat(
            [experiences[no][0] for no in range(self.num_agents)], dim=1)
        joint_actions = torch.cat(
            [experiences[no][1] for no in range(self.num_agents)], dim=1)

        # compute Q_expected applying the local critic to joint state observations
        # and all agents' actions
        Q_expected = self.critic_local(joint_states, joint_actions)

        # Compute critic loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        joint_actions_pred = torch.cat([
            agents[no].actor_local(experiences[no][0])
            for no in range(self.num_agents)
        ],
                                       dim=1)

        # Compute actor loss
        actor_loss = -self.critic_local(joint_states,
                                        joint_actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Args:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
예제 #12
0
class Agent():
    """Main DDPG agent that extracts experiences and learns from them"""
    def __init__(self, state_size=24, action_size=2, random_seed=0):
        """
        Initializes Agent object.
        @Param:
        1. state_size: dimension of each state.
        2. action_size: number of actions.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        #Actor network
        self.actor_local = Actor(self.state_size, self.action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        #Critic network
        self.critic_local = Critic(self.state_size, self.action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        #Noise proccess
        self.noise = OUNoise(action_size,
                             random_seed)  #define Ornstein-Uhlenbeck process

        #Replay memory
        self.memory = ReplayBuffer(
            self.action_size, BUFFER_SIZE, MINI_BATCH,
            random_seed)  #define experience replay buffer object

    def step(self, time_step, state, action, reward, next_state, done):
        """
        Saves an experience in the replay memory to learn from using random sampling.
        @Param:
        1. state: current state, S.
        2. action: action taken based on current state.
        3. reward: immediate reward from state, action.
        4. next_state: next state, S', from action, a.
        5. done: (bool) has the episode terminated?
        Exracted version for trajectory used in calculating the value for an action, a."""

        self.memory.add(state, action, reward, next_state,
                        done)  #append to memory buffer

        # only learn every n_time_steps
        if time_step % N_TIME_STEPS != 0:
            return

        #check if enough samples in buffer. if so, learn from experiences, otherwise, keep collecting samples.
        if (len(self.memory) > MINI_BATCH):
            for _ in range(N_LEARN_UPDATES):
                experience = self.memory.sample()
                self.learn(experience)

    def reset(self):
        """Resets the noise process to mean"""
        self.noise.reset()

    def act(self, state, add_noise=True):
        """
        Returns a deterministic action given current state.
        @Param:
        1. state: current state, S.
        2. add_noise: (bool) add bias to agent, default = True (training mode)
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(
            device)  #typecast to torch.Tensor
        self.actor_local.eval()  #set in evaluation mode
        with torch.no_grad():  #reset gradients
            action = self.actor_local(state).cpu().data.numpy(
            )  #deterministic action based on Actor's forward pass.
        self.actor_local.train()  #set training mode

        #If training mode, i.e. add_noise = True, add noise to the model to learn a more accurate policy for current state.
        if (add_noise):
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def learn(self, experiences, gamma=GAMMA):
        """
        Learn from a set of experiences picked up from a random sampling of even frequency (not prioritized)
        of experiences when buffer_size = MINI_BATCH.
        Updates policy and value parameters accordingly
        @Param:
        1. experiences: (Tuple[torch.Tensor]) set of experiences, trajectory, tau. tuple of (s, a, r, s', done)
        2. gamma: immediate reward hyper-parameter, 0.99 by default.
        """
        #Extrapolate experience into (state, action, reward, next_state, done) tuples
        states, actions, rewards, next_states, dones = experiences

        #Update Critic network
        actions_next = self.actor_target(
            next_states
        )  # Get predicted next-state actions and Q values from target models
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next *
                               (1 - dones))  #  r + γ * Q-values(a,s)

        # Compute critic loss using MSE
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                 1)  #clip gradients
        self.critic_optimizer.step()

        #Update Actor Network

        # Compute actor loss
        actions_pred = self.actor_local(states)  #gets mu(s)
        actor_loss = -self.critic_local(states,
                                        actions_pred).mean()  #gets V(s,a)
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters. Copies model τ every experience.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDPG():
    """ Deep Deterministic Policy Gradients Agent used to interaction with and learn from an environment """
    def __init__(self, state_size: int, action_size: int, num_agents: int,
                 epsilon, random_seed: int):
        """ Initialize a DDPG Agent Object

        :param state_size: dimension of state (input)
        :param action_size: dimension of action (output)
        :param num_agents: number of concurrent agents in the environment
        :param epsilon: initial value of epsilon for exploration
        :param random_seed: random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.t_step = 0

        # Hyperparameters
        self.buffer_size = 1000000
        self.batch_size = 128
        self.update_every = 10
        self.num_updates = 10
        self.gamma = 0.99
        self.tau = 0.001
        self.lr_actor = 0.0001
        self.lr_critic = 0.001
        self.weight_decay = 0
        self.epsilon = epsilon
        self.epsilon_decay = 0.97
        self.epsilon_min = 0.005

        # Networks (Actor: State -> Action, Critic: (State,Action) -> Value)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   random_seed).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)
        # Initialize actor and critic networks to start with same parameters
        self.soft_update(self.actor_local, self.actor_target, tau=1)
        self.soft_update(self.critic_local, self.critic_target, tau=1)

        # Noise Setup
        self.noise = OUNoise(self.action_size, random_seed)

        # Replay Buffer Setup
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

    def __str__(self):
        return "DDPG_Agent"

    def train(self,
              env,
              brain_name,
              num_episodes=200,
              max_time=1000,
              print_every=10):
        """ Interacts with and learns from a given Unity Environment

        :param env: Unity Environment the agents is trying to learn
        :param brain_name: Brain for Environment
        :param num_episodes: Number of episodes to train
        :param max_time: How long each episode runs for
        :param print_every: How often in episodes to print a running average
        :return: Returns episodes scores and 100 episode averages as lists
        """
        # --------- Set Everything up --------#
        scores = []
        avg_scores = []
        scores_deque = deque(maxlen=print_every)

        # -------- Simulation Loop --------#
        for episode_num in range(1, num_episodes + 1):
            # Reset everything
            env_info = env.reset(train_mode=True)[brain_name]
            states = env_info.vector_observations
            episode_scores = np.zeros(self.num_agents)
            self.reset_noise()
            # Run the episode
            for t in range(max_time):
                actions = self.act(states, self.epsilon)
                env_info = env.step(actions)[brain_name]
                next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done
                self.step(states, actions, rewards, next_states, dones)
                episode_scores += rewards
                states = next_states
                if np.any(dones):
                    break

            # -------- Episode Finished ---------#
            self.epsilon *= self.epsilon_decay
            self.epsilon = max(self.epsilon, self.epsilon_min)
            scores.append(np.mean(episode_scores))
            scores_deque.append(np.mean(episode_scores))
            avg_scores.append(np.mean(scores_deque))
            if episode_num % print_every == 0:
                print(
                    f'Episode: {episode_num} \tAverage Score: {round(np.mean(scores_deque), 2)}'
                )
                torch.save(
                    self.actor_local.state_dict(),
                    f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth')
                torch.save(
                    self.critic_local.state_dict(),
                    f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth')

        # -------- All Episodes finished Save parameters and scores --------#
        # Save Model Parameters
        torch.save(self.actor_local.state_dict(),
                   f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth')
        torch.save(self.critic_local.state_dict(),
                   f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth')
        # Save mean score per episode (of the 20 agents)
        f = open(f'{PATH}\scores\{self.__str__()}_Multiple_Scores.txt', 'w')
        scores_string = "\n".join([str(score) for score in scores])
        f.write(scores_string)
        f.close()
        # Save average scores for 100 window average
        f = open(f'{PATH}\scores\{self.__str__()}_Multiple_AvgScores.txt', 'w')
        avgScores_string = "\n".join([str(score) for score in avg_scores])
        f.write(avgScores_string)
        f.close()
        return scores, avg_scores

    def step(self, states, actions, rewards, next_states, dones):
        """ what the agent needs to do for every time step that occurs in the environment. Takes
        in a (s,a,r,s',d) tuple and saves it to memeory and learns from experiences. Note: this is not
        the same as a step in the environment. Step is only called once per environment time step.

        :param states: array of states agent used to select actions
        :param actions: array of actions taken by agents
        :param rewards: array of rewards for last action taken in environment
        :param next_states: array of next states after actions were taken
        :param dones: array of bools representing if environment is finished or not
        """
        # Save experienced in replay memory
        for agent_num in range(self.num_agents):
            self.memory.add(states[agent_num], actions[agent_num],
                            rewards[agent_num], next_states[agent_num],
                            dones[agent_num])

        # Learn "num_updates" times every "update_every" time step
        self.t_step += 1
        if len(self.memory
               ) > self.batch_size and self.t_step % self.update_every == 0:
            self.t_step = 0
            for _ in range(self.num_updates):
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, states, epsilon, add_noise=True):
        """ Returns actions for given states as per current policy. Policy comes from the actor network.

        :param states: array of states from the environment
        :param epsilon: probability of exploration
        :param add_noise: bool on whether or not to potentially have exploration for action
        :return: clipped actions
        """
        states = torch.from_numpy(states).float().to(self.device)
        self.actor_local.eval()  # Sets to eval mode (no gradients)
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()  # Sets to train mode (gradients back on)
        if add_noise and epsilon > np.random.random():
            actions += [self.noise.sample() for _ in range(self.num_agents)]
        return np.clip(actions, -1, 1)

    def reset_noise(self):
        """ resets to noise parameters """
        self.noise.reset()

    def learn(self, experiences):
        """ Update actor and critic networks using a given batch of experiences
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(states) -> actions
            critic_target(states, actions) -> Q-value
        :param experiences: tuple of arrays (states, actions, rewards, next_states, dones)  sampled from the replay buffer
        """

        states, actions, rewards, next_states, dones = experiences
        # -------------------- Update Critic -------------------- #
        # Use target networks for getting next actions and q values and calculate q_targets
        next_actions = self.actor_target(next_states)
        next_q_targets = self.critic_target(next_states, next_actions)
        q_targets = rewards + (self.gamma * next_q_targets * (1 - dones))
        # Compute critic loss (Same as DQN Loss)
        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)
        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # -------------------- Update Actor --------------------- #
        # Computer actor loss (maximize mean of Q(states,actions))
        action_preds = self.actor_local(states)
        # Optimizer minimizes and we want to maximize so multiply by -1
        actor_loss = -1 * self.critic_local(states, action_preds).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #---------------- Update Target Networks ---------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_network, target_network, tau):
        """ soft update newtwork parametes
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_network: PyTorch Network that is always up to date
        :param target_network: PyTorch Network that is not up to date
        :param tau: update (interpolation) parameter
        """
        for target_param, local_param in zip(target_network.parameters(),
                                             local_network.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #14
0
class Agent():
    def __init__(self, model_name, state_size, action_size, random_seed=0):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.model_name = model_name
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.rewards = list()
        self.losses = deque(maxlen=100)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def sense(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        self.rewards.append(reward)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset_episode(self):
        self.rewards = list()
        self.noise.reset()

    def ave_loss(self):
        return sum(self.losses) / max(1, len(self.losses))

    def cum_rewards(self):
        return sum(self.rewards)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        self.losses.append(actor_loss)

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def load(self):
        afn = "{}_actor.mdl".format(self.model_name)
        cfn = "{}_critic.mdl".format(self.model_name)
        state_dict = torch.load(afn)
        self.actor_local.load_state_dict(state_dict)
        state_dict = torch.load(cfn)
        self.critic_target.load_state_dict(state_dict)
        log.info("loaded {}, {}".format(afn, cfn))
        return self

    def save(self):
        afn = "{}_actor.mdl".format(self.model_name)
        cfn = "{}_critic.mdl".format(self.model_name)
        torch.save(self.actor_local.state_dict(), afn)
        torch.save(self.critic_local.state_dict(), cfn)
        log.info("saved to {}, {}".format(afn, cfn))
        return self
예제 #15
0
class DdpgActor:
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 batch_size,
                 actor_lr,
                 experience_buffer,
                 critic,
                 weight_decay,
                 tau,
                 update_every,
                 gamma,
                 device,
                 hidden_1_size=256,
                 hidden_2_size=128,
                 checkpoint_dir="."):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): how many agents are running in each step
            seed (int): random seed
            batch_size (int): how many experience tuples to process at once
            actor_lr (float): actor learning rate alpha
            experience_buffer (ReplayBuffer): experience replay buffer
            critic (DdpgCritic): Critic instance
            weight_decay (float): rate of nn weight decay for critic network
            tau (float): soft update rate for synchronizing target and train network weights 
            update_every (int): how many env steps to train agent
            gamma (float): reward discount factor
            device (string): device to run PyTorch computation on (CPU, GPU)
            checkpoint_dir (string) : where to save checkpoints (trained weights)
        """
        self.seed = torch.manual_seed(seed)
        self.batch_size = batch_size
        self.tau = tau
        self.update_every = update_every
        self.gamma = gamma
        self.device = device

        # NN models for
        # network size

        # Critic
        # self.critic_train     = CriticQNetwork(state_size, action_size, seed, hidden_1_size, hidden_2_size).to(device)
        # self.critic_target    = CriticQNetwork(state_size, action_size, seed, hidden_1_size, hidden_2_size).to(device)
        # self.critic_optimizer = optim.Adam(self.critic_train.parameters(), lr=critic_lr, weight_decay=weight_decay)
        # Actor
        self.actor_train = ActorQNetwork(state_size, action_size, seed,
                                         hidden_1_size,
                                         hidden_2_size).to(device)
        self.actor_target = ActorQNetwork(state_size, action_size, seed,
                                          hidden_1_size,
                                          hidden_2_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_train.parameters(),
                                          lr=actor_lr)

        # init Noise process
        self.noise = OUNoise(action_size, seed, theta=0.15, sigma=0.2)

        # # init Replay Buffer
        # self.memory = ReplayBuffer(action_size= action_size,
        #                            buffer_size=buffer_size,
        #                            batch_size=batch_size,
        #                            seed=seed,
        #                           device=device)

        self.memory = experience_buffer
        self.critic = critic
        self.step_counter = 0
        self.critic_loss = 0
        self.actor_loss = 0

        #checkpointing
        self.checkpoint_dir = checkpoint_dir

        self.actor_weights = self.checkpoint_dir + "/" + "actor.pth"

    def load_checkpoint(self, file_prefix=None):
        actor_weights = file_prefix + "_actor.pth" if file_prefix else self.actor_weights

        if os.path.isfile(actor_weights):
            self.actor_target.load_state_dict(torch.load(actor_weights))
            self.actor_train.load_state_dict(torch.load(actor_weights))

    def save_checkpoint(self, file_name=None):
        actor_weights = file_name + "_actor.pth" if file_name else self.actor_weights
        torch.save(self.actor_train.state_dict(), actor_weights)

    def act(self, state, add_noise=True, noise_decay=1.0):
        state = torch.from_numpy(state).unsqueeze(0).float().to(self.device)
        self.actor_train.eval()
        with torch.no_grad():
            actions = self.actor_train(state).cpu().data.numpy()
        self.actor_train.train()

        if add_noise:
            if self.step_counter < 5000:
                actions += np.random.standard_normal(self.noise.size)
            else:
                actions += self.noise.sample() * noise_decay
        return np.clip(actions, -1, 1)

    def step(self):
        self.step_counter += 1
        if len(self.memory) >= self.batch_size:
            for _ in range(5):
                self.critic.learn(self.memory.sample(), self)
예제 #16
0
class SingleDDPGAgent:
    """
        Single agent DDPG.
        Interacts with and learns from the environment.
    """
    def __init__(self, state_size, action_size, cfg, num_agents=1, agent_id=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            cfg (config object): main configuration with other passed settings
            num_agents (int): optional (default: 1). If >1 will multiply state and action
                            space sizes for critic. Used for usage with MADDPG.
            agent_id (int): optional (default: 0). Set agent id for MADDPG.
        """
        print("Initializing single DDPG agent!")

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(cfg.random_seed)
        self.n_agents = num_agents
        self.agent_id = agent_id

        self.cfg = cfg

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, cfg.random_seed,
                                 cfg.dense_layers_actor).to(device)
        self.actor_target = Actor(state_size, action_size, cfg.random_seed,
                                  cfg.dense_layers_actor).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=cfg.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * num_agents,
                                   action_size * num_agents, cfg.random_seed,
                                   cfg.dense_layers_critic).to(device)
        self.critic_target = Critic(state_size * num_agents,
                                    action_size * num_agents, cfg.random_seed,
                                    cfg.dense_layers_critic).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=cfg.lr_critic,
                                           weight_decay=cfg.weight_decay)

        self.hard_copy_weights(self.critic_local, self.critic_target)
        self.hard_copy_weights(self.actor_local, self.actor_target)

        self.t_step = 0

        # Noise process
        self.noise = OUNoise(action_size,
                             cfg.random_seed,
                             theta=cfg.theta_ou,
                             sigma=cfg.sigma_ou)

        # Replay memory
        self.memory = ReplayBuffer(action_size, cfg.buffer_size,
                                   cfg.batch_size, cfg.random_seed, cfg)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        max_prio = self.memory.get_max_priority()
        self.memory.add(state, action, reward, next_state, max_prio, done)

        # Learn, if enough samples are available in memory
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.cfg.update_every
        if self.t_step == 0:
            if len(self.memory) > self.cfg.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.cfg.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state.view(
                1, -1)).squeeze().cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def target_act(self, state):
        """ Let target network return action."""
        self.actor_target.eval()
        with torch.no_grad():
            action_target = self.actor_target(state)

        return np.clip(action_target, -1, 1)

    def reset(self):
        self.t_step = 0
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', prio, done, indices) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, priorities, dones, indices = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        if self.cfg.prioritized_replay:
            weights = 1. / (
                (self.cfg.batch_size * priorities)**self.cfg.priority_beta)
            weights /= max(weights)
            # calculating new transition priorities based on residuals
            # between target and local network predictions
            diffs = Q_targets - Q_expected  # TD-error
            diffs = np.abs(np.squeeze(diffs.tolist()))
            self.memory.update_prios(indices, diffs)
            # bias-annealing weights
            Q_expected *= weights
            Q_targets *= weights

        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.cfg.tau)
        self.soft_update(self.actor_local, self.actor_target, self.cfg.tau)

    @staticmethod
    def hard_copy_weights(local_model, target_model):
        """Update model parameters.

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_weights(self, model_save_path, suffix=""):
        """
        Simple method to save network weights.
        """
        # actors
        torch.save(
            self.actor_local.state_dict(),
            os.path.join(model_save_path,
                         "weights_actor_local{:s}.pth".format(suffix)))
        torch.save(
            self.actor_target.state_dict(),
            os.path.join(model_save_path,
                         "weights_actor_target{:s}.pth".format(suffix)))
        # critics
        torch.save(
            self.critic_local.state_dict(),
            os.path.join(model_save_path,
                         "weights_critic_local{:s}.pth".format(suffix)))
        torch.save(
            self.critic_target.state_dict(),
            os.path.join(model_save_path,
                         "weights_critic_target{:s}.pth".format(suffix)))

    def load_weights(self, model_save_path, suffix=""):
        """
        Method to load network weights from saved files.
        """
        self.actor_local.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_actor_local{:s}.pth".format(suffix))))
        self.actor_target.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_actor_target{:s}.pth".format(suffix))))

        self.critic_local.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_critic_local{:s}.pth".format(suffix))))
        self.critic_target.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_critic_target{:s}.pth".format(suffix))))
예제 #17
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 replay_memory,
                 random_seed=0,
                 nb_agent=20,
                 bs=128,
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=1e-4,
                 wd_actor=0,
                 wd_critic=0,
                 clip_actor=None,
                 clip_critic=None,
                 update_interval=20,
                 update_times=10):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.nb_agent = nb_agent
        self.bs = bs
        self.update_interval = update_interval
        self.update_times = update_times
        self.timestep = 0

        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.wd_critic = wd_critic
        self.wd_actor = wd_actor
        self.clip_critic = clip_critic
        self.clip_actor = clip_actor
        self.actor_losses = []
        self.critic_losses = []

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor,
                                          weight_decay=self.wd_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.wd_critic)

        # Noise process
        self.noise = OUNoise((self.nb_agent, action_size), random_seed)

        # Replay memory
        self.memory = replay_memory

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        #increment timestep
        self.timestep += 1

        # Save experience / reward
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if self.timestep % self.update_interval == 0:
            for i in range(self.update_times):
                if len(self.memory) > self.bs:
                    experiences = self.memory.sample(self.bs)
                    self.learn(experiences)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset_noise(self):
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        if self.clip_critic:
            torch.nn.utils.clip_grad_norm(self.critic_local.parameters(),
                                          self.clip_critic)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        if self.clip_actor:
            torch.nn.utils.clip_grad_norm(self.actor_local.parameters(),
                                          self.clip_actor)
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

        self.actor_losses.append(actor_loss.cpu().data.numpy())
        self.critic_losses.append(critic_loss.cpu().data.numpy())

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
class DDPG():
    """
    Reinforcement Learning agent that learns using DDPG.
    """

    def __init__(
            self,
            task,
            actor_params={},
            critic_params={},
            noise_params={},
            replay_memory_params={},
            algo_params = {}
            ):

        # Default Params
        default_actor_params = {'lr': .001}
        default_critic_params= {'lr': .001}
        default_noise_params= {'mu': 0, 'theta': .15, 'sigma': .2}
        default_replay_memory_params= {'buffer_size': 100000, 'batch_size': 64}
        default_algo_params = {'gamma': .99, 'tau': .1}

        # Final Params
        final_actor_params= {**default_actor_params, **actor_params}
        final_critic_params={**default_critic_params, **critic_params}
        final_noise_params={**default_noise_params, **noise_params}
        final_replay_memory_params={**default_replay_memory_params, **replay_memory_params, }
        final_algo_params = {**default_algo_params, **algo_params}

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, final_critic_params)
        self.critic_target = Critic(self.state_size, self.action_size, final_critic_params)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(
                self.action_size,
                final_noise_params['mu'],
                final_noise_params['theta'],
                final_noise_params['sigma']
                )

        # Replay memory
        self.batch_size = final_replay_memory_params['batch_size']
        self.memory = ReplayBuffer(
                final_replay_memory_params['buffer_size'],
                final_replay_memory_params['batch_size']
                )

        # Algorithm parameters
        self.gamma = final_algo_params['gamma']  # discount factor
        self.tau = final_algo_params['tau']      # for soft update of target parameters


    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state


    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state


    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration


    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)


    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
예제 #19
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, seed=0, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, gamma=GAMMA, checkpoint_path='./checkpoints/', pretrained=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.checkpoint_path = checkpoint_path

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        
        # If pretrained, load weights
        if pretrained:
            actor_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_actor.pth'))
            critic_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_critic.pth'))
            self.actor_local.load_state_dict(actor_dict)
            self.actor_target.load_state_dict(actor_dict)
            self.critic_local.load_state_dict(critic_dict)
            self.critic_target.load_state_dict(critic_dict)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device)
    
    def step(self, state, action, reward, next_state, done, tstep=LEARN_EVERY+1):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and tstep % LEARN_EVERY == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences)
            
    def train(self, env, n_episodes=1000):
        """Deep Deterministic Policy Gradient (DDPG) Learning.

        Params
        ======
            env (UnityEnvironment): Unity environment
            n_episodes (int): maximum number of training episodes
        """
        # create checkpoints folder if necessary
        if not os.path.exists(self.checkpoint_path): os.makedirs(self.checkpoint_path)
        # get the default brain
        brain_name = env.brain_names[0]
        env_info = env.reset(train_mode=True)[brain_name]
        num_agents = len(env_info.agents)
        # last 100 scores
        scores_deque = deque(maxlen=100)
        # list containing scores from each episode
        all_scores = []
        # list containing window averaged scores
        avg_scores = []
        # for each episode
        for i_episode in range(1, n_episodes+1):
            # reset environment
            env_info = env.reset(train_mode=True)[brain_name]
            states = env_info.vector_observations
            # reset noise
            self.reset()
            scores = np.zeros(num_agents) 
            # for each timepoint
            t=0
            while True:
                # agent action
                actions = self.act(states)
                # get the next state
                env_info = env.step(actions)[brain_name]
                next_states = env_info.vector_observations
                # get the reward
                rewards = env_info.rewards
                # see if episode has ended
                dones = env_info.local_done
                # step
                for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                    self.step(state, action, reward, next_state, done, t)
                states = next_states
                scores += rewards
                t+=1
                if np.any(dones):
                    break 
            # save most recent score
            max_score = np.max(scores)
            scores_deque.append(max_score)
            all_scores.append(max_score)
            avg_scores.append(np.mean(scores_deque))
            print('\rEpisode {}\tScore: {:.2f}\tMax Score: {:.2f}'.format(i_episode, max_score, np.mean(scores_deque)), end="")
            if i_episode % 50 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            if np.mean(scores_deque)>=0.5:
                print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
                torch.save(self.actor_local.state_dict(), self.checkpoint_path+'checkpoint_actor.pth')
                torch.save(self.critic_local.state_dict(), self.checkpoint_path+'checkpoint_critic.pth')
                break
            
        return all_scores, avg_scores

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

        self.reset()
        
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def play(self, env, n_episodes=5):
        """Play a few episodes with trained agents.

        Params
        ======
            env (UnityEnvironment): Unity environment
            n_episodes (int): maximum number of training episodes
        """
        # get the default brain
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]

        # reset the environment
        env_info = env.reset(train_mode=False)[brain_name]
        num_agents = len(env_info.agents)
        action_size = brain.vector_action_space_size
        state_size = env_info.vector_observations.shape[1]

        # for each episode
        for i_episode in range(1, n_episodes+1):
            env_info = env.reset(train_mode=False)[brain_name]
            states = env_info.vector_observations
            self.reset() # set the noise to zero
            score = np.zeros(num_agents)
            while(True):
                actions = self.act(states, add_noise=False)
                env_info = env.step(actions)[brain_name]
                # get the next states
                next_states = env_info.vector_observations             
                # get the rewards
                rewards = env_info.rewards                             
                # see if the episode has finished for any agent
                dones = env_info.local_done                            

                self.step(states, actions, rewards, next_states, dones)
                states = next_states
                score += rewards
                if np.any(dones):
                    break

            print('Best Score:', np.max(score))    
        env.close()
예제 #20
0
class Agent(object):
    def __init__(self, task, hp):

        self.task = task
        self.nb_states = task.state_size
        self.nb_actions = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # why not use bits to represent continous action space as discrete actions :)
        self.action_bits = 8  # np.floor( np.log2(self.action_range) + 1 )
        self.action_size = (self.nb_actions * self.action_bits
                            )  #.astype(np.int)
        self.action_factor = self.action_high / 2**self.action_bits

        self.use_cuda = 1 if hp['USE_CUDA'] is True else 0

        if int(hp['SEED']) > 0:
            self.seed(hp['SEED'])

        self.buffer_size = hp['EXP_BUFFER_SIZE']
        self.batch_size = hp['EXP_BATCH_SIZE']

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': hp['HIDDEN1'],
            'hidden2': hp['HIDDEN2'],
            'init_w': hp['INIT_W']
        }

        self.actor = Actor(self.nb_states, self.action_size, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.action_size, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=hp['ACTOR_LR'])

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=hp['CRITIC_LR'])

        self.hard_copy(self.actor, self.actor_target)
        self.hard_copy(self.critic, self.critic_target)

        # Create experience memory buffer
        self.memory = ExperienceMemory(self.buffer_size, self.batch_size)

        # init the process of life ... ..
        self.random_process = OUNoise(size=self.nb_actions,
                                      theta=hp['OU_THETA'],
                                      mu=hp['OU_MU'],
                                      sigma=hp['OU_SIGMA'])
        self.ou_decay = hp['OU_DECAY']

        # Hyper-parameters
        #self.batch_size = hp.BATCH_SIZE
        self.tau = hp['TAU']
        self.discount = hp['DISCOUNT']
        #self.depsilon = 1.0 / args.epsilon

        #
        #self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        # nvidia
        if hp['USE_CUDA']:
            self.cuda()

    def hard_copy(self, source, target):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    def soft_update(self, source, target):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)

    def update_policy(self):

        # Get Sample batches
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.batch_samples(self.batch_size)

        #state_batch = state_batch / 360
        #next_state_batch = next_state_batch / 360
        #print(action_batch)

        ###########################################
        # Prepare for the target q batch
        with torch.no_grad():  # no grad calc
            next_actions = []
            for action in self.actor_target(to_tensor(next_state_batch)):
                #print(action)
                action = to_numpy(action)
                next_actions.append(np.array(self.action_transform(action)))

            next_q_values = self.critic_target([
                to_tensor(next_state_batch),
                to_tensor(np.array(next_actions))
            ])

            # Q_targets = (rewards + self.gamma * Q_targets_next.reshape(len(experiences)) * (1 - dones))
            target_q_batch = to_tensor(reward_batch) + \
                self.discount*to_tensor(1 - terminal_batch.astype(np.float))*next_q_values

        ############################################
        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        #print("vloss:",value_loss)
        value_loss.backward()
        self.critic_optim.step()

        ##############################################
        # Actor update
        self.actor.zero_grad()

        next_actions = []
        for action in self.actor_target(to_tensor(state_batch)):
            #print(action)
            action = to_numpy(action)
            next_actions.append(np.array(self.action_transform(action)))

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             to_tensor(np.array(next_actions))])

        policy_loss = policy_loss.mean()
        #print("ploss:",policy_loss)
        policy_loss.backward()
        self.actor_optim.step()

        ###############################################
        # Target update
        self.soft_update(self.actor, self.actor_target)
        self.soft_update(self.critic, self.critic_target)

        return None, None  #value_loss.detach().squeeze().numpy() ,policy_loss.detach().squeeze().numpy()

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()
        self.is_training = False

    def action_transform(self, action):
        # this dependes on our output activation function
        action[action <= 0.] = 0
        action[action > 0.] = 1
        action = np.array(np.split(action, self.nb_actions)).astype(np.bool)
        action = np.packbits(action).astype(np.float)  #, axis=-1)
        action = action * self.action_factor
        return action

    def cuda(self):
        self.use_cuda = 1
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.a_t = action
        self.observe(reward, next_state, done)

        # If we got our minibatch of experience memories..
        # learn from them and slowly change belief :)
        aloss = None
        ploss = None
        if len(self.memory) > self.batch_size:
            aloss, ploss = self.update_policy()
        return aloss, ploss

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.add(self.s_t, self.a_t, r_t, s_t1, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(0., 900., self.nb_actions)
        self.a_t = action
        return action

    def act(self, s_t, i_episode=0, decay_epsilon=True):

        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)

        #action = (action * self.action_range) + self.action_low
        action = self.action_transform(action)
        #np.packbits(a, axis=-1)

        if (self.ou_decay != 0):
            decay = 1 - (i_episode * self.ou_decay)
            #print(action, decay)
            action += self.is_training * decay * self.random_process.sample()

        self.a_t = action
        return action

    def reset(self):
        self.s_t = self.task.reset()
        self.random_process.reset()
        return self.s_t

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if self.use_cuda:
            torch.cuda.manual_seed(s)
예제 #21
0
class DDPGAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """ Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # for MADDPG
        self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)
        self.eps = EPS_START
        self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM)
        self.timestep = 0

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.timestep += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        # Learn, if enough samples are available in memory and at learning interval settings
        if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, agent_number)

    def act(self, states, add_noise):
        """Returns actions for both agents as per current policy, given their respective states."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            # For MADDPG: get action for each agent and concatenate them
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
            Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
            where:
                actor_target(state) -> action
                critic_target(state, action) -> Q-value
            Params
            ======
                experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        # Construct next actions vector relative to the agent
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        # Compute Q targets for current states (y_i)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # For MADDPG: Construct action vector for each agent
        actions_pred = self.actor_local(states)
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        # Compute actor loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # update noise decay parameter
        #self.eps -= self.eps_decay
        #self.eps = max(self.eps, EPS_FINAL)
        #self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters."""
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_checkpoint(self, agent_number, filename='checkpoint'):
        checkpoint = {
            'action_size': self.action_size,
            'state_size': self.state_size,
            'actor_state_dict': self.actor_local.state_dict(),
            'critic_state_dict': self.critic_local.state_dict()
        }
        filepath = filename + '_' + str(agent_number) + '.pth'
        torch.save(checkpoint, filepath)
        print(filepath + ' succesfully saved.')

    def load_checkpoint(self, agent_number, filename='checkpoint'):
        filepath = filename + '_' + str(agent_number) + '.pth'
        checkpoint = torch.load(filepath)
        state_size = checkpoint['state_size']
        action_size = checkpoint['action_size']
        self.actor_local = Actor(state_size, action_size, seed=42).to(device)
        self.critic_local = Critic(state_size, action_size, seed=42).to(device)
        self.actor_local.load_state_dict(checkpoint['actor_state_dict'])
        self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
        print(filepath + ' successfully loaded.')
class DDPG():
    """ This is an Individual DDPG Agent """

    def __init__(self, state_size, action_size, seed):
        """ Initialize a DDPG Agent Object
        :param state_size: dimension of state (input) for this decentralized actor
        :param action_size: dimension of action (output) for this decentralized actor
        :param random_seed: random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Hyperparameters
        self.buffer_size = 100000
        self.batch_size = 256
        self.gamma = 0.99
        self.tau = 0.01
        self.lr_actor = 0.0001
        self.lr_critic = 0.001

        # Setup Networks (Actor: State -> Action, Critic: (States for all agents, Actions for all agents) -> Value)
        self.actor_local = Actor(self.state_size, self.action_size,  self.seed).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.lr_actor)
        self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.lr_critic)

        # Initialize local and taret networks to start with same parameters
        self.soft_update(self.actor_local, self.actor_target, tau=1)
        self.soft_update(self.critic_local, self.critic_target, tau=1)

        # Noise Setup
        self.noise = OUNoise(self.action_size, self.seed)

        # Replay Buffer Setup
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

    def __str__(self):
        return "DDPG_Agent"

    def reset_noise(self):
        """ resets to noise parameters """
        self.noise.reset()

    def act(self, state, epsilon, add_noise=True):
        """ Returns actions for given states as per current policy. Policy comes from the actor network.
        :param state: observations for this individual agent
        :param epsilon: probability of exploration
        :param add_noise: bool on whether or not to potentially have exploration for action
        :return: clipped actions
        """
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise and epsilon > np.random.random():
            actions += self.noise.sample()
        return np.clip(actions, -1,1)

    def step(self):
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def learn(self, experiences):
        """ Update actor and critic networks using a given batch of experiences
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(states) -> actions
            critic_target(states, actions) -> Q-value
        :param experiences: tuple of arrays (states, actions, rewards, next_states, dones)  sampled from the replay buffer
        """

        states, actions, rewards, next_states, dones = experiences
        # -------------------- Update Critic -------------------- #
        # Use target networks for getting next actions and q values and calculate q_targets
        next_actions = self.actor_target(next_states)
        next_q_targets = self.critic_target(next_states, next_actions)
        q_targets = rewards + (self.gamma * next_q_targets * (1 - dones))
        # Compute critic loss (Same as DQN Loss)
        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)
        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # -------------------- Update Actor --------------------- #
        # Computer actor loss (maximize mean of Q(states,actions))
        action_preds = self.actor_local(states)
        # Optimizer minimizes and we want to maximize so multiply by -1
        actor_loss = -1 * self.critic_local(states, action_preds).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ---------------- Update Target Networks ---------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_network, target_network, tau):
        """ soft update newtwork parametes
        θ_target = τ*θ_local + (1 - τ)*θ_target
        :param local_network: PyTorch Network that is always up to date
        :param target_network: PyTorch Network that is not up to date
        :param tau: update (interpolation) parameter
        """
        for target_param, local_param in zip(target_network.parameters(), local_network.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
예제 #23
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.name = "DDPG"
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 'actor_local')
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  'actor_target')

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   'critic_local')
        self.critic_target = Critic(self.state_size, self.action_size,
                                    'critic_target')

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Reward counter
        self.total_reward = 0
        self.n_steps = 0

    def load(self):
        self.actor_local.load()
        self.actor_target.load()
        self.critic_local.load()
        self.critic_target.load()
        print("Agent's weights loaded from disk.")

    def save(self):
        self.actor_local.save()
        self.actor_target.save()
        self.critic_local.save()
        self.critic_target.save()
        print("Agent's weights saved to disk.")

    def reset_episode(self):
        self.total_reward = 0
        self.n_steps = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        # Add reward to total
        self.total_reward += reward
        self.n_steps += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, add_noise=True):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        # Hack, rescale rotor revs to +-5 range from average
        # rev_mean = np.mean(action)
        # action = (action-450)/450
        # action *= 50
        # action += rev_mean

        if add_noise:
            action += self.noise.sample()  # additive noise for exploration
        return list(action)

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
예제 #24
0
class DDPG_Agent:
    def __init__(self, ob_sp, act_sp, alow, ahigh, writer, args):
        self.args = args
        self.alow = alow
        self.ahigh = ahigh
        self.policy = Policy_net(ob_sp, act_sp)
        self.policy_targ = Policy_net(ob_sp, act_sp)
        self.qnet = Q_net(ob_sp, act_sp)
        self.qnet_targ = Q_net(ob_sp, act_sp)

        self.policy.to(device)
        self.qnet.to(device)
        self.policy_targ.to(device)
        self.qnet_targ.to(device)
        self.MSE_loss = nn.MSELoss()
        self.noise = OUNoise(1, 1)

        hard_update(self.policy_targ, self.policy)
        hard_update(self.qnet_targ, self.qnet)

        self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR)
        self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR)
        self.memory = ReplayMemory(int(1e6))
        self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS,
                                                FINAL_STD,
                                                INITIAL_STD,
                                                warmup_steps=WARMUP_STEPS)
        self.n_steps = 0
        self.n_updates = 0
        self.writer = writer

    def get_action(self, state):
        if self.args.use_ounoise:
            noise = self.noise.sample()[0]
        else:
            noise = np.random.normal(
                0, self.epsilon_scheduler.value(self.n_steps))
        st = torch.from_numpy(state).view(1, -1).float()
        action = self.policy(st)
        action_with_noise = np.clip(action.item() + noise, self.alow,
                                    self.ahigh)
        if self.args.use_writer:
            self.writer.add_scalar("action mean", action.item(), self.n_steps)
            self.writer.add_scalar("action noise", noise, self.n_steps)
            self.writer.add_scalar("epsilon",
                                   self.epsilon_scheduler.value(self.n_steps),
                                   self.n_steps)
            self.writer.add_scalar("action", action_with_noise, self.n_steps)
        self.n_steps += 1
        return action_with_noise

    def store_transition(self, state, action, reward, next_state, done):

        self.memory.push(torch.from_numpy(state), torch.tensor(action),
                         torch.tensor(reward), torch.from_numpy(next_state),
                         torch.tensor(done))

    def reset(self):
        self.noise.reset()

    def train(self):
        batch = self.memory.sample(min(BATCH_SIZE, len(self.memory)))
        b_dict = [torch.stack(elem) for elem in Transition(*zip(*batch))]
        states, actions, rewards, next_states, dones = \
            b_dict[0], b_dict[1].view(-1, 1), \
            b_dict[2].view(-1, 1).float().to(device), b_dict[3], \
            b_dict[4].view(-1, 1).float().to(device)

        #  CRITIC LOSS: Q(s, a) += (r + gamma*Q'(s, π'(s)) - Q(s, a))
        # inputs computation
        inputs_critic = self.qnet(states, actions)
        # targets
        with torch.no_grad():
            policy_acts = self.policy_targ(next_states)
        targ_values = self.qnet_targ(next_states, policy_acts)
        targets_critics = rewards + GAMMA * (1 - dones) * targ_values
        loss_critic = self.MSE_loss(inputs_critic, targets_critics)
        self.q_optimizer.zero_grad()
        loss_critic.backward()
        # nn.utils.clip_grad_norm_(self.qnet.parameters(), GRAD_CLIP)
        self.q_optimizer.step()

        # ACTOR objective: derivative of Q(s, π(s | ø)) with respect to ø
        actor_loss = -self.qnet(states, self.policy(states)).mean()
        self.p_optimizer.zero_grad()
        actor_loss.backward()
        # nn.utils.clip_grad_norm_(self.policy.parameters(), GRAD_CLIP)
        self.p_optimizer.step()
        soft_update(self.policy_targ, self.policy, TAU)
        soft_update(self.qnet_targ, self.qnet, TAU)
        if self.args.use_writer:
            self.writer.add_scalar("critic_loss", loss_critic.item(),
                                   self.n_updates)
            self.writer.add_scalar("actor_loss", actor_loss.item(),
                                   self.n_updates)
        self.n_updates += 1
예제 #25
0
파일: ddpg.py 프로젝트: juandd18/Tenis
class DDPGAgent(object):
    """
    General class for DDPG agents (policy, critic, target policy, target
    critic, exploration noise)
    """
    def __init__(self, num_in_pol, num_out_pol, num_in_critic, hidden_dim_actor=120,
    hidden_dim_critic=64,lr_actor=0.01,lr_critic=0.01,batch_size=64,
    max_episode_len=100,tau=0.02,gamma = 0.99,agent_name='one', discrete_action=False):
        """
        Inputs:
            num_in_pol (int): number of dimensions for policy input
            num_out_pol (int): number of dimensions for policy output
            num_in_critic (int): number of dimensions for critic input
        """
        self.policy = Actor(num_in_pol, num_out_pol,
                                 hidden_dim=hidden_dim_actor,
                                 discrete_action=discrete_action)
        self.critic = Critic(num_in_pol, 1,num_out_pol,
                                 hidden_dim=hidden_dim_critic)
        self.target_policy = Actor(num_in_pol, num_out_pol,
                                        hidden_dim=hidden_dim_actor,
                                        discrete_action=discrete_action)
        self.target_critic = Critic(num_in_pol, 1,num_out_pol,
                                        hidden_dim=hidden_dim_critic)
        hard_update(self.target_policy, self.policy)
        hard_update(self.target_critic, self.critic)
        self.policy_optimizer = Adam(self.policy.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic,weight_decay=0)
        
        self.policy = self.policy.float()
        self.critic = self.critic.float()
        self.target_policy = self.target_policy.float()
        self.target_critic = self.target_critic.float()

        self.agent_name = agent_name
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        #self.replay_buffer = ReplayBuffer(1e7)
        self.replay_buffer = ReplayBufferOption(500000,self.batch_size,12)
        self.max_replay_buffer_len = batch_size * max_episode_len
        self.replay_sample_index = None
        self.niter = 0
        self.eps = 5.0
        self.eps_decay = 1/(250*5)

        self.exploration = OUNoise(num_out_pol)
        self.discrete_action = discrete_action

        self.num_history = 2
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.dones = []

    def reset_noise(self):
        if not self.discrete_action:
            self.exploration.reset()

    def scale_noise(self, scale):
        if self.discrete_action:
            self.exploration = scale
        else:
            self.exploration.scale = scale

    def act(self, obs, explore=False):
        """
        Take a step forward in environment for a minibatch of observations
        Inputs:
            obs : Observations for this agent
            explore (boolean): Whether or not to add exploration noise
        Outputs:
            action (PyTorch Variable): Actions for this agent
        """
        #obs = obs.reshape(1,48)
        state = Variable(torch.Tensor(obs),requires_grad=False)

        self.policy.eval()
        with torch.no_grad():
            action = self.policy(state)
        self.policy.train()
        # continuous action
        if explore:
            action += Variable(Tensor(self.eps * self.exploration.sample()),requires_grad=False)
            action = torch.clamp(action, min=-1, max=1)
        return action

    def step(self, agent_id, state, action, reward, next_state, done,t_step):
        
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.next_states.append(next_state)
        self.dones.append(done)

        #self.replay_buffer.add(state, action, reward, next_state, done)
        if t_step % self.num_history == 0:
            # Save experience / reward
            
            self.replay_buffer.add(self.states, self.actions, self.rewards, self.next_states, self.dones)
            self.states = []
            self.actions = []
            self.rewards = []
            self.next_states = []
            self.dones = []

        # Learn, if enough samples are available in memory
        if len(self.replay_buffer) > self.batch_size:
            
            obs, acs, rews, next_obs, don = self.replay_buffer.sample()     
            self.update(agent_id ,obs,  acs, rews, next_obs, don,t_step)
        


    def update(self, agent_id, obs, acs, rews, next_obs, dones ,t_step, logger=None):
    
        obs = torch.from_numpy(obs).float()
        acs = torch.from_numpy(acs).float()
        rews = torch.from_numpy(rews[:,agent_id]).float()
        next_obs = torch.from_numpy(next_obs).float()
        dones = torch.from_numpy(dones[:,agent_id]).float()

        acs = acs.view(-1,2)
                
        # --------- update critic ------------ #        
        self.critic_optimizer.zero_grad()
        
        all_trgt_acs = self.target_policy(next_obs) 
    
        target_value = (rews + self.gamma *
                        self.target_critic(next_obs,all_trgt_acs) *
                        (1 - dones)) 
        
        actual_value = self.critic(obs,acs)
        vf_loss = MSELoss(actual_value, target_value.detach())

        # Minimize the loss
        vf_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

        # --------- update actor --------------- #
        self.policy_optimizer.zero_grad()

        if self.discrete_action:
            curr_pol_out = self.policy(obs)
            curr_pol_vf_in = gumbel_softmax(curr_pol_out, hard=True)
        else:
            curr_pol_out = self.policy(obs)
            curr_pol_vf_in = curr_pol_out


        pol_loss = -self.critic(obs,curr_pol_vf_in).mean()
        #pol_loss += (curr_pol_out**2).mean() * 1e-3
        pol_loss.backward()

        torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1)
        self.policy_optimizer.step()

        self.update_all_targets()
        self.eps -= self.eps_decay
        self.eps = max(self.eps, 0)
        

        if logger is not None:
            logger.add_scalars('agent%i/losses' % self.agent_name,
                               {'vf_loss': vf_loss,
                                'pol_loss': pol_loss},
                               self.niter)

    def update_all_targets(self):
        """
        Update all target networks (called after normal updates have been
        performed for each agent)
        """
        
        soft_update(self.critic, self.target_critic, self.tau)
        soft_update(self.policy, self.target_policy, self.tau)
   
    def get_params(self):
        return {'policy': self.policy.state_dict(),
                'critic': self.critic.state_dict(),
                'target_policy': self.target_policy.state_dict(),
                'target_critic': self.target_critic.state_dict(),
                'policy_optimizer': self.policy_optimizer.state_dict(),
                'critic_optimizer': self.critic_optimizer.state_dict()}

    def load_params(self, params):
        self.policy.load_state_dict(params['policy'])
        self.critic.load_state_dict(params['critic'])
        self.target_policy.load_state_dict(params['target_policy'])
        self.target_critic.load_state_dict(params['target_critic'])
        self.policy_optimizer.load_state_dict(params['policy_optimizer'])
        self.critic_optimizer.load_state_dict(params['critic_optimizer'])
class AgentDDPG:
    def __init__(self, state_size, action_size, seed):
        """

        :state_size: size of the state vector
        :action_size: size of the action vector
        """

        self.state_size = state_size
        self.action_size = action_size
        self.t_step = 0
        self.score = 0.0
        self.best = 0.0
        self.seed = seed
        self.total_reward = 0.0
        self.count = 0
        self.learning_rate_actor = 0.0001
        self.learning_rate_critic = 0.001
        self.batch_size = 128
        self.update_every = 1

        # Instances of the policy function or actor and the value function or critic
        # Actor critic with Advantage

        # Actor local and target network definitions
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.seed).to(device)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.seed).to(device)

        # Critic local and target
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.seed).to(device)

        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.seed).to(device)
        # Actor Optimizer
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate_actor)

        # Critic Optimizer
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.learning_rate_critic)

        # Make sure local and target start with the same weights
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # Initialize the Gaussin Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Initialize the Replay Memory
        self.buffer_size = 1000000
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Parameters for the Algorithm
        self.gamma = 0.99  # Discount factor
        self.tau = 0.001  # Soft update for target parameters Actor Critic with Advantage

    # Actor interact with the environment through the step
    def step(self, state, action, reward, next_state, done):
        # Add to the total reward the reward of this time step
        self.total_reward += reward
        # Increase your count based on the number of rewards
        # received in the episode
        self.count += 1
        # Stored experience tuple in the replay buffer
        self.memory.add(state, action, reward, next_state, done)

        # Learn every update_times time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:

            # Check to see if you have enough to produce a batch
            # and learn from it

            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                # Train the networks using the experiences
                self.learn(experiences)

        # Roll over last state action (not needed)
        # self.last_state = next_state

    # Actor determines what to do based on the policy
    def act(self, state):
        # Given a state return the action recommended by the policy
        # Reshape the state to fit the torch tensor input
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Pass the state to the actor local model to get an action
        # recommend for the policy in a state
        # set the actor_local model to predict not to train
        self.actor_local.eval()
        # set the model so this operation is not counted in the
        # gradiant calculation.
        with torch.no_grad():
            actions = self.actor_local(state)
        # set the model back to training mode
        self.actor_local.train()

        # Because we are exploring we add some noise to the
        # action vector
        return list(actions.detach().numpy().reshape(4, ) +
                    self.noise.sample())

    # This is the Actor learning logic called when the agent
    # take a step to learn
    def learn(self, experiences):
        """
        Learning means that the networks parameters needs to be updated
        Using the experineces batch.
        Network learns from experiences not form interaction with the
        environment
        """

        # Reshape the experience tuples in separate arrays of states, actions
        # rewards, next_state, done
        # Your are converting every member of the tuple in a column or vector
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Now reshape the numpy arrays for states, actions and next_states to torch tensors
        # rewards and dones does not need to be tensors.
        states = torch.from_numpy(states).float().unsqueeze(0).to(device)
        actions = torch.from_numpy(actions).float().unsqueeze(0).to(device)
        next_states = torch.from_numpy(next_states).float().unsqueeze(0).to(
            device)

        # Firs we pass a batch of next states to the actor so it tell us what actions
        # to execute, we use the actor target network instead of the actor local network
        # because of the advantage principle

        # set the target network to predict because this is not part of the training, this model
        # weights are alter by a soft update not by an optimizer
        self.actor_target.eval()
        with torch.no_grad():
            next_state_actions = self.actor_target(next_states).detach()
        self.actor_target.train()

        # The critic evaluates the actions taking by the actor in the next state and generates the
        # Q(a,s) value of the next state taking those actions. These action, next_state tuple comes from the
        # ReplayBuffer not from interacting with the environment.
        # Remember the Critic or q_value function inputs is states, actions
        # We calculate the q_targets of the next state. We will use this to calculate the current
        # state q_value using the bellman equation.

        # set the target network to predict because this is not part of the training, this model
        # weights are alter by a soft update not by an optimizer
        self.critic_target.eval()
        with torch.no_grad():
            q_targets_next_state_action_values = self.critic_target(
                next_states, next_state_actions).detach()
        self.actor_target.train()

        # With the next state q_value that is a vector of action values Q(s,a) of a random selected
        # next_states from the replay buffer. We calculate the CURRENT state target Q(s,a).
        # using the TD one-step Sarsa equations and the q_target_next value we got from the critic_target net
        # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value
        # This is done to train the critic_local model in a supervise learning fashion, this is the target values.
        q_targets = torch.from_numpy(
            rewards + self.gamma * q_targets_next_state_action_values.numpy() *
            (1 - dones)).float()

        # --- Optimize the local Critic Model ----#

        # Here we start the supervise training process of the critic_local network
        # we pass a bunch of states actions samples it produces the expected output
        # q_value of each action we passed.
        q_expected = self.critic_local(states, actions)

        # Clear grad buffer values in preparation.
        self.critic_optimizer.zero_grad()

        # loss function for the critic_local model mean square of the difference
        # between the q_expected value and the q_target value.
        critic_loss = F.smooth_l1_loss(q_expected, q_targets)
        critic_loss.backward(retain_graph=True)

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        # optimize the critic_local model using the optimizer defined for the critic
        # In the init function of this class
        self.critic_optimizer.step()

        # --- Optimize the local Actor Model ---#

        # Get the actor actions using the experience buffer states
        actor_actions = self.actor_local(states)

        # Use as a loss the negative sum of the q_values produce by the optimized critic local model given the
        # action of the actor_local model obtain using the states of the sampled buffer.
        loss_actor = -1 * torch.sum(
            self.critic_local.forward(states, actor_actions))

        # Set the model gradients to zero in preparation
        self.actor_optimizer.zero_grad()

        # Back propagate
        loss_actor.backward()

        # optimize the actor_local model using the optimizer defined for the actor
        # In the init function of this class
        self.actor_optimizer.step()

        # Soft-update target models
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def get_episode_score(self):
        """
        Calculate the episode scores
        :return: None
        """
        # Update score and best score
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best:
            self.best = self.score

    def save_model_weights(self):
        torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
예제 #27
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size=int(1e5),
                 batch_size=256,
                 learn_every=1,
                 update_every=1,
                 gamma=0.99,
                 tau=0.02,
                 lr_actor=2e-4,
                 lr_critic=2e-3,
                 random_seed=None,
                 use_asn=True,
                 asn_kwargs={},
                 use_psn=False,
                 psn_kwargs={},
                 use_per=False,
                 restore=None):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.update_every = update_every
        self.learn_every = learn_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        # Keep track of how many times we've updated weights
        self.i_updates = 0
        self.i_step = 0
        self.use_asn = use_asn
        self.use_psn = use_psn
        self.use_per = use_per

        if random_seed is not None:
            random.seed(random_seed)

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        if self.use_psn:
            self.actor_perturbed = Actor(state_size, action_size).to(device)
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)

        # restore networks if needed
        if restore is not None:
            checkpoint = torch.load(restore, map_location=device)
            self.actor_local.load_state_dict(checkpoint[0]['actor'])
            self.actor_target.load_state_dict(checkpoint[0]['actor'])
            if self.use_psn:
                self.actor_perturbed.load_state_dict(checkpoint[0]['actor'])
            self.critic_local.load_state_dict(checkpoint[0]['critic'])
            self.critic_target.load_state_dict(checkpoint[0]['critic'])

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic)

        # Hard copy weights from local to target networks
        policy_update(self.actor_local, self.actor_target, 1.0)
        policy_update(self.critic_local, self.critic_target, 1.0)

        # Noise process
        if self.use_asn:
            self.action_noise = OUNoise(action_size, **asn_kwargs)

        if self.use_psn:
            self.param_noise = ParameterSpaceNoise(**psn_kwargs)

        if self.use_per:
            self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size,
                                                      random_seed)
        else:
            self.buffer = ExperienceReplay(buffer_size, batch_size,
                                           random_seed)

    def act(self, states, perturb_mode=True, train_mode=True):
        """Returns actions for given state as per current policy."""
        if not train_mode:
            self.actor_local.eval()
            if self.use_psn:
                self.actor_perturbed.eval()

        with torch.no_grad():
            states = torch.from_numpy(states).float().to(device)
            actor = self.actor_perturbed if (
                self.use_psn and perturb_mode) else self.actor_local
            actions = actor(states).cpu().numpy()[0]

        if train_mode:
            actions += self.action_noise.sample()

        self.actor_local.train()
        if self.use_psn:
            self.actor_perturbed.train()

        return np.clip(actions, -1, 1)

    def perturb_actor_parameters(self):
        """Apply parameter space noise to actor model, for exploration"""
        policy_update(self.actor_local, self.actor_perturbed, 1.0)
        params = self.actor_perturbed.state_dict()
        for name in params:
            if 'ln' in name:
                pass
            param = params[name]
            random = torch.randn(param.shape)
            if use_cuda:
                random = random.cuda()
            param += random * self.param_noise.current_stddev

    def reset(self):
        self.action_noise.reset()
        if self.use_psn:
            self.perturb_actor_parameters()

    def step(self, experience, priority=0.0):
        self.buffer.push(experience)
        self.i_step += 1
        if len(self.buffer) > self.batch_size:
            if self.i_step % self.learn_every == 0:
                self.learn(priority)
            if self.i_step % self.update_every == 0:
                self.update(
                )  # soft update the target network towards the actual networks

    def learn(self, priority=0.0):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        if self.use_per:
            (states, actions, rewards, states_next,
             dones), batch_idx = self.buffer.sample(priority)
        else:
            states, actions, rewards, states_next, dones = self.buffer.sample()

        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            actions_next = self.actor_target(states_next)
            Q_targets_next = self.critic_target(states_next, actions_next)
            Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # ---------------------------- update critic ---------------------------- #
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.smooth_l1_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_local.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_local.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        if self.use_per:
            Q_error = Q_expected - Q_targets
            new_deltas = torch.abs(Q_error.detach().squeeze(1)).numpy()
            self.buffer.update_deltas(batch_idx, new_deltas)

    def update(self):
        """soft update targets"""
        self.i_updates += 1
        policy_update(self.actor_local, self.actor_target, self.tau)
        policy_update(self.critic_local, self.critic_target, self.tau)

    def save_model(self, model_dir, session_name, i_episode, best):

        filename = os.path.join(
            model_dir,
            f'ddpg_{session_name}-EP_{i_episode}-score_{best:.3f}.pt')
        filename_best = os.path.join(model_dir, f'ddpg_{session_name}-best.pt')
        save_dict_list = []
        save_dict = {
            'actor': self.actor_local.state_dict(),
            'actor_optim_params': self.actor_optimizer.state_dict(),
            'critic': self.critic_local.state_dict(),
            'critic_optim_params': self.critic_optimizer.state_dict()
        }
        save_dict_list.append(save_dict)
        torch.save(save_dict_list, filename)
        copyfile(filename, filename_best)

    def postprocess(self, t_step):
        if self.use_psn and t_step > 0:
            perturbed_states, perturbed_actions, _, _, _ = self.buffer.tail(
                t_step)
            unperturbed_actions = self.act(np.array(perturbed_states), False,
                                           False)
            diff = np.array(perturbed_actions) - unperturbed_actions
            mean_diff = np.mean(np.square(diff), axis=0)
            dist = sqrt(np.mean(mean_diff))
            self.param_noise.adapt(dist)
예제 #28
0
class DDPG:
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self,
                 state_shape,
                 action_shape,
                 batch_size=128,
                 gamma=0.995,
                 tau=0.005,
                 actor_lr=0.0001,
                 critic_lr=0.001,
                 use_layer_norm=True):

        self.state_shape = state_shape
        self.action_shape = action_shape
        self.num_actions = np.prod(self.action_shape)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = batch_size
        self.memory = ReplayBuffer(self.buffer_size, self.action_shape,
                                   self.state_shape)

        # Noise process
        self.noise = OUNoise(self.num_actions)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  #soft update
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        #initialize
        self.models = Models(self.state_shape,
                             self.action_shape,
                             actor_lr=self.actor_lr,
                             critic_lr=self.critic_lr,
                             gamma=self.gamma,
                             use_layer_norm=use_layer_norm)
        self.initialize()
        self.saver = tf.train.Saver()
        self.current_path = os.getcwd()

        #initial episode vars
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0
        self.episode_num = 0

    def reset_episode_vars(self):
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0

    def step(self, state, reward, done):

        action = self.act(state)
        self.count += 1
        if self.last_state is not None and self.last_action is not None:
            self.total_reward += reward
            self.memory.add(self.last_state, self.last_action, reward, state,
                            done)
        if (len(self.memory) > self.batch_size):
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)
        self.last_state = state
        self.last_action = action
        if done:
            self.episode_num += 1
            eps_reward = self.total_reward
            print('Episode {}: total reward={:7.4f}, count={}'.format(
                self.episode_num, self.total_reward, self.count))
            # summary_str = self.sess.run(self.summary_ops, feed_dict={
            #     self.summary_vars[0]: eps_reward,
            #     self.summary_vars[1]: eps_reward / float(self.count)  # need to change to average Q
            # })
            # self.writer.add_summary(summary_str, self.episode_num)
            # self.writer.flush()
            self.reset_episode_vars()
            return action, eps_reward
        else:
            return action

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        actions = self.sess.run(self.models.actor,
                                feed_dict={self.models.input_state: states})
        noise = self.noise.sample()
        print('Noise : {:0.2f}     {:0.2f}     {:0.2f}').format(
            noise[0], noise[1], noise[2])  # action : [ vel , arm , bucket ]
        return np.clip(actions + noise, a_min=-1.,
                       a_max=1.).reshape(self.action_shape)

    def act_without_noise(self, states):
        """Returns actions for given state(s) as per current policy."""
        actions = self.sess.run(self.models.actor,
                                feed_dict={self.models.input_state: states})
        return np.array(actions).reshape(self.action_shape)

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        states = experiences['states']
        actions = experiences['actions']
        rewards = experiences['rewards']
        next_states = experiences['next_states']
        dones = experiences['dones']

        #actor critic update
        self.sess.run(
            [self.models.actor_opt, self.models.critic_opt],
            feed_dict={
                self.models.input_state: states,
                self.models.input_action: actions,
                self.models.input_state_target: next_states,
                self.models.rewards: rewards,
                self.models.dones: dones
            })
        #target soft update
        self.sess.run(self.soft_update_ops)

    def initialize(self):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        """
        self.sess = tf.Session()
        # self.summary_ops, self.summary_vars = build_summaries()
        self.sess.run(tf.global_variables_initializer())

        # self.writer = tf.summary.FileWriter('./graphs', self.sess.graph)

        actor_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      scope='actor')
        actor_target_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             scope='target_actor')
        critic_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope='critic')
        critic_target_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                              scope='target_critic')
        target_init_ops = []
        soft_update_ops = []
        for var, target_var in zip(actor_var, actor_target_var):
            target_init_ops.append(tf.assign(target_var, var))
            soft_update_ops.append(
                tf.assign(target_var,
                          (1. - self.tau) * target_var + self.tau * var))
        for var, target_var in zip(critic_var, critic_target_var):
            target_init_ops.append(tf.assign(target_var, var))
            soft_update_ops.append(
                tf.assign(target_var,
                          (1. - self.tau) * target_var + self.tau * var))
        self.soft_update_ops = soft_update_ops
        self.sess.run(target_init_ops)

    def save_model(self):
        self.saver.save(self.sess, self.current_path + '/model/model.ckpt')

    def load_model(self, path):
        self.saver.restore(self.sess, path)

    def save_memory(self):
        mem_file = open(self.current_path + '/replay_buffer_memory.p', 'w')
        pickle.dump(self.memory, mem_file)
        mem_file.close()

    def load_memory(self, path):
        mem_file = open(self.current_path + '/replay_buffer_memory.p', 'r')
        mem = pickle.load(mem_file)
        self.memory = mem
        mem_file.close()
예제 #29
0
class MultiDDPGAgent:
    """ Multi-agent DDPG implementation."""
    def __init__(self, state_size, action_size, num_agents, cfg):
        """Initialize a MADDPG Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): Number of agents in environment
            cfg (config object): main configuration with other settings
        """
        print("Initializing MADDPG agent with {:d} agents!".format(num_agents))

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(cfg.random_seed)

        self.cfg = cfg

        # initializing list of single agents (2 for tennis)
        self.agents = []
        for aid in range(num_agents):
            agent = SingleDDPGAgent(state_size,
                                    action_size,
                                    cfg,
                                    num_agents=num_agents,
                                    agent_id=aid)
            self.agents.append(agent)

        self.t_step = 0

        # Noise process
        self.noise_scale = self.cfg.noise_scale
        self.noise = OUNoise(action_size,
                             cfg.random_seed,
                             theta=cfg.theta_ou,
                             sigma=cfg.sigma_ou)

        # as long as active, will fill replay buffer with random memories, no learning
        self.prefetching = True

        # Replay memory for shared experiences (all agents)
        self.memory = ReplayBuffer(action_size, cfg.buffer_size,
                                   cfg.batch_size, cfg.random_seed, cfg)

    def add_noise(self):
        if self.cfg.use_ou:
            return self.noise_scale * self.noise.sample()
        else:
            # Gaussian noise
            return self.noise_scale * np.random.normal(0, 1.0,
                                                       self.action_size)

    def reset(self):
        self.t_step = 0
        self.noise.reset()

    def act(self, state_all, add_noise=True):
        """
        Let all agents act.
        Receives full state tensor of all agents
        and outputs all actions (num_agents x action_size).
        """
        actions = []
        for aid in range(self.num_agents):
            # only add noise after pre-loading memories
            noise = 0
            if not self.prefetching and add_noise:
                noise = self.add_noise()
            actions.append(
                self.agents[aid].act(state_all[aid], add_noise=False) + noise)

        return actions

    def _target_act(self, states_all):
        """
        Internal function used by learn function.
        Gets target network actions for all agents.
        """
        target_actions = []
        for aid in range(self.num_agents):
            # states_all format (batch size, num_agents, state size)
            target_actions.append(self.agents[aid].target_act(
                states_all[:, aid, :]))

        return target_actions

    def step(self, states, actions, rewards, next_states, dones):
        """ Save experiences in global memory.
            If memory large enough, use it to learn each agent.
        """
        max_prio = self.memory.get_max_priority()
        self.memory.add(states, actions, rewards, next_states, max_prio, dones)

        # start training if memory size large enough.
        if len(self.memory) >= max(self.cfg.batch_size, self.cfg.init_replay):
            if self.prefetching:
                self.prefetching = False
                print("Pre-loading of memories complete, starting training!")
        else:
            return

        self.t_step = (self.t_step + 1) % self.cfg.learn_every
        if self.t_step == 0:
            for _ in range(self.cfg.learn_steps):
                self.learn_all()

        self.noise_scale = max(self.noise_scale * self.cfg.noise_decay,
                               self.cfg.noise_scale_min)
        self.t_step += 1

    def learn_all(self):
        """Generates full batch input and performs individual learning steps."""
        samples = self.memory.sample()
        for aid in range(self.num_agents):
            self.learn(samples, aid)
            self.soft_update_all()

    def learn(self, samples, agent_number):
        """
            Update critic and actor networks of given agent using provided
            samples from replay memory.
        """
        # from memory
        states, actions, rewards, next_states, priorities, dones, indices = samples

        # creating full states and next_states with shape (batch_size, -1)
        batch_size = self.cfg.batch_size
        full_states = states.view(batch_size, -1)
        full_next_states = next_states.view(batch_size, -1)

        # selecting the correct agent
        agent = self.agents[agent_number]

        # 1. Update of critic
        agent.critic_optimizer.zero_grad()

        # critic loss = TD-error, so batch mean of (y- Q*(s,a))^2
        # y = current reward + discount * Q*(st+1,at+1) from target network Q*

        # shape (batch_size, num_agents, -1)
        target_actions = torch.cat(self._target_act(
            next_states.view(batch_size, self.num_agents, -1)),
                                   dim=1)
        # returns list, so change to shape (batch_size, action_size, num_agent)

        # get next q values from target critic
        q_next = agent.critic_target(full_next_states,
                                     target_actions.to(device))

        y = rewards[:, agent_number].view(-1, 1) + \
            self.cfg.gamma * q_next * (1 - dones[:, agent_number].view(-1, 1))

        q = agent.critic_local(full_states, actions.view(batch_size, -1))

        critic_loss = None
        if self.cfg.loss_l == 1:
            huber_loss = torch.nn.SmoothL1Loss()
            critic_loss = huber_loss(q, y.detach())
        elif self.cfg.loss_l == 2:
            critic_loss = F.mse_loss(q, y.detach())
        else:
            AssertionError("L{:d} loss is not supported!".format(
                self.cfg.loss_l))

        # optimization of critic (local) loss
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.critic_local.parameters(), 1)
        agent.critic_optimizer.step()

        # 2. Update of actor network using policy gradient
        agent.actor_optimizer.zero_grad()

        # make input to agent
        # detach the other agents to save computation
        # saves some time for computing derivative
        q_input = [
            self.agents[i].actor_local(
                states.view(batch_size, self.num_agents, -1)[:, i, :])
            if i == agent_number else self.agents[i].actor_local(
                states.view(batch_size, self.num_agents, -1)[:,
                                                             i, :]).detach()
            for i in range(self.num_agents)
        ]

        q_input = torch.cat(q_input, dim=1)

        # combine all the actions and observations for input to critic
        # many of the obs are redundant, and obs[1] contains all useful information already

        # get the actual policy gradient here
        actor_loss = -agent.critic_local(full_states, q_input).mean()

        # optimize
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.actor_local.parameters(), 1)
        agent.actor_optimizer.step()

        # soft update the models
        agent.soft_update(agent.critic_local, agent.critic_target,
                          self.cfg.tau)
        agent.soft_update(agent.actor_local, agent.actor_target, self.cfg.tau)

    def soft_update_all(self):
        """soft update targets"""
        for agent in self.agents:
            agent.soft_update(agent.critic_local, agent.critic_target,
                              self.cfg.tau)
            agent.soft_update(agent.actor_local, agent.actor_target,
                              self.cfg.tau)

    def save_weights(self, model_save_path):
        """
        Simple method to save network weights.
        """
        for aid, agent in enumerate(self.agents):
            agent.save_weights(model_save_path, suffix="_{:d}".format(aid))

    def load_weights(self, model_save_path):
        """
        Method to load network weights from saved files.
        """
        for aid, agent in enumerate(self.agents):
            agent.load_weights(model_save_path, suffix="_{:d}".format(aid))
예제 #30
0
class MADDPGAgent:
    def __init__(self,
                 agent_index,
                 state_size,
                 action_size,
                 seed,
                 actor_lr,
                 critic_lr,
                 weight_decay,
                 tau,
                 update_every,
                 gamma,
                 device,
                 hidden_1_size=256,
                 hidden_2_size=128,
                 checkpoint_dir="."):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): how many agents are running in each step
            seed (int): random seed
            actor_lr (float): actor learning rate alpha
            critic_lr (float): critic learning rate alpha
            weight_decay (float): rate of nn weight decay for critic network
            tau (float): soft update rate for synchronizing target and train network weights 
            update_every (int): how many env steps to train agent
            gamma (float): reward discount factor
            device (string): device to run PyTorch computation on (CPU, GPU)
            checkpoint_dir (string) : where to save checkpoints (trained weights)
        """
        self.agent_index = agent_index
        self.seed = torch.manual_seed(seed)
        self.tau = tau
        self.action_size = action_size

        self.update_every = update_every
        self.gamma = gamma
        self.device = device

        # NN models for
        #         network size

        # Critic
        self.critic_train = CriticQNetwork(state_size, action_size, seed,
                                           hidden_1_size,
                                           hidden_2_size).to(device)
        self.critic_target = CriticQNetwork(state_size, action_size, seed,
                                            hidden_1_size,
                                            hidden_2_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_train.parameters(),
                                           lr=critic_lr,
                                           weight_decay=weight_decay)
        # Actor
        self.actor_train = ActorQNetwork(state_size, action_size, seed,
                                         hidden_1_size,
                                         hidden_2_size).to(device)
        self.actor_target = ActorQNetwork(state_size, action_size, seed,
                                          hidden_1_size,
                                          hidden_2_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_train.parameters(),
                                          lr=actor_lr)

        # init Noise process
        self.noise = OUNoise(action_size, seed, theta=0.15, sigma=0.2)

        self.actor_loss = 0
        self.critic_loss = 0

        #checkpointing
        self.checkpoint_dir = checkpoint_dir

        self.actor_weights = "{}/actor_{}.pth".format(self.checkpoint_dir,
                                                      self.agent_index)
        self.critic_weights = "{}/critic_{}.pth".format(
            self.checkpoint_dir, self.agent_index)

    def load_checkpoint(self, file_prefix=None):
        actor_weights = "{}actor_{}.pth".format(
            file_prefix,
            self.agent_index) if file_prefix else self.actor_weights
        critic_weights = "{}critic_{}.pth".format(
            file_prefix,
            self.agent_index) if file_prefix else self.critic_weights

        if os.path.isfile(actor_weights) and os.path.isfile(critic_weights):
            self.actor_target.load_state_dict(torch.load(actor_weights))
            self.actor_train.load_state_dict(torch.load(actor_weights))
            self.critic_target.load_state_dict(torch.load(critic_weights))
            self.critic_train.load_state_dict(torch.load(critic_weights))

    def save_checkpoint(self, file_name=None):
        actor_weights = "{}actor_{}.pth".format(
            file_prefix, self.agent_index) if file_name else self.actor_weights
        critic_weights = "{}critic_{}.pth".format(
            file_prefix,
            self.agent_index) if file_name else self.critic_weights
        torch.save(self.actor_train.state_dict(), actor_weights)
        torch.save(self.critic_train.state_dict(), critic_weights)

    def act(self, states, add_noise=True, epsilon=1.0):
        states = torch.from_numpy(states).float().to(self.device)
        self.actor_train.eval()
        with torch.no_grad():
            actions = self.actor_train(states).cpu().data.numpy()
        self.actor_train.train()

        if add_noise:
            actions += self.noise.sample() * epsilon
            # actions += 0.5*np.random.standard_normal(self.action_size) *epsilon
        return np.clip(actions, -1, 1)

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, target_actions_pred, actions_pred = experiences

        ## DDPG  implementation
        #### Critic network training

        # Calculate Q_Targets
        # first use target Actor to predict best next actions for next states S'
        # target_actions_pred = self.actor_target(next_states)
        # Then use target critic to asses Q value of this (S', pred_action) tuple
        with torch.no_grad():
            target_pred = self.critic_target(
                next_states, target_actions_pred).to(self.device)
        # calculate the Q_target using TD error formula
        Q_target = rewards[:, self.agent_index].view(
            -1, 1) + (self.gamma * target_pred *
                      (1 - dones[:, self.agent_index].view(-1, 1)))

        # find what Q value does Critic train network assign to this (state, action) - current state, actual action performed
        Q_pred = self.critic_train(states, actions).to(self.device)

        # Minimize critic loss
        # do Gradient Descent step on Critic train network by minimizing diff between (Q_pred, Q_target)
        self.critic_optimizer.zero_grad()
        critic_loss = F.smooth_l1_loss(Q_pred, Q_target.detach())
        self.critic_loss = critic_loss.cpu().detach().item()
        critic_loss.backward()
        self.critic_optimizer.step()

        #### Actor network training
        # find wich action does Actor train predict
        # actions_pred = self.actor_train(states)
        # Loss is negative of Critic_train Q estimate of (S,  actions_pred)
        # i.e. we want to maximize (minimize the negative) of action state Value function (Q) prediction by critic_train
        # for current state and next action predicted by actor_train
        actor_loss = -self.critic_train(states, actions_pred).mean()

        self.actor_loss = actor_loss.cpu().detach().item()
        # minimize Actor loss
        # do Gradient Descent step on Actor train network
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        soft_update(self.critic_train, self.critic_target, self.tau)
        soft_update(self.actor_train, self.actor_target, self.tau)