Пример #1
0
class ActorAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, noise,
                 learning_rate, memory, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.noise = noise
        self.learning_rate = learning_rate
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate)

        # Noise process
        self.noise = OUNoise(action_size, seed=random_seed)

        # Replay memory
        #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        self.memory = memory

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()
Пример #2
0
class DDPG:
    def __init__(self,
                 in_actor,
                 out_actor,
                 in_critic,  # e.g. = n_agent * (state_size + action_size)
                 lr_actor=1e-4,
                 lr_critic=1e-3,  # better learn faster than actor
                 random_seed=2):
        self.state_size = in_actor
        self.action_size = out_actor
        self.seed = random.seed(random_seed)

        self.params = {"lr_actor": lr_actor,
                       "lr_critic": lr_critic,
                       "optimizer": "adam"}

        self.local_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device)
        self.target_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device)
        self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=lr_actor)

        # for a single agent, critic takes global observations as input, and output action-value Q
        # e.g. global_states = all_states + all_actions
        self.local_critic = Critic(in_shape=in_critic).to(device)
        self.target_critic = Critic(in_shape=in_critic).to(device)
        self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=lr_critic)

        # Q: should local/target start with same weights ? synchronized after first copy after all
        # A: better hard copy at the beginning
        hard_update_A_from_B(self.target_actor, self.local_actor)
        hard_update_A_from_B(self.target_critic, self.local_critic)

        # Noise process
        self.noise = OUNoise(out_actor, scale=1.0)

    def act(self, obs, noise_scale=0.0):
        obs = obs.to(device)
        # debug noise
        # noise = torch.from_numpy(noise_scale*0.5*np.random.randn(1, self.action_size)).float().to(device)
        # action = self.local_actor(obs) + noise
        action = self.local_actor(obs) + noise_scale * self.noise.noise().to(device)
        return action

    def target_act(self, obs, noise_scale=0.0):
        obs = obs.to(device)
        # noise = torch.from_numpy(noise_scale*0.5 * np.random.randn(1, self.action_size)).float().to(device)
        # action = self.target_actor(obs) + noise_scale * noise
        action = self.target_actor(obs) + noise_scale * self.noise.noise().to(device)
        return action

    def reset(self):
        self.noise.reset()
Пример #3
0
class AgentCommon():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Noise process
        #self.noise = OUNoise(action_size, random_seed)
        self.noise = OUNoise((self.num_agents, action_size), seed = random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
        self.actorL = ActorAgent(state_size, action_size, num_agents, self.noise, LR_ACTOR, self.memory, random_seed)
        self.actorR = ActorAgent(state_size, action_size, num_agents, self.noise, LR_ACTOR, self.memory, random_seed)
        self.sharedcritic = CriticAgent(state_size, action_size, num_agents, LR_CRITIC, WEIGHT_DECAY, TAU, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        self.actorL.step(state[0], action[0], reward[0], next_state[0], done[0])
        self.actorR.step(state[1], action[1], reward[1], next_state[1], done[1])
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences1 = self.memory.sample()
            experiences2 = self.memory.sample()
            self.sharedcritic.learn(self.actorL,experiences1, GAMMA)
            self.sharedcritic.learn(self.actorR,experiences2, GAMMA)

    def act(self, state, add_noise=True):
        actionL = self.actorL.act(state[0],add_noise=add_noise)
        actionR = self.actorL.act(state[1],add_noise=add_noise)
        return[actionL,actionR]
    
    def reset(self):
        self.noise.reset()
Пример #4
0
class SharedCritic():
    def __init__(self, state_size, action_size, random_seed, num_agents):
        self.num_agents = num_agents
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.noise = OUNoise(action_size, random_seed)
        self.actors = [
            ActorAgent(i, state_size, action_size, random_seed, LR_ACTOR,
                       self.noise, self.memory) for i in range(num_agents)
        ]
        self.critic = CriticAgent(state_size, action_size, random_seed,
                                  LR_CRITIC, WEIGHT_DECAY, TAU)
        self.count = 0

    def act(self, states, add_noise=True):
        actions = []
        for actor, state in zip(self.actors, states):
            action = actor.act(state, add_noise=add_noise)
            actions.append(action)
        #return np.array(actions).reshape(1, -1) # reshape 2x2 into 1x4 dim vector
        return actions

    def reset(self):
        self.noise.reset()

    def step(self, states, actions, rewards, next_states, dones):
        for actor, state, action, reward, next_state, done in zip(
                self.actors, states, actions, rewards, next_states, dones):
            actor.step(state, action, reward, next_state, done)

        self.count = (self.count + 1) % UPDATE_EVERY
        if len(self.memory) > BATCH_SIZE:
            if self.count == 0:
                for actor in self.actors:
                    experiences = self.memory.sample()
                    self.critic.learn(actor, experiences, GAMMA)
Пример #5
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 num_agents,
                 state_size,
                 action_size,
                 gamma,
                 tau,
                 learning_rate_actor,
                 learning_rate_critic,
                 weight_decay,
                 device,
                 random_seed=42):
        """Initialize an Agent object (used my MultiAgent for MADDPG).

        Params
        ======
            num_agents (list): number of agents acting in the environment
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            gamma (float): discount factor
            tau (float): used for soft update of target parameters
            learning_rate_actor (float): learning rate for the actor
            learning_rate_critic (float): learning rate for the critic
            weight_decay (float): weight decay for the optimizers
            device (torch.Device): pytorch device
            random_seed (int): random seed
        """

        self.gamma = gamma
        self.tau = tau
        self.device = device
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=learning_rate_actor,
                                          weight_decay=weight_decay)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(num_agents, state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(num_agents, state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=learning_rate_critic,
                                           weight_decay=weight_decay)  #0.0001

        # Noise process
        self.noise = OUNoise(size=action_size, seed=random_seed)

        self.timestep = 0

    def act(self, state, epsilon=1, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * epsilon
        return np.clip(action, -1, 1)

    def reset(self):
        """Resets the noise"""
        self.noise.reset()

    def learn(self, index, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            index (int): Index of the current agent
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        all_states = torch.cat(states, dim=1).to(self.device)
        all_next_states = torch.cat(next_states, dim=1).to(self.device)
        all_actions = torch.cat(actions, dim=1).to(self.device)

        actions_next = actions.copy()

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next[index] = self.actor_target(next_states[index])
        all_actions_next = torch.cat(actions_next, dim=1).to(self.device)
        Q_targets_next = self.critic_target(all_next_states, all_actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards[index] + (gamma * Q_targets_next *
                                      (1 - dones[index]))
        # Compute critic loss
        Q_expected = self.critic_local(all_states, all_actions)

        huber_loss = torch.nn.SmoothL1Loss()
        critic_loss = huber_loss(Q_expected, Q_targets.detach())
        #critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = actions.copy()
        actions_pred[index] = self.actor_local(states[index])
        all_actions_pred = torch.cat(actions_pred, dim=1).to(self.device)
        actor_loss = -self.critic_local(all_states, all_actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #6
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, engine):
        self.task = engine
        self.width = engine.width
        self.height = engine.height
        self.state_size = engine.state_size
        self.action_size = engine.action_size
        self.action_low = engine.action_low
        self.action_high = engine.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high,self.width,self.height)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high,self.width,self.height)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,self.width,self.height)
        self.critic_target = Critic(self.state_size, self.action_size,self.width,self.height)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)        # Learn, if enough samples are available in memory
       # print(self.last_state)
        #print(action)
        #print(reward)
        #print(next_state)
        #print(done)
        #print('----')
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        #print(self.state_size)
        #state = np.reshape(state, [1, self.state_size])
        #print(state.shape)
        #print('act')
        action = self.actor_local.model.predict(state.reshape(1,self.state_size))[0]
        #action = action.squeeze(0).argmax()
        return list(action + self.noise.sample())  # add some noise for exploration
    def act1(self, state):
        """Returns actions for given state(s) as per current policy."""
        #print(state)
        #print('act')
        state = np.reshape(state, [-1, self.state_size])
        #print(state)
        #print('act')
        action = self.actor_local.model.predict(state.reshape(1,self.state_size))[0]
        #my_state.reshape(1, OBSERVATION_SPACE)
        #print(action)
        action = np.argmax(action)
        #print(action)
        return action
        #return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None]).reshape(-1, self.state_size)
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        #actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
      
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None]).reshape(-1, self.state_size)

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        #print(next_states)
        #print('next_states')
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])
        file_output = 'data1.txt' 
        with open(file_output, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(np.array(Q_targets_next) ) 
       
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        #print(Q_targets.shape)
        #print(actions.shape)
        #print(Q_targets.shape)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)   
       

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model arameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
    def reset_episode(self):
        self.noise.reset()
        #state = self.task.clear()
        self.task.clear()
        #self.last_state = state
        self.last_state =self.task.board
        return self.task.board
class TD3MultiAgent:
    def __init__(self):
      
        self.max_action = 1
        self.policy_freq = 2
        self.policy_freq_it = 0
        self.batch_size = 512
        self.discount = 0.99
        self.replay_buffer = int(1e5)
        
        
        self.device = 'cuda'
        
        self.state_dim = 24
        self.action_dim = 2
        self.max_action = 1
        self.policy_noise = 0.1
        self.agents = 1
        
        self.random_period = 1e4
        
        self.tau = 5e-3
        
        self.replay_buffer = ReplayBuffer(self.replay_buffer)
        
        self.actor = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device)
        self.actor_target = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)
#         self.actor.load_state_dict(torch.load('actor2.pth'))
#         self.actor_target.load_state_dict(torch.load('actor2.pth'))

        self.noise = OUNoise(2, 32)
        
        
        self.critic = Critic(48, self.action_dim).to(self.device)
        self.critic_target = Critic(48, self.action_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

    
    def select_action_with_noise(self, state, i):
        import pdb
        ratio = len(self.replay_buffer)/self.random_period

        if len(self.replay_buffer)>self.random_period:
            
            state = torch.FloatTensor(state[i,:]).to(self.device)
            action = self.actor(state).cpu().data.numpy()

            if self.policy_noise != 0: 
                action = (action + self.noise.sample())
            return action.clip(-self.max_action,self.max_action)
        
        else:
            q= self.noise.sample()
            return q
   
    
    def step(self, i):
        if len(self.replay_buffer)>self.random_period/2:
            # Sample mini batch
#         if True:
            import pdb
            s, a, r, s_, d = self.replay_buffer.sample(self.batch_size)
            
            state = torch.FloatTensor(s[:,i,:]).to(self.device)
            action = torch.FloatTensor(a[:,i,:]).to(self.device)
            next_state = torch.FloatTensor(s_[:,i,:]).to(self.device)
            
            a_state = torch.FloatTensor(s).to(self.device).reshape(-1,48)
            a_action = torch.FloatTensor(a).to(self.device).reshape(-1,4)
            a_next_state = torch.FloatTensor(s_).to(self.device).reshape(-1,48)
            
            done = torch.FloatTensor(1 - d[:,i]).to(self.device)
            reward = torch.FloatTensor(r[:,i]).to(self.device)
#             pdb.set_trace()
            # Select action with the actor target and apply clipped noise
            noise = torch.FloatTensor(a[:,i,:]).data.normal_(0, self.policy_noise).to(self.device)
            noise = noise.clamp(-0.1,0.1) # NOISE CLIP WTF?
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)
            # Compute the target Q value

            target_Q1, target_Q2 = self.critic_target(a_next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward.reshape(-1,1) + (done.reshape(-1,1) * self.discount * target_Q).detach()
            
            # Get current Q estimates
            current_Q1, current_Q2 = self.critic(a_state, action)

            # Compute critic loss
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Delayed policy updates
            if self.policy_freq_it % self.policy_freq == 0:
                # Compute actor loss
                actor_loss = -self.critic.Q1(a_state, self.actor(state)).mean()
                # Optimize the actor 
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Update the frozen target models
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


            self.policy_freq_it += 1
        
        return True
        
    
    def reset(self):
        self.policy_freq_it = 0
        self.noise.reset()
Пример #8
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 agents=2,
                 every=4,
                 updates=4):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        random.seed(np.random.seed(random_seed))
        self.agents = agents
        self.every = every
        self.updates = updates
        self.steps = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noises = OUNoise((agents, action_size))

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   device)

    def load_actor(self, model_file: str):
        self.actor_local.load_state_dict(
            torch.load(model_file, map_location=device))
        self.actor_local.to(device)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.steps += 1
        for i in range(self.agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and self.steps % self.every == 0:
            self.steps = 0
            for _ in range(self.updates):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noises.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noises.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)  # Gradient clipping
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #9
0
class Agent():
    def __init__(self, state_size, action_size):

        # Constants
        self.buffer_size = int(1e6)
        self.batch_size = 128
        self.learning_rate = 1e-4
        self.learn_every = 2
        self.learning_rounds = 4

        self.gamma = 0.99
        self.tau = 1e-3

        self.t = 0
        self.state_size = state_size
        self.action_size = action_size
        self.eps = 5.0
        self.eps_decay = 1 / (300 * self.learning_rounds)

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate)

        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.learning_rate)

        self.noise = OUNoise((1, action_size))
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size)

    def step(self, state, action, reward, next_state, done, agent_number):
        self.t += 1

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory
               ) > self.batch_size and self.t % self.learn_every == 0:
            for _ in range(self.learning_rounds):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma, agent_number)

    def act(self, states, add_noise):
        states = torch.from_numpy(states).to(device).float()

        # Get the actions for this agent
        with torch.no_grad():
            actions = self.actor_local(
                states.squeeze()).unsqueeze(0).cpu().data.numpy()

        if add_noise:
            actions += self.eps * self.noise.sample()

        actions = np.clip(actions, -1, 1)
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        states, actions, rewards, next_states, dones = experiences

        # Find the best action according to target network
        actions_next = self.actor_target(next_states)
        if agent_number == 0:
            #Get the first two actions
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            #Get the second two action
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        # Compute Q targets for current states
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(states, actions)
        # Compute loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip the gradients to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # Find the best action according to local network
        actions_pred = self.actor_local(states)
        if agent_number == 0:
            #Get the first two actions
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            #Get the second two actions
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)

        # Compute loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target network ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        # Update noise param eps
        self.eps -= self.eps_decay
        self.eps = max(self.eps, 0)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #10
0
class Agent:
    """Initeracts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed, cfg):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """

        buffer_size = cfg["Agent"]["Buffer_size"]
        batch_size = cfg["Agent"]["Batch_size"]
        gamma = cfg["Agent"]["Gamma"]
        tau = cfg["Agent"]["Tau"]
        lr_actor = cfg["Agent"]["Lr_actor"]
        lr_critic = cfg["Agent"]["Lr_critic"]
        noise_decay = cfg["Agent"]["Noise_decay"]
        weight_decay = cfg["Agent"]["Weight_decay"]
        update_every = cfg["Agent"]["Update_every"]
        noise_min = cfg["Agent"]["Noise_min"]
        noise_initial = cfg["Agent"]["Noise_initial"]
        action_clip = cfg["Agent"]["Action_clip"]

        # Attach some configuration parameters
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.action_clip = action_clip

        # Actor Networks both Local and Target.
        self.actor_local = Actor(state_size, action_size, random_seed,
                                 cfg).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed,
                                  cfg).to(device)
        self.actor_noise = ActorNoise(state_size, action_size, random_seed,
                                      cfg).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Networks both Local and Target.
        self.critic_local = Critic(state_size, action_size, random_seed,
                                   cfg).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed,
                                    cfg).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, cfg)
        self.noise_modulation = noise_initial
        self.noise_decay = noise_decay
        self.noise_min = noise_min

        # Replay memory
        # self._memory = Memory(capacity=buffer_size, seed=random_seed)
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)

        # Count number of steps
        self.n_steps = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer
        to learn."""
        self.memory.add(state, action, reward, next_state, done)

        # Learn if enough samples are available in memory
        if len(self.memory
               ) > self.batch_size and self.n_steps % self.update_every == 0:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.noise_modulation *= self.noise_decay
        self.noise_modulation = max(self.noise_modulation, self.noise_min)
        self.n_steps += 1

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        action = self.actor_local(state).cpu().data.numpy()
        if add_noise:
            # action += self.noise_modulation * self.noise.sample()
            self.actor_noise.reset_parameters()
            self.actor_noise.eval()
            self.hard_update(self.actor_local, self.actor_noise,
                             self.noise_modulation)
            action = self.actor_noise(state).cpu().data.numpy()
            self.actor_noise.train()
        self.actor_local.train()
        return np.clip(action, -self.action_clip, self.action_clip)

    def reset(self):
        self.n_steps = 0
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters given batch of experience tuples.
        Q_targets = r + gamma * cirtic_target(next_state, actor_state(next)state)
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Update critic
        # Get predicted next-state actions and Q-values from target models.
        self.actor_target.eval()
        self.critic_target.eval()

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # We didn't want actor_target or critc_target showing up in the graph.
        self.actor_target.train()
        self.critic_target.train()

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()  # Clear gradient
        critic_loss.backward()  # Backpropagation
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()  # Update parameters

        # Update actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()  # Clear gradient
        actor_loss.backward()  # Backpropagation
        # torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()  # Update parameters

        # Now we update the target networks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        theta_target = tau * theta_local + (1 - tau) * theta_target

        Params
        ======
            local_model: PyTorch model (weight source)
            target_model: PyTorch model (weight destination)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, local_model, noise_model, noise_modulation):
        """Hard update model parameters.
        theta_noise = theta_local + self.noise_modulation * theta_noise

        Params
        ======
            local_model: PyTorch model (weight source)
            noise_model: PyTorch model (weight destination)
        """
        for noise_param, local_param in zip(noise_model.parameters(),
                                            local_model.parameters()):
            noise_param.data.copy_(local_param.data +
                                   noise_modulation * noise_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 mnoise=True,
                 split_state=True):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.mnoise = mnoise
        self.split_state = split_state

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # initialize targets same as original networks
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        # Noise process
        if self.mnoise:
            self.noise = OUNoise((2, action_size), random_seed)
        else:
            self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, states, actions, rewards, next_states, dones, step):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        if self.split_state:
            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                self.memory.add(state, action, reward, next_state, done)
        else:
            self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        """
        Copy network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Пример #12
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed, add_noise = True, PER = False, PSN = True):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.add_noise = add_noise

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.PSN = PSN
        if self.add_noise:
          if self.PSN:
            self.noise = PSNoise(state_size, action_size, random_seed)
          else:
            self.noise = OUNoise(action_size, random_seed)
          
        # Replay memory
        self.PER = PER
        if self.PER:
          self.memory = ReplayBufferPE(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, alpha = ALPHA)
          self.beta = BETA_INITIAL
        else:
          self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
        # Initialize learning steps 
        self.learn_step = 0  
    
    def reset(self):
        if self.add_noise:
          if self.PSN:
            self.noise.reset(self.actor_local)
          else:
            self.noise.reset()    
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        
        if len(self.memory) > BATCH_SIZE:
            # Learn, if enough samples are available in memory for number of timesteps
            for _ in range(STEPS_UPDATE):
              experiences = self.memory.sample()
              self.learn(experiences, GAMMA)
        
        # LEARN_EVERY time steps.
        '''
        self.learn_step = (self.learn_step + 1) % LEARN_EVERY
        if self.learn_step == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                for _ in range(STEPS_UPDATE):
                  experiences = self.memory.sample()
                  self.learn(experiences, GAMMA)            
        '''
    def act(self, state, epsilon = 1, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        
        #If add_noise = True:
        if self.add_noise:
          #Add AS or PS noise:
          if self.PSN:
            # Parameter Space Noise
            if len(self.memory) > BATCH_SIZE:
              # PS noise needs to sample from memory to perturbate actor weights
              self.noise.update_noise(self.actor_local, states_batch = self.memory.sample()[0])
            with torch.no_grad():
              action = self.actor_local(state).cpu().data.numpy()
            self.actor_local.train()
          # Action Space Noise
          else:
            with torch.no_grad():
              action = self.actor_local(state).cpu().data.numpy()
            self.actor_local.train()
            action += self.noise.sample()
        #If add_noise = False, no noise is added
        else:
          with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        #For all cases, return clipped action      
        return np.clip(action, -1, 1)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples."""    
        
        states, actions, rewards, next_states, dones = experiences
        
        #Clip rewards
        #rewards_ = torch.clamp(rewards, min=-1., max=1.)
        
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute Q expected
        Q_expected = self.critic_local(states, actions)
        
        # Compute critic loss        
        if self.PER:
          # Update Beta
          self.beta += BETA_INCREMENT
          # Get RB weights
          weights = self.memory.get_weights(self.beta)
          # Clip abs(TD_errors)
          TD_errors = torch.clamp(torch.abs(Q_targets - Q_expected), min=0., max=1.)
          # Update replay buffer with proportional probs
          self.memory.update_priorities(TD_errors)
          #compute weighted mse loss     
          critic_loss = torch.mean(weights * (Q_expected - Q_targets) ** 2)
        else:
          #compute mse loss  
          critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize critic loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip critic gradient
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean() #NEGATIVE: gradiet ascent
        
        # Minimize actor loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Soft updates for target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Пример #13
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, number_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            number_agents (int): number of agents
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.number_agents = number_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise processes
        self.noise = OUNoise((number_agents, action_size), random_seed)
        #self.noise = GaussianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1)
        #self.noise = GeometricBrownianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experiences in replay memory, and use random sample from buffer to learn."""

        # We save experience tuples in the memory for each agent.
        for i in range(self.number_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn, if enough samples are available in memory (threshold value: BATCH_SIZE) and at learning interval settings
        if len(self.memory) > BATCH_SIZE:
            for _ in range(UPDATE_RATE):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

#     def act(self, states, add_noise=True):
#         """Returns actions for given state as per current policy."""
#                                                                   # The code has been adapted to implement batch normalization.
#         actions = np.zeros((self.number_agents, self.action_size))
#         self.actor_local.eval()
#         with torch.no_grad():
#             for agent_number, state in enumerate(states):
#                 state = torch.from_numpy(state).float().unsqueeze(0).to(device)   # The code has been adapted to implement batch normalization.
#                 action = self.actor_local(state).cpu().data.numpy()
#                 actions[agent_number, :] = action
#         self.actor_local.train()
#         if add_noise:
#             actions += self.noise.sample()
#         return np.clip(actions, -1, 1)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.number_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_number, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_number, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #14
0
class DDPG:
    def __init__(self, config):
        self.config = config
        self.state_size = config.state_size
        self.action_size = config.action_size

        self.actor_local = Actor(self.state_size, self.action_size,
                                 2).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  2).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.LR_ACTOR)

        self.critic_local = Critic(self.state_size, self.action_size,
                                   2).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    2).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=config.LR_CRITIC,
        )

        self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE)
        self.noise = OUNoise(self.action_size, config.random_seed)

        self.t_step = 0

        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)

    def step(self, states, actions, rewards, next_states, dones):

        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY

        if len(self.memory) > self.config.BATCH_SIZE and (self.t_step == 0):

            for i in range(self.config.EPOCH):
                experiences = self.memory.sample(self.config.BATCH_SIZE)
                self.learn(experiences)

    def reset(self):
        self.noise.reset()

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.critic_target(next_states,
                                            self.actor_target(next_states))
        Q_targets = rewards + (self.config.GAMMA * Q_targets_next *
                               (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -self.critic_local(states,
                                        self.actor_local(states)).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.config.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.config.TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #15
0
class Agent(object):
    """DDPG Agent that interacts and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 actor_args={},
                 critic_args={}):
        """Initializes the DQN agent.

        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            device (torch.device): Device to use for calculations
            actor_args (dict): Arguments describing the actor network
            critic_args (dict): Arguments describing the critic network
        """
        self.state_size = state_size
        """Dimension of each state"""

        self.action_size = action_size
        """Dimension of each action"""

        self.device = device
        """Device to use for calculations"""

        self.t_step = 0
        """Timestep between training updates"""

        # Parameters

        # Actor network
        self.actor_local = Actor(state_size, action_size,
                                 **actor_args).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  **actor_args).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network

        self.critic_local = Critic(state_size, action_size,
                                   **critic_args).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    **critic_args).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process for exploration
        self.noise = OUNoise(action_size, sigma=NOISE_SD)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device)

    def reset(self):
        """Reset state of agent."""
        self.noise.reset()

    def save_weights(self, path):
        """Save local network weights.

        Args:
            path (string): File to save to"""
        torch.save(
            {
                'actor_local': self.actor_local.state_dict(),
                'actor_target': self.actor_target.state_dict(),
                'critic_local': self.critic_local.state_dict(),
                'critic_target': self.critic_target.state_dict()
            }, path)

    def load_weights(self, path):
        """Load local network weights.

        Args:
            path (string): File to load weights from"""
        checkpoint = torch.load(path)
        self.actor_local.load_state_dict(checkpoint['actor_local'])
        self.actor_target.load_state_dict(checkpoint['actor_target'])
        self.critic_local.load_state_dict(checkpoint['critic_local'])
        self.critic_target.load_state_dict(checkpoint['critic_target'])

    def act(self, state, add_noise=True):
        """Returns action for given state according to the current policy
            
        Args:
            state (np.ndarray): Current state

        Returns:
            action (np.ndarray): Action tuple
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Temporarily set evaluation mode (no dropout &c) & turn off autograd
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().detach().numpy()

        # Resume training mode
        self.actor_local.train()

        # Add noise if exploring
        if add_noise:
            action += self.noise.sample()
            # The noise might take us out of range
            action = np.clip(action, -1, 1)

        return action

    def step(self, state, action, reward, next_state, done):
        """Save experience and learn if due.
        Args:
            state (Tensor): Current state
            action (int): Chosen action
            reward (float): Resulting reward
            next_state (Tensor): State after action
            done (bool): True if terminal state
        """
        self.memory.add(state, action, reward, next_state, done)

        # Learn as soon as we have enough stored experiences
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE:
            for _ in range(NUM_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        """Learn from batch of experiences."""
        states, actions, rewards, next_states, dones = experiences

        # region Update Critic
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)

        q_targets = rewards + (GAMMA * q_targets_next * (1 - dones))

        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)

        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0)
        self.critic_optimizer.step()
        # endregion

        # region Update Actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        # endregion

        # Update target networks
        soft_update(self.critic_local, self.critic_target, TAU)
        soft_update(self.actor_local, self.actor_target, TAU)
Пример #16
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, agent_id, random_seed):
        """Initialize a ddpg_agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            agent_id (int): identifier for this agent
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.agent_id = agent_id

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Make sure that the target-local model pairs are initialized to the
        # same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        self.noise = OUNoise(action_size, random_seed)

        self.noise_amplification = NOISE_AMPLIFICATION
        self.noise_amplification_decay = NOISE_AMPLIFICATION_DECAY

        ### self._print_network()

    def act(self, state, add_noise=False):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
            self._decay_noise_amplification()

        return np.clip(action, -1, 1)

    def reset(self):
        """Resets the OU Noise for this agent."""
        self.noise.reset()

    def learn(self, experiences, next_actions, actions_pred):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(next_state) -> action
            critic_target(next_state, next_action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            next_actions (list): next actions computed from each agent
            actions_pred (list): prediction for actions for current states from each agent
        """
        states, actions, rewards, next_states, dones = experiences
        agent_id_tensor = torch.tensor([self.agent_id - 1]).to(device)

        ### Update critic
        self.critic_optimizer.zero_grad()
        Q_targets_next = self.critic_target(next_states, next_actions)
        Q_targets = rewards.index_select(1, agent_id_tensor) + \
               (GAMMA * Q_targets_next *  (1 - dones.index_select(1, agent_id_tensor)))
        Q_expected = self.critic_local(states, actions)
        # Minimize the loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        critic_loss.backward()
        self.critic_optimizer.step()

        ### Update actor
        self.actor_optimizer.zero_grad()
        # Minimize the loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        actor_loss.backward()
        self.actor_optimizer.step()

        ### Update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def hard_update(self, local_model, target_model):
        """Hard update model parameters.
        θ_target = θ_local
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ * θ_local + (1 - τ) * θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def _decay_noise_amplification(self):
        """Helper for decaying exploration noise amplification."""
        self.noise_amplification *= self.noise_amplification_decay
class Actor():
    def __init__(self, action_size, state_size,buffer_size, batch_size,actor_lr,critic_lr,device,weight_decay, tau,shared_memory,noise,
    share_memory_flag, seed=0):
        self.state_size  = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size  = batch_size
        self.actor_lr = actor_lr
        self.weight_decay = weight_decay
        self.device = device
        self.seed= seed
        self.actor_loss =[]
        #self.critic_loss =[]
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.tau = tau
        self.noise= OUNoise(self.action_size,self.seed)
        #self.noise = noise
        self.share_memory_flag = share_memory_flag
        if self.share_memory_flag:
            self.memory = shared_memory
        else:
            self.memory = ReplayBuffer(action_size, buffer_size, batch_size, self.device)

        ## Actor
        self.actor_local = ActorNN(self.state_size,self.action_size).to(self.device)
        self.actor_target = ActorNN(self.state_size,self.action_size).to(self.device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr = self.actor_lr)
        ## Critic
        #self.critic_local = Critic(self.state_size,self.action_size).to(self.device)
        #self.critic_target = Critic(self.state_size,self.action_size).to(self.device)
        #self.critic_optimizer = Adam(self.critic_local.parameters(), lr = self.critic_lr,  weight_decay=self.weight_decay)
        # initialize targets same as original networks
        self.hard_update(self.actor_target, self.actor_local)
        #self.hard_update(self.critic_target, self.critic_local)

    def reset(self):
        self.noise.reset()

    def act(self, state,noise = True,sd=1e-4):
        state = torch.from_numpy(state).float().to(self.device)

        self.actor_local.eval()
        with torch.no_grad():
            #print(state.shape)
            action = self.actor_local(state).cpu().data.numpy()
            ##action.cpu().detach().numpy()
        self.actor_local.train()

        if noise:
            #print(type(action))
            #action += np.random.normal(loc=0.0, scale=sd, size=action.size)
            action += self.noise.sample()
        action = np.clip(action, -1,1).reshape(1,-1)
        return action




    def hard_update(self,target, source):
        """
        Copy network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(param.data)

    def step(self, state, action, rewards, next_state, done,GAMMA=1.0):
        ## As per the description we are not supposed to use discount factor
        self.memory.add(state, action, rewards, next_state, done)
Пример #18
0
class Agent():
    '''Interact with and learn from environment.'''
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # counter for activating learning every few steps
        self.running_c_loss = 0
        self.running_a_loss = 0
        self.training_cnt = 0

        # Actor network (w/ target network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Prioritized replay memory
        self.prioritized_memory = PrioritizedMemory(BATCH_SIZE, BUFFER_SIZE,
                                                    seed)

    def act(self, state, mode):
        '''Returns actions for given state as per current policy.

        Params
        ======
            state (array): current state
            mode (string): train or test
            epsilon (float): for epsilon-greedy action selection
        '''
        state = torch.from_numpy(state).unsqueeze(0).float().to(
            device)  # shape of state (1, state_size)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if mode == 'test':
            return np.clip(action, -1, 1)

        elif mode == 'train':  # if train, then add OUNoise in action
            action += self.noise.sample()
            return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        # add new experience in memory
        self.prioritized_memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.prioritized_memory) >= BUFFER_SIZE:
                for _ in range(10):  # update 10 times per learning
                    idxes, experiences, is_weights = self.prioritized_memory.sample(
                        device)
                    self.learn(experiences,
                               GAMMA,
                               is_weights=is_weights,
                               leaf_idxes=idxes)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, is_weights, leaf_idxes):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Policy loss = (1/n)*Q_local(s,a) -> for deterministic policy (no log prob)

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
            is_weights (tensor array): importance-sampling weights for prioritized experience replay
            leaf_idxes (numpy array): indexes for update priorities in SumTree
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        rewards = rewards  # TODO: rewards are clipped to be in [-1,1]

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss

        Q_expected = self.critic_local(states, actions)
        td_errors = (
            Q_targets -
            Q_expected).tanh()  # TD-errors are clipped to be in [-1,1]
        abs_errors = td_errors.abs().cpu().data.numpy()  # pull back to cpu
        self.prioritized_memory.batch_update(
            leaf_idxes, abs_errors)  # update priorities in SumTree

        c_loss = (is_weights * (td_errors**2)).mean(
        )  # adjust squared TD loss by Importance-Sampling Weights
        self.running_c_loss += float(c_loss.cpu().data.numpy())
        self.training_cnt += 1

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        c_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       1)  # clip gradient to max 1
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        a_loss = self.critic_local(states, actions_pred)
        a_loss = -a_loss.mean()
        self.running_a_loss += float(a_loss.cpu().data.numpy())

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        a_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(),
                                       1)  # clip gradient to max 1
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #19
0
def train(cfg):
    print('Start to train ! \n')
    env = NormalizedActions(gym.make("Pendulum-v0"))

    # 增加action噪声
    ou_noise = OUNoise(env.action_space)

    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent = DDPG(n_states,
                 n_actions,
                 device="cpu",
                 critic_lr=1e-3,
                 actor_lr=1e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128)
    rewards = []
    moving_average_rewards = []
    ep_steps = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    for i_episode in range(1, cfg.train_eps + 1):
        state = env.reset()
        ou_noise.reset()
        ep_reward = 0
        for i_step in range(1, cfg.train_steps + 1):
            action = agent.select_action(state)
            action = ou_noise.get_action(action,
                                         i_step)  # 即paper中的random process
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done)
            agent.update()
            state = next_state
            if done:
                break
        print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),
              'n_steps:', i_step)
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        if i_episode == 1:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(0.9 * moving_average_rewards[-1] +
                                          0.1 * ep_reward)
        writer.add_scalars('rewards', {
            'raw': rewards[-1],
            'moving_average': moving_average_rewards[-1]
        }, i_episode)
        writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode)
    writer.close()
    print('Complete training!')
    ''' 保存模型 '''
    if not os.path.exists(SAVED_MODEL_PATH):  # 检测是否存在文件夹
        os.mkdir(SAVED_MODEL_PATH)
    agent.save_model(SAVED_MODEL_PATH + 'checkpoint.pth')
    '''存储reward等相关结果'''
    if not os.path.exists(RESULT_PATH):  # 检测是否存在文件夹
        os.mkdir(RESULT_PATH)
    np.save(RESULT_PATH + 'rewards_train.npy', rewards)
    np.save(RESULT_PATH + 'moving_average_rewards_train.npy',
            moving_average_rewards)
    np.save(RESULT_PATH + 'steps_train.npy', ep_steps)
Пример #20
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.best_score = -np.inf
        self.score = 0
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.11  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state
        self.score += reward
        if done:
            if self.score > self.best_score:
                self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #21
0
class Agent:
    """Initeracts with and learns from the environment."""

    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # Actor Networks both Local and Target.
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Networks both Local and Target.
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        self.noise_modulation = 1
        self.noise_decay = NOISE_DECAY

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)

        # Count number of steps
        self.n_steps = 0
        self.update_every = UPDATE_EVERY

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer
        to learn."""
        self.memory.add(state, action, reward, next_state, done)

        # Learn if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and self.n_steps % self.update_every == 0:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

        self.noise_modulation *= self.noise_decay
        self.n_steps += 1

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise_modulation * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.n_steps = 0
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value paramters given batch of experience tuples.
        Q_targets = r + gamma * cirtic_target(next_state, actor_state(next)state)
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Update critic
        # Get predicted next-state actions and Q-values from target models.
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad() # Clear gradient
        critic_loss.backward()            # Backpropagation
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()      # Update parameters

        # Update actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad() # Clear gradient
        actor_loss.backward()            # Backpropagation
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()      # Update parameters

        # Now we update the target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)



    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        theta_target = tau * theta_local + (1 - tau) * theta_target

        Params
        ======
            local_model: PyTorch model (weight source)
            target_model: PyTorch model (weight destination)
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Пример #22
0
class DDPGAgent:
    '''Class representing the DDPG algorithm'''
    def __init__(self, state_size, action_size, config):
        '''Class constructor and parameters initialization'''
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print(f'Using {self.device}')

        self.timestep = 0

        seed = config['seed']

        self.gamma = config['gamma']
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = config['number_agents']

        # Learns argmax_a[Q(s, a); theta_mu] = mu(s, a; theta_mu)
        self.learnt_actor = Actor(seed, state_size,
                                  action_size).to(self.device)  # learnt
        self.target_actor = Actor(seed, state_size, action_size).to(
            self.device)  # soft-update tracking
        self.actor_optim = optim.Adam(self.learnt_actor.parameters(),
                                      lr=config['actor_lr'])

        # Learns to evaluate Q(s, mu(s, a); theta_q)
        self.learnt_critic = Critic(seed, state_size, action_size,
                                    1).to(self.device)  # learnt
        self.target_critic = Critic(seed, state_size, action_size,
                                    1).to(self.device)  # soft-update tracking
        self.critic_optim = optim.Adam(self.learnt_critic.parameters(),
                                       lr=config['critic_lr'])

        print(
            f'Summary:\nActor network:\n{self.learnt_actor}\nCritic network:\n{self.learnt_critic}'
        )

        # Note: Could be replaced by parallel env batching
        self.batch_size = config['batch_size']
        self.memory = Memory(config['memory_size'], self.batch_size, seed)
        self.memory.to_device(self.device)

        # Soft-update
        self.tau = config['tau']

        # Noise
        self.noise = OUNoise(action_size, seed)
        self.noise_decay = config['noise_decay']

    def reset(self):
        '''Reset the noise state'''
        self.noise.reset()

    def act(self, states):
        '''Sample an action from the policy'''
        states = torch.tensor(states, dtype=torch.float32, device=self.device)

        self.learnt_actor.eval()
        with torch.no_grad():
            actions = self.learnt_actor(states).cpu().data.numpy()
        self.learnt_actor.train()

        actions += self.noise_decay * self.noise.sample()

        return np.clip(actions, -1, 1)

    def remember(self, states, actions, rewards, next_states, dones):
        '''Populates the replay memory with new batch of data'''
        n = len(states)

        assert (n == len(actions))
        assert (n == len(rewards))
        assert (n == len(next_states))
        assert (n == len(dones))

        for (state, action, reward, next_state,
             done) in zip(states, actions, rewards, next_states, dones):
            self.memory.add(Experience(state, action, reward, next_state,
                                       done))

    def step(self, timestep):
        '''Wraps and controls the training of the function approximators using soft-updating'''
        if len(self.memory
               ) > self.batch_size and self.timestep % LEARN_EVERY == 0:
            for _ in range(ITERS):
                states, actions, rewards, next_states, dones = self.memory.sample(
                )
                self.__learn(states, actions, rewards, next_states, dones)

    def __learn(self, states, actions, rewards, next_states, dones):
        '''Optimizes the function apprximators and soft-updates'''

        self.__optimize_critic(states, actions, rewards, next_states, dones)

        self.__optimize_actor(states)

        self.__soft_update(self.learnt_actor, self.target_actor, self.tau)
        self.__soft_update(self.learnt_critic, self.target_critic, self.tau)

        self.noise_decay *= self.noise_decay
        self.reset()

    def __optimize_critic(self, states, actions, rewards, next_states, dones):
        '''Optimizes the critic approximator'''
        best_next_actions = self.target_actor(next_states)
        q_targets = rewards + self.gamma * self.target_critic(
            next_states, best_next_actions) * (1 - dones)

        q_predictions = self.learnt_critic(states, actions)

        self.critic_optim.zero_grad()
        critic_loss = F.mse_loss(q_predictions, q_targets)
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.learnt_critic.parameters(), 1)
        self.critic_optim.step()

    def __optimize_actor(self, states):
        '''Optimizes the actor approximator'''
        best_current_actions = self.learnt_actor(states)
        advantage = -self.learnt_critic(states, best_current_actions).mean()

        self.actor_optim.zero_grad()
        advantage.backward()
        self.actor_optim.step()

    def __soft_update(self, learnt, target, tau):
        '''Soft-updates the target parameters'''
        for learnt_param, target_param in zip(learnt.parameters(),
                                              target.parameters()):
            target_param.data.copy_(tau * learnt_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #23
0
class Agent():
    def __init__(self,
                 device,
                 state_size,
                 action_size,
                 actor,
                 critic,
                 action_low=-1.0,
                 action_high=1.0,
                 lrate_critic=10e-3,
                 lrate_actor=10e-4,
                 tau=0.001,
                 gamma=0.99,
                 exploration_mu=0.0,
                 exploration_theta=0.15,
                 noise_decay=1.,
                 exploration_sigma=0.20,
                 restore_path=None,
                 weight_decay=0.,
                 seed=None):

        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high
        self.seed = seed if seed else np.random.randint(100)
        self.lrate_critic = lrate_critic
        self.lrate_actor = lrate_actor
        self.tau = tau
        self.gamma = gamma
        self.restore_path = restore_path
        self.device = device
        self.weight_decay = weight_decay
        self.noise_decay = noise_decay

        # actors networks
        self.actor = actor(device,
                           state_size,
                           action_size,
                           low=action_low,
                           high=action_high,
                           seed=self.seed)
        self.actor_target = actor(device,
                                  state_size,
                                  action_size,
                                  low=action_low,
                                  high=action_high,
                                  seed=self.seed)

        # critic networks
        self.critic = critic(device, state_size, action_size, seed=self.seed)
        self.critic_target = critic(device,
                                    state_size,
                                    action_size,
                                    seed=self.seed)

        # restore networks if needed
        if restore_path is not None:
            self.restore(restore_path, True)

        # optimizer
        self.actor_opt = optim.Adam(self.actor.parameters(),
                                    lr=lrate_actor,
                                    weight_decay=self.weight_decay)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=lrate_critic,
                                     weight_decay=self.weight_decay)

        # noise
        self.noise = OUNoise(action_size, exploration_mu, exploration_theta,
                             exploration_sigma)
        self.noise_scale = 1.0

        # reset agent for training
        self.reset_episode()
        self.it = 0

    def reset_episode(self):
        self.noise.reset()

    def act(self, state, learn=True):

        if type(state) == 'list':
            state = np.array(state)

        if not learn:
            self.actor.eval()

        with torch.no_grad():
            action = self.actor(self.tensor(state)).cpu().numpy()

        # Add noise when learning for exploration
        if learn:
            action += self.noise.sample() * self.noise_scale
            self.noise_scale = max(self.noise_scale * self.noise_decay, 0.01)

        self.actor.train()
        return np.clip(action, self.action_low, self.action_high)

    def save(self, path):
        dirn = os.path.dirname(path)
        if not os.path.exists(dirn):
            os.mkdir(dirn)
        params = {}
        params['actor'] = self.actor.state_dict()
        params['critic'] = self.critic.state_dict()
        torch.save(params, path)

    def restore(self, path, for_Training=False):

        # Restore only actor for performance
        checkpoint = torch.load(path, map_location=self.device)
        self.actor.load_state_dict(checkpoint['actor'])
        # Restore also for futhert training
        if for_Training:
            self.actor_target.load_state_dict(checkpoint['actor'])
            self.critic.load_state_dict(checkpoint['critic'])
            self.critic_target.load_state_dict(checkpoint['critic'])

    def learn_step(self, replay_buffer):
        # learn from mini-batch of replay buffer
        state_b, action_b, reward_b, next_state_b, done_b = replay_buffer.sample(
        )

        # calculate td target
        with torch.no_grad():
            y_b = reward_b.unsqueeze(1) + self.gamma * \
             self.critic_target(next_state_b, self.actor_target(next_state_b)) * (1-done_b.unsqueeze(1))

        # update critic
        critic_loss = F.smooth_l1_loss(self.critic(state_b, action_b), y_b)
        self.critic.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # update actor
        action = self.actor(state_b)
        actor_loss = -self.critic(state_b, action).mean()
        self.actor.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        # soft update networks
        # critic only if trained
        # actor always
        self.soft_update()

    def soft_update(self):
        """Soft update of target network
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def tensor(self, x):
        return torch.from_numpy(x).float().to(torch.device(self.device))
Пример #24
0
class Environment:
    """ Train & simulate wrapper for Atari-DQN
    Args:
        params: dictionary of parameters
            memory_size : size of replay memory. 100000 needs almost 25GB memory, recommend reduce it if you need
            exploration_step : pure exploration step
            gamma : discount rate
            tau: parameter for soft update
            lr_actor: learning rate for actor network
            lr_critic: learning rate for critic network
        device_name : name of device(normally cpu:0 or gpu:0)
    """
    def __init__(self, params, device_name):
        self.env = gym.make('Pendulum-v0')
        self.ddpg = DDPG(input_dim=self.env.observation_space.shape[0],
                         action_dim=self.env.action_space.shape[0],
                         action_scale=(self.env.action_space.low[0],
                                       self.env.action_space.high[0]),
                         memory_size=params["memory_size"],
                         gamma=params["gamma"],
                         tau=params["tau"],
                         learning_rate_actor=params["lr_actor"],
                         learning_rate_critic=params["lr_critic"],
                         device_name=device_name)
        self.ddpg.build()
        self.ddpg.summary()

        self.random_process = OUNoise(size=self.env.action_space.shape[0])

        # total step operated
        self.i_step = 0

    def load(self, global_step="latest"):
        """ Load saved weights for ddpg
        Args:
            global_step : load specific step, if "latest" load latest one
        """
        self.ddpg.load(global_step)

    def save(self):
        """ Save current weight of ddpg layers
        """
        self.ddpg.save()

    def train(self,
              episode,
              max_step,
              minibatch_size,
              render=False,
              verbose=1,
              val_epi=5,
              saving=False):
        """run the game with training network
        Args:
            episode : number of train episodes
            max_step : maximum step for each episode
            minibatch_size : minibatch size for replay memory training
            render : whether to show game simulating graphic
            verbose : for which step it will print the loss and accuracy (and saving)
            val_epi : number of episode for validation
            saving: whether to save checkpoint or not
        """
        losses = []
        episode_return = []
        verbose_return = []
        episode_return_val = []

        tr = trange(episode, desc="")
        for i_episode in tr:
            return_episode = 0
            observation = self.env.reset()
            self.random_process.reset()

            for t in range(max_step):
                self.i_step += 1
                if render:
                    self.env.render()

                X = observation.astype(np.float32)
                action_policy = self.ddpg.get_action(tf.convert_to_tensor(X))
                action_policy += self.random_process.sample()
                action_policy = np.clip(action_policy,
                                        self.env.action_space.low[0],
                                        self.env.action_space.high[0])
                observation, reward, done, info = self.env.step(action_policy)
                return_episode += reward

                X_next = observation.astype(np.float32)
                self.ddpg.replay_memory.append(
                    (X, action_policy, reward, X_next, done))
                # training step
                if len(self.ddpg.replay_memory) > minibatch_size:
                    X_batch, action_batch, reward_batch, X_next_batch, done_batch = self.ddpg.replay_memory.get_batch(
                        minibatch_size)
                    loss_critic, loss_actor = self.ddpg.train(
                        X_batch, action_batch, reward_batch, X_next_batch,
                        done_batch)
                    losses.append((loss_critic, loss_actor))

                if done:
                    break

            episode_return.append(return_episode)
            verbose_return.append(return_episode)
            tr.set_description("%.4f" %
                               (sum(episode_return) / len(episode_return)))

            if i_episode == 0 or ((i_episode + 1) % verbose == 0):
                if len(self.ddpg.replay_memory) <= minibatch_size:
                    stage_tooltip = "EXPLORATION"
                    print(Fore.RED + "[EPISODE %3d / STEP %5d] - %s" %
                          (i_episode + 1, self.i_step, stage_tooltip))
                    print(Fore.GREEN + "Learned Step : %4d" %
                          (self.ddpg.global_step))
                    print(Fore.BLUE + "AVG   Return         : %.4f" %
                          (sum(verbose_return) / len(verbose_return)))
                    print(Fore.BLUE + "MAX   Return         : %.4f" %
                          (max(verbose_return)))
                    continue
                else:
                    stage_tooltip = "TRAINING"
                losses_critic = [l[0] for l in losses]
                losses_actor = [l[1] for l in losses]

                # validation
                returns = []
                for epi_val in range(val_epi):
                    return_episode_val = 0
                    observation = self.env.reset()

                    for t in range(max_step):
                        if render:
                            self.env.render()

                        action_policy = self.ddpg.get_action(
                            tf.convert_to_tensor(observation.astype(
                                np.float32)))
                        observation, reward, done, info = self.env.step(
                            action_policy)
                        return_episode_val += reward

                        if done:
                            # print(Fore.GREEN + "EPISODE %3d: REWARD: %s" % (i_episode, return_episode))
                            returns.append(return_episode_val)
                            break

                print(Fore.RED + "[EPISODE %3d / STEP %5d] - %s" %
                      (i_episode + 1, self.i_step, stage_tooltip))
                print(Fore.GREEN + "Learned Step : %4d" %
                      (self.ddpg.global_step))
                print(Fore.BLUE + "AVG   Return         : %.4f" %
                      (sum(verbose_return) / len(verbose_return)))
                print(Fore.BLUE + "MAX   Return         : %.4f" %
                      (max(verbose_return)))
                print(Fore.LIGHTYELLOW_EX + "AVG   LOSS Actor     :  %.4f" %
                      (sum(losses_actor) / len(losses_actor)))
                print(Fore.LIGHTYELLOW_EX + "AVG   LOSS Critic    :  %.4f" %
                      (sum(losses_critic) / len(losses_critic)))
                print(Fore.LIGHTRED_EX + "AVG VAL[%2d]   Return : %.4f" %
                      (val_epi, sum(returns) / len(returns)))
                print(Fore.LIGHTRED_EX + "MAX VAL[%2d]   Return : %.4f" %
                      (val_epi, max(returns)))
                verbose_return = []
                losses = []
                episode_return_val.append(sum(returns) / len(returns))

                if saving:
                    self.save()

                time.sleep(1)

        return episode_return

    def simulate(self, episode, max_step=1000, render=False):
        """Run the game with existing dqn network
        Args:
            episode : number of train episodes
            max_step : maximum step for each episode
            render : whether to show game simulating graphic
        """
        returns = []
        for i_episode in range(episode):
            return_episode = 0
            observation = self.env.reset()

            for t in range(max_step):
                if render:
                    self.env.render()

                action_policy = self.ddpg.get_action(
                    tf.convert_to_tensor(observation.astype(np.float32)))
                observation, reward, done, info = self.env.step(action_policy)
                return_episode += reward

                if done:
                    print(Fore.GREEN + "EPISODE %3d: REWARD: %s" %
                          (i_episode, return_episode))
                    returns.append(return_episode)
                    break

        print(Fore.RED + "AVG REWARD : %s" % (sum(returns) / len(returns)))
        print(Fore.BLUE + "MAX REWARD : %s" % (max(returns)))
Пример #25
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed, agent_id):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, 256, 256,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, 256, 256,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # critic input
        critic_state_size = (state_size + action_size) * 2

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(critic_state_size, 256, 256,
                                   random_seed).to(device)
        self.critic_target = Critic(critic_state_size, 256, 256,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # agent id
        self.id_agent = agent_id

        # set weights the same for both models
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

    def act(self, state, noise_counter, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            if noise_counter < NOISE_LEVEL:
                action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, all_actions, all_next_actions,
              agent_id):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
            next_actions (list): list of agents next actions
            actions (list): list of agents actions
            agent_id (int): agent_id, needed to distinguish between agents
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        agent_id = torch.tensor([agent_id]).to(device)
        all_next_actions = torch.cat(all_next_actions, dim=1).to(device)
        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            Q_targets_next = self.critic_target(next_states, all_next_actions)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards.index_select(
            1, agent_id) + (gamma * Q_targets_next *
                            (1 - dones.index_select(1, agent_id)))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = [
            actions if i == self.id_agent else actions.detach()
            for i, actions in enumerate(all_actions)
        ]
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        """Copy weights from source to target network,
        modified version of agent.soft_update()"""
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Пример #26
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, agent_id, args):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = args['seed']
        self.device = args['device']
        self.args = args

        # Q-Network
        self.actor_network = ActorNetwork(state_size, action_size,
                                          args).to(self.device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         args).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_network.parameters(),
                                          lr=args['LR_ACTOR'])

        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        if not agent_id:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p0_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p0_path']),
                                              strict=False)
        else:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p1_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p1_path']),
                                              strict=False)

        # Replay memory
        self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'],
                                   args['BATCH_SIZE'], self.seed)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.args['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.train(experiences)

    def act(self, current_state):

        with torch.no_grad():

            self.actor_network.eval()

            input_state = torch.from_numpy(current_state).float().to(
                self.device)

            with torch.no_grad():
                action = self.actor_network(input_state).cpu().data.numpy()

            self.actor_network.train()

            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def train(self, experiences):

        global states_
        global next_states_
        global actions_
        global max_min_actions_vector
        global max_min_states_vector

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        with torch.no_grad():
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target(next_states)
            Q_targets_next = mCritic.target(next_states, actions_next)

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = mCritic.network(states, actions)
        mCritic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        mCritic.optimizer.zero_grad()
        mCritic_loss.backward()
        mCritic.optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_network(states)
        actor_loss = -mCritic.network(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(mCritic.network, mCritic.target, TAU)
        self.soft_update(self.actor_network, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDPGAgent:
    '''Class representing the DDPG algorithm'''
    def __init__(self, seed, state_size, action_size, num_agents, device,
                 config):
        '''Class constructor and parameters initialization'''
        self.device = device

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size

        self.gamma = config['gamma']

        # Learns argmax_a[Q(s, a); theta_mu] = mu(s, a; theta_mu)
        self.learnt_actor = Actor(seed, state_size,
                                  action_size).to(self.device)  # learnt
        self.target_actor = Actor(seed, state_size, action_size).to(
            self.device)  # soft-update tracking
        self.actor_optim = optim.Adam(self.learnt_actor.parameters(),
                                      lr=config['actor_lr'])

        # Learns to evaluate Q(s, mu(s, a); theta_q)
        self.learnt_critic = Critic(seed, state_size * num_agents,
                                    action_size * num_agents,
                                    num_agents).to(self.device)  # learnt
        self.target_critic = Critic(seed, state_size * num_agents,
                                    action_size * num_agents, num_agents).to(
                                        self.device)  # soft-update tracking
        self.critic_optim = optim.Adam(self.learnt_critic.parameters(),
                                       lr=config['critic_lr'])

        print(
            f'Summary:\nActor network:\n{self.learnt_actor}\nCritic network:\n{self.learnt_critic}'
        )

        # Soft-update
        self.tau = config['tau']

        # Noise
        self.noise = OUNoise(action_size, seed)
        self.noise_decay = config['noise_decay']

        self.hard_copy_weights(self.learnt_actor, self.target_actor)
        self.hard_copy_weights(self.learnt_critic, self.target_critic)

    def reset_noise(self):
        '''Reset the noise state'''
        self.noise.reset()

    # Note: Decentralized actors (execution)
    def act(self, state):
        '''Sample an action from the policy'''
        state = torch.tensor(state, dtype=torch.float32, device=self.device)

        self.learnt_actor.eval()
        with torch.no_grad():
            actions = self.learnt_actor(state).cpu().data.numpy()
        self.learnt_actor.train()

        actions += self.noise_decay * self.noise.sample()

        return np.clip(actions, -1, 1)

    # Note: Centralized critic (training)
    def step(self, best_current_actions, best_next_actions, states, actions,
             rewards, next_states, dones):
        '''Optimizes the function apprximators and soft-updates'''

        self.__optimize_critic(best_next_actions, states, actions, rewards,
                               next_states, dones)

        self.__optimize_actor(best_current_actions, states)

        self.__soft_update(self.learnt_actor, self.target_actor, self.tau)
        self.__soft_update(self.learnt_critic, self.target_critic, self.tau)

        self.noise_decay *= 0.9999  #self.noise_decay
        #self.reset_noise()

    def __optimize_critic(self, best_next_actions, states, actions, rewards,
                          next_states, dones):
        '''Optimizes the critic approximator'''
        with torch.no_grad():
            q_targets = self.target_critic(next_states, best_next_actions)
        q_targets = rewards + self.gamma * q_targets * (1 - dones)
        q_predictions = self.learnt_critic(states, actions)

        self.critic_optim.zero_grad()
        critic_loss = F.mse_loss(q_predictions, q_targets.detach())
        critic_loss.backward()
        # Note: Control the magnitude of the gradient
        torch.nn.utils.clip_grad_norm_(self.learnt_critic.parameters(), 0.5)
        self.critic_optim.step()

    def __optimize_actor(self, best_current_actions, states):
        '''Optimizes the actor approximator'''
        advantage = -self.learnt_critic(states, best_current_actions).mean()

        self.actor_optim.zero_grad()
        advantage.backward()
        self.actor_optim.step()

    def hard_copy_weights(self, learnt, target):
        """ Copy weights from source to target network (part of initialization)"""
        for learnt_param, target_param in zip(learnt.parameters(),
                                              target.parameters()):
            target_param.data.copy_(learnt_param.data)

    def __soft_update(self, learnt, target, tau):
        '''Soft-updates the target parameters'''
        for learnt_param, target_param in zip(learnt.parameters(),
                                              target.parameters()):
            target_param.data.copy_(tau * learnt_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #28
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_shape,
                 action_size,
                 num_agents,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 learning_rate_actor,
                 learning_rate_critic,
                 device,
                 update_every=1,
                 random_seed=42):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents acting in the environment
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): used for soft update of target parameters
            learning_rate_actor (float): learning rate for the actor
            learning_rate_critic (float): learning rate for the critic
            device (torch.Device): pytorch device
            update_every (int): how many time steps between network updates
            seed (int): random seed
        """
        self.state_shape = state_shape
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.device = device
        self.update_every = update_every
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(action_size, random_seed).to(device)
        self.actor_target = Actor(action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=learning_rate_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(action_size, random_seed).to(device)
        self.critic_target = Critic(action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=learning_rate_critic,
                                           weight_decay=0)

        # Noise process
        self.noise = OUNoise(size=action_size, seed=random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size,
                                   buffer_size,
                                   batch_size,
                                   device=device,
                                   seed=random_seed)

        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""

        next_state_torch = torch.from_numpy(next_state).float().to(self.device)
        reward_torch = torch.from_numpy(np.array(reward)).float().to(
            self.device)
        done_torch = torch.from_numpy(np.array(done).astype(
            np.uint8)).float().to(self.device)
        state_torch = torch.from_numpy(state).float().to(self.device)
        action_torch = torch.from_numpy(action).float().to(self.device)

        self.actor_target.eval()
        self.critic_target.eval()
        self.critic_local.eval()
        with torch.no_grad():
            action_next = self.actor_target(next_state_torch)
            Q_target_next = self.critic_target(next_state_torch, action_next)
            Q_target = reward_torch + (self.gamma * Q_target_next *
                                       (1 - done_torch))
            Q_expected = self.critic_local(state_torch, action_torch)
        self.actor_local.train()
        self.critic_target.train()
        self.critic_local.train()

        #Error used in prioritized replay buffer
        error = (Q_expected - Q_target).squeeze().cpu().data.numpy()

        #Adding experiences to prioritized replay buffer
        #for i in np.arange(len(reward)):
        self.memory.add(error, state, action, reward, next_state, done)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory."""
        # Save experience / reward
        self.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if len(self.memory) > self.batch_size:
                experiences, idxs, is_weights = self.memory.sample()
                self.learn(experiences, idxs, is_weights)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, idxs, is_weights):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        #critic_loss = F.mse_loss(Q_expected, Q_targets)
        critic_loss = (torch.from_numpy(is_weights).float().to(self.device) *
                       F.mse_loss(Q_expected, Q_targets)).mean()

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()

        #gradient clipping
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        #.......................update priorities in prioritized replay buffer.......#
        #Calculate errors used in prioritized replay buffer
        errors = (Q_expected - Q_targets).squeeze().cpu().data.numpy()

        # update priority
        for i in range(self.batch_size):
            idx = idxs[i]
            self.memory.update(idx, errors[i])

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed,
                 device,
                 lr_actor,
                 lr_critic,
                 weight_decay_critic,
                 batch_size,
                 buffer_size,
                 gamma,
                 tau,
                 update_every,
                 n_updates,
                 eps_start,
                 eps_end,
                 eps_decay):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.t_step = 0
        self.device = device
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay_critic = weight_decay_critic
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.n_updates = n_updates
        self.eps = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.t_step += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory and at interval settings
        if len(self.memory) > self.batch_size:
            if self.t_step % self.update_every == 0:
                for _ in range(self.n_updates):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma, agent_number)

    def act(self, states, add_noise):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(self.device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.eps * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)

        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)

        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)

        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        # Update epsilon noise value
        self.eps = max(self.eps_end, self.eps_decay*self.eps)
        # self.eps = self.eps - (1/self.eps_decay)
        # if self.eps < self.eps_end:
        #     self.eps = self.eps_end

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
Пример #30
0
class DDPGAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 memory,
                 device='cpu',
                 params=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            memory (obj): Memory buffer to sample
            device (str): device string between cuda:0 and cpu
            params (dict): hyper-parameters
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.step_t = 0
        self.update_every = params['update_every']

        # Set parameters
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.seed = random.seed(params['seed'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, params['seed'],
                                 params['actor_units'][0],
                                 params['actor_units'][1]).to(device)
        self.actor_target = Actor(state_size, action_size, params['seed'],
                                  params['actor_units'][0],
                                  params['actor_units'][1]).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=params['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, params['seed'],
                                   params['critic_units'][0],
                                   params['critic_units'][1]).to(device)
        self.critic_target = Critic(state_size, action_size, params['seed'],
                                    params['critic_units'][0],
                                    params['critic_units'][1]).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=params['lr_critic'],
                                           weight_decay=params['weight_decay'])

        # Noise process
        self.noise = OUNoise(action_size,
                             params['seed'],
                             theta=params['noise_theta'],
                             sigma=params['noise_sigma'])

        # Replay memory
        self.memory = memory

    def store_weights(self, filenames):
        """Store weights of Actor/Critic

        Params
        ======
            filenames (list): string of filename to store weights of actor and critic
                              filenames[0] = actor weights
                              filenames[1] = critic weights
        """
        torch.save(self.actor_local.state_dict(), filenames[0])
        torch.save(self.critic_local.state_dict(), filenames[1])

    def load_weights(self, filenames):
        """Load weights of Actor/Critic

        Params
        ======
            filenames (list): string of filename to load weights of actor and critic
                              filenames[0] = actor weights
                              filenames[1] = critic weights
        """
        self.actor_local.load_state_dict(torch.load(filenames[0]))
        self.critic_local.load_state_dict(torch.load(filenames[1]))

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        self.step_t = (self.step_t + 1) % self.update_every

        # Learn, if enough samples are available in memory
        if self.step_t == 0 and len(
                self.memory) > self.memory.get_batch_size():
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)