Python OUNoise.reset примеры использования

Язык программирования: Python

Пространство имен/Пакет: noise

Класс/Тип: OUNoise

Метод/Функция: reset

Примеров на hotexamples.com: 30

Python OUNoise.reset - 30 примеров найдено. Это лучшие примеры Python кода для noise.OUNoise.reset, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

OUNoise(30)

reset(30)

sample(30)

get_action(5)

noise(4)

update_noise(1)

Пример #1

Показать файл

Файл: actoragent.py Проект: agutierreza/drl-unity-tennis

class ActorAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, noise,
                 learning_rate, memory, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.noise = noise
        self.learning_rate = learning_rate
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate)

        # Noise process
        self.noise = OUNoise(action_size, seed=random_seed)

        # Replay memory
        #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        self.memory = memory

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

Пример #2

Показать файл

class DDPG:
    def __init__(self,
                 in_actor,
                 out_actor,
                 in_critic,  # e.g. = n_agent * (state_size + action_size)
                 lr_actor=1e-4,
                 lr_critic=1e-3,  # better learn faster than actor
                 random_seed=2):
        self.state_size = in_actor
        self.action_size = out_actor
        self.seed = random.seed(random_seed)

        self.params = {"lr_actor": lr_actor,
                       "lr_critic": lr_critic,
                       "optimizer": "adam"}

        self.local_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device)
        self.target_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device)
        self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=lr_actor)

        # for a single agent, critic takes global observations as input, and output action-value Q
        # e.g. global_states = all_states + all_actions
        self.local_critic = Critic(in_shape=in_critic).to(device)
        self.target_critic = Critic(in_shape=in_critic).to(device)
        self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=lr_critic)

        # Q: should local/target start with same weights ? synchronized after first copy after all
        # A: better hard copy at the beginning
        hard_update_A_from_B(self.target_actor, self.local_actor)
        hard_update_A_from_B(self.target_critic, self.local_critic)

        # Noise process
        self.noise = OUNoise(out_actor, scale=1.0)

    def act(self, obs, noise_scale=0.0):
        obs = obs.to(device)
        # debug noise
        # noise = torch.from_numpy(noise_scale*0.5*np.random.randn(1, self.action_size)).float().to(device)
        # action = self.local_actor(obs) + noise
        action = self.local_actor(obs) + noise_scale * self.noise.noise().to(device)
        return action

    def target_act(self, obs, noise_scale=0.0):
        obs = obs.to(device)
        # noise = torch.from_numpy(noise_scale*0.5 * np.random.randn(1, self.action_size)).float().to(device)
        # action = self.target_actor(obs) + noise_scale * noise
        action = self.target_actor(obs) + noise_scale * self.noise.noise().to(device)
        return action

    def reset(self):
        self.noise.reset()

Пример #3

Показать файл

Файл: agentcommon.py Проект: agutierreza/drl-unity-tennis

class AgentCommon():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Noise process
        #self.noise = OUNoise(action_size, random_seed)
        self.noise = OUNoise((self.num_agents, action_size), seed = random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
        self.actorL = ActorAgent(state_size, action_size, num_agents, self.noise, LR_ACTOR, self.memory, random_seed)
        self.actorR = ActorAgent(state_size, action_size, num_agents, self.noise, LR_ACTOR, self.memory, random_seed)
        self.sharedcritic = CriticAgent(state_size, action_size, num_agents, LR_CRITIC, WEIGHT_DECAY, TAU, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        self.actorL.step(state[0], action[0], reward[0], next_state[0], done[0])
        self.actorR.step(state[1], action[1], reward[1], next_state[1], done[1])
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences1 = self.memory.sample()
            experiences2 = self.memory.sample()
            self.sharedcritic.learn(self.actorL,experiences1, GAMMA)
            self.sharedcritic.learn(self.actorR,experiences2, GAMMA)

    def act(self, state, add_noise=True):
        actionL = self.actorL.act(state[0],add_noise=add_noise)
        actionR = self.actorL.act(state[1],add_noise=add_noise)
        return[actionL,actionR]
    
    def reset(self):
        self.noise.reset()

Пример #4

Показать файл

Файл: SharedCritic.py Проект: mogaf/Tennis

class SharedCritic():
    def __init__(self, state_size, action_size, random_seed, num_agents):
        self.num_agents = num_agents
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.noise = OUNoise(action_size, random_seed)
        self.actors = [
            ActorAgent(i, state_size, action_size, random_seed, LR_ACTOR,
                       self.noise, self.memory) for i in range(num_agents)
        ]
        self.critic = CriticAgent(state_size, action_size, random_seed,
                                  LR_CRITIC, WEIGHT_DECAY, TAU)
        self.count = 0

    def act(self, states, add_noise=True):
        actions = []
        for actor, state in zip(self.actors, states):
            action = actor.act(state, add_noise=add_noise)
            actions.append(action)
        #return np.array(actions).reshape(1, -1) # reshape 2x2 into 1x4 dim vector
        return actions

    def reset(self):
        self.noise.reset()

    def step(self, states, actions, rewards, next_states, dones):
        for actor, state, action, reward, next_state, done in zip(
                self.actors, states, actions, rewards, next_states, dones):
            actor.step(state, action, reward, next_state, done)

        self.count = (self.count + 1) % UPDATE_EVERY
        if len(self.memory) > BATCH_SIZE:
            if self.count == 0:
                for actor in self.actors:
                    experiences = self.memory.sample()
                    self.critic.learn(actor, experiences, GAMMA)

Пример #5

Показать файл

Файл: agent.py Проект: shazimov/deep_reinforcement_learning

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 num_agents,
                 state_size,
                 action_size,
                 gamma,
                 tau,
                 learning_rate_actor,
                 learning_rate_critic,
                 weight_decay,
                 device,
                 random_seed=42):
        """Initialize an Agent object (used my MultiAgent for MADDPG).

        Params
        ======
            num_agents (list): number of agents acting in the environment
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            gamma (float): discount factor
            tau (float): used for soft update of target parameters
            learning_rate_actor (float): learning rate for the actor
            learning_rate_critic (float): learning rate for the critic
            weight_decay (float): weight decay for the optimizers
            device (torch.Device): pytorch device
            random_seed (int): random seed
        """

        self.gamma = gamma
        self.tau = tau
        self.device = device
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=learning_rate_actor,
                                          weight_decay=weight_decay)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(num_agents, state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(num_agents, state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=learning_rate_critic,
                                           weight_decay=weight_decay)  #0.0001

        # Noise process
        self.noise = OUNoise(size=action_size, seed=random_seed)

        self.timestep = 0

    def act(self, state, epsilon=1, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * epsilon
        return np.clip(action, -1, 1)

    def reset(self):
        """Resets the noise"""
        self.noise.reset()

    def learn(self, index, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            index (int): Index of the current agent
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        all_states = torch.cat(states, dim=1).to(self.device)
        all_next_states = torch.cat(next_states, dim=1).to(self.device)
        all_actions = torch.cat(actions, dim=1).to(self.device)

        actions_next = actions.copy()

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next[index] = self.actor_target(next_states[index])
        all_actions_next = torch.cat(actions_next, dim=1).to(self.device)
        Q_targets_next = self.critic_target(all_next_states, all_actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards[index] + (gamma * Q_targets_next *
                                      (1 - dones[index]))
        # Compute critic loss
        Q_expected = self.critic_local(all_states, all_actions)

        huber_loss = torch.nn.SmoothL1Loss()
        critic_loss = huber_loss(Q_expected, Q_targets.detach())
        #critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = actions.copy()
        actions_pred[index] = self.actor_local(states[index])
        all_actions_pred = torch.cat(actions_pred, dim=1).to(self.device)
        actor_loss = -self.critic_local(all_states, all_actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #6

Показать файл

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, engine):
        self.task = engine
        self.width = engine.width
        self.height = engine.height
        self.state_size = engine.state_size
        self.action_size = engine.action_size
        self.action_low = engine.action_low
        self.action_high = engine.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high,self.width,self.height)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high,self.width,self.height)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,self.width,self.height)
        self.critic_target = Critic(self.state_size, self.action_size,self.width,self.height)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)        # Learn, if enough samples are available in memory
       # print(self.last_state)
        #print(action)
        #print(reward)
        #print(next_state)
        #print(done)
        #print('----')
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        #print(self.state_size)
        #state = np.reshape(state, [1, self.state_size])
        #print(state.shape)
        #print('act')
        action = self.actor_local.model.predict(state.reshape(1,self.state_size))[0]
        #action = action.squeeze(0).argmax()
        return list(action + self.noise.sample())  # add some noise for exploration
    def act1(self, state):
        """Returns actions for given state(s) as per current policy."""
        #print(state)
        #print('act')
        state = np.reshape(state, [-1, self.state_size])
        #print(state)
        #print('act')
        action = self.actor_local.model.predict(state.reshape(1,self.state_size))[0]
        #my_state.reshape(1, OBSERVATION_SPACE)
        #print(action)
        action = np.argmax(action)
        #print(action)
        return action
        #return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None]).reshape(-1, self.state_size)
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        #actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
      
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None]).reshape(-1, self.state_size)

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        #print(next_states)
        #print('next_states')
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])
        file_output = 'data1.txt' 
        with open(file_output, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(np.array(Q_targets_next) ) 
       
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        #print(Q_targets.shape)
        #print(actions.shape)
        #print(Q_targets.shape)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)   
       

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model arameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
    def reset_episode(self):
        self.noise.reset()
        #state = self.task.clear()
        self.task.clear()
        #self.last_state = state
        self.last_state =self.task.board
        return self.task.board

Пример #7

Показать файл

Файл: agent.py Проект: crazyleg/MATD3--reinforcement-learning

class TD3MultiAgent:
    def __init__(self):
      
        self.max_action = 1
        self.policy_freq = 2
        self.policy_freq_it = 0
        self.batch_size = 512
        self.discount = 0.99
        self.replay_buffer = int(1e5)
        
        
        self.device = 'cuda'
        
        self.state_dim = 24
        self.action_dim = 2
        self.max_action = 1
        self.policy_noise = 0.1
        self.agents = 1
        
        self.random_period = 1e4
        
        self.tau = 5e-3
        
        self.replay_buffer = ReplayBuffer(self.replay_buffer)
        
        self.actor = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device)
        self.actor_target = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)
#         self.actor.load_state_dict(torch.load('actor2.pth'))
#         self.actor_target.load_state_dict(torch.load('actor2.pth'))

        self.noise = OUNoise(2, 32)
        
        
        self.critic = Critic(48, self.action_dim).to(self.device)
        self.critic_target = Critic(48, self.action_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

    
    def select_action_with_noise(self, state, i):
        import pdb
        ratio = len(self.replay_buffer)/self.random_period

        if len(self.replay_buffer)>self.random_period:
            
            state = torch.FloatTensor(state[i,:]).to(self.device)
            action = self.actor(state).cpu().data.numpy()

            if self.policy_noise != 0: 
                action = (action + self.noise.sample())
            return action.clip(-self.max_action,self.max_action)
        
        else:
            q= self.noise.sample()
            return q
   
    
    def step(self, i):
        if len(self.replay_buffer)>self.random_period/2:
            # Sample mini batch
#         if True:
            import pdb
            s, a, r, s_, d = self.replay_buffer.sample(self.batch_size)
            
            state = torch.FloatTensor(s[:,i,:]).to(self.device)
            action = torch.FloatTensor(a[:,i,:]).to(self.device)
            next_state = torch.FloatTensor(s_[:,i,:]).to(self.device)
            
            a_state = torch.FloatTensor(s).to(self.device).reshape(-1,48)
            a_action = torch.FloatTensor(a).to(self.device).reshape(-1,4)
            a_next_state = torch.FloatTensor(s_).to(self.device).reshape(-1,48)
            
            done = torch.FloatTensor(1 - d[:,i]).to(self.device)
            reward = torch.FloatTensor(r[:,i]).to(self.device)
#             pdb.set_trace()
            # Select action with the actor target and apply clipped noise
            noise = torch.FloatTensor(a[:,i,:]).data.normal_(0, self.policy_noise).to(self.device)
            noise = noise.clamp(-0.1,0.1) # NOISE CLIP WTF?
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)
            # Compute the target Q value

            target_Q1, target_Q2 = self.critic_target(a_next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward.reshape(-1,1) + (done.reshape(-1,1) * self.discount * target_Q).detach()
            
            # Get current Q estimates
            current_Q1, current_Q2 = self.critic(a_state, action)

            # Compute critic loss
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Delayed policy updates
            if self.policy_freq_it % self.policy_freq == 0:
                # Compute actor loss
                actor_loss = -self.critic.Q1(a_state, self.actor(state)).mean()
                # Optimize the actor 
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Update the frozen target models
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


            self.policy_freq_it += 1
        
        return True
        
    
    def reset(self):
        self.policy_freq_it = 0
        self.noise.reset()

Пример #8

Показать файл

Файл: agent.py Проект: zmy/deep-reinforcement-learning

class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 agents=2,
                 every=4,
                 updates=4):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        random.seed(np.random.seed(random_seed))
        self.agents = agents
        self.every = every
        self.updates = updates
        self.steps = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noises = OUNoise((agents, action_size))

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   device)

    def load_actor(self, model_file: str):
        self.actor_local.load_state_dict(
            torch.load(model_file, map_location=device))
        self.actor_local.to(device)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.steps += 1
        for i in range(self.agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and self.steps % self.every == 0:
            self.steps = 0
            for _ in range(self.updates):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noises.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noises.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)  # Gradient clipping
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #9

Показать файл

class Agent():
    def __init__(self, state_size, action_size):

        # Constants
        self.buffer_size = int(1e6)
        self.batch_size = 128
        self.learning_rate = 1e-4
        self.learn_every = 2
        self.learning_rounds = 4

        self.gamma = 0.99
        self.tau = 1e-3

        self.t = 0
        self.state_size = state_size
        self.action_size = action_size
        self.eps = 5.0
        self.eps_decay = 1 / (300 * self.learning_rounds)

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate)

        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.learning_rate)

        self.noise = OUNoise((1, action_size))
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size)

    def step(self, state, action, reward, next_state, done, agent_number):
        self.t += 1

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory
               ) > self.batch_size and self.t % self.learn_every == 0:
            for _ in range(self.learning_rounds):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma, agent_number)

    def act(self, states, add_noise):
        states = torch.from_numpy(states).to(device).float()

        # Get the actions for this agent
        with torch.no_grad():
            actions = self.actor_local(
                states.squeeze()).unsqueeze(0).cpu().data.numpy()

        if add_noise:
            actions += self.eps * self.noise.sample()

        actions = np.clip(actions, -1, 1)
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        states, actions, rewards, next_states, dones = experiences

        # Find the best action according to target network
        actions_next = self.actor_target(next_states)
        if agent_number == 0:
            #Get the first two actions
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            #Get the second two action
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        # Compute Q targets for current states
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(states, actions)
        # Compute loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip the gradients to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # Find the best action according to local network
        actions_pred = self.actor_local(states)
        if agent_number == 0:
            #Get the first two actions
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            #Get the second two actions
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)

        # Compute loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target network ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        # Update noise param eps
        self.eps -= self.eps_decay
        self.eps = max(self.eps, 0)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #10

Показать файл

Файл: ddpg_agent.py Проект: odellus/tennis

class Agent:
    """Initeracts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed, cfg):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """

        buffer_size = cfg["Agent"]["Buffer_size"]
        batch_size = cfg["Agent"]["Batch_size"]
        gamma = cfg["Agent"]["Gamma"]
        tau = cfg["Agent"]["Tau"]
        lr_actor = cfg["Agent"]["Lr_actor"]
        lr_critic = cfg["Agent"]["Lr_critic"]
        noise_decay = cfg["Agent"]["Noise_decay"]
        weight_decay = cfg["Agent"]["Weight_decay"]
        update_every = cfg["Agent"]["Update_every"]
        noise_min = cfg["Agent"]["Noise_min"]
        noise_initial = cfg["Agent"]["Noise_initial"]
        action_clip = cfg["Agent"]["Action_clip"]

        # Attach some configuration parameters
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.action_clip = action_clip

        # Actor Networks both Local and Target.
        self.actor_local = Actor(state_size, action_size, random_seed,
                                 cfg).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed,
                                  cfg).to(device)
        self.actor_noise = ActorNoise(state_size, action_size, random_seed,
                                      cfg).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Networks both Local and Target.
        self.critic_local = Critic(state_size, action_size, random_seed,
                                   cfg).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed,
                                    cfg).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, cfg)
        self.noise_modulation = noise_initial
        self.noise_decay = noise_decay
        self.noise_min = noise_min

        # Replay memory
        # self._memory = Memory(capacity=buffer_size, seed=random_seed)
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)

        # Count number of steps
        self.n_steps = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer
        to learn."""
        self.memory.add(state, action, reward, next_state, done)

        # Learn if enough samples are available in memory
        if len(self.memory
               ) > self.batch_size and self.n_steps % self.update_every == 0:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.noise_modulation *= self.noise_decay
        self.noise_modulation = max(self.noise_modulation, self.noise_min)
        self.n_steps += 1

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        action = self.actor_local(state).cpu().data.numpy()
        if add_noise:
            # action += self.noise_modulation * self.noise.sample()
            self.actor_noise.reset_parameters()
            self.actor_noise.eval()
            self.hard_update(self.actor_local, self.actor_noise,
                             self.noise_modulation)
            action = self.actor_noise(state).cpu().data.numpy()
            self.actor_noise.train()
        self.actor_local.train()
        return np.clip(action, -self.action_clip, self.action_clip)

    def reset(self):
        self.n_steps = 0
        self.noise.reset()

    def learn(self, experiences):
        """Update policy and value parameters given batch of experience tuples.
        Q_targets = r + gamma * cirtic_target(next_state, actor_state(next)state)
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Update critic
        # Get predicted next-state actions and Q-values from target models.
        self.actor_target.eval()
        self.critic_target.eval()

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # We didn't want actor_target or critc_target showing up in the graph.
        self.actor_target.train()
        self.critic_target.train()

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()  # Clear gradient
        critic_loss.backward()  # Backpropagation
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()  # Update parameters

        # Update actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()  # Clear gradient
        actor_loss.backward()  # Backpropagation
        # torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()  # Update parameters

        # Now we update the target networks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        theta_target = tau * theta_local + (1 - tau) * theta_target

        Params
        ======
            local_model: PyTorch model (weight source)
            target_model: PyTorch model (weight destination)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, local_model, noise_model, noise_modulation):
        """Hard update model parameters.
        theta_noise = theta_local + self.noise_modulation * theta_noise

        Params
        ======
            local_model: PyTorch model (weight source)
            noise_model: PyTorch model (weight destination)
        """
        for noise_param, local_param in zip(noise_model.parameters(),
                                            local_model.parameters()):
            noise_param.data.copy_(local_param.data +
                                   noise_modulation * noise_param.data)

Пример #11

Показать файл

Файл: agent.py Проект: mkolod/MADDPG-Collaboration-Competition

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 mnoise=True,
                 split_state=True):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.mnoise = mnoise
        self.split_state = split_state

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # initialize targets same as original networks
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        # Noise process
        if self.mnoise:
            self.noise = OUNoise((2, action_size), random_seed)
        else:
            self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, states, actions, rewards, next_states, dones, step):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        if self.split_state:
            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                self.memory.add(state, action, reward, next_state, done)
        else:
            self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        """
        Copy network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

Пример #12

Показать файл

Файл: ddpg.py Проект: thenickben/dlrnd-p2-reacher

class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed, add_noise = True, PER = False, PSN = True):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.add_noise = add_noise

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.PSN = PSN
        if self.add_noise:
          if self.PSN:
            self.noise = PSNoise(state_size, action_size, random_seed)
          else:
            self.noise = OUNoise(action_size, random_seed)
          
        # Replay memory
        self.PER = PER
        if self.PER:
          self.memory = ReplayBufferPE(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, alpha = ALPHA)
          self.beta = BETA_INITIAL
        else:
          self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
        # Initialize learning steps 
        self.learn_step = 0  
    
    def reset(self):
        if self.add_noise:
          if self.PSN:
            self.noise.reset(self.actor_local)
          else:
            self.noise.reset()    
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        
        if len(self.memory) > BATCH_SIZE:
            # Learn, if enough samples are available in memory for number of timesteps
            for _ in range(STEPS_UPDATE):
              experiences = self.memory.sample()
              self.learn(experiences, GAMMA)
        
        # LEARN_EVERY time steps.
        '''
        self.learn_step = (self.learn_step + 1) % LEARN_EVERY
        if self.learn_step == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                for _ in range(STEPS_UPDATE):
                  experiences = self.memory.sample()
                  self.learn(experiences, GAMMA)            
        '''
    def act(self, state, epsilon = 1, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        
        #If add_noise = True:
        if self.add_noise:
          #Add AS or PS noise:
          if self.PSN:
            # Parameter Space Noise
            if len(self.memory) > BATCH_SIZE:
              # PS noise needs to sample from memory to perturbate actor weights
              self.noise.update_noise(self.actor_local, states_batch = self.memory.sample()[0])
            with torch.no_grad():
              action = self.actor_local(state).cpu().data.numpy()
            self.actor_local.train()
          # Action Space Noise
          else:
            with torch.no_grad():
              action = self.actor_local(state).cpu().data.numpy()
            self.actor_local.train()
            action += self.noise.sample()
        #If add_noise = False, no noise is added
        else:
          with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        #For all cases, return clipped action      
        return np.clip(action, -1, 1)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples."""    
        
        states, actions, rewards, next_states, dones = experiences
        
        #Clip rewards
        #rewards_ = torch.clamp(rewards, min=-1., max=1.)
        
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute Q expected
        Q_expected = self.critic_local(states, actions)
        
        # Compute critic loss        
        if self.PER:
          # Update Beta
          self.beta += BETA_INCREMENT
          # Get RB weights
          weights = self.memory.get_weights(self.beta)
          # Clip abs(TD_errors)
          TD_errors = torch.clamp(torch.abs(Q_targets - Q_expected), min=0., max=1.)
          # Update replay buffer with proportional probs
          self.memory.update_priorities(TD_errors)
          #compute weighted mse loss     
          critic_loss = torch.mean(weights * (Q_expected - Q_targets) ** 2)
        else:
          #compute mse loss  
          critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize critic loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip critic gradient
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean() #NEGATIVE: gradiet ascent
        
        # Minimize actor loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Soft updates for target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

Пример #13

Показать файл

Файл: maddpg_agent.py Проект: andreaspts/DRL_Tennis_Project

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, number_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            number_agents (int): number of agents
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.number_agents = number_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise processes
        self.noise = OUNoise((number_agents, action_size), random_seed)
        #self.noise = GaussianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1)
        #self.noise = GeometricBrownianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experiences in replay memory, and use random sample from buffer to learn."""

        # We save experience tuples in the memory for each agent.
        for i in range(self.number_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn, if enough samples are available in memory (threshold value: BATCH_SIZE) and at learning interval settings
        if len(self.memory) > BATCH_SIZE:
            for _ in range(UPDATE_RATE):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

#     def act(self, states, add_noise=True):
#         """Returns actions for given state as per current policy."""
#                                                                   # The code has been adapted to implement batch normalization.
#         actions = np.zeros((self.number_agents, self.action_size))
#         self.actor_local.eval()
#         with torch.no_grad():
#             for agent_number, state in enumerate(states):
#                 state = torch.from_numpy(state).float().unsqueeze(0).to(device)   # The code has been adapted to implement batch normalization.
#                 action = self.actor_local(state).cpu().data.numpy()
#                 actions[agent_number, :] = action
#         self.actor_local.train()
#         if add_noise:
#             actions += self.noise.sample()
#         return np.clip(actions, -1, 1)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.number_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_number, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_number, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #14

Показать файл

class DDPG:
    def __init__(self, config):
        self.config = config
        self.state_size = config.state_size
        self.action_size = config.action_size

        self.actor_local = Actor(self.state_size, self.action_size,
                                 2).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  2).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.LR_ACTOR)

        self.critic_local = Critic(self.state_size, self.action_size,
                                   2).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    2).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=config.LR_CRITIC,
        )

        self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE)
        self.noise = OUNoise(self.action_size, config.random_seed)

        self.t_step = 0

        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)

    def step(self, states, actions, rewards, next_states, dones):

        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY

        if len(self.memory) > self.config.BATCH_SIZE and (self.t_step == 0):

            for i in range(self.config.EPOCH):
                experiences = self.memory.sample(self.config.BATCH_SIZE)
                self.learn(experiences)

    def reset(self):
        self.noise.reset()

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.critic_target(next_states,
                                            self.actor_target(next_states))
        Q_targets = rewards + (self.config.GAMMA * Q_targets_next *
                               (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -self.critic_local(states,
                                        self.actor_local(states)).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.config.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.config.TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #15

Показать файл

Файл: agent.py Проект: chrka/drlnd-p3-collaboration

class Agent(object):
    """DDPG Agent that interacts and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 actor_args={},
                 critic_args={}):
        """Initializes the DQN agent.

        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            device (torch.device): Device to use for calculations
            actor_args (dict): Arguments describing the actor network
            critic_args (dict): Arguments describing the critic network
        """
        self.state_size = state_size
        """Dimension of each state"""

        self.action_size = action_size
        """Dimension of each action"""

        self.device = device
        """Device to use for calculations"""

        self.t_step = 0
        """Timestep between training updates"""

        # Parameters

        # Actor network
        self.actor_local = Actor(state_size, action_size,
                                 **actor_args).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  **actor_args).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network

        self.critic_local = Critic(state_size, action_size,
                                   **critic_args).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    **critic_args).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process for exploration
        self.noise = OUNoise(action_size, sigma=NOISE_SD)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device)

    def reset(self):
        """Reset state of agent."""
        self.noise.reset()

    def save_weights(self, path):
        """Save local network weights.

        Args:
            path (string): File to save to"""
        torch.save(
            {
                'actor_local': self.actor_local.state_dict(),
                'actor_target': self.actor_target.state_dict(),
                'critic_local': self.critic_local.state_dict(),
                'critic_target': self.critic_target.state_dict()
            }, path)

    def load_weights(self, path):
        """Load local network weights.

        Args:
            path (string): File to load weights from"""
        checkpoint = torch.load(path)
        self.actor_local.load_state_dict(checkpoint['actor_local'])
        self.actor_target.load_state_dict(checkpoint['actor_target'])
        self.critic_local.load_state_dict(checkpoint['critic_local'])
        self.critic_target.load_state_dict(checkpoint['critic_target'])

    def act(self, state, add_noise=True):
        """Returns action for given state according to the current policy
            
        Args:
            state (np.ndarray): Current state

        Returns:
            action (np.ndarray): Action tuple
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Temporarily set evaluation mode (no dropout &c) & turn off autograd
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().detach().numpy()

        # Resume training mode
        self.actor_local.train()

        # Add noise if exploring
        if add_noise:
            action += self.noise.sample()
            # The noise might take us out of range
            action = np.clip(action, -1, 1)

        return action

    def step(self, state, action, reward, next_state, done):
        """Save experience and learn if due.
        Args:
            state (Tensor): Current state
            action (int): Chosen action
            reward (float): Resulting reward
            next_state (Tensor): State after action
            done (bool): True if terminal state
        """
        self.memory.add(state, action, reward, next_state, done)

        # Learn as soon as we have enough stored experiences
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE:
            for _ in range(NUM_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        """Learn from batch of experiences."""
        states, actions, rewards, next_states, dones = experiences

        # region Update Critic
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)

        q_targets = rewards + (GAMMA * q_targets_next * (1 - dones))

        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)

        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0)
        self.critic_optimizer.step()
        # endregion

        # region Update Actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        # endregion

        # Update target networks
        soft_update(self.critic_local, self.critic_target, TAU)
        soft_update(self.actor_local, self.actor_target, TAU)

Пример #16

Показать файл

class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, agent_id, random_seed):
        """Initialize a ddpg_agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            agent_id (int): identifier for this agent
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.agent_id = agent_id

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Make sure that the target-local model pairs are initialized to the
        # same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        self.noise = OUNoise(action_size, random_seed)

        self.noise_amplification = NOISE_AMPLIFICATION
        self.noise_amplification_decay = NOISE_AMPLIFICATION_DECAY

        ### self._print_network()

    def act(self, state, add_noise=False):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
            self._decay_noise_amplification()

        return np.clip(action, -1, 1)

    def reset(self):
        """Resets the OU Noise for this agent."""
        self.noise.reset()

    def learn(self, experiences, next_actions, actions_pred):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(next_state) -> action
            critic_target(next_state, next_action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            next_actions (list): next actions computed from each agent
            actions_pred (list): prediction for actions for current states from each agent
        """
        states, actions, rewards, next_states, dones = experiences
        agent_id_tensor = torch.tensor([self.agent_id - 1]).to(device)

        ### Update critic
        self.critic_optimizer.zero_grad()
        Q_targets_next = self.critic_target(next_states, next_actions)
        Q_targets = rewards.index_select(1, agent_id_tensor) + \
               (GAMMA * Q_targets_next *  (1 - dones.index_select(1, agent_id_tensor)))
        Q_expected = self.critic_local(states, actions)
        # Minimize the loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        critic_loss.backward()
        self.critic_optimizer.step()

        ### Update actor
        self.actor_optimizer.zero_grad()
        # Minimize the loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        actor_loss.backward()
        self.actor_optimizer.step()

        ### Update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def hard_update(self, local_model, target_model):
        """Hard update model parameters.
        θ_target = θ_local
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ * θ_local + (1 - τ) * θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def _decay_noise_amplification(self):
        """Helper for decaying exploration noise amplification."""
        self.noise_amplification *= self.noise_amplification_decay

Пример #17

Показать файл

Файл: actor.py Проект: kaustav1987/Tennis-Collaboration-and-Competition-Continuous-Control

class Actor():
    def __init__(self, action_size, state_size,buffer_size, batch_size,actor_lr,critic_lr,device,weight_decay, tau,shared_memory,noise,
    share_memory_flag, seed=0):
        self.state_size  = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size  = batch_size
        self.actor_lr = actor_lr
        self.weight_decay = weight_decay
        self.device = device
        self.seed= seed
        self.actor_loss =[]
        #self.critic_loss =[]
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.tau = tau
        self.noise= OUNoise(self.action_size,self.seed)
        #self.noise = noise
        self.share_memory_flag = share_memory_flag
        if self.share_memory_flag:
            self.memory = shared_memory
        else:
            self.memory = ReplayBuffer(action_size, buffer_size, batch_size, self.device)

        ## Actor
        self.actor_local = ActorNN(self.state_size,self.action_size).to(self.device)
        self.actor_target = ActorNN(self.state_size,self.action_size).to(self.device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr = self.actor_lr)
        ## Critic
        #self.critic_local = Critic(self.state_size,self.action_size).to(self.device)
        #self.critic_target = Critic(self.state_size,self.action_size).to(self.device)
        #self.critic_optimizer = Adam(self.critic_local.parameters(), lr = self.critic_lr,  weight_decay=self.weight_decay)
        # initialize targets same as original networks
        self.hard_update(self.actor_target, self.actor_local)
        #self.hard_update(self.critic_target, self.critic_local)

    def reset(self):
        self.noise.reset()

    def act(self, state,noise = True,sd=1e-4):
        state = torch.from_numpy(state).float().to(self.device)

        self.actor_local.eval()
        with torch.no_grad():
            #print(state.shape)
            action = self.actor_local(state).cpu().data.numpy()
            ##action.cpu().detach().numpy()
        self.actor_local.train()

        if noise:
            #print(type(action))
            #action += np.random.normal(loc=0.0, scale=sd, size=action.size)
            action += self.noise.sample()
        action = np.clip(action, -1,1).reshape(1,-1)
        return action




    def hard_update(self,target, source):
        """
        Copy network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(param.data)

    def step(self, state, action, rewards, next_state, done,GAMMA=1.0):
        ## As per the description we are not supposed to use discount factor
        self.memory.add(state, action, rewards, next_state, done)

Пример #18

Показать файл

class Agent():
    '''Interact with and learn from environment.'''
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # counter for activating learning every few steps
        self.running_c_loss = 0
        self.running_a_loss = 0
        self.training_cnt = 0

        # Actor network (w/ target network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Prioritized replay memory
        self.prioritized_memory = PrioritizedMemory(BATCH_SIZE, BUFFER_SIZE,
                                                    seed)

    def act(self, state, mode):
        '''Returns actions for given state as per current policy.

        Params
        ======
            state (array): current state
            mode (string): train or test
            epsilon (float): for epsilon-greedy action selection
        '''
        state = torch.from_numpy(state).unsqueeze(0).float().to(
            device)  # shape of state (1, state_size)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if mode == 'test':
            return np.clip(action, -1, 1)

        elif mode == 'train':  # if train, then add OUNoise in action
            action += self.noise.sample()
            return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        # add new experience in memory
        self.prioritized_memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.prioritized_memory) >= BUFFER_SIZE:
                for _ in range(10):  # update 10 times per learning
                    idxes, experiences, is_weights = self.prioritized_memory.sample(
                        device)
                    self.learn(experiences,
                               GAMMA,
                               is_weights=is_weights,
                               leaf_idxes=idxes)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, is_weights, leaf_idxes):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Policy loss = (1/n)*Q_local(s,a) -> for deterministic policy (no log prob)

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
            is_weights (tensor array): importance-sampling weights for prioritized experience replay
            leaf_idxes (numpy array): indexes for update priorities in SumTree
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        rewards = rewards  # TODO: rewards are clipped to be in [-1,1]

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss

        Q_expected = self.critic_local(states, actions)
        td_errors = (
            Q_targets -
            Q_expected).tanh()  # TD-errors are clipped to be in [-1,1]
        abs_errors = td_errors.abs().cpu().data.numpy()  # pull back to cpu
        self.prioritized_memory.batch_update(
            leaf_idxes, abs_errors)  # update priorities in SumTree

        c_loss = (is_weights * (td_errors**2)).mean(
        )  # adjust squared TD loss by Importance-Sampling Weights
        self.running_c_loss += float(c_loss.cpu().data.numpy())
        self.training_cnt += 1

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        c_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       1)  # clip gradient to max 1
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        a_loss = self.critic_local(states, actions_pred)
        a_loss = -a_loss.mean()
        self.running_a_loss += float(a_loss.cpu().data.numpy())

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        a_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(),
                                       1)  # clip gradient to max 1
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #19

Показать файл

Файл: main.py Проект: zymale/leedeeprl-notes

def train(cfg):
    print('Start to train ! \n')
    env = NormalizedActions(gym.make("Pendulum-v0"))

    # 增加action噪声
    ou_noise = OUNoise(env.action_space)

    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent = DDPG(n_states,
                 n_actions,
                 device="cpu",
                 critic_lr=1e-3,
                 actor_lr=1e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128)
    rewards = []
    moving_average_rewards = []
    ep_steps = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    for i_episode in range(1, cfg.train_eps + 1):
        state = env.reset()
        ou_noise.reset()
        ep_reward = 0
        for i_step in range(1, cfg.train_steps + 1):
            action = agent.select_action(state)
            action = ou_noise.get_action(action,
                                         i_step)  # 即paper中的random process
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done)
            agent.update()
            state = next_state
            if done:
                break
        print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),
              'n_steps:', i_step)
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        if i_episode == 1:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(0.9 * moving_average_rewards[-1] +
                                          0.1 * ep_reward)
        writer.add_scalars('rewards', {
            'raw': rewards[-1],
            'moving_average': moving_average_rewards[-1]
        }, i_episode)
        writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode)
    writer.close()
    print('Complete training！')
    ''' 保存模型 '''
    if not os.path.exists(SAVED_MODEL_PATH):  # 检测是否存在文件夹
        os.mkdir(SAVED_MODEL_PATH)
    agent.save_model(SAVED_MODEL_PATH + 'checkpoint.pth')
    '''存储reward等相关结果'''
    if not os.path.exists(RESULT_PATH):  # 检测是否存在文件夹
        os.mkdir(RESULT_PATH)
    np.save(RESULT_PATH + 'rewards_train.npy', rewards)
    np.save(RESULT_PATH + 'moving_average_rewards_train.npy',
            moving_average_rewards)
    np.save(RESULT_PATH + 'steps_train.npy', ep_steps)

Пример #20

Показать файл

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.best_score = -np.inf
        self.score = 0
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.11  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state
        self.score += reward
        if done:
            if self.score > self.best_score:
                self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

Пример #21

Показать файл

Файл: ddpg_agent.py Проект: odellus/reacher

class Agent:
    """Initeracts with and learns from the environment."""

    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # Actor Networks both Local and Target.
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Networks both Local and Target.
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        self.noise_modulation = 1
        self.noise_decay = NOISE_DECAY

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)

        # Count number of steps
        self.n_steps = 0
        self.update_every = UPDATE_EVERY

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer
        to learn."""
        self.memory.add(state, action, reward, next_state, done)

        # Learn if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and self.n_steps % self.update_every == 0:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

        self.noise_modulation *= self.noise_decay
        self.n_steps += 1

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise_modulation * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.n_steps = 0
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value paramters given batch of experience tuples.
        Q_targets = r + gamma * cirtic_target(next_state, actor_state(next)state)
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Update critic
        # Get predicted next-state actions and Q-values from target models.
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad() # Clear gradient
        critic_loss.backward()            # Backpropagation
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()      # Update parameters

        # Update actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad() # Clear gradient
        actor_loss.backward()            # Backpropagation
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()      # Update parameters

        # Now we update the target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)



    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        theta_target = tau * theta_local + (1 - tau) * theta_target

        Params
        ======
            local_model: PyTorch model (weight source)
            target_model: PyTorch model (weight destination)
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

Пример #22

Показать файл

class DDPGAgent:
    '''Class representing the DDPG algorithm'''
    def __init__(self, state_size, action_size, config):
        '''Class constructor and parameters initialization'''
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print(f'Using {self.device}')

        self.timestep = 0

        seed = config['seed']

        self.gamma = config['gamma']
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = config['number_agents']

        # Learns argmax_a[Q(s, a); theta_mu] = mu(s, a; theta_mu)
        self.learnt_actor = Actor(seed, state_size,
                                  action_size).to(self.device)  # learnt
        self.target_actor = Actor(seed, state_size, action_size).to(
            self.device)  # soft-update tracking
        self.actor_optim = optim.Adam(self.learnt_actor.parameters(),
                                      lr=config['actor_lr'])

        # Learns to evaluate Q(s, mu(s, a); theta_q)
        self.learnt_critic = Critic(seed, state_size, action_size,
                                    1).to(self.device)  # learnt
        self.target_critic = Critic(seed, state_size, action_size,
                                    1).to(self.device)  # soft-update tracking
        self.critic_optim = optim.Adam(self.learnt_critic.parameters(),
                                       lr=config['critic_lr'])

        print(
            f'Summary:\nActor network:\n{self.learnt_actor}\nCritic network:\n{self.learnt_critic}'
        )

        # Note: Could be replaced by parallel env batching
        self.batch_size = config['batch_size']
        self.memory = Memory(config['memory_size'], self.batch_size, seed)
        self.memory.to_device(self.device)

        # Soft-update
        self.tau = config['tau']

        # Noise
        self.noise = OUNoise(action_size, seed)
        self.noise_decay = config['noise_decay']

    def reset(self):
        '''Reset the noise state'''
        self.noise.reset()

    def act(self, states):
        '''Sample an action from the policy'''
        states = torch.tensor(states, dtype=torch.float32, device=self.device)

        self.learnt_actor.eval()
        with torch.no_grad():
            actions = self.learnt_actor(states).cpu().data.numpy()
        self.learnt_actor.train()

        actions += self.noise_decay * self.noise.sample()

        return np.clip(actions, -1, 1)

    def remember(self, states, actions, rewards, next_states, dones):
        '''Populates the replay memory with new batch of data'''
        n = len(states)

        assert (n == len(actions))
        assert (n == len(rewards))
        assert (n == len(next_states))
        assert (n == len(dones))

        for (state, action, reward, next_state,
             done) in zip(states, actions, rewards, next_states, dones):
            self.memory.add(Experience(state, action, reward, next_state,
                                       done))

    def step(self, timestep):
        '''Wraps and controls the training of the function approximators using soft-updating'''
        if len(self.memory
               ) > self.batch_size and self.timestep % LEARN_EVERY == 0:
            for _ in range(ITERS):
                states, actions, rewards, next_states, dones = self.memory.sample(
                )
                self.__learn(states, actions, rewards, next_states, dones)

    def __learn(self, states, actions, rewards, next_states, dones):
        '''Optimizes the function apprximators and soft-updates'''

        self.__optimize_critic(states, actions, rewards, next_states, dones)

        self.__optimize_actor(states)

        self.__soft_update(self.learnt_actor, self.target_actor, self.tau)
        self.__soft_update(self.learnt_critic, self.target_critic, self.tau)

        self.noise_decay *= self.noise_decay
        self.reset()

    def __optimize_critic(self, states, actions, rewards, next_states, dones):
        '''Optimizes the critic approximator'''
        best_next_actions = self.target_actor(next_states)
        q_targets = rewards + self.gamma * self.target_critic(
            next_states, best_next_actions) * (1 - dones)

        q_predictions = self.learnt_critic(states, actions)

        self.critic_optim.zero_grad()
        critic_loss = F.mse_loss(q_predictions, q_targets)
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.learnt_critic.parameters(), 1)
        self.critic_optim.step()

    def __optimize_actor(self, states):
        '''Optimizes the actor approximator'''
        best_current_actions = self.learnt_actor(states)
        advantage = -self.learnt_critic(states, best_current_actions).mean()

        self.actor_optim.zero_grad()
        advantage.backward()
        self.actor_optim.step()

    def __soft_update(self, learnt, target, tau):
        '''Soft-updates the target parameters'''
        for learnt_param, target_param in zip(learnt.parameters(),
                                              target.parameters()):
            target_param.data.copy_(tau * learnt_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #23

Показать файл

class Agent():
    def __init__(self,
                 device,
                 state_size,
                 action_size,
                 actor,
                 critic,
                 action_low=-1.0,
                 action_high=1.0,
                 lrate_critic=10e-3,
                 lrate_actor=10e-4,
                 tau=0.001,
                 gamma=0.99,
                 exploration_mu=0.0,
                 exploration_theta=0.15,
                 noise_decay=1.,
                 exploration_sigma=0.20,
                 restore_path=None,
                 weight_decay=0.,
                 seed=None):

        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high
        self.seed = seed if seed else np.random.randint(100)
        self.lrate_critic = lrate_critic
        self.lrate_actor = lrate_actor
        self.tau = tau
        self.gamma = gamma
        self.restore_path = restore_path
        self.device = device
        self.weight_decay = weight_decay
        self.noise_decay = noise_decay

        # actors networks
        self.actor = actor(device,
                           state_size,
                           action_size,
                           low=action_low,
                           high=action_high,
                           seed=self.seed)
        self.actor_target = actor(device,
                                  state_size,
                                  action_size,
                                  low=action_low,
                                  high=action_high,
                                  seed=self.seed)

        # critic networks
        self.critic = critic(device, state_size, action_size, seed=self.seed)
        self.critic_target = critic(device,
                                    state_size,
                                    action_size,
                                    seed=self.seed)

        # restore networks if needed
        if restore_path is not None:
            self.restore(restore_path, True)

        # optimizer
        self.actor_opt = optim.Adam(self.actor.parameters(),
                                    lr=lrate_actor,
                                    weight_decay=self.weight_decay)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=lrate_critic,
                                     weight_decay=self.weight_decay)

        # noise
        self.noise = OUNoise(action_size, exploration_mu, exploration_theta,
                             exploration_sigma)
        self.noise_scale = 1.0

        # reset agent for training
        self.reset_episode()
        self.it = 0

    def reset_episode(self):
        self.noise.reset()

    def act(self, state, learn=True):

        if type(state) == 'list':
            state = np.array(state)

        if not learn:
            self.actor.eval()

        with torch.no_grad():
            action = self.actor(self.tensor(state)).cpu().numpy()

        # Add noise when learning for exploration
        if learn:
            action += self.noise.sample() * self.noise_scale
            self.noise_scale = max(self.noise_scale * self.noise_decay, 0.01)

        self.actor.train()
        return np.clip(action, self.action_low, self.action_high)

    def save(self, path):
        dirn = os.path.dirname(path)
        if not os.path.exists(dirn):
            os.mkdir(dirn)
        params = {}
        params['actor'] = self.actor.state_dict()
        params['critic'] = self.critic.state_dict()
        torch.save(params, path)

    def restore(self, path, for_Training=False):

        # Restore only actor for performance
        checkpoint = torch.load(path, map_location=self.device)
        self.actor.load_state_dict(checkpoint['actor'])
        # Restore also for futhert training
        if for_Training:
            self.actor_target.load_state_dict(checkpoint['actor'])
            self.critic.load_state_dict(checkpoint['critic'])
            self.critic_target.load_state_dict(checkpoint['critic'])

    def learn_step(self, replay_buffer):
        # learn from mini-batch of replay buffer
        state_b, action_b, reward_b, next_state_b, done_b = replay_buffer.sample(
        )

        # calculate td target
        with torch.no_grad():
            y_b = reward_b.unsqueeze(1) + self.gamma * \
             self.critic_target(next_state_b, self.actor_target(next_state_b)) * (1-done_b.unsqueeze(1))

        # update critic
        critic_loss = F.smooth_l1_loss(self.critic(state_b, action_b), y_b)
        self.critic.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # update actor
        action = self.actor(state_b)
        actor_loss = -self.critic(state_b, action).mean()
        self.actor.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        # soft update networks
        # critic only if trained
        # actor always
        self.soft_update()

    def soft_update(self):
        """Soft update of target network
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def tensor(self, x):
        return torch.from_numpy(x).float().to(torch.device(self.device))

Пример #24

Показать файл

Файл: env.py Проект: art28/tensorflow-eager-papers

class Environment:
    """ Train & simulate wrapper for Atari-DQN
    Args:
        params: dictionary of parameters
            memory_size : size of replay memory. 100000 needs almost 25GB memory, recommend reduce it if you need
            exploration_step : pure exploration step
            gamma : discount rate
            tau: parameter for soft update
            lr_actor: learning rate for actor network
            lr_critic: learning rate for critic network
        device_name : name of device(normally cpu:0 or gpu:0)
    """
    def __init__(self, params, device_name):
        self.env = gym.make('Pendulum-v0')
        self.ddpg = DDPG(input_dim=self.env.observation_space.shape[0],
                         action_dim=self.env.action_space.shape[0],
                         action_scale=(self.env.action_space.low[0],
                                       self.env.action_space.high[0]),
                         memory_size=params["memory_size"],
                         gamma=params["gamma"],
                         tau=params["tau"],
                         learning_rate_actor=params["lr_actor"],
                         learning_rate_critic=params["lr_critic"],
                         device_name=device_name)
        self.ddpg.build()
        self.ddpg.summary()

        self.random_process = OUNoise(size=self.env.action_space.shape[0])

        # total step operated
        self.i_step = 0

    def load(self, global_step="latest"):
        """ Load saved weights for ddpg
        Args:
            global_step : load specific step, if "latest" load latest one
        """
        self.ddpg.load(global_step)

    def save(self):
        """ Save current weight of ddpg layers
        """
        self.ddpg.save()

    def train(self,
              episode,
              max_step,
              minibatch_size,
              render=False,
              verbose=1,
              val_epi=5,
              saving=False):
        """run the game with training network
        Args:
            episode : number of train episodes
            max_step : maximum step for each episode
            minibatch_size : minibatch size for replay memory training
            render : whether to show game simulating graphic
            verbose : for which step it will print the loss and accuracy (and saving)
            val_epi : number of episode for validation
            saving: whether to save checkpoint or not
        """
        losses = []
        episode_return = []
        verbose_return = []
        episode_return_val = []

        tr = trange(episode, desc="")
        for i_episode in tr:
            return_episode = 0
            observation = self.env.reset()
            self.random_process.reset()

            for t in range(max_step):
                self.i_step += 1
                if render:
                    self.env.render()

                X = observation.astype(np.float32)
                action_policy = self.ddpg.get_action(tf.convert_to_tensor(X))
                action_policy += self.random_process.sample()
                action_policy = np.clip(action_policy,
                                        self.env.action_space.low[0],
                                        self.env.action_space.high[0])
                observation, reward, done, info = self.env.step(action_policy)
                return_episode += reward

                X_next = observation.astype(np.float32)
                self.ddpg.replay_memory.append(
                    (X, action_policy, reward, X_next, done))
                # training step
                if len(self.ddpg.replay_memory) > minibatch_size:
                    X_batch, action_batch, reward_batch, X_next_batch, done_batch = self.ddpg.replay_memory.get_batch(
                        minibatch_size)
                    loss_critic, loss_actor = self.ddpg.train(
                        X_batch, action_batch, reward_batch, X_next_batch,
                        done_batch)
                    losses.append((loss_critic, loss_actor))

                if done:
                    break

            episode_return.append(return_episode)
            verbose_return.append(return_episode)
            tr.set_description("%.4f" %
                               (sum(episode_return) / len(episode_return)))

            if i_episode == 0 or ((i_episode + 1) % verbose == 0):
                if len(self.ddpg.replay_memory) <= minibatch_size:
                    stage_tooltip = "EXPLORATION"
                    print(Fore.RED + "[EPISODE %3d / STEP %5d] - %s" %
                          (i_episode + 1, self.i_step, stage_tooltip))
                    print(Fore.GREEN + "Learned Step : %4d" %
                          (self.ddpg.global_step))
                    print(Fore.BLUE + "AVG   Return         : %.4f" %
                          (sum(verbose_return) / len(verbose_return)))
                    print(Fore.BLUE + "MAX   Return         : %.4f" %
                          (max(verbose_return)))
                    continue
                else:
                    stage_tooltip = "TRAINING"
                losses_critic = [l[0] for l in losses]
                losses_actor = [l[1] for l in losses]

                # validation
                returns = []
                for epi_val in range(val_epi):
                    return_episode_val = 0
                    observation = self.env.reset()

                    for t in range(max_step):
                        if render:
                            self.env.render()

                        action_policy = self.ddpg.get_action(
                            tf.convert_to_tensor(observation.astype(
                                np.float32)))
                        observation, reward, done, info = self.env.step(
                            action_policy)
                        return_episode_val += reward

                        if done:
                            # print(Fore.GREEN + "EPISODE %3d: REWARD: %s" % (i_episode, return_episode))
                            returns.append(return_episode_val)
                            break

                print(Fore.RED + "[EPISODE %3d / STEP %5d] - %s" %
                      (i_episode + 1, self.i_step, stage_tooltip))
                print(Fore.GREEN + "Learned Step : %4d" %
                      (self.ddpg.global_step))
                print(Fore.BLUE + "AVG   Return         : %.4f" %
                      (sum(verbose_return) / len(verbose_return)))
                print(Fore.BLUE + "MAX   Return         : %.4f" %
                      (max(verbose_return)))
                print(Fore.LIGHTYELLOW_EX + "AVG   LOSS Actor     :  %.4f" %
                      (sum(losses_actor) / len(losses_actor)))
                print(Fore.LIGHTYELLOW_EX + "AVG   LOSS Critic    :  %.4f" %
                      (sum(losses_critic) / len(losses_critic)))
                print(Fore.LIGHTRED_EX + "AVG VAL[%2d]   Return : %.4f" %
                      (val_epi, sum(returns) / len(returns)))
                print(Fore.LIGHTRED_EX + "MAX VAL[%2d]   Return : %.4f" %
                      (val_epi, max(returns)))
                verbose_return = []
                losses = []
                episode_return_val.append(sum(returns) / len(returns))

                if saving:
                    self.save()

                time.sleep(1)

        return episode_return

    def simulate(self, episode, max_step=1000, render=False):
        """Run the game with existing dqn network
        Args:
            episode : number of train episodes
            max_step : maximum step for each episode
            render : whether to show game simulating graphic
        """
        returns = []
        for i_episode in range(episode):
            return_episode = 0
            observation = self.env.reset()

            for t in range(max_step):
                if render:
                    self.env.render()

                action_policy = self.ddpg.get_action(
                    tf.convert_to_tensor(observation.astype(np.float32)))
                observation, reward, done, info = self.env.step(action_policy)
                return_episode += reward

                if done:
                    print(Fore.GREEN + "EPISODE %3d: REWARD: %s" %
                          (i_episode, return_episode))
                    returns.append(return_episode)
                    break

        print(Fore.RED + "AVG REWARD : %s" % (sum(returns) / len(returns)))
        print(Fore.BLUE + "MAX REWARD : %s" % (max(returns)))

Пример #25

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed, agent_id):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, 256, 256,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, 256, 256,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # critic input
        critic_state_size = (state_size + action_size) * 2

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(critic_state_size, 256, 256,
                                   random_seed).to(device)
        self.critic_target = Critic(critic_state_size, 256, 256,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # agent id
        self.id_agent = agent_id

        # set weights the same for both models
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

    def act(self, state, noise_counter, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            if noise_counter < NOISE_LEVEL:
                action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, all_actions, all_next_actions,
              agent_id):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
            next_actions (list): list of agents next actions
            actions (list): list of agents actions
            agent_id (int): agent_id, needed to distinguish between agents
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        agent_id = torch.tensor([agent_id]).to(device)
        all_next_actions = torch.cat(all_next_actions, dim=1).to(device)
        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            Q_targets_next = self.critic_target(next_states, all_next_actions)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards.index_select(
            1, agent_id) + (gamma * Q_targets_next *
                            (1 - dones.index_select(1, agent_id)))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = [
            actions if i == self.id_agent else actions.detach()
            for i, actions in enumerate(all_actions)
        ]
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        """Copy weights from source to target network,
        modified version of agent.soft_update()"""
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

Пример #26

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, agent_id, args):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = args['seed']
        self.device = args['device']
        self.args = args

        # Q-Network
        self.actor_network = ActorNetwork(state_size, action_size,
                                          args).to(self.device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         args).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_network.parameters(),
                                          lr=args['LR_ACTOR'])

        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        if not agent_id:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p0_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p0_path']),
                                              strict=False)
        else:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p1_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p1_path']),
                                              strict=False)

        # Replay memory
        self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'],
                                   args['BATCH_SIZE'], self.seed)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.args['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.train(experiences)

    def act(self, current_state):

        with torch.no_grad():

            self.actor_network.eval()

            input_state = torch.from_numpy(current_state).float().to(
                self.device)

            with torch.no_grad():
                action = self.actor_network(input_state).cpu().data.numpy()

            self.actor_network.train()

            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def train(self, experiences):

        global states_
        global next_states_
        global actions_
        global max_min_actions_vector
        global max_min_states_vector

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        with torch.no_grad():
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target(next_states)
            Q_targets_next = mCritic.target(next_states, actions_next)

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = mCritic.network(states, actions)
        mCritic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        mCritic.optimizer.zero_grad()
        mCritic_loss.backward()
        mCritic.optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_network(states)
        actor_loss = -mCritic.network(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(mCritic.network, mCritic.target, TAU)
        self.soft_update(self.actor_network, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #27

Показать файл

Файл: agent.py Проект: filipgrigorov/deep-reinforcement-learning

class DDPGAgent:
    '''Class representing the DDPG algorithm'''
    def __init__(self, seed, state_size, action_size, num_agents, device,
                 config):
        '''Class constructor and parameters initialization'''
        self.device = device

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size

        self.gamma = config['gamma']

        # Learns argmax_a[Q(s, a); theta_mu] = mu(s, a; theta_mu)
        self.learnt_actor = Actor(seed, state_size,
                                  action_size).to(self.device)  # learnt
        self.target_actor = Actor(seed, state_size, action_size).to(
            self.device)  # soft-update tracking
        self.actor_optim = optim.Adam(self.learnt_actor.parameters(),
                                      lr=config['actor_lr'])

        # Learns to evaluate Q(s, mu(s, a); theta_q)
        self.learnt_critic = Critic(seed, state_size * num_agents,
                                    action_size * num_agents,
                                    num_agents).to(self.device)  # learnt
        self.target_critic = Critic(seed, state_size * num_agents,
                                    action_size * num_agents, num_agents).to(
                                        self.device)  # soft-update tracking
        self.critic_optim = optim.Adam(self.learnt_critic.parameters(),
                                       lr=config['critic_lr'])

        print(
            f'Summary:\nActor network:\n{self.learnt_actor}\nCritic network:\n{self.learnt_critic}'
        )

        # Soft-update
        self.tau = config['tau']

        # Noise
        self.noise = OUNoise(action_size, seed)
        self.noise_decay = config['noise_decay']

        self.hard_copy_weights(self.learnt_actor, self.target_actor)
        self.hard_copy_weights(self.learnt_critic, self.target_critic)

    def reset_noise(self):
        '''Reset the noise state'''
        self.noise.reset()

    # Note: Decentralized actors (execution)
    def act(self, state):
        '''Sample an action from the policy'''
        state = torch.tensor(state, dtype=torch.float32, device=self.device)

        self.learnt_actor.eval()
        with torch.no_grad():
            actions = self.learnt_actor(state).cpu().data.numpy()
        self.learnt_actor.train()

        actions += self.noise_decay * self.noise.sample()

        return np.clip(actions, -1, 1)

    # Note: Centralized critic (training)
    def step(self, best_current_actions, best_next_actions, states, actions,
             rewards, next_states, dones):
        '''Optimizes the function apprximators and soft-updates'''

        self.__optimize_critic(best_next_actions, states, actions, rewards,
                               next_states, dones)

        self.__optimize_actor(best_current_actions, states)

        self.__soft_update(self.learnt_actor, self.target_actor, self.tau)
        self.__soft_update(self.learnt_critic, self.target_critic, self.tau)

        self.noise_decay *= 0.9999  #self.noise_decay
        #self.reset_noise()

    def __optimize_critic(self, best_next_actions, states, actions, rewards,
                          next_states, dones):
        '''Optimizes the critic approximator'''
        with torch.no_grad():
            q_targets = self.target_critic(next_states, best_next_actions)
        q_targets = rewards + self.gamma * q_targets * (1 - dones)
        q_predictions = self.learnt_critic(states, actions)

        self.critic_optim.zero_grad()
        critic_loss = F.mse_loss(q_predictions, q_targets.detach())
        critic_loss.backward()
        # Note: Control the magnitude of the gradient
        torch.nn.utils.clip_grad_norm_(self.learnt_critic.parameters(), 0.5)
        self.critic_optim.step()

    def __optimize_actor(self, best_current_actions, states):
        '''Optimizes the actor approximator'''
        advantage = -self.learnt_critic(states, best_current_actions).mean()

        self.actor_optim.zero_grad()
        advantage.backward()
        self.actor_optim.step()

    def hard_copy_weights(self, learnt, target):
        """ Copy weights from source to target network (part of initialization)"""
        for learnt_param, target_param in zip(learnt.parameters(),
                                              target.parameters()):
            target_param.data.copy_(learnt_param.data)

    def __soft_update(self, learnt, target, tau):
        '''Soft-updates the target parameters'''
        for learnt_param, target_param in zip(learnt.parameters(),
                                              target.parameters()):
            target_param.data.copy_(tau * learnt_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #28

Показать файл

Файл: agent.py Проект: shazimov/deep_reinforcement_learning

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_shape,
                 action_size,
                 num_agents,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 learning_rate_actor,
                 learning_rate_critic,
                 device,
                 update_every=1,
                 random_seed=42):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents acting in the environment
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): used for soft update of target parameters
            learning_rate_actor (float): learning rate for the actor
            learning_rate_critic (float): learning rate for the critic
            device (torch.Device): pytorch device
            update_every (int): how many time steps between network updates
            seed (int): random seed
        """
        self.state_shape = state_shape
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.device = device
        self.update_every = update_every
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(action_size, random_seed).to(device)
        self.actor_target = Actor(action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=learning_rate_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(action_size, random_seed).to(device)
        self.critic_target = Critic(action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=learning_rate_critic,
                                           weight_decay=0)

        # Noise process
        self.noise = OUNoise(size=action_size, seed=random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size,
                                   buffer_size,
                                   batch_size,
                                   device=device,
                                   seed=random_seed)

        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""

        next_state_torch = torch.from_numpy(next_state).float().to(self.device)
        reward_torch = torch.from_numpy(np.array(reward)).float().to(
            self.device)
        done_torch = torch.from_numpy(np.array(done).astype(
            np.uint8)).float().to(self.device)
        state_torch = torch.from_numpy(state).float().to(self.device)
        action_torch = torch.from_numpy(action).float().to(self.device)

        self.actor_target.eval()
        self.critic_target.eval()
        self.critic_local.eval()
        with torch.no_grad():
            action_next = self.actor_target(next_state_torch)
            Q_target_next = self.critic_target(next_state_torch, action_next)
            Q_target = reward_torch + (self.gamma * Q_target_next *
                                       (1 - done_torch))
            Q_expected = self.critic_local(state_torch, action_torch)
        self.actor_local.train()
        self.critic_target.train()
        self.critic_local.train()

        #Error used in prioritized replay buffer
        error = (Q_expected - Q_target).squeeze().cpu().data.numpy()

        #Adding experiences to prioritized replay buffer
        #for i in np.arange(len(reward)):
        self.memory.add(error, state, action, reward, next_state, done)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory."""
        # Save experience / reward
        self.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if len(self.memory) > self.batch_size:
                experiences, idxs, is_weights = self.memory.sample()
                self.learn(experiences, idxs, is_weights)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, idxs, is_weights):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        #critic_loss = F.mse_loss(Q_expected, Q_targets)
        critic_loss = (torch.from_numpy(is_weights).float().to(self.device) *
                       F.mse_loss(Q_expected, Q_targets)).mean()

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()

        #gradient clipping
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        #.......................update priorities in prioritized replay buffer.......#
        #Calculate errors used in prioritized replay buffer
        errors = (Q_expected - Q_targets).squeeze().cpu().data.numpy()

        # update priority
        for i in range(self.batch_size):
            idx = idxs[i]
            self.memory.update(idx, errors[i])

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #29

Показать файл

Файл: agent.py Проект: mattmurray/deep_reinforcement_learning

class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed,
                 device,
                 lr_actor,
                 lr_critic,
                 weight_decay_critic,
                 batch_size,
                 buffer_size,
                 gamma,
                 tau,
                 update_every,
                 n_updates,
                 eps_start,
                 eps_end,
                 eps_decay):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.t_step = 0
        self.device = device
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay_critic = weight_decay_critic
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.n_updates = n_updates
        self.eps = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.t_step += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory and at interval settings
        if len(self.memory) > self.batch_size:
            if self.t_step % self.update_every == 0:
                for _ in range(self.n_updates):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma, agent_number)

    def act(self, states, add_noise):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(self.device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.eps * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)

        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)

        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)

        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        # Update epsilon noise value
        self.eps = max(self.eps_end, self.eps_decay*self.eps)
        # self.eps = self.eps - (1/self.eps_decay)
        # if self.eps < self.eps_end:
        #     self.eps = self.eps_end

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

Пример #30

Показать файл

class DDPGAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 memory,
                 device='cpu',
                 params=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            memory (obj): Memory buffer to sample
            device (str): device string between cuda:0 and cpu
            params (dict): hyper-parameters
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.step_t = 0
        self.update_every = params['update_every']

        # Set parameters
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.seed = random.seed(params['seed'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, params['seed'],
                                 params['actor_units'][0],
                                 params['actor_units'][1]).to(device)
        self.actor_target = Actor(state_size, action_size, params['seed'],
                                  params['actor_units'][0],
                                  params['actor_units'][1]).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=params['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, params['seed'],
                                   params['critic_units'][0],
                                   params['critic_units'][1]).to(device)
        self.critic_target = Critic(state_size, action_size, params['seed'],
                                    params['critic_units'][0],
                                    params['critic_units'][1]).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=params['lr_critic'],
                                           weight_decay=params['weight_decay'])

        # Noise process
        self.noise = OUNoise(action_size,
                             params['seed'],
                             theta=params['noise_theta'],
                             sigma=params['noise_sigma'])

        # Replay memory
        self.memory = memory

    def store_weights(self, filenames):
        """Store weights of Actor/Critic

        Params
        ======
            filenames (list): string of filename to store weights of actor and critic
                              filenames[0] = actor weights
                              filenames[1] = critic weights
        """
        torch.save(self.actor_local.state_dict(), filenames[0])
        torch.save(self.critic_local.state_dict(), filenames[1])

    def load_weights(self, filenames):
        """Load weights of Actor/Critic

        Params
        ======
            filenames (list): string of filename to load weights of actor and critic
                              filenames[0] = actor weights
                              filenames[1] = critic weights
        """
        self.actor_local.load_state_dict(torch.load(filenames[0]))
        self.critic_local.load_state_dict(torch.load(filenames[1]))

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        self.step_t = (self.step_t + 1) % self.update_every

        # Learn, if enough samples are available in memory
        if self.step_t == 0 and len(
                self.memory) > self.memory.get_batch_size():
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)