Python OUNoise примеры использования

Язык программирования: Python

Пространство имен/Пакет: utils.ounoise

Класс/Тип: OUNoise

Примеров на hotexamples.com: 10

Python OUNoise - 10 примеров найдено. Это лучшие примеры Python кода для utils.ounoise.OUNoise, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

OUNoise(5)

reset(5)

sample(5)

Основные методы

OUNoise (5)

reset (5)

sample (5)

Пример #1

Показать файл

    def __init__(self, game, num_agents, state_size, action_size, name, random_seed=0,
                    lr_critic=1e-3, lr_actor=1e-3,
                    fc1_units=400, fc2_units=300,
                    buffer_size=int(1e6), batch_size=128,
                    gamma=0.99, tau=1e-3,
                    max_norm=1.0,
                    epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.99,
                    exploration_mu=0.0, exploration_theta=0.15, exploration_sigma=0.2):
        
        """Initialize an Agent object.
        Args:
            game (class Game): meidator in chain-of-responsibility design pattern. (Broker chain)
            random_seed (int): random seed.
            
            max_norm (float): value of clip_grad_norm for critic optimizer
        """
        super().__init__()
        
        self.index_agent = None
        
        self.game = game
        self.num_agents = num_agents
            
        self.state_size = state_size
        self.action_size = action_size
        self.name = name
        self.seed = random.seed(random_seed)
        
        self.max_norm = max_norm
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        # Actor Network (w/ Target Network)
        self.actor_local = MADDPGActorVersion3(state_size, action_size, 
                                               fc1_units=fc1_units, fc2_units=fc2_units).to(device)
        self.actor_target = MADDPGActorVersion3(state_size, action_size, 
                                                fc1_units=fc1_units, fc2_units=fc2_units).to(device)
        
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)
        
        # Critic Network (w/ Target Network)
        self.critic_local = MADDPGCriticVersion4(num_agents, state_size, action_size, 
                                                 fcs1_units=fc1_units, fc2_units=fc2_units).to(device)
        self.critic_target = MADDPGCriticVersion4(num_agents, state_size, action_size, 
                                                  fcs1_units=fc1_units, fc2_units=fc2_units).to(device)
        
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        
        # Noise process for action
        # Noise process
        self.noise = OUNoise(self.action_size, exploration_mu, exploration_theta, exploration_sigma)

        # parameter of discounted reward
        self.gamma = gamma
        
        # soft update parameter
        self.tau = tau
        
        self.batch_size = batch_size

Пример #2

Показать файл

    def __init__(self,
                 agent_count,
                 observation_size,
                 action_size,
                 actor_optim_params,
                 critic_optim_params,
                 soft_update_tau,
                 discount_gamma,
                 use_batch_norm,
                 seed,
                 actor_network_states,
                 critic_network_states,
                 device):

        self._soft_update_tau = soft_update_tau
        self._gamma = discount_gamma

        # actor networks
        self._actor_local = ActorNetwork(
            observation_size, action_size, use_batch_norm, seed
        ).to(device)

        self._actor_target = ActorNetwork(
            observation_size, action_size, use_batch_norm, seed
        ).to(device)

        # critic networks
        self._critic_local = CriticNetwork(
            observation_size * agent_count, action_size * agent_count, use_batch_norm, seed
        ).to(device)

        self._critic_target = CriticNetwork(
            observation_size * agent_count, action_size * agent_count, use_batch_norm, seed
        ).to(device)

        # optimizers
        self._actor_optimizer = optim.Adam(
            self._actor_local.parameters(),
            **actor_optim_params
        )

        self._critic_optimizer = optim.Adam(
            self._critic_local.parameters(),
            **critic_optim_params
        )

        if actor_network_states is not None:
            self._actor_local.load_state_dict(actor_network_states[0])
            self._actor_target.load_state_dict(actor_network_states[1])

        if critic_network_states is not None:
            self._critic_local.load_state_dict(critic_network_states[0])
            self._critic_target.load_state_dict(critic_network_states[1])

        self.noise = OUNoise(action_size, seed)

Пример #3

Показать файл

    def __init__(self, task, buffer_size, batch_size, gamma, tau,
                 actor_dropout, critic_dropout, exploration_theta,
                 exploration_sigma, actor_lr, critic_lr):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.actor_dropout = actor_dropout
        self.critic_dropout = critic_dropout
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_dropout, self.actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_dropout, self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_dropout, self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_dropout, self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 5
        self.exploration_theta = exploration_theta
        self.exploration_sigma = exploration_sigma
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.memory = PrioritizedReplayBuffer(self.buffer_size,
                                              self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters

        self.best_score = -np.inf

Пример #4

Показать файл

class MADDPGAgentVersion5(BaseAgent):
    def __init__(self, game, num_agents, state_size, action_size, name, random_seed=0,
                    lr_critic=1e-3, lr_actor=1e-3,
                    fc1_units=400, fc2_units=300,
                    buffer_size=int(1e6), batch_size=128,
                    gamma=0.99, tau=1e-3,
                    max_norm=1.0,
                    epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.99,
                    exploration_mu=0.0, exploration_theta=0.15, exploration_sigma=0.2):
        
        """Initialize an Agent object.
        Args:
            game (class Game): meidator in chain-of-responsibility design pattern. (Broker chain)
            random_seed (int): random seed.
            
            max_norm (float): value of clip_grad_norm for critic optimizer
        """
        super().__init__()
        
        self.index_agent = None
        
        self.game = game
        self.num_agents = num_agents
            
        self.state_size = state_size
        self.action_size = action_size
        self.name = name
        self.seed = random.seed(random_seed)
        
        self.max_norm = max_norm
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        # Actor Network (w/ Target Network)
        self.actor_local = MADDPGActorVersion2(state_size, action_size, random_seed, 
                                               fc1_units=fc1_units, fc2_units=fc2_units).to(device)
        self.actor_target = MADDPGActorVersion2(state_size, action_size, random_seed, 
                                                fc1_units=fc1_units, fc2_units=fc2_units).to(device)
        
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)
        
        # Critic Network (w/ Target Network)
        self.critic_local = MADDPGCriticVersion3(num_agents, state_size, action_size, 
                                                 fcs1_units=fc1_units, fc2_units=fc2_units,
                                                 seed=random_seed).to(device)
        self.critic_target = MADDPGCriticVersion3(num_agents, state_size, action_size, 
                                                  fcs1_units=fc1_units, fc2_units=fc2_units,
                                                 seed=random_seed).to(device)
        
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        
        # Noise process for action
        # Noise process
        self.noise = OUNoise(self.action_size, exploration_mu, exploration_theta, exploration_sigma)

        # parameter of discounted reward
        self.gamma = gamma
        
        # soft update parameter
        self.tau = tau
        
        self.batch_size = batch_size

        
    def step(self, states, actions, rewards, next_states, dones):
        """
        Args:
            states (numpy.array): states.shape[1] = (state_size*num_agents)
            actions (numpy.array): actions.shape[1] = (actions_size*num_agents)
            next_states (numpy.array): next_states.shape[1] = (state_size*num_agents)
        """
        
        self.learn(states, actions, rewards, next_states, dones)

       
    def act(self, state, add_noise=True):
        """
            Returns actions for given state.
            The input size of actor networks is state_size.
        """
        
        state = torch.from_numpy(state).float().to(device)
        
        with torch.no_grad(): 
            self.actor_local.eval()

            action = self.actor_local(state).cpu().data.numpy()

            self.actor_local.train()

            if add_noise:
                action += self.epsilon * self.noise.sample()

        return np.clip(action, -1, 1)    
    
    
    def reset(self):
        self.noise.reset()

        
    def forward_all(self, next_states):
        """
        Get next_actions. This is a chain-of-responsibility design pattern. (Broker chain)
        
        Return:
            1d differentiable tensor of next_actions.
        """
        q = ActionQuery()
        
        for i, agent in enumerate(self.game):
            # get next_state_i of agent_i
            n_state = next_states[:, i*self.state_size: (i+1)*self.state_size]
            
#             pdb.set_trace()
            
            if agent == self:
                detach = False
            else:
                detach = True
                
            # predict next_action and append it to actionQuery.actions
            agent.query(n_state, q, detach)
            
        return q.next_actions
    
    
    def query(self, next_state, q, detach):
        """
        Args:
            q (class ActionQuery): parcel that stores actions
        """
        
        next_action = self.actor_local(next_state)
        
        if detach is True:
            next_action = next_action.detach()
        
        if q.next_actions is None:
            q.next_actions = next_action
        else:
            q.next_actions = torch.cat((q.next_actions, next_action), dim=1)    
            
#             pdb.set_trace()


    def learn(self, states, actions, rewards, next_states, dones):
        """Update policy and value parameters using given batch of experience tuples.
        For agent i:
            Q_target_i = r_i + gamma * critic_target(next_state, actor_target(next_state))
            
        where:
            actor_target(state) -> actions for all agent
            critic_target(state, action) -> Q-value
        
        Args:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        
        # divide fields update agent number i
        experience_unpacks = ExperienceUnpack(states, actions, rewards, next_states, dones,
                                              self.state_size, self.action_size, self.num_agents)
        
        # upack field in agent_i
        if self.index_agent is None:
            self.index_agent = self.game.index_of_agent(self)
            
            
#         pdb.set_trace()
            
        states_i, actions_i, rewards_i, next_states_i, dones_i = experience_unpacks[self.index_agent]

#         assert (states_i.shape[1] == (self.state_size)), 'Wrong shape of states_i'
#         assert (actions_i.shape[1] == (self.action_size)), 'Wrong shape of actions_i'
#         assert (rewards_i.shape[1] == (1)), 'Wrong shape of rewards_i'
#         assert (dones_i.shape[1] == (1)), 'Wrong shape of dones_i'

        # train critic
        # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current)      
        next_actions = self.forward_all(next_states)

        assert (next_actions.shape[1] == (self.action_size * self.num_agents)), 'Wrong shape of next_actions'

        Q_targets_next = self.critic_target(next_states, next_actions)

        Q_target_i = rewards_i + (self.gamma * Q_targets_next * (1-dones_i))
        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_target_i)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.max_norm)
        self.critic_optimizer.step()

        # train actor
        actions_pred = self.forward_all(states)
        actor_loss = - self.critic_local(states, actions).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

            
#         # update critic
#         self.soft_update(self.critic_local, self.critic_target, self.tau)

#         # update actors
#         self.soft_update(self.actor_local, self.actor_target, self.tau)
        
        #------ update noise ---#
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_end)
        self.noise.reset()
         
        
    def update_targets(self):
        # update critic
        self.soft_update(self.critic_local, self.critic_target, self.tau)

        # update actors
        self.soft_update(self.actor_local, self.actor_target, self.tau)
        
            
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Args:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
      

    def model_dicts(self):
        m_dicts = {'critic_{}'.format(self.name): self.critic_target,
                   'actor_{}'.format(self.name): self.actor_target}
        
        return m_dicts

Пример #5

Показать файл

    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-2,
                 lr_critic=1e-2,
                 fc1_units=128,
                 fc2_units=128,
                 buffer_size=int(1e6),
                 batch_size=50,
                 gamma=0.95,
                 tau=1e-2,
                 max_norm=1.0,
                 learn_period=100,
                 learn_sampling_num=50,
                 adam_critic_weight_decay=0.0,
                 name=None,
                 exploration_mu=0.0,
                 exploration_sigma=0.2,
                 exploration_theta=0.15):
        """Initialize an Agent object.
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.max_norm = max_norm
        self.learn_period = learn_period
        self.learn_sampling_num = learn_sampling_num

        self.actor_local = DDPGActorVersion1(state_size,
                                             action_size,
                                             random_seed,
                                             fc1_units=fc1_units,
                                             fc2_units=fc2_units).to(device)

        self.actor_target = DDPGActorVersion1(state_size,
                                              action_size,
                                              random_seed,
                                              fc1_units=fc1_units,
                                              fc2_units=fc2_units).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPGCriticVersion1(state_size,
                                               action_size,
                                               random_seed,
                                               fcs1_units=fc1_units,
                                               fc2_units=fc2_units).to(device)

        self.critic_target = DDPGCriticVersion1(state_size,
                                                action_size,
                                                random_seed,
                                                fcs1_units=fc1_units,
                                                fc2_units=fc2_units).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=lr_critic,
            weight_decay=adam_critic_weight_decay)
        # Noise process for action
        # Noise process
        #         self.exploration_mu = 0
        #         self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016)
        #         self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016)

        self.exploration_mu = exploration_mu
        self.exploration_theta = exploration_theta  # (Timothy Lillicrap, 2016)
        self.exploration_sigma = exploration_sigma  # (Timothy Lillicrap, 2016)

        self.noise = OUNoise(action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory

        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed, device)

        self.gamma = gamma

        # soft update parameter
        self.tau = tau

        self.batch_size = batch_size

        self.name = name

        self.time_step = 0

Пример #6

Показать файл

class DDPGAgentVersion1(BaseAgent):
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-2,
                 lr_critic=1e-2,
                 fc1_units=128,
                 fc2_units=128,
                 buffer_size=int(1e6),
                 batch_size=50,
                 gamma=0.95,
                 tau=1e-2,
                 max_norm=1.0,
                 learn_period=100,
                 learn_sampling_num=50,
                 adam_critic_weight_decay=0.0,
                 name=None,
                 exploration_mu=0.0,
                 exploration_sigma=0.2,
                 exploration_theta=0.15):
        """Initialize an Agent object.
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.max_norm = max_norm
        self.learn_period = learn_period
        self.learn_sampling_num = learn_sampling_num

        self.actor_local = DDPGActorVersion1(state_size,
                                             action_size,
                                             random_seed,
                                             fc1_units=fc1_units,
                                             fc2_units=fc2_units).to(device)

        self.actor_target = DDPGActorVersion1(state_size,
                                              action_size,
                                              random_seed,
                                              fc1_units=fc1_units,
                                              fc2_units=fc2_units).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPGCriticVersion1(state_size,
                                               action_size,
                                               random_seed,
                                               fcs1_units=fc1_units,
                                               fc2_units=fc2_units).to(device)

        self.critic_target = DDPGCriticVersion1(state_size,
                                                action_size,
                                                random_seed,
                                                fcs1_units=fc1_units,
                                                fc2_units=fc2_units).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=lr_critic,
            weight_decay=adam_critic_weight_decay)
        # Noise process for action
        # Noise process
        #         self.exploration_mu = 0
        #         self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016)
        #         self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016)

        self.exploration_mu = exploration_mu
        self.exploration_theta = exploration_theta  # (Timothy Lillicrap, 2016)
        self.exploration_sigma = exploration_sigma  # (Timothy Lillicrap, 2016)

        self.noise = OUNoise(action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory

        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed, device)

        self.gamma = gamma

        # soft update parameter
        self.tau = tau

        self.batch_size = batch_size

        self.name = name

        self.time_step = 0

    def step(self, state, action, reward, next_state, done):
        self.time_step += 1
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if (len(self.memory) > self.batch_size) and (self.time_step %
                                                     self.learn_period == 0):
            for _ in range(self.learn_sampling_num):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        
        Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Args:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # train critic
        # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current)
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       self.max_norm)
        self.critic_optimizer.step()

        # train actor (policy gradient)
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # update critic_target
        self.soft_update(self.critic_local, self.critic_target, self.tau)

        # update actor_target
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        #------ update noise ---#
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Args:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def model_dicts(self):
        return {
            'agent_{}_actor'.format(self.name): self.actor_target,
            'agent_{}_critic'.format(self.name): self.critic_target
        }

Пример #7

Показать файл

class DDPGAgentVersion5(BaseAgent):
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-2,
                 lr_critic=1e-2,
                 fc1_units=128,
                 fc2_units=128,
                 buffer_size=int(1e6),
                 batch_size=50,
                 gamma=0.95,
                 tau=1e-2,
                 max_norm=1.0,
                 learn_period=100,
                 learn_sampling_num=50,
                 adam_critic_weight_decay=0.0,
                 name=None,
                 exploration_mu=0.0,
                 exploration_sigma=0.2,
                 exploration_theta=0.15,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay=0.99):
        """Initialize an Agent object.
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.max_norm = max_norm
        self.learn_period = learn_period
        self.learn_sampling_num = learn_sampling_num
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay

        self.actor_local = DDPGActorVersion1(state_size,
                                             action_size,
                                             random_seed,
                                             fc1_units=fc1_units,
                                             fc2_units=fc2_units).to(device)

        self.actor_target = DDPGActorVersion1(state_size,
                                              action_size,
                                              random_seed,
                                              fc1_units=fc1_units,
                                              fc2_units=fc2_units).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPGCriticVersion1(state_size,
                                               action_size,
                                               random_seed,
                                               fcs1_units=fc1_units,
                                               fc2_units=fc2_units).to(device)

        self.critic_target = DDPGCriticVersion1(state_size,
                                                action_size,
                                                random_seed,
                                                fcs1_units=fc1_units,
                                                fc2_units=fc2_units).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=lr_critic,
            weight_decay=adam_critic_weight_decay)
        # Noise process for action
        # Noise process
        #         self.exploration_mu = 0
        #         self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016)
        #         self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016)

        self.exploration_mu = exploration_mu
        self.exploration_theta = exploration_theta  # (Timothy Lillicrap, 2016)
        self.exploration_sigma = exploration_sigma  # (Timothy Lillicrap, 2016)

        self.noise = OUNoise(action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory

        #self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device)

        self.memory = PrioritizedReplayBuffer(action_size, buffer_size,
                                              batch_size, random_seed, device)

        # Prioritized Replay Buffer Params
        #self.a, self.b = 0.7, 0.5   # rank-based variant
        self.a, self.b = 0.6, 0.4  # proportional variant

        self.e = 1e-3  # 0.01 * (reward of each time step) = 0.01 * 0.1

        # parameter of discounted reward
        self.gamma = gamma

        # soft update parameter
        self.tau = tau

        self.batch_size = batch_size

        self.name = name

        self.time_step = 0

    def step(self, state, action, reward, next_state, done):
        self.time_step += 1
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if (len(self.memory) > self.batch_size) and (self.time_step %
                                                     self.learn_period == 0):

            for _ in range(self.learn_sampling_num):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            action += self.epsilon * self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        
        Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Args:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, indices, probs = experiences
        # train critic
        # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current)
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # compute td error (delta) for updating prioritized replay buffer
        abs_td_error = torch.abs(Q_targets - Q_expected)

        # Calculate importance sampling weight
        if probs:
            weights = np.array(probs).reshape(-1, 1) * len(
                self.memory)**(-self.b)
            weights /= np.max(weights)
            #weights = [(prob * size_memory) ** (-self.b) for prob in probs]
            #max_weight = max(weights)
            #weights = np.array([w / max_weight for w in weights]).reshape((-1, 1))
        else:
            weights = np.ones(critic_loss.shape, dtype=np.float)

        # Calculate weighted loss
        weighted_critic_loss = torch.mean(
            torch.from_numpy(weights).float().to(device) * critic_loss)
        self.critic_optimizer.zero_grad()
        weighted_critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       self.max_norm)
        self.critic_optimizer.step()

        if indices:
            # convert errors to priorities and update them
            self.memory.update(
                indices,
                list(
                    abs_td_error.detach().to('cpu').numpy().squeeze()**self.a +
                    self.e))

        # train actor (policy gradient)
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # update critic_target
        self.soft_update(self.critic_local, self.critic_target, self.tau)

        # update actor_target
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        #------ update noise ---#
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_end)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Args:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def model_dicts(self):
        return {
            'agent_{}_actor'.format(self.name): self.actor_target,
            'agent_{}_critic'.format(self.name): self.critic_target
        }

Пример #8

Показать файл

class Brain:
    def __init__(self,
                 agent_count,
                 observation_size,
                 action_size,
                 actor_optim_params,
                 critic_optim_params,
                 soft_update_tau,
                 discount_gamma,
                 use_batch_norm,
                 seed,
                 actor_network_states,
                 critic_network_states,
                 device):

        self._soft_update_tau = soft_update_tau
        self._gamma = discount_gamma

        # actor networks
        self._actor_local = ActorNetwork(
            observation_size, action_size, use_batch_norm, seed
        ).to(device)

        self._actor_target = ActorNetwork(
            observation_size, action_size, use_batch_norm, seed
        ).to(device)

        # critic networks
        self._critic_local = CriticNetwork(
            observation_size * agent_count, action_size * agent_count, use_batch_norm, seed
        ).to(device)

        self._critic_target = CriticNetwork(
            observation_size * agent_count, action_size * agent_count, use_batch_norm, seed
        ).to(device)

        # optimizers
        self._actor_optimizer = optim.Adam(
            self._actor_local.parameters(),
            **actor_optim_params
        )

        self._critic_optimizer = optim.Adam(
            self._critic_local.parameters(),
            **critic_optim_params
        )

        if actor_network_states is not None:
            self._actor_local.load_state_dict(actor_network_states[0])
            self._actor_target.load_state_dict(actor_network_states[1])

        if critic_network_states is not None:
            self._critic_local.load_state_dict(critic_network_states[0])
            self._critic_target.load_state_dict(critic_network_states[1])

        self.noise = OUNoise(action_size, seed)

    def get_actor_model_states(self):
        return self._actor_local.state_dict(), self._actor_target.state_dict()

    def get_critic_model_states(self):
        return self._critic_local.state_dict(), self._critic_target.state_dict()

    def act(self, observation, target=False, noise=0.0, train=False):
        """
        :param observation: tensor of shape == (b, observation_size)
        :param target: true to evaluate with target
        :param noise: OU noise factor
        :param train: True for training mode else eval mode
        :return: action: tensor of shape == (b, action_size)
        """

        actor = self._actor_target if target else self._actor_local

        if train:
            actor.train()
        else:
            actor.eval()

        action_values = actor(observation)

        if noise > 0:
            noise = torch.tensor(
                noise * self.noise.sample(),
                dtype=observation.dtype,
                device=observation.device
            )
        else:
            noise = 0

        return action_values + noise

    def update_actor(self, all_obs, all_pred_actions):
        """
        Actor
        :param all_obs: array of shape == (b, observation_size * n_agents)
        :param all_pred_actions: array of shape == (b, action_size * n_agents)
        :return:
        """

        actor_loss = -self._critic_local(all_obs, all_pred_actions).mean()

        self._actor_optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self._actor_optimizer.step()

    def update_critic(self, rewards, dones,
                      all_obs, all_actions, all_next_obs, all_next_actions):
        """
        Critic receives observation and actions of all agents as input
        :param rewards: array of shape == (b, 1)
        :param dones: array of shape == (b, 1)
        :param all_obs: array of shape == (b, n_agents, observation_size)
        :param all_actions: array of shape == (b, n_agents, action_size)
        :param all_next_obs:  array of shape == (b, n_agents, observation_size)
        :param all_next_actions: array of shape == (b, n_agents, action_size)
        """

        with torch.no_grad():
            q_target_next = self._critic_target(all_next_obs, all_next_actions)

        q_target = rewards + self._gamma * q_target_next * (1 - dones)

        q_expected = self._critic_local(all_obs, all_actions)

        # mse loss, manual calculation due to mse_loss bug, as of 0.4.1
        # https://github.com/pytorch/pytorch/issues/10148
        # critic_loss = F.mse_loss(q_expected, q_target.detach())
        critic_loss = ((q_expected - q_target.detach()) ** 2).mean()

        self._critic_optimizer.zero_grad()
        critic_loss.backward()
        self._critic_optimizer.step()

    def update_targets(self):
        self._soft_update(self._actor_local, self._actor_target, self._soft_update_tau)
        self._soft_update(self._critic_local, self._critic_target, self._soft_update_tau)

    def reset(self):
        self.noise.reset()

    @staticmethod
    def _soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ * θ_local + (1 - τ) * θ_target
        :param local_model: model will be copied from
        :param target_model: model will be copied to
        :param tau: interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

Пример #9

Показать файл

Файл: ddpg_main.py Проект: cyzhao1991/dream-RL

def main(gpu_num, exp_num, env=None):
    dir_name = 'Data/checkpoint/'
    if not os.path.isdir(dir_name):
        os.makedirs(dir_name)

    with open('log.txt', 'a') as text_file:
        text_file.write('gpu %i exp %i started.\n' % (gpu_num, exp_num))

    with tf.device('/gpu:%i' % (gpu_num)):
        pms = Paras_base().pms
        pms.save_model = True
        pms.save_dir = dir_name
        env = CartPoleEnv() if env is None else env
        action_size = env.action_space.shape[0]
        observation_size = env.observation_space.shape[0]
        max_action = env.action_space.high[0]
        pms.obs_shape = observation_size
        pms.max_iter = 1000000
        pms.action_shape = action_size
        pms.max_action = max_action
        pms.num_of_paths = 100
        pms.name_scope = 'ddpg'
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.20
        sess = tf.Session(config=config)

        state_ph = tf.placeholder(tf.float32, [None, pms.obs_shape])
        action_ph = tf.placeholder(tf.float32, [None, pms.action_shape])
        critic_input_ph = tf.concat([state_ph, action_ph], axis=1)
        actor_net = Fcnn(sess,
                         pms.obs_shape,
                         pms.action_shape, [400, 300],
                         name=pms.name_scope + '_actor_r',
                         if_bias=[False],
                         activation=['relu', 'relu', 'None'],
                         input_tf=state_ph)
        actor_target_net = Fcnn(sess,
                                pms.obs_shape,
                                pms.action_shape, [400, 300],
                                name=pms.name_scope + '_actor_t',
                                if_bias=[False],
                                activation=['relu', 'relu', 'None'],
                                input_tf=state_ph)
        critic_net = Fcnn(sess,
                          pms.obs_shape + pms.action_shape,
                          1, [400, 300],
                          name=pms.name_scope + '_critic_r',
                          if_bias=[False],
                          activation=['relu', 'relu', 'None'],
                          input_tf=critic_input_ph)
        critic_target_net = Fcnn(sess,
                                 pms.obs_shape + pms.action_shape,
                                 1, [400, 300],
                                 name=pms.name_scope + '_critic_t',
                                 if_bias=[False],
                                 activation=['relu', 'relu', 'None'],
                                 input_tf=critic_input_ph)
        critic_net.state_ph = state_ph
        critic_net.action_ph = action_ph
        print('sth')
        actor = DeterministicActor(actor_net, sess, pms)
        actor_target = DeterministicActor(actor_net, sess, pms)

        replay_buffer = ReplayBuffer(buffer_size=pms.buffer_size)
        ounoise = OUNoise(pms.action_shape)
        learn_agent = DDPGagent(env, actor, critic_net, actor_target,
                                critic_target_net, replay_buffer, ounoise,
                                sess, pms, [None])

    saver = tf.train.Saver()
    learn_agent.saver = saver
    sess.run(tf.global_variables_initializer())
    saving_result = learn_agent.learn()
    sess.close()

Пример #10

Показать файл

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, buffer_size, batch_size, gamma, tau,
                 actor_dropout, critic_dropout, exploration_theta,
                 exploration_sigma, actor_lr, critic_lr):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.actor_dropout = actor_dropout
        self.critic_dropout = critic_dropout
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_dropout, self.actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_dropout, self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_dropout, self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_dropout, self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 5
        self.exploration_theta = exploration_theta
        self.exploration_sigma = exploration_sigma
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.memory = PrioritizedReplayBuffer(self.buffer_size,
                                              self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters

        self.best_score = -np.inf

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state

        self.total_reward = 0.0
        self.count = 0
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        #self.memory.add(self.last_state, action, reward, next_state, done)
        #Generate the parameters in order to calculate the TD error
        next_state_predict = np.reshape(next_state, [-1, self.state_size])
        last_state_predict = np.reshape(self.last_state, [-1, self.state_size])
        action_predict = np.reshape(action, [-1, self.action_size])
        #next_state_action = np.concatenate([next_state, action])
        Q_target_next = self.critic_target.model.predict(
            [next_state_predict, action_predict])[0]
        Q_local = self.critic_local.model.predict(
            [last_state_predict, action_predict])[0]

        #Calculate the TD error in order to generate the priority value of the experience
        td_error = reward + self.gamma * Q_target_next - Q_local

        #Normalize the TD error with TANH as advised by Google's DeepMind paper "Prioritized Experience Replay": https://arxiv.org/pdf/1511.05952.pdf
        #td_error = math.tanh(td_error[0])

        self.memory.add(self.last_state, action, reward, next_state, done,
                        abs(td_error[0]))

        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences, idx_sample, is_weights = self.memory.sample_priority()
            self.learn(experiences, idx_sample, is_weights)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, test=False):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        if test == False:
            return list(action +
                        self.noise.sample())  # add some noise for exploration
        else:
            return list(action)

    def learn(self, experiences, idx_sample, is_weights):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])
        is_weights = is_weights.reshape(-1, 1)

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (
            1 - dones) * is_weights
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        #Generate the new TD error value and update the priority value within the Replay Buffer
        td_error = rewards + self.gamma * Q_targets_next * (1 -
                                                            dones) - Q_targets

        #Normalize the TD error with TANH as advised by Google's DeepMind paper "Prioritized Experience Replay": https://arxiv.org/pdf/1511.05952.pdf
        #td_error = np.tanh(td_error)

        self.memory.update_priority(idx=idx_sample, error=td_error)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def test_control(self, file_output='data.txt'):
        state = self.reset_episode()
        done = False
        #Results with the conditions of the quadcopter
        labels = [
            'time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity',
            'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity',
            'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3',
            'rotor_speed4'
        ]
        results = {x: [] for x in labels}

        # Run the simulation, and save the results.
        with open(file_output, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(labels)
            while True:
                action = self.act(state, test=True)
                #action = self.act(state, test=False)
                next_state, reward, done = self.task.step(action)
                state = next_state
                to_write = [self.task.sim.time] + list(
                    self.task.sim.pose) + list(self.task.sim.v) + list(
                        self.task.sim.angular_v) + list(action)
                for ii in range(len(labels)):
                    results[labels[ii]].append(to_write[ii])
                writer.writerow(to_write)
                if done:
                    break
        #Shows the results of the control
        control_results(results)

    #Useful for testing
    def update_score(self):
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score