Exemplo n.º 1
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 action_sigma=0.1,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2,
                 seed=0):
        '''
        TD3 Agent
        :param state_size: State Dimension
        :param action_size: Action dimension
        :param action_sigma: standard deviation of the noise to be added to the action
        :param memory_size:
        :param batch:
        :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper)
        :param noise_clip: How much noise to allow
        :param gamma:
        :param update_frequency:
        :param seed:
        '''

        self.state_size = state_size
        self.action_size = action_size

        self.action_sigma = action_sigma
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency
        self.seed = seed

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        #second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        # second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch, seed=seed)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)

    def act(self, state, epsilon=True):

        state = torch.from_numpy(np.asarray(state)).float().to(device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor.forward(state).cpu().data.numpy()
        self.actor.train()

        if epsilon:
            #if we want to inject some noise
            noise = np.random.normal(0, self.action_sigma, action.shape[0])
            action += noise

        return action

    def update(self, step):
        '''
        #https: // arxiv.org / pdf / 1802.09477.pdf
        the function is very similar to typical DDPG algorithm, except for
        1) we have 2 critics to update
        2) we take the min of the 2 values critics output
        3) Has modified Target network with noise injected into it (Chapter 5.3 of the paper)
        4) We delay updating the actor by certain steps

        :param step: how often to update the actor
        :return:
        '''

        state, action, reward, next_state, done = self.memory.sample()

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models

        next_state_action = self.target_actor(next_state)

        #sample a random noise
        noise = Normal(torch.zeros(self.action_size), self.sigma).sample()
        noise = torch.clamp(noise, -self.noise_clip,
                            self.noise_clip).to(device)

        next_state_action += noise

        target_Q0 = self.target_critic0(next_state, next_state_action)
        target_Q1 = self.target_critic1(next_state, next_state_action)
        target_Q = torch.min(target_Q0, target_Q1)

        target_value = reward + self.gamma * target_Q * (1.0 - done)

        expected_Q0 = self.critic0(state, action)
        expected_Q1 = self.critic1(state, action)

        critic_0_loss = F.mse_loss(expected_Q0, target_value.detach())
        critic_1_loss = F.mse_loss(expected_Q1, target_value.detach())

        self.critic0_optimizer.zero_grad()
        critic_0_loss.backward()
        self.critic0_optimizer.step()

        self.critic1_optimizer.zero_grad()
        critic_1_loss.backward()
        self.critic1_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss

        #as mentioned in the paper, we delay updating the actor network.

        if step % self.update_frequency == 0:

            actor_loss = self.critic0.forward(state, self.actor.forward(state))
            actor_loss = -actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            # ----------------------- update target networks ------------------- #
            self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE)
            self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE)
            self.soft_update(self.actor, self.target_actor, TRANSFER_RATE)

    def soft_update(self, local_model, target_model, tao):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tao * local_param.data +
                                    (1.0 - tao) * target_param.data)

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)
Exemplo n.º 2
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, time_steps, hidden_dim):
        self.name = 'DDPG'  # name for uploading results
        self.scale = env.asset
        self.unit = env.unit
        self.seed = env.rd_seed

        self.time_dim = time_steps
        self.state_dim = env.observation_space.shape[1]
        self.action_dim = env.action_space.shape[0]
        self.batch_size = 64
        self.memory_size = self.time_dim + self.batch_size * 10
        self.start_size = self.time_dim + self.batch_size * 2

        # Initialise actor & critic networks
        self.actor_network = Actor(self.time_dim, self.state_dim,
                                   self.action_dim, hidden_dim)
        self.critic_network = Critic(self.time_dim, self.state_dim,
                                     self.action_dim, hidden_dim)

        # Initialize replay buffer
        self.replay_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_next_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_action = torch.zeros(
            (self.start_size - 1, 1, self.state_dim), device=cuda)
        self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim,
                                         sigma=0.01 / self.action_dim)
        self.initial()

    def initial(self):
        self.steps = 0
        self.action = torch.zeros(self.action_dim, device=cuda)
        self.replay_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_next_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_action = torch.zeros((self.start_size - 1, self.state_dim),
                                         device=cuda)
        self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda)

    def train_on_batch(self):
        # Sample a random minibatch of N transitions from replay buffer
        sample = torch.randint(self.time_dim,
                               self.replay_reward.shape[0], [self.batch_size],
                               device=cuda)
        index = torch.stack([sample - i for i in range(self.time_dim, 0, -1)
                             ]).t().reshape(-1)

        state_data = min_max_scale(self.replay_state[:, 0, :])
        amount_data = min_max_scale(self.replay_state[:, 2, :])
        next_state_data = min_max_scale(self.replay_next_state[:, 0, :])
        next_amount_data = min_max_scale(self.replay_next_state[:, 2, :])

        state_batch = torch.index_select(state_data, 0,
                                         index).view(self.batch_size, -1)
        amount_data = torch.index_select(amount_data, 0,
                                         sample).view(self.batch_size, -1)
        state_batch = torch.cat([state_batch, amount_data], dim=1)
        next_state_batch = torch.index_select(next_state_data, 0,
                                              index).view(self.batch_size, -1)
        next_amount_data = torch.index_select(next_amount_data, 0,
                                              sample).view(
                                                  self.batch_size, -1)
        next_state_batch = torch.cat([next_state_batch, next_amount_data],
                                     dim=1)
        action_batch = torch.index_select(self.replay_action / self.unit, 0,
                                          sample)
        reward_batch = torch.index_select(self.replay_reward, 0, sample)

        # Calculate y_batch
        next_action_batch = self.actor_network.target_action(next_state_batch)
        q_batch = self.critic_network.target_q(next_action_batch,
                                               next_state_batch)
        y_batch = torch.add(reward_batch, q_batch, alpha=GAMMA).view(-1, 1)

        # train actor-critic by target loss
        self.actor_network.train(
            self.critic_network.train(y_batch, action_batch, state_batch))

        # Update target networks by soft update
        self.actor_network.update_target()
        self.critic_network.update_target()

    def perceive(self, state, action, reward, next_state, done):
        if self.steps < self.start_size - 1:
            self.replay_state[self.steps] = state
            self.replay_next_state[self.steps] = next_state
            self.replay_action[self.steps] = action
            self.replay_reward[self.steps] = reward
        else:
            if self.steps >= self.memory_size:
                self.replay_state = self.replay_state[1:]
                self.replay_next_state = self.replay_next_state[1:]
                self.replay_action = self.replay_action[1:]
                self.replay_reward = self.replay_reward[1:]
            self.replay_state = torch.cat(
                (self.replay_state, state.unsqueeze(0)), dim=0)
            self.replay_next_state = torch.cat(
                (self.replay_next_state, next_state.unsqueeze(0)), dim=0)
            self.replay_action = torch.cat(
                (self.replay_action, action.unsqueeze(0)), dim=0)
            self.replay_reward = torch.cat(
                (self.replay_reward, reward.unsqueeze(0)), dim=0)
        self.steps += 1

    def act(self, next_state, portfolio):
        if self.steps > self.start_size:
            next_state_data = min_max_scale(
                self.replay_next_state[:, 0, :])[-self.time_dim:].view(1, -1)
            next_amount_data = min_max_scale(
                self.replay_next_state[:, 2, :])[-1].view(1, -1)
            next_state_data = torch.cat([next_state_data, next_amount_data],
                                        dim=1)
            self.train_on_batch()
            allocation = self.actor_network.target_action(
                next_state_data).data.view(-1)
            allocation += torch.tensor(self.exploration_noise.noise().tolist(),
                                       device=cuda)
            allocation[allocation < 0] = 0
            allocation /= sum(allocation)
            allocation = torch.floor(portfolio * allocation /
                                     next_state[1, :] / self.unit) * self.unit
            self.action = allocation
        return self.action.clone()
Exemplo n.º 3
0
class DDPGAGENT:
    def __init__(self, state_size, action_size, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPS

        #--- actor -----#

        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=1e-3)

        #---- critic -----#

        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=1e-3,
                                           weight_decay=0)

        self.noise = OUNoise(action_size, random_seed)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        #self.timestep = 0

    def step(self, state, action, reward, next_state, done, timestep):
        self.memory.add_experience(state, action, reward, next_state, done)

        #self.timestep = (self.timestep + 1) % UPDATE_EVERY

        if len(self.memory) > BATCH_SIZE and timestep % UPDATE_EVERY == 0:
            for _ in range(LEARN_NUM):
                xp = self.memory.sample()
                self.learn(xp, GAMMA)  #GAMMA VALUE 0.99

    def act(self, state, noise_accumulate=True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        #Epsilon greedy selection
        if noise_accumulate:
            action += self.epsilon * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset_internal_state()

    def learn(self, xp, gamma):
        states, actions, rewards, next_states, dones = xp

        #---configuring critic and computation of loss with help of MSE

        actions_nxt = self.actor_target(next_states)

        q_target_next = self.critic_target(next_states, actions_nxt)

        q_target = rewards + (gamma * q_target_next * (1 - dones))

        q_expected = self.critic_local(states, actions)

        #MSE LOSS
        critic_loss = F.mse_loss(q_expected, q_target)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clips gradient norm of an iterable of parameters
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        #---configuring actor and computation of loss with help of MSE
        actor_predicted = self.actor_local(states)
        actor_loss = -self.critic_local(states, actor_predicted).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        self.epsilon -= 1e-6
        self.noise.reset_internal_state()

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(
        self,
        state_size=24,
        action_size=2,
        BATCH_SIZE=128,
        BUFFER_SIZE=int(1e6),
        discount_factor=1,
        tau=1e-2,
        noise_coefficient_start=5,
        noise_coefficient_decay=0.99,
        LR_ACTOR=1e-3,
        LR_CRITIC=1e-3,
        WEIGHT_DECAY=1e-3,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")):
        """
			state_size (int): dimension of each state
			action_size (int): dimension of each action
			BATCH_SIZE (int): mini batch size
			BUFFER_SIZE (int): experience storing lenght, keep it as high as possible
			discount_factor (float): discount factor for calculating Q_target
			tau (float): interpolation parameter for updating target network
			noise_coefficient_start (float): value to be multiplied to OUNoise sample
			noise_coefficient_decay (float): exponential decay factor for value to be multiplied to OUNoise sample
			LR_ACTOR (float): learning rate for actor network
			LR_CRITIC (float): learning rate for critic network
			WEIGHT_DECAY (float): Weight decay for critic network optimizer
			device : "cuda:0" if torch.cuda.is_available() else "cpu"
		"""

        self.state_size = state_size
        print(device)
        self.action_size = action_size
        self.BATCH_SIZE = BATCH_SIZE
        self.BUFFER_SIZE = BUFFER_SIZE
        self.discount_factor = discount_factor
        self.tau = tau
        self.noise_coefficient = noise_coefficient_start
        self.noise_coefficient_decay = noise_coefficient_decay
        self.steps_completed = 0
        self.device = device
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(self.device)
        self.actor_target = Actor(state_size, action_size).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size).to(self.device)
        self.critic_target = Critic(state_size, action_size).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((1, action_size))

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.memory.add(state, action, reward, next_state, done)
        self.steps_completed += 1
        # If number of memory data > Batch_Size then learn
        if len(self.memory) > self.BATCH_SIZE:
            experiences = self.memory.sample(self.device)
            self.learn(experiences, self.discount_factor, agent_number)

    def act(self, states, add_noise):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(self.device)
        actions = np.zeros((1, self.action_size))  # shape will be (1,2)
        self.actor_local.eval()
        with torch.no_grad():
            actions[0, :] = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise_coefficient * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, discount_factor, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
		Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
		where:
			actor_target(state) -> action
			critic_target(state, action) -> Q-value
		Params
		======
			experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
			discount_factor (float): discount factor
		"""
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)

        # It is basically taking action of both the agents, so if agent_number=0 then we will have to concatenate agent0 action(currently actions_next) and agent1 action(currently actions[:,2:])
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (discount_factor * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)

        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)

        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

        # Update noise_coefficient value
        # self.noise_coefficient = self.noise_coefficient*self.noise_coefficient_decay

        self.noise_coefficient = max(
            self.noise_coefficient - (1 / self.noise_coefficient_decay), 0)
        # print(self.steps_completed,': ',self.noise_coefficient)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
		θ_target = τ*θ_local + (1 - τ)*θ_target
			local_model: PyTorch model (weights will be copied from)
			target_model: PyTorch model (weights will be copied to)
			tau (float): interpolation parameter 
		"""
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
Exemplo n.º 5
0
class Agent():
    """ Interacts with and learns from the environment """
    def __init__(self, state_size, action_size, num_agents, seed):
        """
        Initialize an Agent object
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.eps = eps_start
        self.t_step = 0

        # Actor Network (with Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (with Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def step(self, state, action, reward, next_state, done, agent_number):
        """ Save experience in replay memory, and use random sample from buffer to learn """
        self.t_step += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory and at interval settings
        if len(self.memory) > BATCH_SIZE:
            if self.t_step % UPDATE_EVERY == 0:
                for _ in range(N_UPDATES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA, agent_number)

    def act(self, states, add_noise):
        """ Returns actions for given state as per current policy """
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.eps * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """
        Update policy and value parameters using given batch of experience tuples

        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)

        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)

        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)

        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # Update epsilon noise value
        self.eps = self.eps - (1 / eps_decay)
        if self.eps < eps_end:
            self.eps = eps_end

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters

        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 6
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, n, state_size, action_size, random_seed, params):
        """Initialize an Agent object.
        
        Params
        ======
            n (int): number of agents in env
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            params (dict): dictionary with hyperparameters name-value pairs
        """
        self.n = n
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.BUFFER_SIZE = params["BUFFER_SIZE"]
        self.BATCH_SIZE = params["BATCH_SIZE"]
        self.GAMMA = params["GAMMA"]
        self.TAU = params["TAU"]
        self.LR_ACTOR = params["LR_ACTOR"]
        self.LR_CRITIC = params["LR_CRITIC"]
        self.WEIGHT_DECAY = params["WEIGHT_DECAY"]
        self.N_UPDATES = params["N_UPDATES"]
        self.UPDATE_STEP = params["UPDATE_STEP"]

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.LR_CRITIC,
                                           weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(self.n, action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE, random_seed)

        #Count timesteps
        self.timestep = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(self.n):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        self.timestep += 1
        # Learn, if enough samples are available in memory
        if self.timestep % self.UPDATE_STEP == 0 and len(
                self.memory) > self.BATCH_SIZE:
            for _ in range(self.N_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences, self.GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 7
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed, num_agents):
        """Initialize an Agent object.
         """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, sigma=0.1)

        # Replay buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.num_agents = num_agents

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        #self.memory.add(state, action, reward, next_state, done)
        for i in range(self.num_agents):
            self.memory.add(state[i], action[i], reward[i], next_state[i],
                            done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # update critic
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # update actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)