class Agent(): def __init__(self, state_size, action_size, action_sigma=0.1, memory_size=1000000, batch=128, sigma=0.2, noise_clip=0.5, gamma=0.99, update_frequency=2, seed=0): ''' TD3 Agent :param state_size: State Dimension :param action_size: Action dimension :param action_sigma: standard deviation of the noise to be added to the action :param memory_size: :param batch: :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper) :param noise_clip: How much noise to allow :param gamma: :param update_frequency: :param seed: ''' self.state_size = state_size self.action_size = action_size self.action_sigma = action_sigma self.sigma = sigma self.noise_clip = noise_clip self.gamma = gamma self.update_frequency = update_frequency self.seed = seed self.actor = Actor(self.state_size, self.action_size).to(device) self.critic0 = Critic(self.state_size, self.action_size).to(device) #second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.critic1 = Critic(self.state_size, self.action_size).to(device) self.target_actor = Actor(self.state_size, self.action_size).to(device) self.target_critic0 = Critic(self.state_size, self.action_size).to(device) # second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.target_critic1 = Critic(self.state_size, self.action_size).to(device) self.memory = ReplayBuffer(memory_size, batch, seed=seed) self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR) self.soft_update(self.actor, self.target_actor, 1) self.soft_update(self.critic0, self.target_critic0, 1) self.soft_update(self.critic1, self.target_critic1, 1) def act(self, state, epsilon=True): state = torch.from_numpy(np.asarray(state)).float().to(device) self.actor.eval() with torch.no_grad(): action = self.actor.forward(state).cpu().data.numpy() self.actor.train() if epsilon: #if we want to inject some noise noise = np.random.normal(0, self.action_sigma, action.shape[0]) action += noise return action def update(self, step): ''' #https: // arxiv.org / pdf / 1802.09477.pdf the function is very similar to typical DDPG algorithm, except for 1) we have 2 critics to update 2) we take the min of the 2 values critics output 3) Has modified Target network with noise injected into it (Chapter 5.3 of the paper) 4) We delay updating the actor by certain steps :param step: how often to update the actor :return: ''' state, action, reward, next_state, done = self.memory.sample() # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models next_state_action = self.target_actor(next_state) #sample a random noise noise = Normal(torch.zeros(self.action_size), self.sigma).sample() noise = torch.clamp(noise, -self.noise_clip, self.noise_clip).to(device) next_state_action += noise target_Q0 = self.target_critic0(next_state, next_state_action) target_Q1 = self.target_critic1(next_state, next_state_action) target_Q = torch.min(target_Q0, target_Q1) target_value = reward + self.gamma * target_Q * (1.0 - done) expected_Q0 = self.critic0(state, action) expected_Q1 = self.critic1(state, action) critic_0_loss = F.mse_loss(expected_Q0, target_value.detach()) critic_1_loss = F.mse_loss(expected_Q1, target_value.detach()) self.critic0_optimizer.zero_grad() critic_0_loss.backward() self.critic0_optimizer.step() self.critic1_optimizer.zero_grad() critic_1_loss.backward() self.critic1_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss #as mentioned in the paper, we delay updating the actor network. if step % self.update_frequency == 0: actor_loss = self.critic0.forward(state, self.actor.forward(state)) actor_loss = -actor_loss.mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ------------------- # self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE) self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE) self.soft_update(self.actor, self.target_actor, TRANSFER_RATE) def soft_update(self, local_model, target_model, tao): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tao * local_param.data + (1.0 - tao) * target_param.data) def add_to_memory(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done)
class DDPG: """docstring for DDPG""" def __init__(self, env, time_steps, hidden_dim): self.name = 'DDPG' # name for uploading results self.scale = env.asset self.unit = env.unit self.seed = env.rd_seed self.time_dim = time_steps self.state_dim = env.observation_space.shape[1] self.action_dim = env.action_space.shape[0] self.batch_size = 64 self.memory_size = self.time_dim + self.batch_size * 10 self.start_size = self.time_dim + self.batch_size * 2 # Initialise actor & critic networks self.actor_network = Actor(self.time_dim, self.state_dim, self.action_dim, hidden_dim) self.critic_network = Critic(self.time_dim, self.state_dim, self.action_dim, hidden_dim) # Initialize replay buffer self.replay_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_next_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_action = torch.zeros( (self.start_size - 1, 1, self.state_dim), device=cuda) self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, sigma=0.01 / self.action_dim) self.initial() def initial(self): self.steps = 0 self.action = torch.zeros(self.action_dim, device=cuda) self.replay_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_next_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_action = torch.zeros((self.start_size - 1, self.state_dim), device=cuda) self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda) def train_on_batch(self): # Sample a random minibatch of N transitions from replay buffer sample = torch.randint(self.time_dim, self.replay_reward.shape[0], [self.batch_size], device=cuda) index = torch.stack([sample - i for i in range(self.time_dim, 0, -1) ]).t().reshape(-1) state_data = min_max_scale(self.replay_state[:, 0, :]) amount_data = min_max_scale(self.replay_state[:, 2, :]) next_state_data = min_max_scale(self.replay_next_state[:, 0, :]) next_amount_data = min_max_scale(self.replay_next_state[:, 2, :]) state_batch = torch.index_select(state_data, 0, index).view(self.batch_size, -1) amount_data = torch.index_select(amount_data, 0, sample).view(self.batch_size, -1) state_batch = torch.cat([state_batch, amount_data], dim=1) next_state_batch = torch.index_select(next_state_data, 0, index).view(self.batch_size, -1) next_amount_data = torch.index_select(next_amount_data, 0, sample).view( self.batch_size, -1) next_state_batch = torch.cat([next_state_batch, next_amount_data], dim=1) action_batch = torch.index_select(self.replay_action / self.unit, 0, sample) reward_batch = torch.index_select(self.replay_reward, 0, sample) # Calculate y_batch next_action_batch = self.actor_network.target_action(next_state_batch) q_batch = self.critic_network.target_q(next_action_batch, next_state_batch) y_batch = torch.add(reward_batch, q_batch, alpha=GAMMA).view(-1, 1) # train actor-critic by target loss self.actor_network.train( self.critic_network.train(y_batch, action_batch, state_batch)) # Update target networks by soft update self.actor_network.update_target() self.critic_network.update_target() def perceive(self, state, action, reward, next_state, done): if self.steps < self.start_size - 1: self.replay_state[self.steps] = state self.replay_next_state[self.steps] = next_state self.replay_action[self.steps] = action self.replay_reward[self.steps] = reward else: if self.steps >= self.memory_size: self.replay_state = self.replay_state[1:] self.replay_next_state = self.replay_next_state[1:] self.replay_action = self.replay_action[1:] self.replay_reward = self.replay_reward[1:] self.replay_state = torch.cat( (self.replay_state, state.unsqueeze(0)), dim=0) self.replay_next_state = torch.cat( (self.replay_next_state, next_state.unsqueeze(0)), dim=0) self.replay_action = torch.cat( (self.replay_action, action.unsqueeze(0)), dim=0) self.replay_reward = torch.cat( (self.replay_reward, reward.unsqueeze(0)), dim=0) self.steps += 1 def act(self, next_state, portfolio): if self.steps > self.start_size: next_state_data = min_max_scale( self.replay_next_state[:, 0, :])[-self.time_dim:].view(1, -1) next_amount_data = min_max_scale( self.replay_next_state[:, 2, :])[-1].view(1, -1) next_state_data = torch.cat([next_state_data, next_amount_data], dim=1) self.train_on_batch() allocation = self.actor_network.target_action( next_state_data).data.view(-1) allocation += torch.tensor(self.exploration_noise.noise().tolist(), device=cuda) allocation[allocation < 0] = 0 allocation /= sum(allocation) allocation = torch.floor(portfolio * allocation / next_state[1, :] / self.unit) * self.unit self.action = allocation return self.action.clone()
class DDPGAGENT: def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPS #--- actor -----# self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=1e-3) #---- critic -----# self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=1e-3, weight_decay=0) self.noise = OUNoise(action_size, random_seed) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) #self.timestep = 0 def step(self, state, action, reward, next_state, done, timestep): self.memory.add_experience(state, action, reward, next_state, done) #self.timestep = (self.timestep + 1) % UPDATE_EVERY if len(self.memory) > BATCH_SIZE and timestep % UPDATE_EVERY == 0: for _ in range(LEARN_NUM): xp = self.memory.sample() self.learn(xp, GAMMA) #GAMMA VALUE 0.99 def act(self, state, noise_accumulate=True): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() #Epsilon greedy selection if noise_accumulate: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset_internal_state() def learn(self, xp, gamma): states, actions, rewards, next_states, dones = xp #---configuring critic and computation of loss with help of MSE actions_nxt = self.actor_target(next_states) q_target_next = self.critic_target(next_states, actions_nxt) q_target = rewards + (gamma * q_target_next * (1 - dones)) q_expected = self.critic_local(states, actions) #MSE LOSS critic_loss = F.mse_loss(q_expected, q_target) self.critic_optimizer.zero_grad() critic_loss.backward() # Clips gradient norm of an iterable of parameters torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() #---configuring actor and computation of loss with help of MSE actor_predicted = self.actor_local(states) actor_loss = -self.critic_local(states, actor_predicted).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) self.epsilon -= 1e-6 self.noise.reset_internal_state() def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__( self, state_size=24, action_size=2, BATCH_SIZE=128, BUFFER_SIZE=int(1e6), discount_factor=1, tau=1e-2, noise_coefficient_start=5, noise_coefficient_decay=0.99, LR_ACTOR=1e-3, LR_CRITIC=1e-3, WEIGHT_DECAY=1e-3, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")): """ state_size (int): dimension of each state action_size (int): dimension of each action BATCH_SIZE (int): mini batch size BUFFER_SIZE (int): experience storing lenght, keep it as high as possible discount_factor (float): discount factor for calculating Q_target tau (float): interpolation parameter for updating target network noise_coefficient_start (float): value to be multiplied to OUNoise sample noise_coefficient_decay (float): exponential decay factor for value to be multiplied to OUNoise sample LR_ACTOR (float): learning rate for actor network LR_CRITIC (float): learning rate for critic network WEIGHT_DECAY (float): Weight decay for critic network optimizer device : "cuda:0" if torch.cuda.is_available() else "cpu" """ self.state_size = state_size print(device) self.action_size = action_size self.BATCH_SIZE = BATCH_SIZE self.BUFFER_SIZE = BUFFER_SIZE self.discount_factor = discount_factor self.tau = tau self.noise_coefficient = noise_coefficient_start self.noise_coefficient_decay = noise_coefficient_decay self.steps_completed = 0 self.device = device # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(self.device) self.actor_target = Actor(state_size, action_size).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(self.device) self.critic_target = Critic(state_size, action_size).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((1, action_size)) # Replay memory self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE) def step(self, state, action, reward, next_state, done, agent_number): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done) self.steps_completed += 1 # If number of memory data > Batch_Size then learn if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample(self.device) self.learn(experiences, self.discount_factor, agent_number) def act(self, states, add_noise): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(self.device) actions = np.zeros((1, self.action_size)) # shape will be (1,2) self.actor_local.eval() with torch.no_grad(): actions[0, :] = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise_coefficient * self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, discount_factor, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples discount_factor (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # It is basically taking action of both the agents, so if agent_number=0 then we will have to concatenate agent0 action(currently actions_next) and agent1 action(currently actions[:,2:]) if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (discount_factor * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) # Update noise_coefficient value # self.noise_coefficient = self.noise_coefficient*self.noise_coefficient_decay self.noise_coefficient = max( self.noise_coefficient - (1 / self.noise_coefficient_decay), 0) # print(self.steps_completed,': ',self.noise_coefficient) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class Agent(): """ Interacts with and learns from the environment """ def __init__(self, state_size, action_size, num_agents, seed): """ Initialize an Agent object Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.eps = eps_start self.t_step = 0 # Actor Network (with Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (with Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def step(self, state, action, reward, next_state, done, agent_number): """ Save experience in replay memory, and use random sample from buffer to learn """ self.t_step += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at interval settings if len(self.memory) > BATCH_SIZE: if self.t_step % UPDATE_EVERY == 0: for _ in range(N_UPDATES): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """ Returns actions for given state as per current policy """ states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: actions += self.eps * self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """ Update policy and value parameters using given batch of experience tuples Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # Update epsilon noise value self.eps = self.eps - (1 / eps_decay) if self.eps < eps_end: self.eps = eps_end def soft_update(self, local_model, target_model, tau): """ Soft update model parameters θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, n, state_size, action_size, random_seed, params): """Initialize an Agent object. Params ====== n (int): number of agents in env state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed params (dict): dictionary with hyperparameters name-value pairs """ self.n = n self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.BUFFER_SIZE = params["BUFFER_SIZE"] self.BATCH_SIZE = params["BATCH_SIZE"] self.GAMMA = params["GAMMA"] self.TAU = params["TAU"] self.LR_ACTOR = params["LR_ACTOR"] self.LR_CRITIC = params["LR_CRITIC"] self.WEIGHT_DECAY = params["WEIGHT_DECAY"] self.N_UPDATES = params["N_UPDATES"] self.UPDATE_STEP = params["UPDATE_STEP"] # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.LR_CRITIC, weight_decay=self.WEIGHT_DECAY) # Noise process self.noise = OUNoise(self.n, action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, random_seed) #Count timesteps self.timestep = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(self.n): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) self.timestep += 1 # Learn, if enough samples are available in memory if self.timestep % self.UPDATE_STEP == 0 and len( self.memory) > self.BATCH_SIZE: for _ in range(self.N_UPDATES): experiences = self.memory.sample() self.learn(experiences, self.GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.TAU) self.soft_update(self.actor_local, self.actor_target, self.TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, num_agents): """Initialize an Agent object. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, sigma=0.1) # Replay buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.num_agents = num_agents def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward #self.memory.add(state, action, reward, next_state, done) for i in range(self.num_agents): self.memory.add(state[i], action[i], reward[i], next_state[i], done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # update critic actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # update actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() #update target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)