class Agent(): """Interacts with and learns from the environment.""" def __init__(self, persistence_file, memory, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.persistence_file = persistence_file self.memory = memory self.state_size = state_size self.action_size = action_size self.seed_num = random_seed self.random_seed = random.seed(self.seed_num) self.epsilon = EPSILON self.initialize_models() self.noise = OUNoise(action_size, random_seed) @property def state(self): return { 'actor_local': self.actor_local.state_dict(), 'actor_target': self.actor_target.state_dict(), 'actor_optimizer': self.actor_optimizer.state_dict(), 'critic_local': self.critic_local.state_dict(), 'critic_target': self.critic_target.state_dict(), 'critic_optimizer': self.critic_optimizer.state_dict() } def save(self): torch.save(self.state, self.persistence_file) def initialize_models(self): # Actor Network (w/ Target Network) self.actor_local = Actor(self.state_size, self.action_size, self.seed_num).to(DEVICE) self.actor_target = Actor(self.state_size, self.action_size, self.seed_num).to(DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) print('Actor Network') print(self.actor_local) # Critic Network (w/ Target Network) self.critic_local = Critic(self.state_size, self.action_size, self.seed_num).to(DEVICE) self.critic_target = Critic(self.state_size, self.action_size, self.seed_num).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) print('Critic Network') print(self.critic_local) # look to the persistence file for a saved state if path.exists(self.persistence_file): print('Loading persisted agents from {}'.format(self.persistence_file)) state = torch.load(self.persistence_file) for k in ['actor_local', 'actor_target', 'critic_local', 'critic_target', 'actor_optimizer', 'critic_optimizer']: getattr(self, k).load_state_dict(state[k]) else: print('Creating new agents, none found at {}'.format(self.persistence_file)) def step(self, state, action, reward, next_state, done, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn at defined interval, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(DEVICE) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ---------------------------- update noise ---------------------------- # self.epsilon -= EPSILON_DECAY self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class A2C: def __init__(self, conf, device): self.conf = conf self.state_dim = conf['state_dim'] self.action_dim = conf['action_dim'] self.device = device self.actor = Actor(self.state_dim, self.action_dim).to(self.device) self.critic = Critic(self.state_dim, self.action_dim).to(self.device) self.optimizerA = optim.Adam(self.actor.parameters()) self.optimizerC = optim.Adam(self.critic.parameters()) def optimization_model(self, next_state, rewards, log_probs, values, masks): ''' next_state : episode last state, which is used for G_t (return) rewards : a list that includes all rewards during an episode log_probs : a list that includes all log pi(a_t|s_t) during an episode, relative to actor network values : a list that includes all V(s_t), relative to critic network ''' next_state_ts = to_tensor(next_state).reshape(-1) # [5*num_plant] next_value = self.critic(next_state_ts) # V(s_{t+1}) 전체 분포? returns = self.compute_returns(next_value, rewards, masks) # G_t = R + gamma * G_{t+1} log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values # A = G_t - V(s_t) actor_loss = -(log_probs * advantage.detach()).mean() # -(log_pi(a|s) * A) 의 평균 critic_loss = advantage.pow(2).mean() # A^2 의 평균? self.optimizerA.zero_grad() self.optimizerC.zero_grad() actor_loss.backward() critic_loss.backward() self.optimizerA.step() self.optimizerC.step() return actor_loss, critic_loss def optimization_model_v2(self, dist, action, state_ts, next_state_ts, reward, done, gamma=0.99): advantage = reward + (1-done) * gamma * self.critic(next_state_ts) - self.critic(state_ts) critic_loss = advantage.pow(2).mean() log_prob = dist.log_prob(action) # scalar : log pi(a_t|s_t) actor_loss = -(log_prob * advantage.detach()) # [1] self.optimizerA.zero_grad() self.optimizerC.zero_grad() actor_loss.backward() critic_loss.backward() self.optimizerA.step() self.optimizerC.step() return actor_loss, critic_loss def compute_returns(self, next_value, rewards, masks, gamma=0.99): R = next_value returns = [] for step in reversed(range(len(rewards))): R = rewards[step] + gamma * R * masks[step] returns.insert(0, R) return returns def actor_load_model(self, path): self.actor = Actor(self.state_dim, self.action_dim).to(self.device) self.actor.load_state_dict(torch.load(path)) self.actor.eval()
class Agent(): def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.noise = OUNoise(action_size, random_seed) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, n_agents=1, seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action n_agents: number of agents it will control in the environment seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = np.random.seed(seed) random.seed(seed) self.n_agents = n_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed=seed).to(device) self.actor_target = Actor(state_size, action_size, seed=seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed=seed).to(device) self.critic_target = Critic(state_size, action_size, seed=seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.timesteps = 0 def step(self, states, actions, rewards, next_states, dones): """ Given a batch of S,A,R,S' experiences, it saves them into the experience buffer, and occasionally samples from the experience buffer to perform training steps. """ self.timesteps += 1 for i in range(self.n_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) if (len(self.memory) > BATCH_SIZE) and (self.timesteps % 20 == 0): for _ in range(10): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): """ Given a list of states for each agent it returns the actions to be taken by each agent based on the current policy. Returns a numpy array of shape [n_agents, n_actions] NOTE: clips actions to be between -1, 1 Args: states: () one row of state for each agent [n_agents, n_actions] add_noise: (bool) add noise to the actions? """ states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += [self.noise.sample() for _ in range(self.n_agents)] return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) @property def device(self): return device
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = [ Actor(state_size, action_size, random_seed).to(device) for cnt in range(num_agents) ] self.actor_target = [ Actor(state_size, action_size, random_seed).to(device) for cnt in range(num_agents) ] self.actor_optimizer = [ optim.Adam(self.actor_local[cnt].parameters(), lr=LR_ACTOR) for cnt in range(num_agents) ] # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process #self.noise = OUNoise(action_size, random_seed) self.noise = OUNoise((2, ), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones, step): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward #for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() if step % (BATCH_SIZE / 8): self.learn(experiences, GAMMA) def act(self, state, net_index, episode, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local[net_index].eval() with torch.no_grad(): action = self.actor_local[net_index]( state[net_index]).cpu().data.numpy() self.actor_local[net_index].train() #print(action.shape) if add_noise: action += np.exp(-episode / 2000) * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences for net_index in range(2): next_states_cloned = next_states.clone() rewards_cloned = rewards.clone() dones_cloned = dones.clone() actions_cloned = actions.clone() states_cloned = states.clone() actions_next = self.actor_target[net_index]( next_states_cloned[:, net_index, :]) actions_next_cloned = actions_next.clone() Q_targets_next = self.critic_target( next_states_cloned[:, net_index, :], actions_next_cloned) if net_index == 0: Q_targets_one = rewards_cloned[:, net_index].view( BATCH_SIZE, 1) + (gamma * Q_targets_next * (1 - dones_cloned[:, net_index, :])) # Compute critic loss Q_expected_one = self.critic_local( states_cloned[:, net_index, :], actions_cloned[:, net_index, :]) actions_pred = self.actor_local[net_index]( states_cloned[:, net_index, :]) actor_loss_one = -self.critic_local( states_cloned[:, net_index, :], actions_pred) else: Q_targets_two = rewards_cloned[:, net_index].view( BATCH_SIZE, 1) + (gamma * Q_targets_next * (1 - dones_cloned[:, net_index, :])) # Compute critic loss Q_expected_two = self.critic_local( states_cloned[:, net_index, :], actions_cloned[:, net_index, :]) actions_pred = self.actor_local[net_index]( states_cloned[:, net_index, :]) actor_loss_two = -self.critic_local( states_cloned[:, net_index, :], actions_pred) Q_expected = torch.cat([ Q_expected_one.view(1, BATCH_SIZE, 1), Q_expected_two.view(1, BATCH_SIZE, 1) ]).mean(0) Q_targets = torch.cat([ Q_targets_one.view(1, BATCH_SIZE, 1), Q_targets_two.view(1, BATCH_SIZE, 1) ]).mean(0) actor_loss = torch.cat([ actor_loss_one.view(1, BATCH_SIZE, 1), actor_loss_two.view(1, BATCH_SIZE, 1) ]).mean() critic_loss = F.mse_loss(Q_expected.clone(), Q_targets.clone()) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() self.soft_update(self.critic_local, self.critic_target, TAU) for net_index in range(2): self.actor_optimizer[net_index].zero_grad() actor_loss.backward(retain_graph=True) self.actor_optimizer[net_index].step() self.soft_update(self.actor_local[net_index], self.actor_target[net_index], TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(2 * state_size, action_size, random_seed).to(device) self.actor_target = Actor(2 * state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(2 * state_size, 2 * action_size, random_seed).to(device) self.critic_target = Critic(2 * state_size, 2 * action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents,action_size), random_seed, mu=0., theta=0.15, sigma=0.1) self.add_noise = True self.param_noise = ActorParamNoise(2 * state_size, action_size, random_seed).to(device) self.add_param_noise = True # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.noise_coef = 6 # update parameter or not self.update_param = True def step(self, state, action, reward, next_state, done, timestep, agent_id): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) timestep = timestep % TRAIN_EVERY # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep == 0 and self.update_param: for _ in range(N_LEARN_UPDATES): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_id) def act(self, state): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() self.param_noise.eval() with torch.no_grad(): self.param_noise.reset_noise_parameters() add_param_noise(self.param_noise, self.actor_local, 1) #action = self.actor_local(state).cpu().data.numpy() action = self.param_noise(state).cpu().data.numpy() self.param_noise.train() self.actor_local.train() if self.add_noise: action += self.noise.sample() * self.noise_coef self.noise_coef *= DECAY_NOISE if self.noise_coef < 0.01: self.noise_coef = 0.01 return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_id): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) if agent_id == 0: actions_next = torch.cat((actions_next, actions[:,2:]), dim = 1) else: actions_next = torch.cat((actions[:,:2], actions_next), dim = 1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # clip gradient #torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_id == 0: actions_pred = torch.cat((actions_pred,actions[:,2:]), dim = 1) else: actions_pred = torch.cat((actions[:,:2],actions_pred), dim = 1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def copy_parameter(self, other_ddpg_agent): self.soft_update(other_ddpg_agent.actor_local, self.actor_local, 1) self.soft_update(other_ddpg_agent.critic_local, self.critic_local, 1) self.soft_update(other_ddpg_agent.actor_target, self.actor_target, 1) self.soft_update(other_ddpg_agent.critic_target, self.critic_target, 1)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process - adapt the noise for multi agents self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory - no changes as agents share the memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def hard_copy_weights(self, target, source): """ copy weights from source to target network (part of initialization)""" for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward - adapt it for multi agents for i in range(self.num_agents): self.memory.add(state[i], action[i], reward[i], next_state[i], done[i]) def to_learn(self, t): # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() #if int (t / LEARN_EVERY ) % 2 == 1: self.learn(experiences, GAMMA, t) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" """Adapted for multi agents""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_i, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_i] = action self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, t): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) #use gradient clipping self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # #if t % UPDATE_EVERY == 0: self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DDPG(object): def __init__(self, memory, nb_status, nb_actions, action_noise=None, gamma=0.99, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), actor_lr=1e-4, critic_lr=1e-3): self.nb_status = nb_status self.nb_actions = nb_actions self.action_range = action_range self.observation_range = observation_range self.normalize_observations = normalize_observations self.actor = Actor(self.nb_status, self.nb_actions) self.actor_target = Actor(self.nb_status, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(self.nb_status, self.nb_actions) self.critic_target = Critic(self.nb_status, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) # Create replay buffer self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.action_noise = action_noise # Hyper-parameters self.batch_size = batch_size self.tau = tau self.discount = gamma if self.normalize_observations: self.obs_rms = RunningMeanStd() else: self.obs_rms = None def pi(self, obs, apply_noise=True, compute_Q=True): obs = np.array([obs]) action = to_numpy(self.actor(to_tensor(obs))).squeeze(0) if compute_Q: q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q[0][0] def store_transition(self, obs0, action, reward, obs1, terminal1): self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) next_q_values = self.critic_target([ to_tensor(batch['obs1'], volatile=True), self.actor_target(to_tensor(batch['obs1'], volatile=True))]) next_q_values.volatile = False target_q_batch = to_tensor(batch['rewards']) + \ self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values self.critic.zero_grad() q_batch = self.critic([to_tensor(batch['obs0']), to_tensor(batch['actions'])]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.cpu().data[0], policy_loss.cpu().data[0] def initialize(self): hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_target_net(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def reset(self): if self.action_noise is not None: self.action_noise.reset() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda()
class DdpgAgent(): def __init__(self, config, seed, device="cpu"): self.seed = seed # -- Set environment self.action_size = config["env"]["action_size"] self.env = config["env"]["simulator"] self.brain_name = config["env"]["brain_name"] self.num_agents = config["env"]["num_agents"] # -- Construct Actor/Critic models self.actor_local = Actor(config["env"]["state_size"], config["env"]["action_size"], seed, config["actor"]["hidden_layers"]).to(device) self.actor_target = Actor(config["env"]["state_size"], config["env"]["action_size"], seed, config["actor"]["hidden_layers"]).to(device) self.checkpoint = {"state_size":config["env"]["state_size"], "action_size":config["env"]["action_size"], "hidden_layers":config["actor"]["hidden_layers"], "state_dict":self.actor_local.state_dict()} self.critic_local = Critic(config["env"]["state_size"], config["env"]["action_size"], seed, config["critic"]["hidden_layers"]).to(device) self.critic_target = Critic(config["env"]["state_size"], config["env"]["action_size"], seed, config["critic"]["hidden_layers"]).to(device) # self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config["learning"]["lr_critic"], weight_decay=0.0001) # -- Configure optimizer self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config["learning"]["lr_actor"]) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config["learning"]["lr_critic"]) self.optimizer_lr_decay = config["learning"]["lr_decay"]["activate"] self.actor_optimizer_lr_scheduler = optim.lr_scheduler.StepLR(self.actor_optimizer, step_size=config["learning"]["lr_decay"]["actor_step"], gamma=config["learning"]["lr_decay"]["actor_gamma"]) self.critic_optimizer_lr_scheduler = optim.lr_scheduler.StepLR(self.critic_optimizer, step_size=config["learning"]["lr_decay"]["critic_step"], gamma=config["learning"]["lr_decay"]["critic_gamma"]) # -- Set learning parameters self.batch_size = config["learning"]["batch_size"] self.buffer_size = config["learning"]["buffer_size"] self.discount = config["learning"]["discount"] self.max_t = config["learning"]["max_t"] self.tau = config["learning"]["soft_update_tau"] self.learn_every_n_steps = config["learning"]["learn_every_n_steps"] self.num_learn_steps = config["learning"]["num_learn_steps"] self.checkpointfile = config["learning"]["checkpointfile"] self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed, device) self.device=device self.add_noise = True self.ou_noise = OUNoise(self.action_size, seed) self.hard_copy(self.actor_local, self.actor_target) self.hard_copy(self.critic_local, self.critic_target) def steps(self): if self.optimizer_lr_decay: self.actor_optimizer_lr_scheduler.step() self.critic_optimizer_lr_scheduler.step() env_info = self.env.reset(train_mode=True)[self.brain_name] self.ou_noise.reset() state = env_info.vector_observations score = np.zeros(self.num_agents) self.step_ctr = 0 while True: action = self.act(state) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations # get next state (for each agent) reward = env_info.rewards # get reward (for each agent) done = env_info.local_done self.step(state, action, reward, next_state, done) state = next_state score += reward if np.any(done): break return score, self.step_ctr def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.step_ctr += 1 if len(self.memory) > self.batch_size and self.step_ctr % self.learn_every_n_steps == 0: for _ in range(self.num_learn_steps): self.learn() def act(self, state): # print(state) state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() # set train= False with torch.no_grad(): # action = self.actor_local(state).data.numpy() action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # set back train=True if self.add_noise: action += self.ou_noise.sample() return np.clip(action, -1, 1) def learn(self): # print("IM IN") # print("*") states, actions, rewards, next_states, dones = self.memory.sample_random() # -------------------- Update Critic ----------------------------- # Get predicted next-state actions and Q values from target model next_actions = self.actor_target(next_states) # Q_targets_next = self.critic_target(next_states, next_actions) Q_targets_next = self.critic_target(next_states, next_actions).detach() # Compare Q targets for current states (y_i) Q_targets = rewards + (self.discount * Q_targets_next * (1 - dones)) # Q_targets = Q_targets.detach() # Compute Critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # -------------------- Update Actor ----------------------------- # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ------------------ Update Target Networds -------------------- self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau*local_param + (1.0-self.tau)*target_param) def hard_copy(self, model_a, model_b): """ copy model_a to model_b """ for param_a, param_b in zip(model_a.parameters(), model_b.parameters()): param_b.data.copy_(param_a) def reset(self): self.actor_local.reset_parameters() self.actor_target.reset_parameters() self.critic_local.reset_parameters() self.critic_target.reset_parameters() # self.hard_copy(self.actor_local, self.actor_target) # self.hard_copy(self.critic_local, self.critic_target) def set_lr(self, actor_lr=None, critic_lr=None): if actor_lr is not None: self.actor_optimizer def save_model(self): torch.save(self.checkpoint, self.checkpointfile) def add_noise_on_act(self, nois_on_act): """ When nois_on_act is True, OU noise is added in act() """ self.add_noise = nois_on_act
class MADDPGAgent(object): def __init__(self, state_size, action_size, seed): super(MADDPGAgent, self).__init__() self.state_size = state_size self.action_size = action_size self.seed = torch.manual_seed(seed) # initialise local and target Actor networks self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # initialise local and target Critic networks self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # copying the network weights of local model to target model # self.hard_update(self.actor_local, self.actor_target) # self.hard_update(self.critic_local, self.critic_target) # initialise the Ornstein-Uhlenbeck noise process self.noise = OUNoise((n_agents, action_size), seed) # Shared Replay Buffer self.memory = ReplayBuffer(buffer_size, batch_size, seed) self.t_step = 0 def hard_update(self, local_model, target_model): """ copy weights from source to target network (part of initialization)""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def step(self, states, actions, rewards, next_states, dones): # each agent adding their experience tuples in the replay buffer for i in range(n_agents): self.memory.add(states[i, :], actions[i, :], rewards[i], next_states[i, :], dones[i]) self.t_step = (self.t_step + 1) % update_every if self.t_step == 0: # if enough samples are there then learn if len(self.memory) > batch_size: for i in range(update_freq): experiences = self.memory.sample() self.learn(experiences, gamma) def act(self, states, add_noise=True): """Returns actions for given state as per current policy""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((n_agents, self.action_size)) self.actor_local.eval() # get the actions for each agent with torch.no_grad(): for i in range(n_agents): action_i = self.actor_local(states[i]).cpu().data.numpy() actions[i, :] = action_i self.actor_local.train() # Ornstein-Uhlenbeck noise process if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # update critic # Get the actions corresponding to next states and then their Q-values # from target critic network actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Now minimize this loss self.critic_optim.zero_grad() critic_loss.backward() # gradient clipping as suggested nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optim.step() # Update Actor # Compute Actor loss actions_pred = self.actor_local(states) # -ve sign because we want to maximise this value actor_loss = -self.critic_local(states, actions_pred).mean() # minimizing the loss self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() # update target networks self.soft_update(self.critic_local, self.critic_target, tau) self.soft_update(self.actor_local, self.actor_target, tau) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class CriticAgent(): def __init__(self, state_size, action_size, random_seed, learning_rate, weight_decay, tau): self.id = id self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.count = 0 self.learning_rate = learning_rate self.weight_decay = weight_decay self.tau = tau self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) def learn(self, actor, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = actor.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = actor.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss actor.actor_optimizer.zero_grad() actor_loss.backward() actor.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(actor.actor_local, actor.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgent(): """A class to create DDPG agents that interact and learn from the enviroment.""" def __init__(self, state_size, action_size, index, seed): """Initilize the Agent. Params: state_size: dimension of the state action_size: dimension of the action seed: random seed """ self.config = Configuration() self.epsilon = self.config.epsilon self.index = index # Set up the Actor networks self.actor_local = Actor(state_size, action_size, seed, fc1_units=self.config.actor_fc1, fc2_units=self.config.actor_fc2).to(self.config.device) self.actor_target = Actor(state_size, action_size, seed, fc1_units=self.config.actor_fc1, fc2_units=self.config.actor_fc2).to(self.config.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.config.lr_actor) # Set up the Critic networks self.critic_local = Critic(state_size, action_size, seed, fc1_units=self.config.critic_fc1, fc2_units=self.config.critic_fc2).to(self.config.device) self.critic_target = Critic(state_size, action_size, seed, fc1_units=self.config.critic_fc1, fc2_units=self.config.critic_fc2).to(self.config.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.config.lr_critic, weight_decay=self.config.weight_decay) # Copy over the weights self.hard_copy(self.actor_local, self.actor_target) self.hard_copy(self.critic_local, self.critic_target) def act(self, state): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.config.device) # Put model in evaluating mode self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() # Put model back in training mode self.actor_local.train() return action def learn(self, index, experiences, gamma, all_next_actions, all_actions): """Update policy and value using given batch of experiences given. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) Params: experiences: tuple of (s, a, r, s', done) tuples gamma: discount factor """ states, actions, rewards, next_states, dones = experiences # Reset the gradients self.critic_optimizer.zero_grad() index = torch.tensor([index]).to(self.config.device) actions_next = torch.cat(all_next_actions, dim=1).to(self.config.device) with torch.no_grad(): q_next = self.critic_target(torch.cat((next_states, actions_next), dim=1)) q_expected = self.critic_local(torch.cat((states, actions), dim=1)) q_t = rewards.index_select(1, index) + (gamma * q_next * (1 - dones.index_select(1, index))) F.mse_loss(q_expected, q_t.detach()).backward() self.critic_optimizer.step() self.actor_optimizer.zero_grad() actions_predicted = [actions if i == self.index else actions.detach() for i, actions in enumerate(all_actions)] actions_predicted = torch.cat(actions_predicted, dim=1).to(self.config.device) actor_loss = -self.critic_local(torch.cat((states, actions_predicted), dim=1)).mean() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.soft_update(self.critic_local, self.critic_target, self.config.tau) self.soft_update(self.actor_local, self.actor_target, self.config.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params: local_model: model that weights will be copied from target_model: model that weights will be copied to tau: soft update parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def hard_copy(self, local_model, target_model): for target_param, param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(param.data) def save(self): torch.save(self.actor_local.state_dict(), str(self.config.actor_fc1)+'_'+str(self.config.actor_fc2) + '_' + str(self.index) + '_actor.pth') torch.save(self.critic_local.state_dict(), str(self.config.critic_fc1)+'_'+str(self.config.critic_fc2) + '_' + str(self.index) + '_critic.pth') def load(self, actor_file, critic_file): self.actor_local.load_state_dict(torch.load(actor_file)) self.critic_local.load_state_dict(torch.load(critic_file)) self.hard_copy(self.actor_local, self.actor_target) self.hard_copy(self.critic_local, self.critic_target)
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] print('state size:', state_size) print('action size:', action_size) actor = Actor(state_size, action_size, args) critic = Critic(state_size, action_size, args) target_critic = Critic(state_size, action_size, args) actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr) critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr) hard_target_update(critic, target_critic) # initialize automatic entropy tuning target_entropy = -torch.prod(torch.Tensor(action_size)).item() log_alpha = torch.zeros(1, requires_grad=True) alpha = torch.exp(log_alpha) alpha_optimizer = optim.Adam([log_alpha], lr=args.alpha_lr) writer = SummaryWriter(args.logdir) replay_buffer = deque(maxlen=100000) recent_rewards = deque(maxlen=100) steps = 0 for episode in range(args.max_iter_num): done = False score = 0 state = env.reset() state = np.reshape(state, [1, state_size]) while not done: if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state)) action = get_action(mu, std) next_state, reward, done, _ = env.step(action) next_state = np.reshape(next_state, [1, state_size]) mask = 0 if done else 1 replay_buffer.append((state, action, reward, next_state, mask)) state = next_state score += reward if steps > args.batch_size: mini_batch = random.sample(replay_buffer, args.batch_size) actor.train(), critic.train(), target_critic.train() alpha = train_model(actor, critic, target_critic, mini_batch, actor_optimizer, critic_optimizer, alpha_optimizer, target_entropy, log_alpha, alpha) soft_target_update(critic, target_critic, args.tau) if done: recent_rewards.append(score) if episode % args.log_interval == 0: print('{} episode | score_avg: {:.2f}'.format(episode, np.mean(recent_rewards))) writer.add_scalar('log/score', float(score), episode) if np.mean(recent_rewards) > args.goal_score: if not os.path.isdir(args.save_path): os.makedirs(args.save_path) ckpt_path = args.save_path + 'model.pth.tar' torch.save(actor.state_dict(), ckpt_path) print('Recent rewards exceed -300. So end') break
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, batch_size): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, sigma=SIGMA) self.random_seed = random_seed self.batch_size = batch_size # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, batch_size, random_seed) def step(self, state, action, reward, next_state, done): """use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, actor_hidden_layers, critic_hidden_layers, ou_sigma, ou_theta, ou_sigma_decay, ou_theta_decay, energy_penalty, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action actor_hidden_layers (list[int]): dimension of each hidden layer size for the actor critic_hidden_layers (list[int]): dimension of each hidden layer size for the critic ou_sigma (float): sigma parameter for Ornstein-Uhlenbeck noise ou_theta (float): theta parameter for Ornstein-Uhlenbeck noise ou_sigma_decay (float): multiplicative decay for sigma parameter for Ornstein-Uhlenbeck noise. Applied after each episode ou_theta_decay (float): multiplicative decay for theta parameter for Ornstein-Uhlenbeck noise. Applied after each episode energy_penalty (float): weight for L1 loss on actions to reinforce taking smaller actions random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.energy_penalty = energy_penalty # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed, fc_units=actor_hidden_layers).to(device) self.actor_target = Actor(state_size, action_size, random_seed, fc_units=actor_hidden_layers).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed, fc_units=critic_hidden_layers).to(device) self.critic_target = Critic(state_size, action_size, random_seed, fc_units=critic_hidden_layers).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, theta=ou_theta, sigma=ou_sigma, theta_decay=ou_theta_decay, sigma_decay=ou_sigma_decay) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: try: action += self.noise.sample() except: import pdb pdb.set_trace() return np.clip(action, -1, 1) def reset(self): self.noise.reset() self.noise.decay() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() #print("LOSS INFO: critic_loss={0:.2E}, actor_loss={1:.2E}".format(critic_loss, actor_loss)) # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Agent that plays and learn from experience. Hyper-paramters chosen from paper.""" def __init__(self, state_size, action_size, max_action, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2): """ Initializes the Agent. @Param: 1. state_size: env.observation_space.shape[0] 2. action_size: env.action_size.shape[0] 3. max_action: list of max values that the agent can take, i.e. abs(env.action_space.high) 4. discount: return rate 5. tau: soft target update 6. policy_noise: noise reset level, DDPG uses Ornstein-Uhlenbeck process 7. noise_clip: sets boundary for noise calculation to prevent from overestimation of Q-values 8. policy_freq: number of timesteps to update the policy (actor) after """ super(Agent, self).__init__() #Actor Network initialization self.actor = Actor(state_size, action_size, max_action).to(device) self.actor.apply(self.init_weights) self.actor_target = copy.deepcopy( self.actor) #loads main model into target model self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=0.001) #Critic Network initialization self.critic = Critic(state_size, action_size).to(device) self.critic.apply(self.init_weights) self.critic_target = copy.deepcopy( self.critic) #loads main model into target model self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=0.001) self.max_action = max_action self.discount = discount self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.total_it = 0 def init_weights(self, layer): """Xaviar Initialization of weights""" if (type(layer) == nn.Linear): nn.init.xavier_normal_(layer.weight) layer.bias.data.fill_(0.01) def select_action(self, state): """Selects an automatic epsilon-greedy action based on the policy""" state = torch.FloatTensor(state.reshape(1, -1)).to(device) return self.actor(state).cpu().data.numpy().flatten() def train(self, replay_buffer: ReplayBuffer): """Train the Agent""" self.total_it += 1 # Sample replay buffer state, action, reward, next_state, done = replay_buffer.sample( ) #sample 256 experiences with torch.no_grad(): # Select action according to policy and add clipped noise noise = (torch.randn_like(action) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) next_action = ( self.actor_target(next_state) + noise #noise only set in training to prevent from overestimation ).clamp(-self.max_action, self.max_action) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(next_state, next_action) #Q1, Q2 target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + (1 - done) * self.discount * target_Q #TD-target # Get current Q estimates current_Q1, current_Q2 = self.critic(state, action) #Q1, Q2 # Compute critic loss using MSE critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Delayed policy updates (DDPG baseline = 1) if (self.total_it % self.policy_freq == 0): # Compute actor loss actor_loss = -self.critic(state, self.actor(state))[0].mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Soft update by updating the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def save(self, filename): """Saves the Actor Critic local and target models""" torch.save(self.critic.state_dict(), "models/checkpoint/" + filename + "_critic") torch.save(self.critic_optimizer.state_dict(), "models/checkpoint/" + filename + "_critic_optimizer") torch.save(self.actor.state_dict(), "models/checkpoint/" + filename + "_actor") torch.save(self.actor_optimizer.state_dict(), "models/checkpoint/" + filename + "_actor_optimizer") def load(self, filename): """Loads the Actor Critic local and target models""" self.critic.load_state_dict( torch.load("models/checkpoint/" + filename + "_critic", map_location='cpu')) self.critic_optimizer.load_state_dict( torch.load("models/checkpoint/" + filename + "_critic_optimizer", map_location='cpu')) #optional self.critic_target = copy.deepcopy(self.critic) self.actor.load_state_dict( torch.load("models/checkpoint/" + filename + "_actor", map_location='cpu')) self.actor_optimizer.load_state_dict( torch.load("models/checkpoint/" + filename + "_actor_optimizer", map_location='cpu')) #optional self.actor_target = copy.deepcopy(self.actor)
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # update counter self.update_counter = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(20): state1, action1, reward1, next_state1, done1 = state[i], action[ i], reward[i], next_state[i], done[i] #print('next state1', next_state[0:2]) #print('state1', state[0:2]) #print('reward1', reward) #print('action1', action[0:2]) #print('done1', done) self.memory.add(state1, action1, reward1, next_state1, done1) self.update_counter += 1 #print('adding to memory - counter', self.update_counter) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: if self.update_counter > (UPDATE_RATE - 1): self.update_counter = 0 for i in range(UPDATE_TIMES): #print('learning - counter',i) experiences = self.memory.sample() states, actions, rewards, next_states, dones = experiences #print('next states', np.shape(next_states)) #print('states', len(states)) #print('rewards', np.shape(rewards)) #print('actions', len(actions)) #print('dones', len(dones)) #print('experiences', np.shape(experiences)) self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() self.update_counter = 0 def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) #print('actions_next', actions_next.shape) #print('Qtargets_next', Q_targets_next.shape) #print('next states', len(next_states)) #print('states', len(states)) #print('rewards', len(rewards)) #print('actions', len(actions)) #print('dones', len(dones)) #print('Qtargets', Q_targets.size) #print('rewards', rewards.shape) #print('dones', dones.shape) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgent(): def __init__(self, state_size, action_size, num_agents): super().__init__() self.state_size = state_size self.action_size = action_size self.num_agents = num_agents # Construct Actor networks self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Construct Critic networks self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # noise processing self.noise = OUNoise(action_size) def step(self): if len(sharedBuffer) > BATCH_SIZE: experiences = sharedBuffer.sample(self.num_agents) self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states_list, actions_list, rewards, next_states_list, dones = experiences next_states_tensor = torch.cat(next_states_list, dim=1).to(device) states_tensor = torch.cat(states_list, dim=1).to(device) actions_tensor = torch.cat(actions_list, dim=1).to(device) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models next_actions = [self.actor_target(states) for states in states_list] next_actions_tensor = torch.cat(next_actions, dim=1).to(device) Q_targets_next = self.critic_target(next_states_tensor, next_actions_tensor) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states_tensor, actions_tensor) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss # take the current states and predict actions actions_pred = [self.actor_local(states) for states in states_list] actions_pred_tensor = torch.cat(actions_pred, dim=1).to(device) # -1 * (maximize) Q value for the current prediction actor_loss = - \ self.critic_local(states_tensor, actions_pred_tensor).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() #torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, n_agents=1, random_seed=4): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.n_agents = n_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed, fc1_units=fc1_size_actor, fc2_units=fc2_size_actor).to(device) self.actor_target = Actor(state_size, action_size, random_seed, fc1_units=fc1_size_actor, fc2_units=fc2_size_actor).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed, fcs1_units=fc1_size_critic, fc2_units=fc2_size_critic).to(device) self.critic_target = Critic(state_size, action_size, random_seed, fcs1_units=fc1_size_critic, fc2_units=fc2_size_critic).to(device) #self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # Noise process self.epsilon = epsilon self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.timesteps = 0 # Make sure target is with the same weight as the source self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" self.timesteps += 1 for x in range(self.n_agents): self.memory.add(state[x], action[x], reward[x], next_state[x], done[x]) # Learn, if enough samples are available in memory if (len(self.memory) > BATCH_SIZE) and (self.timesteps % UPDATE_EVERY == 0): for _ in range(UPDATE_TIMES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() self.epsilon -= epsilon_decay if add_noise: action += [ np.maximum(self.epsilon, 0.2) * self.noise.sample() for _ in range(self.n_agents) ] return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) self.reset() def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.eps = 3.0 self.eps_decay = 0.9999 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_target = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_target = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=0) # Noise process self.noise = OUNoise((1, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number, learn_iterations=5): """Save experience in replay memory, and use random sample from buffer to learn.""" #self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory ) > BATCH_SIZE: #and self.timestep % LEARN_EVERY == 0: for _ in range(learn_iterations): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Since the critic takes the actions of both agents we need to update only # one part of the given action if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) elif agent_number == 1: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Since the critic takes the actions of both agents we need to update only # one part of the given action if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) elif agent_number == 1: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update epsilon self.eps *= self.eps_decay self.eps = max(self.eps, 1) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG: def __init__(self, **kwargs): if 'filename' in kwargs.keys(): data= torch.load(kwargs['filename']) self.config= data["config"] self.scores= data["scores"] elif 'config' in kwargs.keys(): self.config= kwargs['config'] data= {} self.scores= [] else: raise OSError('DDPG: no configuration parameter in class init') self.state_size = self.config["state_size"] self.action_size = self.config["action_size"] memory_size = self.config["memory_size"] actor_lr = self.config["actor_lr"] critic_lr = self.config["critic_lr"] self.batch_size = self.config["batch_size"] self.discount = self.config["discount"] sigma = self.config["sigma"] if self.config["sigma"] else 0.2 self.tau= self.config["tau"] self.seed = self.config["seed"] if self.config["seed"] else 0 self.action_noise= self.config["action_noise"] if self.config["action_noise"] else "No" self.critic_l2_reg= self.config["critic_l2_reg"] if self.config["critic_l2_reg"] else 0.0 random.seed(self.seed) torch.manual_seed(self.seed) self.actor = Actor(self.state_size, self.action_size, nodes= self.config["actor_nodes"], seed= self.seed).to(device) if 'actor' in data.keys(): self.actor.load_state_dict(data['actor']) self.critic = Critic(self.state_size, self.action_size, nodes= self.config["critic_nodes"], seed= self.seed).to(device) self.targetActor = Actor(self.state_size, self.action_size, nodes= self.config["actor_nodes"], seed= self.seed).to(device) self.targetCritic = Critic(self.state_size, self.action_size, nodes= self.config["critic_nodes"], seed= self.seed).to(device) # Initialize parameters self.hard_update(self.actor, self.targetActor) self.hard_update(self.critic, self.targetCritic) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr= actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr= critic_lr, weight_decay= self.critic_l2_reg) self.criticLoss = nn.MSELoss() self.noise= None if self.action_noise== "OU": self.noise = OUNoise(np.zeros(self.action_size), sigma= sigma) elif self.action_noise== "No": self.noise = NoNoise() elif self.action_noise== "Normal": self.noise = NormalActionNoise(np.zeros(self.action_size), sigma= sigma) self.memory = ReplayBuffer(self.action_size, memory_size, self.batch_size, self.seed) def hard_update(self, source, target): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def act(self, state, add_noise= True): """Returns actions for given state as per current policy.""" #state = torch.from_numpy(state).float().to(device) #state= torch.FloatTensor(state).view(1, -1).to(device) #state= torch.FloatTensor(state).unsqueeze(0).to(device) state= torch.FloatTensor(state).to(device) if len(state.size())== 1: state= state.unsqueeze(0) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().data.numpy() self.actor.train() if add_noise and self.noise: action += self.noise() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done) if len(self.memory) >= self.batch_size: self.learn() def learn(self): states, actions, rewards, next_states, dones = self.memory.sample() # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.targetActor(next_states) Q_targets_next = self.targetCritic(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.discount * Q_targets_next * (1 - dones)) Q_targets = Variable(Q_targets.data, requires_grad=False) # Compute critic loss Q_expected = self.critic(states, actions) critic_loss = self.criticLoss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor(states) actor_loss = -self.critic(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic, self.targetCritic, self.tau) self.soft_update(self.actor, self.targetActor, self.tau) def reset(self): self.noise.reset() def update(self, score= None): if score: self.scores.append(score) def save(self, filename= None): data= {"config": self.config, "actor": self.actor.state_dict(), "scores": self.scores,} if not filename: filename= self.__class__.__name__+ '_'+ datetime.now().strftime("%Y-%m-%d_%H:%M:%S")+ '.data' torch.save(data, filename) torch.save(self.actor.state_dict(), "last_actor.pth")
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] print('state size:', state_size) print('action size:', action_size) actor = Actor(state_size, action_size, args) critic = Critic(state_size, args) critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr) writer = SummaryWriter(args.logdir) recent_rewards = deque(maxlen=100) episodes = 0 for iter in range(args.max_iter_num): trajectories = deque() steps = 0 while steps < args.total_sample_size: done = False score = 0 episodes += 1 state = env.reset() state = np.reshape(state, [1, state_size]) while not done: if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state)) action = get_action(mu, std) next_state, reward, done, _ = env.step(action) mask = 0 if done else 1 trajectories.append((state, action, reward, mask)) next_state = np.reshape(next_state, [1, state_size]) state = next_state score += reward if done: recent_rewards.append(score) actor.train() train_model(actor, critic, critic_optimizer, trajectories, state_size, action_size) writer.add_scalar('log/score', float(score), episodes) if iter % args.log_interval == 0: print('{} iter | {} episode | score_avg: {:.2f}'.format( iter, episodes, np.mean(recent_rewards))) if np.mean(recent_rewards) > args.goal_score: if not os.path.isdir(args.save_path): os.makedirs(args.save_path) ckpt_path = args.save_path + 'model.pth.tar' torch.save(actor.state_dict(), ckpt_path) print('Recent rewards exceed -300. So end') break
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, writer): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.writer = writer # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed, fc1_units=L1_SIZE, fc2_units=L2_SIZE).to(device) self.actor_target = Actor(state_size, action_size, random_seed, fc1_units=L1_SIZE, fc2_units=L2_SIZE).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.actor_running_loss = 0.0 self.steps = 0 self.epsilon = 1.0 # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed, fcs1_units=L1_SIZE, fc2_units=L2_SIZE).to(device) self.critic_target = Critic(state_size, action_size, random_seed, fcs1_units=L1_SIZE, fc2_units=L2_SIZE).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, theta=NOISE_THETA, sigma=NOISE_SIGMA) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # self.memory.add(state, action, reward, next_state, done) self.steps += 1 self.writer.add_scalar('rewards', sum(rewards), self.steps) self.writer.add_scalar('action_1', actions[0][0], self.steps) self.writer.add_scalar('action_2', actions[0][1], self.steps) self.writer.add_scalar('action_3', actions[0][2], self.steps) self.writer.add_scalar('action_4', actions[0][3], self.steps) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: noise_add = self.noise.sample() self.writer.add_scalar('action_noise', noise_add.mean(), self.steps) self.writer.add_scalar('action_before_noise', action.mean(), self.steps) self.writer.add_scalar('epsilon', self.epsilon, self.steps) action += self.epsilon * noise_add # self.writer.add_scalar('action_preclip_1', action[0][0], self.steps) # self.writer.add_scalar('action_preclip_2', action[0][1], self.steps) # self.writer.add_scalar('action_preclip_3', action[0][2], self.steps) # self.writer.add_scalar('action_preclip_4', action[0][3], self.steps) return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.writer.add_scalar('critic_loss', critic_loss, self.steps) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.actor_running_loss += actor_loss.item() self.writer.add_scalar('actor_running_loss', self.actor_running_loss, self.steps) self.writer.add_scalar('actor_loss', actor_loss.item(), self.steps) # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ----------------------- update noise ----------------------- # self.epsilon -= EPSILON_DECAY self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def training(opt): # ~~~~~~~~~~~~~~~~~~~ hyper parameters ~~~~~~~~~~~~~~~~~~~ # EPOCHS = opt.epochs CHANNELS = 1 H, W = 64, 64 work_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') FEATURE_D = 128 Z_DIM = 100 BATCH_SIZE = opt.batch_size # ~~~~~~~~~~~~~~~~~~~ as per WGAN paper ~~~~~~~~~~~~~~~~~~~ # lr = opt.lr CRITIC_TRAIN_STEPS = 5 WEIGHT_CLIP = 0.01 print(f"Epochs: {EPOCHS}| lr: {lr}| batch size {BATCH_SIZE}|" + f" device: {work_device}") # ~~~~~~~~~~~ creating directories for weights ~~~~~~~~~~~ # if opt.logs: log_dir = Path(f'{opt.logs}').resolve() if log_dir.exists(): shutil.rmtree(str(log_dir)) if opt.weights: Weight_dir = Path(f'{opt.weights}').resolve() if not Weight_dir.exists(): Weight_dir.mkdir() # ~~~~~~~~~~~~~~~~~~~ loading the dataset ~~~~~~~~~~~~~~~~~~~ # trans = transforms.Compose([ transforms.Resize((H, W)), transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )) ]) MNIST_data = MNIST(str(opt.data_dir), True, transform=trans, download=True) loader = DataLoader( MNIST_data, BATCH_SIZE, True, num_workers=2, pin_memory=True, ) # ~~~~~~~~~~~~~~~~~~~ creating tensorboard variables ~~~~~~~~~~~~~~~~~~~ # writer_fake = SummaryWriter(f"{str(log_dir)}/fake") writer_real = SummaryWriter(f"{str(log_dir)}/real") loss_writer = SummaryWriter(f"{str(log_dir)}/loss") # ~~~~~~~~~~~~~~~~~~~ loading the model ~~~~~~~~~~~~~~~~~~~ # critic = Critic(img_channels=CHANNELS, feature_d=FEATURE_D).to(work_device) gen = Faker(Z_DIM, CHANNELS, FEATURE_D).to(work_device) if opt.resume: if Path(Weight_dir / 'critic.pth').exists(): critic.load_state_dict( torch.load(str(Weight_dir / 'critic.pth'), map_location=work_device)) if Path(Weight_dir / 'generator.pth').exists(): gen.load_state_dict( torch.load(str(Weight_dir / 'generator.pth'), map_location=work_device)) # ~~~~~~~~~~~~~~~~~~~ create optimizers ~~~~~~~~~~~~~~~~~~~ # critic_optim = optim.RMSprop(critic.parameters(), lr) gen_optim = optim.RMSprop(gen.parameters(), lr) # ~~~~~~~~~~~~~~~~~~~ training loop ~~~~~~~~~~~~~~~~~~~ # # loss variables C_loss_prev = math.inf G_loss_prev = math.inf C_loss = 0 G_loss = 0 C_loss_avg = 0 G_loss_avg = 0 print_gpu_details() # setting the models to train mode critic.train() gen.train() for epoch in range(EPOCHS): # reset the average loss to zero C_loss_avg = 0 G_loss_avg = 0 print_memory_utilization() for batch_idx, (real, _) in enumerate(tqdm(loader)): real = real.to(work_device) fixed_noise = torch.rand(real.shape[0], Z_DIM, 1, 1).to(work_device) # ~~~~~~~~~~~~~~~~~~~ critic loop ~~~~~~~~~~~~~~~~~~~ # with torch.no_grad(): fake = gen(fixed_noise) # dim of (N,1,W,H) for _ in range(CRITIC_TRAIN_STEPS): critic.zero_grad() # ~~~~~~~~~~~ weight cliping as per WGAN paper ~~~~~~~~~~ # for p in critic.parameters(): p.data.clamp_(-WEIGHT_CLIP, WEIGHT_CLIP) # ~~~~~~~~~~~~~~~~~~~ forward ~~~~~~~~~~~~~~~~~~~ # # make it one dimensional array real_predict = critic(real).view(-1) # make it one dimensional array fake_predict = critic(fake.detach()).view(-1) # ~~~~~~~~~~~~~~~~~~~ loss ~~~~~~~~~~~~~~~~~~~ # C_loss = -(torch.mean(fake_predict) - torch.mean(real_predict)) C_loss_avg += C_loss # ~~~~~~~~~~~~~~~~~~~ backward ~~~~~~~~~~~~~~~~~~~ # C_loss.backward() critic_optim.step() # ~~~~~~~~~~~~~~~~~~~ generator loop ~~~~~~~~~~~~~~~~~~~ # gen.zero_grad() # ~~~~~~~~~~~~~~~~~~~ forward ~~~~~~~~~~~~~~~~~~~ # # make it one dimensional array fake_predict = critic(fake).view(-1) # ~~~~~~~~~~~~~~~~~~~ loss ~~~~~~~~~~~~~~~~~~~ # G_loss = -(torch.mean(fake_predict)) G_loss_avg += G_loss # ~~~~~~~~~~~~~~~~~~~ backward ~~~~~~~~~~~~~~~~~~~ # G_loss.backward() gen_optim.step() # ~~~~~~~~~~~~~~~~~~~ loading the tensorboard ~~~~~~~~~~~~~~~~~~~ # # will execute at every 50 steps if (batch_idx + 1) % 50 == 0: # ~~~~~~~~~~~~ calculate average loss ~~~~~~~~~~~~~ # C_loss_avg_ = C_loss_avg / (CRITIC_TRAIN_STEPS * batch_idx) G_loss_avg_ = G_loss_avg / (batch_idx) print(f"Epoch [{epoch}/{EPOCHS}] | batch size {batch_idx}" + f"Loss C: {C_loss_avg_:.4f}, loss G: {G_loss_avg_:.4f}") # ~~~~~~~~~~~~ send data to tensorboard ~~~~~~~~~~~~~ # with torch.no_grad(): critic.eval() gen.eval() if BATCH_SIZE > 32: fake = gen(fixed_noise[:32]).reshape( -1, CHANNELS, H, W) data = real[:32].reshape(-1, CHANNELS, H, W) else: fake = gen(fixed_noise).reshape(-1, CHANNELS, H, W) data = real.reshape(-1, CHANNELS, H, W) img_grid_fake = torchvision.utils.make_grid(fake, normalize=True) img_grid_real = torchvision.utils.make_grid(data, normalize=True) step = (epoch + 1) * (batch_idx + 1) writer_fake.add_image("Mnist Fake Images", img_grid_fake, global_step=step) writer_real.add_image("Mnist Real Images", img_grid_real, global_step=step) loss_writer.add_scalar('Critic', C_loss, global_step=step) loss_writer.add_scalar('generator', G_loss, global_step=step) # changing back the model to train mode critic.train() gen.train() # ~~~~~~~~~~~~~~~~~~~ saving the weights ~~~~~~~~~~~~~~~~~~~ # if opt.weights: if C_loss_prev > C_loss_avg: C_loss_prev = C_loss_avg weight_path = str(Weight_dir / 'critic.pth') torch.save(critic.state_dict(), weight_path) if G_loss_prev > G_loss_avg: G_loss_prev = G_loss_avg weight_path = str(Weight_dir / 'generator.pth') torch.save(gen.state_dict(), weight_path)
class DDPG(object): def __init__(self, args, nb_states, nb_actions): USE_CUDA = torch.cuda.is_available() if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.gpu_ids = [i for i in range(args.gpu_nums) ] if USE_CUDA and args.gpu_nums > 0 else [-1] self.gpu_used = True if self.gpu_ids[0] >= 0 else False net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_optim = Adam(self.actor.parameters(), lr=args.p_lr, weight_decay=args.weight_decay) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_optim = Adam(self.critic.parameters(), lr=args.c_lr, weight_decay=args.weight_decay) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau_update = args.tau_update self.gamma = args.gamma # Linear decay rate of exploration policy self.depsilon = 1.0 / args.epsilon # initial exploration rate self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.continious_action_space = False def update_policy(self): pass def cuda_convert(self): if len(self.gpu_ids) == 1: if self.gpu_ids[0] >= 0: with torch.cuda.device(self.gpu_ids[0]): print('model cuda converted') self.cuda() if len(self.gpu_ids) > 1: self.data_parallel() self.cuda() self.to_device() print('model cuda converted and paralleled') def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def data_parallel(self): self.actor = nn.DataParallel(self.actor, device_ids=self.gpu_ids) self.actor_target = nn.DataParallel(self.actor_target, device_ids=self.gpu_ids) self.critic = nn.DataParallel(self.critic, device_ids=self.gpu_ids) self.critic_target = nn.DataParallel(self.critic_target, device_ids=self.gpu_ids) def to_device(self): self.actor.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.actor_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.critic.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.critic_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) # self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): # proto action action = to_numpy(self.actor( to_tensor(np.array([s_t]), gpu_used=self.gpu_used, gpu_0=self.gpu_ids[0])), gpu_used=self.gpu_used).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon # self.a_t = action return action def reset(self, s_t): self.s_t = s_t self.random_process.reset_states() def load_weights(self, dir): if dir is None: return if self.gpu_used: # load all tensors to GPU (gpu_id) ml = lambda storage, loc: storage.cuda(self.gpu_ids) else: # load all tensors to CPU ml = lambda storage, loc: storage self.actor.load_state_dict( torch.load('output/{}/actor.pkl'.format(dir), map_location=ml)) self.critic.load_state_dict( torch.load('output/{}/critic.pkl'.format(dir), map_location=ml)) print('model weights loaded') def save_model(self, output): if len(self.gpu_ids) == 1 and self.gpu_ids[0] > 0: with torch.cuda.device(self.gpu_ids[0]): torch.save(self.actor.state_dict(), '{}/actor.pt'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pt'.format(output)) elif len(self.gpu_ids) > 1: torch.save(self.actor.module.state_dict(), '{}/actor.pt'.format(output)) torch.save(self.actor.module.state_dict(), '{}/critic.pt'.format(output)) else: torch.save(self.actor.state_dict(), '{}/actor.pt'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pt'.format(output)) def seed(self, seed): torch.manual_seed(seed) if len(self.gpu_ids) > 0: torch.cuda.manual_seed_all(seed)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward # If updating in batches, then add the last memory of the agents (e.g. 20 agents) to a buffer # and if we've met batch size only push to learn in multiples of whatever LEARN_NUM specifies (e.g.10) self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # gradient clipping for critic if GRAD_CLIPPING > 0: torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # --------------------- and update epsilon decay ----------------------- # if EPSILON_DECAY > 0: self.epsilon -= EPSILON_DECAY self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: def __init__(self, state_size, action_size, seed): """Initializing the agent""" self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) self.noise = OUNoise(action_size, seed=self.seed) self.buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=self.seed) def act(self, state, add_noise=True): """returning actions from the current policy""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, +1) def reset(self): self.noise.reset() def save(self, state, action, reward, next_state, done): """saves experiences in buffer""" self.buffer.add(state, action, reward, next_state, done) def start_learn(self): """calls the learn method""" if len(self.buffer) > BATCH_SIZE: experiences = self.buffer.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): """updates policy and value networks given a batch of experiences""" states, actions, rewards, next_states, dones = experiences #updating Critic_local next_actions = self.actor_target(next_states) next_Q_target = self.critic_target(next_states, next_actions) Q_target = rewards + gamma * (1 - dones) * next_Q_target Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_target, Q_expected) self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() #updating Actor_local next_actions = self.actor_local(states) actor_loss = -self.critic_local(states, next_actions).mean() self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self.soft_update(self.actor_local, self.actor_target, TAU) self.soft_update(self.critic_local, self.critic_target, TAU) def soft_update(self, local_network, target_network, tau): """updates target network using polyak averaging""" for local_parameter, target_parameter in zip( local_network.parameters(), target_network.parameters()): target_parameter.data.copy_((1.0 - tau) * local_parameter + tau * target_parameter)
class Agent(): """DDPG Agent : Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, num_agents=1): """Initialize a DDPG Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed num_agents (int) : Number of agents (1 for DDPG, 2+ for MADDPG -> Will affect the critic) """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Make sure the Actor Target Network has the same weight values as the Local Network for target, local in zip(self.actor_target.parameters(), self.actor_local.parameters()): target.data.copy_(local.data) # Critic Network (w/ Target Network) # Note : in MADDPG, critics have access to all agents obeservations and actions self.critic_local = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Make sure the Critic Target Network has the same weight values as the Local Network for target, local in zip(self.critic_target.parameters(), self.critic_local.parameters()): target.data.copy_(local.data) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory : in MADDPG, the ReplayBuffer is common to all agents #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): ##TODO : not used with MADDPG .. """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, noise=0.0): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if ADD_OU_NOISE: action += self.noise.sample() * noise return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): ### Used only for DDPG (use madddpg.maddpg_learn() for MADDPG) """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" #actor_local = None #actor_target = None #actor_optimizer = None #critic_local = None #critic_target = None #critic_optimizer = None #use a shared memory prepare for 20 Agent scenario memory = None def __init__(self, state_size, action_size, random_seed, SharedReplayBuffer=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON # Actor Network (w/ Target Network) #if Agent.actor_local is None: self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #if Agent.critic_local is None: # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) #self.actor_local = Agent.actor_local #self.actor_target = Agent.actor_target #self.actor_optimizer = Agent.actor_optimizer # Noise process self.noise = OUNoise(action_size, random_seed) #print(Agent.actor_local,BATCH_SIZE,EPSILON_DECAY) #print(Agent.actor_local) #print(Agent.critic_local) print(device) self.t_step = 0 # Replay memory if SharedReplayBuffer is None: Agent.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) if Agent.memory is None: Agent.memory = SharedReplayBuffer #ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) #print(device) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward #update 10 times for each 20 steps Agent.memory.add(state, action, reward, next_state, done) self.t_step += 1 if self.t_step % UPDATE_FREQ == 0: # Learn, if enough samples are available in memory if len(Agent.memory) > BATCH_SIZE: for i in range(10): experiences = Agent.memory.sample() self.learn(experiences, GAMMA) #for local_param in self.actor_local.parameters(): # print(local_param.data) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ---------------------------- update noise ---------------------------- # self.epsilon -= EPSILON_DECAY self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_method':args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class Agent: def __init__(self, env_name, n_iter, n_states, action_bounds, n_actions, lr): self.env_name = env_name self.n_iter = n_iter self.action_bounds = action_bounds self.n_actions = n_actions self.n_states = n_states self.device = torch.device("cpu") self.lr = lr self.current_policy = Actor(n_states=self.n_states, n_actions=self.n_actions).to(self.device) self.critic = Critic(n_states=self.n_states).to(self.device) self.actor_optimizer = Adam(self.current_policy.parameters(), lr=self.lr, eps=1e-5) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.lr, eps=1e-5) self.critic_loss = torch.nn.MSELoss() self.scheduler = lambda step: max(1.0 - float(step / self.n_iter), 0) self.actor_scheduler = LambdaLR(self.actor_optimizer, lr_lambda=self.scheduler) self.critic_scheduler = LambdaLR(self.actor_optimizer, lr_lambda=self.scheduler) def choose_dist(self, state): state = np.expand_dims(state, 0) state = from_numpy(state).float().to(self.device) with torch.no_grad(): dist = self.current_policy(state) # action *= self.action_bounds[1] # action = np.clip(action, self.action_bounds[0], self.action_bounds[1]) return dist def get_value(self, state): state = np.expand_dims(state, 0) state = from_numpy(state).float().to(self.device) with torch.no_grad(): value = self.critic(state) return value.detach().cpu().numpy() def optimize(self, actor_loss, critic_loss): self.actor_optimizer.zero_grad() actor_loss.backward() # torch.nn.utils.clip_grad_norm_(self.current_policy.parameters(), 0.5) # torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.current_policy.parameters(), 0.5) # torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) self.critic_optimizer.step() def schedule_lr(self): # self.total_scheduler.step() self.actor_scheduler.step() self.critic_scheduler.step() def save_weights(self, iteration, state_rms): torch.save( { "current_policy_state_dict": self.current_policy.state_dict(), "critic_state_dict": self.critic.state_dict(), "actor_optimizer_state_dict": self.actor_optimizer.state_dict(), "critic_optimizer_state_dict": self.critic_optimizer.state_dict(), "actor_scheduler_state_dict": self.actor_scheduler.state_dict(), "critic_scheduler_state_dict": self.critic_scheduler.state_dict(), "iteration": iteration, "state_rms_mean": state_rms.mean, "state_rms_var": state_rms.var, "state_rms_count": state_rms.count }, self.env_name + "_weights.pth") def load_weights(self): checkpoint = torch.load(self.env_name + "_weights.pth") self.current_policy.load_state_dict( checkpoint["current_policy_state_dict"]) self.critic.load_state_dict(checkpoint["critic_state_dict"]) self.actor_optimizer.load_state_dict( checkpoint["actor_optimizer_state_dict"]) self.critic_optimizer.load_state_dict( checkpoint["critic_optimizer_state_dict"]) self.actor_scheduler.load_state_dict( checkpoint["actor_scheduler_state_dict"]) self.critic_scheduler.load_state_dict( checkpoint["critic_scheduler_state_dict"]) iteration = checkpoint["iteration"] state_rms_mean = checkpoint["state_rms_mean"] state_rms_var = checkpoint["state_rms_var"] return iteration, state_rms_mean, state_rms_var def set_to_eval_mode(self): self.current_policy.eval() self.critic.eval() def set_to_train_mode(self): self.current_policy.train() self.critic.train()
class DDPG(object): def __init__(self, nb_status, nb_actions, args): self.num_actor = 3 self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn } self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)] self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) for i in range(self.num_actor): hard_update(self.actor_targets[i], self.actors[i]) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor=True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = 0 for i in range(self.num_actor): next_q_values = next_q_values + self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_targets[i](to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values = next_q_values / self.num_actor next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() sum_policy_loss = 0 for i in range(self.num_actor): self.actors[i].zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actors[i](to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor: self.actor_optims[i].step() sum_policy_loss += policy_loss # Target update soft_update(self.actor_targets[i], self.actors[i], self.tau) soft_update(self.critic_target, self.critic, self.tau) return -sum_policy_loss / self.num_actor, value_loss def cuda(self): for i in range(self.num_actor): self.actors[i].cuda() self.actor_targets[i].cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): actions = [] status = [] tot_score = [] for i in range(self.num_actor): action = to_numpy(self.actors[i](to_tensor(np.array([s_t]), volatile=True))).squeeze(0) noise_level = noise_level * max(self.epsilon, 0) action = action + self.random_process.sample() * noise_level status.append(s_t) actions.append(action) tot_score.append(0.) scores = self.critic([to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True)]) for j in range(self.num_actor): tot_score[j] += scores.data[j][0] best = np.array(tot_score).argmax() if decay_epsilon: self.epsilon -= self.depsilon self.a_t = actions[best] return actions[best] def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=0): if output is None: return for i in range(self.num_actor): actor = self.actors[i] actor_target = self.actor_targets[i] actor.load_state_dict( torch.load('{}/actor{}_{}.pkl'.format(output, num, i)) ) actor_target.load_state_dict( torch.load('{}/actor{}_{}.pkl'.format(output, num, i)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: for i in range(self.num_actor): self.actors[i].cpu() self.critic.cpu() for i in range(self.num_actor): torch.save( self.actors[i].state_dict(), '{}/actor{}_{}.pkl'.format(output, num, i) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: for i in range(self.num_actor): self.actors[i].cuda() self.critic.cuda()
class Ppo: def __init__(self, N_S, N_A): self.actor_net = Actor(N_S, N_A) self.critic_net = Critic(N_S) self.actor_optim = optim.Adam(self.actor_net.parameters(), lr=lr_actor) self.critic_optim = optim.Adam(self.critic_net.parameters(), lr=lr_critic, weight_decay=l2_rate) self.critic_loss_func = torch.nn.MSELoss() def train(self, memory): memory = np.array(memory) states = torch.tensor(np.vstack(memory[:, 0]), dtype=torch.float32) actions = torch.tensor(list(memory[:, 1]), dtype=torch.float32) rewards = torch.tensor(list(memory[:, 2]), dtype=torch.float32) masks = torch.tensor(list(memory[:, 3]), dtype=torch.float32) values = self.critic_net(states) returns, advants = self.get_gae(rewards, masks, values) old_mu, old_std = self.actor_net(states) pi = self.actor_net.distribution(old_mu, old_std) old_log_prob = pi.log_prob(actions).sum(1, keepdim=True) n = len(states) arr = np.arange(n) for epoch in range(1): np.random.shuffle(arr) for i in range(n // batch_size): b_index = arr[batch_size * i:batch_size * (i + 1)] b_states = states[b_index] b_advants = advants[b_index].unsqueeze(1) b_actions = actions[b_index] b_returns = returns[b_index].unsqueeze(1) mu, std = self.actor_net(b_states) pi = self.actor_net.distribution(mu, std) new_prob = pi.log_prob(b_actions).sum(1, keepdim=True) old_prob = old_log_prob[b_index].detach() #KL散度正则项 # KL_penalty = self.kl_divergence(old_mu[b_index],old_std[b_index],mu,std) ratio = torch.exp(new_prob - old_prob) surrogate_loss = ratio * b_advants values = self.critic_net(b_states) critic_loss = self.critic_loss_func(values, b_returns) self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() ratio = torch.clamp(ratio, 1.0 - epsilon, 1.0 + epsilon) clipped_loss = ratio * b_advants actor_loss = -torch.min(surrogate_loss, clipped_loss).mean() #actor_loss = -(surrogate_loss-beta*KL_penalty).mean() self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() #计算KL散度 def kl_divergence(self, old_mu, old_sigma, mu, sigma): old_mu = old_mu.detach() old_sigma = old_sigma.detach() kl = torch.log(old_sigma) - torch.log(sigma) + (old_sigma.pow(2) + (old_mu - mu).pow(2)) / \ (2.0 * sigma.pow(2)) - 0.5 return kl.sum(1, keepdim=True) #计算GAE def get_gae(self, rewards, masks, values): rewards = torch.Tensor(rewards) masks = torch.Tensor(masks) returns = torch.zeros_like(rewards) advants = torch.zeros_like(rewards) running_returns = 0 previous_value = 0 running_advants = 0 for t in reversed(range(0, len(rewards))): #计算A_t并进行加权求和 running_returns = rewards[t] + gamma * running_returns * masks[t] running_tderror = rewards[t] + gamma * previous_value * masks[t] - \ values.data[t] running_advants = running_tderror + gamma * lambd * \ running_advants * masks[t] returns[t] = running_returns previous_value = values.data[t] advants[t] = running_advants #advants的归一化 advants = (advants - advants.mean()) / advants.std() return returns, advants
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'init_method':args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([ state_batch, self.actor(state_batch) ]) else: policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if(self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if(self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() # if self.pic: # action = np.concatenate((softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy( self.actor_target(s_t) ).squeeze(0) else: action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = self.random_action(fix=True) # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num)) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train() train_model(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)