class Agent(): def __init__(self, state_size, action_size, n_agents, random_seed): self.state_size = state_size self.action_size = action_size self.n_agents = n_agents self.seed = random.seed(random_seed) #Actor Network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #Critic Network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) #Noise Process self.noise = OUNoise((n_agents, action_size), random_seed) #Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, timestep): #Save Memory for state, action, reward, next_state, done in zip( state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) if timestep % N_LEARN_TIMESTEPS != 0: return #IF enough samples in memory if len(self.memory) > BATCH_SIZE: for i in range(N_LEARN_UPDATES): #Load sample of tuples from memory experiences = self.memory.sample() #Learn from a randomly selected sample self.learn(experiences, GAMMA) def act(self, state, add_noise=True): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() #Return action return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # #Get predicted actions + Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) #Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) #Critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) #Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # #Actor Loss actions_pred = self.actor_local(states) #Negative sign for gradient ascent actor_loss = -self.critic_local(states, actions_pred).mean() #Minimize Loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): for local_param, target_param in zip(local_model.parameters(), target_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor w/ target self.actor_local = Actor(state_size, action_size, seed=random_seed).to(device) self.actor_target = Actor(state_size, action_size, seed=random_seed).to(device) self.actor_opt = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic w/ target self.critic_local = Critic(state_size, action_size, seed=random_seed).to(device) self.critic_target = Critic(state_size, action_size, seed=random_seed).to(device) self.critic_opt = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Misc self.noise = OUNoise(action_size, random_seed) self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, +1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # update critic actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + gamma * Q_targets_next * (1 - dones) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_opt.zero_grad() critic_loss.backward() self.critic_opt.step() # update actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_opt.zero_grad() actor_loss.backward() self.actor_opt.step() # target network upates self.soft_update(self.actor_local, self.actor_target, TAU) self.soft_update(self.critic_local, self.critic_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): mixed_param = tau * local_param.data + (1 - tau) * target_param.data target_param.data.copy_(mixed_param)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, params, device = DEVICE, critic_input_size = None): """Initialize an Agent object. """ self.params = params self.state_size = params.STATE_SIZE self.action_size = params.ACTION_SIZE self.seed = params.SEED self.tau = params.TAU self.device = device if critic_input_size is None: critic_input_size = 2 * (self.state_size + self.action_size) # Actor Network (w/ Target Network) self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params.LR_ACTOR, weight_decay=params.WEIGHT_DECAY_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(critic_input_size, self.seed).to(device) self.critic_target = Critic(critic_input_size, self.seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=params.LR_CRITIC, weight_decay=params.WEIGHT_DECAY_CRITIC) # Noise process self.noise = OUNoise(self.action_size, self.seed, mu=0., theta=params.NOISE_THETA, sigma=params.NOISE_SIGMA) # Parameters for learning self.gamma = params.GAMMA self.learning_step = 0 # Counter for learning steps def act(self, state, add_noise=False, sigma = 0.1): """ Returns actions for given state as per current policy. Arguments: state - input state add_noise - can be: False - No nose added (default) 'OU' - Ornstein-Uhlenbeck noise added 'rand' - uniformly random noise added 'sigma' - noise is scaled from -simga/2 to sigma/2. Works with 'rand' noise """ state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: if add_noise == 'OU': action += self.noise.sample() else: action += sigma * np.random.rand(len(action)) - sigma / 2 return np.clip(action, -1, 1) # Clipping is necessary if we are adding noise else: return action def reset(self): self.noise.reset() def learn(self, states, actions, rewards, next_states, dones, next_actions, ag2_states, ag2_actions, ag2_next_states, ag2_next_actions): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== states, actions, rewards, next_states, dones - parameters for agent next_actions - actions produced by target network ag2_states, ag2_actions, ag2_next_states - parameters for the other agent ag2_next_actions - actions produced by target network of the other agent """ # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models with torch.no_grad(): Q_targets_next = self.critic_target(next_states, next_actions, ag2_next_states, ag2_next_actions) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions, ag2_states, ag2_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss pred_actions = self.actor_local(states) actor_loss = -self.critic_local(states, pred_actions, ag2_states, ag2_next_actions).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, seed, fc1=400, fc2=300): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.num_agents = num_agents self.noise = [ OrnsteinUhlenbeckProcess(size=(action_size, ), std=0.2) for i in range(num_agents) ] # actor local and target network (Policy gradient) self.actor_local = Actor(state_size, action_size, fc1, fc2, seed).to(device) self.actor_target = Actor(state_size, action_size, fc1, fc2, seed).to(device) # critic local and target network (Q-Learning) self.critic_local = Critic(state_size, action_size, fc1, fc2, seed).to(device) self.critic_target = Critic(state_size, action_size, fc1, fc2, seed).to(device) # optimizer for critic and actor network self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory for i in range(self.num_agents): self.memory.add(state[i], action[i], reward[i], next_state[i], done[i]) self.t_step += 1 # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: if self.t_step % UPDATE_EVERY == 0: for i in range(UPDATE_TIMES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, training=True): """Returns continous actions values for all action for given state as per current policy. Params ====== state (array_like): current state """ state = torch.from_numpy(state).float().detach().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(state).cpu().data.numpy() self.actor_local.train() noise = np.array( [self.noise[i].sample() for i in range(self.num_agents)]) return np.clip(actions + noise, -1, 1) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset_random(self): for i in range(self.num_agents): self.noise[i].reset_states()
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) # writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{} episode score is {:.2f}'.format(episodes, score_avg)) # writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() train_discrim(discrim, memory, discrim_optim, demonstrations, args) train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local1 = Actor(state_size, action_size, random_seed).to(device) self.actor_target1 = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer1 = optim.Adam(self.actor_local1.parameters(), lr=LR_ACTOR) self.actor_local2 = Actor(state_size, action_size, random_seed).to(device) self.actor_target2 = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer2 = optim.Adam(self.actor_local2.parameters(), lr=LR_ACTOR) #critic_state_size = np.reshape(state_size, 48) # Critic Network (w/ Target Network) self.critic_local1 = Critic(state_size*2, action_size, random_seed).to(device) self.critic_target1 = Critic(state_size*2,action_size, random_seed).to(device) self.critic_optimizer1 = optim.Adam(self.critic_local1.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.critic_local2 = Critic(state_size*2, action_size, random_seed).to(device) self.critic_target2 = Critic(state_size*2, action_size, random_seed).to(device) self.critic_optimizer2 = optim.Adam(self.critic_local2.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > 7000: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" #self.noise = OUNoise(self.action_size, random_seed,sigma=sigma) state = torch.from_numpy(state).float().to(device) self.actor_local1.eval() with torch.no_grad(): action1 = self.actor_local1(state[0]).cpu().data.numpy() self.actor_local1.train() if add_noise: action1 += self.noise.sample() self.actor_local2.eval() with torch.no_grad(): action2 = self.actor_local2(state[1]).cpu().data.numpy() self.actor_local2.train() if add_noise: action2 += self.noise.sample() return np.vstack((np.clip(action1, -1, 1), np.clip(action2, -1, 1))) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences statesforcritic = torch.reshape(states, (BATCH_SIZE, self.state_size*2)) nextstatesforcritic = torch.reshape(next_states, (BATCH_SIZE, self.state_size*2)) actionsforcritic = torch.reshape(actions, (BATCH_SIZE, self.action_size*2)) nextstatesforactor = torch.split(nextstatesforcritic,self.state_size,1) statesforactor = torch.split(statesforcritic,self.state_size,1) actionsforactor = torch.split(actionsforcritic,self.action_size,1) rewardsforactor = torch.split(rewards,1,1) donesforactor = torch.split(dones,1,1) # --------------------------- update critic 1---------------------------- # # Get predicted next-state actions and Q values from target models actions_next1 = self.actor_target1(nextstatesforactor[0]) actions_next2 = self.actor_target2(nextstatesforactor[1]) actions_next = torch.cat((actions_next1, actions_next2), 1) Q_targets_next_1 = self.critic_target1(nextstatesforcritic, actions_next1) # Compute Q targets for current states (y_i) Q_targets1 = rewardsforactor[0] + (gamma * Q_targets_next_1 * (1 - donesforactor[0])) # Compute critic loss Q_expected1 = self.critic_local1(statesforcritic, actionsforactor[0]) critic_loss1 = F.mse_loss(Q_expected1, Q_targets1) # Minimize the loss self.critic_optimizer1.zero_grad() critic_loss1.backward() self.critic_optimizer1.step() Q_targets_next_2 = self.critic_target2(nextstatesforcritic, actions_next2) # Compute Q targets for current states (y_i) Q_targets2 = rewardsforactor[1] + (gamma * Q_targets_next_2 * (1 - donesforactor[1])) # Compute critic loss Q_expected2 = self.critic_local2(statesforcritic, actionsforactor[1]) critic_loss2 = F.mse_loss(Q_expected2, Q_targets2) # Minimize the loss self.critic_optimizer2.zero_grad() critic_loss2.backward() self.critic_optimizer2.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred1 = self.actor_local1(statesforactor[0]) actor_loss1 = -self.critic_local1(statesforcritic, actions_pred1).mean() # Minimize the loss self.actor_optimizer1.zero_grad() actor_loss1.backward() self.actor_optimizer1.step() actions_pred2 = self.actor_local2(statesforactor[1]) actor_loss2 = -self.critic_local2(statesforcritic, actions_pred2).mean() # Minimize the loss self.actor_optimizer2.zero_grad() actor_loss2.backward() self.actor_optimizer2.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local1, self.critic_target1, TAU) self.soft_update(self.critic_local2, self.critic_target2, TAU) self.soft_update(self.actor_local1, self.actor_target1, TAU) self.soft_update(self.actor_local2, self.actor_target2, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, n_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((n_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Epsilon self.epsilon = EPSILON # # Make sure target is with the same weight as the source # self.hard_update(self.actor_target, self.actor_local) # self.hard_update(self.critic_target, self.critic_local) def step(self, states, actions, rewards, next_states, dones, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) #self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep % UPDATE_EVERY == 0: for _ in range(UPDATE_TIMES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: # epsilon decay self.epsilon -= EPSILON_DECAY self.epsilon = np.maximum(self.epsilon, 0.001) action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # perform gradient clipping #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def load_weights(self, cp_actor, cp_critic): self.critic_local.load_state_dict(torch.load(cp_critic)) self.actor_local.load_state_dict(torch.load(cp_actor)) def eval_act(self, state): self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() return action
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed,num_agents=1): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed num_agents (int) : number of agents in the environment """ """ Base Working for multiple agents ====== Many different agents will sample the environment at the same time to get different states, for which based on the current policy actions will be decided, rewards will be received along with the next states. All the agents update the same experience replay buffer and utilise the same neural net to decide on the optimal set of actions. This should theoretically increase training efficiency since so many different states are being experienced at the same time. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents,action_size),random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) #Initial target and local networks with same weights (Student Hub Discussion) self.hard_update(self.actor_local,self.actor_target) self.hard_update(self.critic_local,self.critic_target) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory.""" # Save experience / reward for state, action, reward, next_state, done in zip(states, actions,rewards,next_states,dones): self.memory.add(state, action, reward, next_state, done) """To decouple learning from experience collection and use random sample from buffer to learn.""" def update(self): # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise and np.random.random() < eps: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),1) # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def hard_update(self,local_model,target_model): for target_param,local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data)
class Agent(): """Implements a DDPG Agent Args: state_size (int): dimension of each state action_size (int): dimension of each action device (str, optional): device used for tensor operations buffer_size (int, optional): size of the experience replay buffer batch_size (int, optional): size of batch sampled for experience replay lr (float, optional): learning rate of both actor and critic models lr_steps (int, optional): number of steps between each scheduler step lr_gamma (float, optional): LR multiplier applied at each scheduler step gamma (float, optional): discount factor tay (float, optional): soft update rate noise_mean (float, optional): mean of Ornstein-Uhlenbeck process noise_theta (float, optional): theta parameter Ornstein-Uhlenbeck process noise_sigma (float, optional): sigma parameter of Ornstein-Uhlenbeck process grad_clip (float, optional): gradient clip """ def __init__(self, state_size, action_size, train=False, device=None, buffer_size=1e6, batch_size=128, lr=1e-3, gamma=0.99, tau=1e-3, update_freq=20, nb_updates=10, noise_mean=0, noise_theta=0.05, noise_sigma=0.15, eps=1.0, eps_decay=1e-6, grad_clip=1.0): self.state_size = state_size self.action_size = action_size self.train = train self.bs = batch_size self.gamma = gamma self.tau = tau self.grad_clip = grad_clip self.update_freq = update_freq self.nb_updates = nb_updates self.eps = eps self.eps_decay = eps_decay if device is None: if torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' self.device = torch.device(device) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(self.device) if self.train: self.actor_target = Actor(state_size, action_size).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(self.device) if self.train: self.critic_target = Critic(state_size, action_size).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr, weight_decay=0.) # Noise process self.noise = OUNoise(action_size, noise_mean, noise_theta, noise_sigma) # Replay memory self.memory = ReplayBuffer(action_size, int(buffer_size), batch_size, self.device) def step(self, state, action, reward, next_state, done, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" if not self.train: raise ValueError('agent cannot be trained if constructor argument train=False') # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.bs and timestep % self.update_freq == 0: for _ in range(self.nb_updates): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Resolves action for given state as per current policy. Args: state (numpy.ndarray): current state representation add_noise (bool, optional): should noise be add to action value Returns: numpy.ndarray: clipped action value """ state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() if self.train: self.actor_local.train() if add_noise: action += self.eps * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Args: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if not self.train: raise ValueError('agent cannot be trained if constructor argument train=False') states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions.to(dtype=torch.float32)) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # gradient clipping for critic if self.grad_clip > 0: torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.grad_clip) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) # --------------------- and update epsilon decay ----------------------- # if self.eps_decay > 0: self.eps -= self.eps_decay self.noise.reset() @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM ) # set decay rate based on epsilon end target self.timestep = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number): """Save experience in replay memory, and use random sample from buffer to learn.""" self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): # get action for each agent and concatenate them for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Construct next actions vector relative to the agent if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Construct action prediction vector relative to each agent if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, EPS_FINAL) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, hidden_sizes_actor=[64, 64], hidden_sizes_critic=[128, 64, 32]): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed hidden_sizes_actor (list): list of neurons in each layer of the actor network hidden_sizes_critic (list): list of neurons in each layer of the critic network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.hidden_sizes_actor = hidden_sizes_actor self.hidden_sizes_critic = hidden_sizes_critic # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed, hidden_sizes_actor).to(device) self.actor_target = Actor(state_size, action_size, random_seed, hidden_sizes_actor).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY_AC) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed, hidden_sizes_critic).to(device) self.critic_target = Critic(state_size, action_size, random_seed, hidden_sizes_critic).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY_CR) # Initialize target networks with same weights: self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) # Add Ornstein-Uhlenbeck noise self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def show_actor_local(self): network = self.actor_local x = Variable(torch.randn(1, self.state_size)) y = network(x) return make_dot(y, params=dict(list(network.named_parameters()))) def show_actor_target(self): network = self.actor_target x = Variable(torch.randn(1, self.state_size)) y = network(x) return make_dot(y, params=dict(list(network.named_parameters()))) def show_critic_local(self): network = self.critic_local x1 = Variable(torch.randn(1, self.state_size)) x2 = Variable(torch.randn(1, self.action_size)) y = network(x1, x2) return make_dot(y, params=dict(list(network.named_parameters()))) def show_critic_target(self): network = self.critic_target x1 = Variable(torch.randn(1, self.state_size)) x2 = Variable(torch.randn(1, self.action_size)) y = network(x1, x2) return make_dot(y, params=dict(list(network.named_parameters()))) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for s, a, r, n_s, d in zip(state, action, reward, next_state, done): self.memory.add(s, a, r, n_s, d) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgent: def __init__(self, state_size, action_size, random_seed, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY): """Initialize an Agent object. Params ====== state_size (int): dimension of entire state action_size (int): dimension of each action random_seed (int): random seed n_agent (int): number of agents """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_target = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Add noise process to agent self.noise = OUNoise(action_size, random_seed**2) def act(self, obs, add_noise=True): """Returns actions for given state as per current policy.""" obs = torch.from_numpy(np.expand_dims(obs, 0)).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(obs).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.squeeze(np.clip(action, -1, 1), axis=0) def target_act(self, obs): """get target network actions from the agent in the MADDPG object """ obs = torch.from_numpy(np.expand_dims(obs, 0)).float().to(device) action = self.actor_target(obs) return action def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(): ''' Deep determinstic policy gradient agent ''' def __init__(self, state_size, action_size, random_seed, gamma, lr_actor, lr_critic, weight_decay, tau, buffer_size, batch_size, update_rate, updates_per_step): ''' Initialize an DDPG Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed nbr_env (int): number of environments the agent is acting with ''' self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Hyperparameter self.gamma = gamma self.lr_actor = lr_actor self.lr_critic = lr_critic self.tau = tau self.update_rate = update_rate self.updates_per_step = updates_per_step # Instantiate Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) # Instantiate Critic Networks self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) # Instantiate Optimizers self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) self.update_counter = 0 def step(self, state, action, reward, next_state, done): ''' Save experience in replay memory, and use random sample from buffer to learn. ''' # Store experiences self.memory.add(state, action, reward, next_state, done) if len(self.memory) > self.memory.batch_size: # Update counter self.update_counter += 1 if self.update_counter >= self.update_rate: for _ in range(self.updates_per_step): experiences = self.memory.sample() self.learn(experiences, self.gamma) self.update_counter = 0 def act(self, state, add_noise=True): ''' Returns actions for given state as per current policy. ''' state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): ''' ''' self.noise.reset() def learn(self, experiences, gamma): ''' Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor ''' states, actions, rewards, next_states, dones = experiences # update critic ########################################################## # compute predicted Q values next_actions = self.actor_target(next_states) next_Q_targets = self.critic_target(next_states, next_actions) # compute Q values for current states Q_targets = rewards + (gamma * next_Q_targets * (1 - dones)) # compute loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # update weigths self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # update actor ########################################################## # compute loss pred_actions = self.actor_local(states) actor_loss = -self.critic_local(states, pred_actions).mean() # update weights self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target networks self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): ''' Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter ''' for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
class DDPGAgent: def __init__(self, config): self.config = config self.seed = config.seed # Actor Network (w/ Target Network) self.actor_local = Actor(config.action_size, config.state_size, config.actor_hidden_units, config.seed).to(device) self.actor_target = Actor(config.action_size, config.state_size, config.actor_hidden_units, config.seed).to(device) self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), lr=config.actor_learning_rate) # Critic Network (w/ Target Network) self.critic_local = Critic(config.action_size, config.state_size, config.critic_hidden_units, config.seed).to(device) self.critic_target = Critic(config.action_size, config.state_size, config.critic_hidden_units, config.seed).to(device) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), lr=config.critic_learning_rate) # ----------------------- initialize target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) self.noise = OUNoise(config.action_size, config.seed) if config.shared_replay_buffer: self.memory = config.memory else: self.memory = ReplayBuffer(config.action_size, config.buffer_size, config.batch_size, config.seed) def reset(self): self.noise.reset() def act(self, states): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() actions += self.noise.sample() return np.clip(actions, -1, 1) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.tau) self.soft_update(self.actor_local, self.actor_target, self.config.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): def __init__(self, state_size, action_size, config, n_agents=1, seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action n_agents: number of agents it will control in the environment seed (int): random seed """ self.config = config self.state_size = state_size self.action_size = action_size self.seed = np.random.seed(seed) random.seed(seed) self.n_agents = n_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, leak=config['LEAKINESS'], seed=seed).to(device) self.actor_target = Actor(state_size, action_size, leak=config['LEAKINESS'], seed=seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config['LR_ACTOR']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, leak=config['LEAKINESS'], seed=seed).to(device) self.critic_target = Critic(state_size, action_size, leak=config['LEAKINESS'], seed=seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config['LR_CRITIC']) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, config['BUFFER_SIZE'], config['BATCH_SIZE'], seed) self.timesteps = 0 self.config = config def step(self, states, actions, rewards, next_states, dones): """ Given a batch of S,A,R,S' experiences, it saves them into the experience buffer, and occasionally samples from the experience buffer to perform training steps. """ self.timesteps += 1 for i in range(self.n_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) if (len(self.memory) > self.config['BATCH_SIZE']) and (self.timesteps % 20 == 0): for _ in range(10): experiences = self.memory.sample() self.learn(experiences, self.config['GAMMA']) def act(self, states, add_noise=True): """ Given a list of states for each agent it returns the actions to be taken by each agent based on the current policy. Returns a numpy array of shape [n_agents, n_actions] NOTE: clips actions to be between -1, 1 Args: states: () one row of state for each agent [n_agents, n_actions] add_noise: (bool) add noise to the actions? """ states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += [self.noise.sample() for _ in range(self.n_agents)] return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences self.update_critic(states, actions, rewards, gamma, next_states, dones) self.update_actor(states) self.update_target_networks() def update_critic(self, states, actions, rewards, gamma, next_states, dones): actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() def update_actor(self, states): actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def update_target_networks(self): self.soft_update(self.critic_local, self.critic_target, self.config.TAU) self.soft_update(self.actor_local, self.actor_target, self.config.TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) @property def device(self): return device
class Agent(): """This agent will interact and learn from the UNITY LM Tennis environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize the Agent. Parameters: state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network and its target network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network and its target network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = [OUNoise(action_size, random_seed) for i in range(2)] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, timestep): """Save the experiences in replay buffer, reuse these samples when learning.""" self.memory.add(state, action, reward, next_state, done) # Start learning when enough samples are present in memory if timestep % 2 == 0 and len(self.memory) > BATCH_SIZE: # sample ten times from memory for _ in range(10): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Get actions for state following current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() #Add noise to explore off policy for each two agents if add_noise: for i in range(2): single_action = action[i] for j in single_action: j += self.noise[i].sample() #Clip for training stability return np.clip(action, -1, 1) def reset(self): """ Reset the noise process """ for i in range(2): self.noise[i].reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Parameters: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ############# Update critic network ##################################### # Get next-state actions and Q values from target network actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() ################# update actor ############################################# # Compute actor's loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize actor's loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ############### update targets ############################################## self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Parameters local_model: MLP from which weights will be copied target_model: MLP to which weights will be copied to tau: interpolation parameter when copying the weights """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: def __init__(self, num_agents, state_size, action_size, random_seed, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=3e-4, weight_decay=1e-4, fc1_a=32, fc2_a=32, fc1_c=32, fc2_c=32, buffer_size=int(1e5), batch_size=64, update_every=4, sigma=0.2): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action """ # random.seed(random_seed) self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.gamma = gamma self.tau = tau self.batch_size = batch_size self.update_every = update_every # Actor Network self.actor_local = Actor(state_size, action_size, random_seed, fc1_units=fc1_a, fc2_units=fc2_a).to(device) self.actor_target = Actor(state_size, action_size, random_seed, fc1_units=fc1_a, fc2_units=fc2_a).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network self.critic_local = Critic(state_size, action_size, random_seed, fcs1_units=fc1_c, fc2_units=fc2_c).to(device) self.critic_target = Critic(state_size, action_size, random_seed, fcs1_units=fc1_c, fc2_units=fc2_c).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed, sigma=sigma) # Initialize time step (for updating every update_every steps) self.t_step = 0 # print networks info print(self.actor_local) summary(self.actor_local, input_size=(state_size, )) print(self.critic_local) summary(self.critic_local, input_size=[(state_size, ), (action_size, )]) def reset(self): self.noise.reset() def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # add noise to actions if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): # Save experience / reward for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions, args.init_w) self.actor_target = Actor(self.nb_states, self.nb_actions, args.init_w) self.critic = Critic(self.nb_states, self.nb_actions, args.init_w) self.critic_target = Critic(self.nb_states, self.nb_actions, args.init_w) self.reward_predictor = Critic(self.nb_states, self.nb_actions, args.init_w) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.trajectory_length = args.trajectory_length self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.is_training = True # if USE_CUDA: self.cuda() def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) return action def select_action(self, state, noise_enable=True, decay_epsilon=True): action, _ = self.actor(to_tensor(np.array([state]))) action = to_numpy(action).squeeze(0) if noise_enable == True: action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon return action def reset_lstm_hidden_state(self, done=True): self.actor.reset_lstm_hidden_state(done) def reset(self): self.random_process.reset_states() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() self.reward_predictor.cuda() def load_weights(self, output): if output is None: return False self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) return True def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))
class DDPGAgent(): """Deep Deterministic Policy Gradient Agent""" def __init__(self, state_size, action_size, random_seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay, update_every, update_times): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.update_every = update_every self.update_times = update_times # initialize Actor Network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # initialize Critic Network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) self.noise = OUNoise(action_size, random_seed) self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) self.step_count = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.step_count += 1 self.step_count %= self.update_every if len(self.memory) > self.batch_size and self.step_count == 0: for _ in range(self.update_times): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): # --- update critic --- states, actions, rewards, next_states, dones = experiences actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # --- update actor --- actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target newtorks self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): target_params = target_model.parameters() local_params = local_model.parameters() for target, local in zip(target_params, local_params): target.data.copy_(tau * local.data + (1.0 - tau) * target.data) def save(self, actor_local_path, actor_target_path, critic_local_path, critic_target_path): torch.save(self.actor_local.state_dict(), actor_local_path) torch.save(self.actor_target.state_dict(), actor_target_path) torch.save(self.critic_local.state_dict(), critic_local_path) torch.save(self.critic_target.state_dict(), critic_target_path) def load(self, actor_local_path, actor_target_path, critic_local_path, critic_target_path): self.actor_local.load_state_dict(torch.load(actor_local_path)) self.actor_target.load_state_dict(torch.load(actor_target_path)) self.critic_local.load_state_dict(torch.load(critic_local_path)) self.critic_target.load_state_dict(torch.load(critic_target_path))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, num_agents): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size*self.num_agents, action_size*self.num_agents, random_seed).to(device) self.critic_target = Critic(state_size*self.num_agents, action_size*self.num_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, sigma=0.1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, time, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward #raise Exception self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: if(time%UPDATE_EVERY_TIMESTAPES==0): for i in range(UPDATE_TIMES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences full_state = states.view(BATCH_SIZE, -1) next_full_state = next_states.view(BATCH_SIZE, -1) actions = actions.view(BATCH_SIZE, -1) with torch.no_grad(): actions_next = [self.actor_target(next_states[:, i, :]) for i in range(self.num_agents)] actions_next = torch.cat(actions_next, axis=-1) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models #actions_next = self.actor_target(next_states) #Pi aproximator Q_targets_next = self.critic_target(next_full_state.to(device), actions_next.to(device)) #q value aproximator # Compute Q targets for current states (y_i) #print(Q_targets_next.size()) rewards = rewards.sum(axis=-1, keepdim=True) #merge reward for all agents workaround dones = dones.max(axis=-1, keepdim=True)[0] #merge dones for all agents workaround Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(full_state, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = [self.actor_local(states[:, i, :]) for i in range(self.num_agents)] actions_pred = torch.cat(actions_pred, dim=1) actor_loss = -self.critic_local(full_state, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): def __init__(self, state_size, action_size, buffer_size, batch_size, num_agents, seed, gamma, tau, lr_actor, lr_critic, weight_decay, update_every, num_updates): ''' ---------------------------------- Parameters state_size: # of states action_size: # of actions buffer_size: size of the memory buffer batch_size: sample minibatch size num_agents: # of agents seed: seed for random gamma: discount rate for future rewards tau: interpolation factor for soft update of target network lr_actor: learning rate of Actor lr_critic: learning rate of Critic weight_decay: L2 weight decay update_every: update every 20 time steps num_updates: number of updates to the network ---------------------------------- ''' self.action_size = action_size self.state_size = state_size self.buffer_size = buffer_size self.batch_size = batch_size self.num_agents = num_agents self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.update_every = update_every self.num_updates = num_updates self.t_step = 0 self.seed = random.seed(seed) # Actor network agent self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic network agent self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Noise paramter self.noise = OUNoise((num_agents, action_size), seed) # Experience replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) def step(self, state, action, reward, next_state, done): ''' Agents takes next step - save most recent environment event to ReplayBuffer for each agent - load random sample from memory to agent's policy and value network 10 times for every 20 time steps ''' for s, a, r, ns, d in zip(state, action, reward, next_state, done): self.memory.add(s, a, r, ns, d) self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: if len(self.memory) > self.batch_size: for _ in range(self.num_updates): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): ''' Agent selects action based on current state and selected policy ''' state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): ''' Agent updates policy and value parameters based on experiences (state, action, reward, next_state, done) Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value ''' states, actions, rewards, next_states, dones = experiences #--------- update critic -----------------------# # get current Q Q_expected = self.critic_local(states, actions) # get next action next_actions = self.actor_target(next_states) # get Qsa_next Q_targets_next = self.critic_target(next_states, next_actions) # calculate target with reward and Qsa_next Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # calculate loss critic_loss = F.mse_loss(Q_expected, Q_targets) # minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() #--------- update actor ------------------------# # computer actor loss pred_actions = self.actor_local(states) actor_loss = -self.critic_local(states, pred_actions).mean() # minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() #---------- update target networks -------------# # update target network parameters self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): ''' Update target network weights gradually with an interpolation rate of TAU ''' for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG_Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, idx, random_seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.idx = idx # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) def act(self, state, add_noise=True, nu=1.0): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += nu * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, actions_next, actions_pred, freq): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(next_state) -> action critic_target(next_state, next_action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples next_actions (list): next actions computed from each agent actions_pred (list): prediction for actions for current states from each agent """ states, actions, rewards, next_states, dones = experiences idxt = torch.tensor([self.idx - 1]).to(device) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target model Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards.index_select( 1, idxt) + (GAMMA * Q_targets_next * (1 - dones.index_select(1, idxt))) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, **kwargs): """Initialize an Agent object. Params ====== """ self.agent_mh_size = kwargs['agent_mh_size'] self.agent_inventory_size = kwargs['agent_inventory_size'] self.world_state_size = kwargs['world_state_size'] self.action_size = kwargs['action_size'] self.seed = kwargs['random_seed'] self.iter = 0 self.noise_scale = 1.0 # Actor Network (w/ Target Network) self.actor_local = Actor(self.agent_mh_size, self.agent_inventory_size, self.world_state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.agent_mh_size, self.agent_inventory_size, self.world_state_size, self.action_size, self.seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #self.actor_optimizer = optim.Adam(self.actor_local.parameters()) self.actor_scheduler = optim.lr_scheduler.StepLR(self.actor_optimizer, step_size=200, gamma=0.99) # Critic Network (w/ Target Network) self.critic_local = Critic(self.agent_mh_size, self.agent_inventory_size, self.world_state_size, self.action_size, self.seed).to(device) self.critic_target = Critic(self.agent_mh_size, self.agent_inventory_size, self.world_state_size, self.action_size, self.seed).to(device) params = list(self.critic_local.parameters()) + list(self.actor_local.parameters()) #self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.critic_optimizer = optim.Adam(params, lr=LR_CRITIC) self.critic_scheduler = optim.lr_scheduler.StepLR(self.critic_optimizer, step_size=200, gamma=0.99) self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(self.action_size, self.seed) # Replay memory #self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, self.seed) # Prioritized replay memory self.memory = NaivePrioritizedBuffer(BUFFER_SIZE, BATCH_SIZE, self.seed) if 'actor_chkpt_file' in kwargs and 'critic_chkpt_file' in kwargs: checkpoint_actor = torch.load(kwargs['actor_chkpt_file']) checkpoint_critic = torch.load(kwargs['critic_chkpt_file']) self.actor_local.load_state_dict(checkpoint_actor) self.critic_local.load_state_dict(checkpoint_critic) checkpoint_actor_t = torch.load(kwargs['actor_chkpt_file_t']) checkpoint_critic_t = torch.load(kwargs['critic_chkpt_file_t']) self.actor_target.load_state_dict(checkpoint_actor_t) self.critic_target.load_state_dict(checkpoint_critic_t) def flatten_action(self, action): action_flat = [] for x in action: if type(x) is list: for y in x: action_flat.append(y) else: action_flat.append(x) return action_flat def get_states(self, mainhand, inventory, pov): agent_state_mainhand = [] agent_state_mainhand.append(mainhand['damage']) agent_state_mainhand.append(mainhand['maxDamage']) agent_state_mainhand.append(equipments.get(mainhand['type'], -1)) agent_state_inventory = [] agent_state_inventory.append(inventory['coal']) agent_state_inventory.append(inventory['cobblestone']) agent_state_inventory.append(inventory['crafting_table']) agent_state_inventory.append(inventory['dirt']) agent_state_inventory.append(inventory['furnace']) agent_state_inventory.append(inventory['iron_axe']) agent_state_inventory.append(inventory['iron_ingot']) agent_state_inventory.append(inventory['iron_ore']) agent_state_inventory.append(inventory['iron_pickaxe']) agent_state_inventory.append(inventory['log']) agent_state_inventory.append(inventory['planks']) agent_state_inventory.append(inventory['stick']) agent_state_inventory.append(inventory['stone']) agent_state_inventory.append(inventory['stone_axe']) agent_state_inventory.append(inventory['stone_pickaxe']) agent_state_inventory.append(inventory['torch']) agent_state_inventory.append(inventory['wooden_axe']) agent_state_inventory.append(inventory['wooden_pickaxe']) agent_state_mainhand = np.array(agent_state_mainhand) agent_state_inventory = np.array(agent_state_inventory) world_state_a = np.array(pov) world_state_b = np.swapaxes(world_state_a,0,2) return agent_state_mainhand, agent_state_inventory, world_state_b def hard_copy_weights(self, target, source): """ copy weights from source to target network (part of initialization)""" for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def step(self, mainhand, inventory, pov, action, reward, mainhand_n, inventory_n, pov_n, done): """Save experience in replay memory, and use random sample from buffer to learn.""" agent_state_mainhand, agent_state_inventory, world_state = self.get_states(mainhand, inventory, pov) agent_state_mainhand_n, agent_state_inventory_n, world_state_n = self.get_states(mainhand_n, inventory_n, pov_n) self.memory.add(agent_state_mainhand, agent_state_inventory, world_state, action, reward, agent_state_mainhand_n, agent_state_inventory_n, world_state_n, done) # Learn, if enough samples are available in memory self.iter = self.iter+1 self.iter = self.iter%1 if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) experiences = self.memory.sample() self.learn(experiences, GAMMA) #self.actor_scheduler.step() #self.critic_scheduler.step() def learn_from_players(self, experiences, mh_ts, invent_ts, writer, loss_list): """Save experience in replay memory, and use random sample from buffer to learn.""" #print(experiences) e = experiences self.memory.add(e[0], e[1], e[2], e[3], e[4], e[5], e[6], e[7], e[8]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() #(states, states_2, actions, rewards, next_states, next_states_2, dones) = experiences self.iter = self.iter+1 loss_1, loss_2 = self.learn_2(experiences, GAMMA, writer) loss_list.append((loss_1, loss_2)) self.iter = self.iter+1 experiences = self.memory.sample() loss_1, loss_2 = self.learn_2(experiences, GAMMA, writer) loss_list.append((loss_1, loss_2)) #self.actor_scheduler.step() #self.critic_scheduler.step() def act(self, mainhand, inventory, pov, add_noise=True, noise_scale=1.0): """Returns actions for given state as per current policy.""" agent_state_mainhand, agent_state_inventory, world_state = self.get_states(mainhand, inventory, pov) s1 = torch.from_numpy(agent_state_mainhand).float().unsqueeze(dim=0).to(device) s3 = torch.from_numpy(agent_state_inventory).float().unsqueeze(dim=0).to(device) s2 = torch.from_numpy(world_state).float().unsqueeze(dim=0).to(device) self.actor_local.eval() with torch.no_grad(): action, action_raw ,_, _ , _ , _ , _ , _, _, _= self.actor_local(s1,s2,s3) self.actor_local.train() return action, action_raw, agent_state_mainhand, agent_state_inventory def reset(self): self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def get_action_loss(self, writer, gt, onehot_probs, mh_state_loss, inventory_state_loss, \ world_state_loss, q_diff_loss=None, q_value_loss=None): q_value_loss = q_value_loss.detach()/14 attack_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,0], gt[:,0]) back_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,1], gt[:,1]) pitch_loss = F.mse_loss(onehot_probs[:,2], gt[:,2]) yaw_loss = F.mse_loss(onehot_probs[:,3], gt[:,3]) craft_loss = F.cross_entropy(onehot_probs[:,4:9], gt[:,4].long()) equip_loss = F.cross_entropy(onehot_probs[:,9:17], gt[:,5].long()) forward_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,17], gt[:,6]) jump_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,18], gt[:,7]) left_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,19], gt[:,8]) nearby_craft_loss = F.cross_entropy(onehot_probs[:,20:28], gt[:,9].long()) nearby_smelt_loss = F.cross_entropy(onehot_probs[:,28:31], gt[:,10].long()) place_loss = F.cross_entropy(onehot_probs[:,31:38], gt[:,11].long()) right_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,38], gt[:,12]) sneak_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,39], gt[:,13]) sprint_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,40], gt[:,14]) writer.add_scalars('Losses', {"attack":attack_loss, "back":back_loss, \ "craft":craft_loss, "equip":equip_loss, "forward":forward_loss, \ "jump":jump_loss, "left":left_loss, "nearbyCraft":nearby_craft_loss, \ "nearbySmelt":nearby_smelt_loss, "place":place_loss, "right":right_loss, \ "sneak":sneak_loss, "sprint":sprint_loss}, global_step=self.iter) writer.add_scalars('Camera Losses', {"pitch":pitch_loss, "yaw":yaw_loss}, global_step=self.iter) writer.add_scalars('State Prediction Losses', {"MainHand":mh_state_loss, "Inventory":inventory_state_loss, "World":world_state_loss}, global_step=self.iter) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() if q_value_loss is None and q_diff_loss is None: torch.autograd.backward([attack_loss,back_loss,pitch_loss,yaw_loss,craft_loss,equip_loss,\ forward_loss,jump_loss,left_loss,nearby_craft_loss,nearby_smelt_loss,place_loss, \ right_loss,sneak_loss,sprint_loss,mh_state_loss,inventory_state_loss, \ world_state_loss]) else: writer.add_scalars('Q Values', {"Q Value":q_value_loss, "Q Difference":q_diff_loss}, global_step=self.iter) torch.autograd.backward([attack_loss,back_loss,pitch_loss,yaw_loss,craft_loss,equip_loss,\ forward_loss,jump_loss,left_loss,nearby_craft_loss,nearby_smelt_loss,place_loss, \ right_loss,sneak_loss,sprint_loss, q_diff_loss]) # torch.autograd.backward([attack_loss,back_loss,pitch_loss,yaw_loss,craft_loss,equip_loss,\ # forward_loss,jump_loss,left_loss,nearby_craft_loss,nearby_smelt_loss,place_loss, \ # right_loss,sneak_loss,sprint_loss,mh_state_loss,inventory_state_loss, \ # world_state_loss, q_diff_loss]) torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.actor_optimizer.step() self.critic_optimizer.step() #print(attack_loss) #print(back_loss) #print(camera_loss.item()) #print(craft_loss) #print(equip_loss) #print(forward_loss) #print(jump_loss) #print(left_loss) #print(nearby_craft_loss) #print(nearby_smelt_loss) #print(place_loss) #print(right_loss) #print(sneak_loss) #print(sprint_loss) return pitch_loss, yaw_loss def learn_1(self, experiences, gamma): ( a_states_mh, a_states_invent, w_states, actions, rewards, a_next_states_mh, a_next_states_invent, w_next_states, dones ) = experiences a_states_mh = a_states_mh.to(device) a_states_invent = a_states_invent.to(device) w_states = w_states.to(device) a_next_states_mh = a_next_states_mh.to(device) a_next_state_invent = a_next_states_invent.to(device) w_next_states = w_next_states.to(device) # predict next actions and next next state with actor with torch.no_grad(): _ , _ , _ , Q_next , _ , _ , _ = self.actor_local(a_next_states_mh, w_next_states, a_next_states_invent) Q_current_2 = rewards + (gamma * Q_next * (1 - dones)) #get next state (from experiences) descriptors with torch.no_grad(): n_wsd = self.actor_local.get_wsd(w_next_states) n_asmhd = self.actor_local.get_asmhd(a_next_states_mh) n_asinventd = self.actor_local.get_asinventoryd(a_next_states_invent) # predict actions and next state with actor actions_pred, actions_pred_raw, action_logits, Q_current, n_wsd_predict, n_asmhd_predict, n_asinventd_predict = \ self.actor_local(a_states_mh, w_states, a_states_invent) # calculate loss for actor loss_1, loss_2 = self.get_action_loss(actions, action_logits, \ F.mse_loss(n_asmhd, n_asmhd_predict), F.mse_loss(n_asinventd, n_asinventd_predict), \ F.mse_loss(n_wsd, n_wsd_predict), F.mse_loss(Q_current, Q_current_2.detach())) print("Actor Losses:{} {}".format(loss_1.item(), loss_2.item())) return loss_1, loss_2 def learn_2(self, experiences, gamma, writer): #states, actions, rewards, next_states, dones, indices, weights = experiences ( a_states_mh, a_states_invent, w_states, actions, rewards, a_next_states_mh, a_next_states_invent, w_next_states, dones ) = experiences a_states_mh = a_states_mh.to(device) a_states_invent = a_states_invent.to(device) w_states = w_states.to(device) a_next_states_mh = a_next_states_mh.to(device) a_next_state_invent = a_next_states_invent.to(device) w_next_states = w_next_states.to(device) #get next state (from experiences) descriptors and Q_next with torch.no_grad(): _, _, _, Q_next, _, _, _, wsd_next, mhd_next, inventd_next = \ self.actor_local(a_next_states_mh, w_next_states, a_next_states_invent) Q_next = Q_next.detach() Q_current_2 = rewards + (gamma * Q_next * (1 - dones)) wsd_next = wsd_next.detach() mhd_next = mhd_next.detach() inventd_next = inventd_next.detach() # predict actions and next state with _, action_raw, action_logits, Q_current, n_wsd_predict, n_asmhd_predict, n_asinventd_predict, _, _, _ = \ self.actor_local(a_states_mh, w_states, a_states_invent) # calculate loss for actor loss_1, loss_2 = self.get_action_loss(writer, actions, action_logits, \ F.mse_loss(mhd_next, n_asmhd_predict), F.mse_loss(inventd_next, n_asinventd_predict), \ F.mse_loss(wsd_next, n_wsd_predict), F.mse_loss(Q_current, Q_current_2), -Q_current.mean()) print("Actor Losses:{} {}".format(loss_1.item(), loss_2.item())) return loss_1, loss_2 def learn_3(self, experiences, gamma): #states, actions, rewards, next_states, dones, indices, weights = experiences ( a_states_mh, a_states_invent, w_states, actions, rewards, a_next_states_mh, a_next_states_invent, w_next_states, dones ) = experiences a_states_mh = a_states_mh.to(device) a_states_invent = a_states_invent.to(device) w_states = w_states.to(device) a_next_states_mh = a_next_states_mh.to(device) a_next_state_invent = a_next_states_invent.to(device) w_next_states = w_next_states.to(device) # predict actions _ , actions_pred_raw, action_logits, _ , _ , _ , _ = \ self.actor_local(a_states_mh, w_states, a_states_invent) #get next state (from experiences) descriptors with torch.no_grad(): n_wsd = self.critic_local.get_wsd(w_next_states) n_asmhd = self.critic_local.get_asmhd(a_next_states_mh) n_asinventd = self.critic_local.get_asinventoryd(a_next_states_invent) # Compute Q value of current state (from experiences) Q_current, n_wsd_predict, n_asmhd_predict, n_asinventd_predict = self.critic_local(a_states_mh, a_states_invent, w_states, actions) # calculate loss for actor/critic loss_1, _ = self.get_action_loss(actions, action_logits, \ F.mse_loss(n_asmhd, n_asmhd_predict), F.mse_loss(n_asinventd, n_asinventd_predict), \ F.mse_loss(n_wsd, n_wsd_predict)) # Compute Q value of next state (next state from experiences and the rest is predicted with actor and critic # predict action in the next state actions_next, actions_next_raw, action_logits, _ , _ , _ , _ = self.actor_local(a_next_states_mh, w_next_states, a_next_states_invent) # predict Q value in the next state Q_next, _ , _ , _ = self.critic_local(a_next_states_mh, a_next_states_invent, w_next_states, actions_next_raw) # Alternative Q value through Bellman equations Q_current_2 = rewards + (gamma * Q_next * (1 - dones)) # Compute critic loss critic_loss = F.mse_loss(Q_current.detach(), Q_current_2) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.critic_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) print("Actor Losses:{} {}".format(loss_1.item(), critic_loss.item())) return loss_1, critic_loss
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, num_agents, random_seed).to(device) self.critic_target = Critic(state_size, action_size, num_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) def step(self, memory): """Save experience in replay memory, and use random sample from buffer to learn.""" # Learn, if enough samples are available in memory if len(memory) > BATCH_SIZE: experiences = memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences t_next_states = torch.cat(next_states, dim=1).to(device) t_states = torch.cat(states, dim=1).to(device) t_actions = torch.cat(actions, dim=1).to(device) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = [self.actor_target(state) for state in states] actions_next = torch.cat(actions_next, dim=1).to(device) Q_targets_next = self.critic_target(t_next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(t_states, t_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = [self.actor_local(state) for state in states] actions_pred = torch.cat(actions_pred, dim=1).to(device) actor_loss = -self.critic_local(t_states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Agent which interacts and learns from environment""" def __init__(self, state_size, action_size, random_seed=0): """Initialize an Agent Params ======= state_size (int): dimensions of each state action_size (int): dimensions of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON # Actor Network self.actor_local = Actor(state_size, action_size, random_seed, leakiness=LEAK_FACTOR).to(device) self.actor_target = Actor(state_size, action_size, random_seed, leakiness=LEAK_FACTOR).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network self.critic_local = Critic(state_size, action_size, random_seed, leakiness=LEAK_FACTOR).to(device) self.critic_target = Critic(state_size, action_size, random_seed, leakiness=LEAK_FACTOR).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def add_to_memory(self, states, actions, rewards, next_states, dones): """Save experience in replay memory""" self.memory.add(states, actions, rewards, next_states, dones) def learn_from_memory(self, timestep): """Sample experience tuples from the replay memory every LEARN_EVERY timesteps""" if len(self.memory) > BATCH_SIZE and timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA) def step(self, states, actions, rewards, next_states, dones, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" self.add_to_memory(states, actions, rewards, next_states, dones) self.learn_from_memory(timestep) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): """resets the current noise value""" self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters given batch of experience tuples Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state) -> Q_value Params ======= experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences #----------------update critic-------------------------------# # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q_targets for current state Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() # Clipping gradients if GRAD_CLIPPING > 0: torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING) self.critic_optimizer.step() #--------------update actor-------------------------------# # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() #------------update target networks---------------------# self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) #-----------update epsilon decay------------------------# self.epsilon *= EPSILON_DECAY self.noise.reset() def soft_update(self, local_model, target_model, tau): """ Soft updating target model's parameters theta_target = tau*theta_local + (1-tau)*theta_target Params ======= local_model: Pytorch model target_model: Pytorch model tau: interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)
class DDPGAgent: def __init__(self, total_agents, state_size, action_size, seed): self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu' #self.device = 'cpu' self.total_agents = total_agents self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.actor_local = Actor(self.state_size, self.action_size, seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(self.state_size, self.action_size, seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) #self.noise = OrnsteinUhlenbeckNoise(action_size, seed) self.noise = OrnsteinUhlenbeckProcess((self.total_agents, action_size), std=LinearSchedule(0.2)) self.replay_buffer = UniformReplayBuffer( BUFFER_SIZE, BATCH_SIZE * self.total_agents, seed, self.device) #self.replay_buffer = PrioritizedReplay(BUFFER_SIZE, self.device) print('Device used: {}'.format(self.device)) print('Actor Local DDPG ->', self.actor_local) print('Actor Target DDPG ->', self.actor_target) print('Critic Local DDPG ->', self.critic_local) print('Critic Target DDPG ->', self.critic_target) def reset(self): self.noise.reset() def act(self, states, add_noise=False): states = torch.from_numpy(states).float().to(self.device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() return np.clip(actions + self.noise.sample(), -1, 1) if add_noise else actions def step(self, states, actions, rewards, next_states, dones): for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.replay_buffer.add(state, action, reward, next_state, done) #for _ in range(self.total_agents): TOO SLOW #if len(self.replay_buffer) > BATCH_SIZE: return self._learn(self.replay_buffer.sample(), GAMMA) #return (None,None) def _learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # ---------- CRITIC UPDATE -------------------- next_actions = self.actor_target(next_states) next_rewards = self.critic_target(next_states, next_actions) target_rewards = rewards + gamma * next_rewards * (1 - dones) predicted_rewards = self.critic_local(states, actions) critic_loss = F.mse_loss(predicted_rewards, target_rewards) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------- ACTOR UPDATE -------------------- predicted_actions = self.actor_local(states) actor_loss = -self.critic_local(states, predicted_actions).mean() #print('\rActor Loss: {:.6f} - Critic Loss: {:.6f}'.format(actor_loss, critic_loss), end='') self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self._soft_update(self.critic_local, self.critic_target, TAU) self._soft_update(self.actor_local, self.actor_target, TAU) return critic_loss.cpu().data.numpy(), actor_loss.cpu().data.numpy() def _soft_update(self, local_model, target_model, tau): for local_parameter, target_parameter in zip( local_model.parameters(), target_model.parameters()): target_parameter.data.copy_((1.0 - tau) * target_parameter + (tau * local_parameter))
class Agent(object): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, hyperparameters): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed hyperparameters (dict): dictionary with h """ # initialize the random generator to ensure reproducibility random.seed(random_seed) # Read hyperparameters from Config dict self.hyperparamaters = hyperparameters self.state_size = state_size self.action_size = action_size self.step_counter = 0 self.epsilon = float(self.hyperparamaters['EPSILON_START']) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=float(self.hyperparamaters['LR_ACTOR'])) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=float(self.hyperparamaters['LR_CRITIC']), weight_decay=float(self.hyperparamaters['WEIGHT_DECAY'])) # Noise process self.noise = OUNoise(action_size) # Replay memory self.memory = ReplayBuffer(action_size, int(self.hyperparamaters['BUFFER_SIZE']), int(self.hyperparamaters['BATCH_SIZE'])) # Hard update so that weights of local and target are identical self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) def step_add_to_memory(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(len(states)): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) self.step_counter += 1 # Learn, if enough samples are available in memory self.step_learn() def step_learn(self): if self.step_counter % int(self.hyperparamaters['LEARN_EVERY']) == 0: if len(self.memory) > int(self.hyperparamaters['BATCH_SIZE']): for _ in range(int(self.hyperparamaters['LEARN_TIMES'])): experiences = self.memory.sample() self.learn(experiences, float(self.hyperparamaters['GAMMA'])) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() scalar = False with torch.no_grad(): if state.dim() == 1: state.unsqueeze_(0) scalar = True action = self.actor_local(state).cpu().data.numpy() if scalar: action = np.squeeze(action) self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def update_epsilon(self): self.epsilon = max( self.epsilon * float(self.hyperparamaters['EPSILON_DECAY']), float(self.hyperparamaters['EPSILON_END'])) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ indexes, states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # -----------------------------update td_error for ranking ------------- # deltas = list( torch.abs(Q_expected - Q_targets).cpu().detach().numpy().flatten()) for index, delta in zip(indexes, deltas): self.memory.td_error_update(index, delta) # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, float(self.hyperparamaters['TAU'])) self.soft_update(self.actor_local, self.actor_target, float(self.hyperparamaters['TAU'])) # ------------------------ update epsilon and noise -------------------- # self.update_epsilon() self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num)) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train() train_model(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
class Agent: def __init__(self, state_size, action_size, args, device="cpu"): """ Initialize DDPG agent for each agent in environment :param state_size: State size of the environment :param action_size: Action size of the environment :param args: Hyper-Parameters for training process :param device: Device to utilize """ self.action_size = action_size self.state_size = state_size self.device = device self.discount_factor = args["discount_factor"] self.tau = args["tau"] self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=args["lr_actor"]) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=args["lr_critic"], weight_decay=args["weight_decay"]) self.hard_update_actor(self.actor_local) self.hard_update_critic(self.critic_local) self.noise = OUNoise(action_size) def learn(self, batch): """ Learn from given batch :param batch: Sampled batch from experience replay buffer :return: (critic_loss, actor_loss) Thanks for the udacity ddpg_bipedal implementation here. """ if batch is None: return None states, actions, rewards, next_states, dones = batch # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.discount_factor * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update_critic(self.critic_local) self.soft_update_actor(self.actor_local) return critic_loss.cpu().data.numpy(), actor_loss.cpu().data.numpy() def hard_update_actor(self, model): """ Hard update for the actor model :param model: Model to be used to update model """ for target_param, param in zip(self.actor_target.parameters(), model.parameters()): target_param.data.copy_(param.data) def hard_update_critic(self, model): """ Hard update for the critic model :param model: Model to be used to update model """ for target_param, param in zip(self.critic_target.parameters(), model.parameters()): target_param.data.copy_(param.data) def soft_update_actor(self, model): """ Soft update for the actor model :param model: Model to be used to update model """ for target_param, param in zip(self.actor_target.parameters(), model.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) def soft_update_critic(self, model): """ Soft update for the critic model :param model: Model to be used to update model """ for target_param, param in zip(self.critic_target.parameters(), model.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) def act(self, state, add_noise=True): """ Interact with the environment. Decide the actions with the given environment state and noise :param state: Current state of the environment :param add_noise: Whether if the noise will be added to the action :return: Decided actions """ state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1)
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_method':args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class Agent(): def __init__(self, state_size, action_size, seed, hparams, identity): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.hparams = hparams self.identity = identity self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.hparams["lr_actor"]) for target_param, source_param in zip(self.actor_target.parameters(), self.actor_local.parameters()): target_param.data.copy_(source_param.data) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.hparams["lr_critic"], weight_decay=self.hparams["weight_decay"]) for target_param, source_param in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_param.data.copy_(source_param.data) #Controller will handle shared memory self.memory = ReplayBuffer(action_size, self.hparams["buffer_size"], self.hparams["batch_size"], seed) self.noise = OUNoise(action_size, seed) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" #Controller will handle concatenating the actions from each agent if not torch.is_tensor(states): states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() * self.hparams['epsilon'] return np.clip(action, -1, 1) # Handle step in controller def step(self, states, actions, rewards, next_states, dones, ep): self.memory.add(states, actions, rewards, next_states, dones) if len(self.memory ) > self.hparams["batch_size"] and ep % 5 == 0 and ep > 100: for _ in range(4): experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.hparams["gamma"] * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) self.hparams['epsilon'] *= self.hparams['epsilon_decay'] def reset(self): self.noise.reset() def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.hparams["tau"] * local_param.data + (1.0 - self.hparams["tau"]) * target_param.data) def print_models(self): print("Agent ", str(self.identity), " ", self.actor_local) print("Agent ", str(self.identity), " ", self.critic_local) def save_models(self): torch.save(self.actor_local.state_dict(), str(self.identity) + "_actor_weights.pth") torch.save(self.critic_local.state_dict(), str(self.identity) + "_critic_weights.pth")
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'init_method':args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([ state_batch, self.actor(state_batch) ]) else: policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if(self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if(self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() # if self.pic: # action = np.concatenate((softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy( self.actor_target(s_t) ).squeeze(0) else: action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = self.random_action(fix=True) # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, num_agents, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Actor Network with target net self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network with target net self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise self.noise = OU_Noise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def reset(self): self.noise.reset() self.t_step = 0 def step(self, states, actions, rewards, next_states, dones): # Save experience in replay memory for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # update the network UPDATE_TIMES times for every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: for i in range(UPDATE_TIMES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): """ Returns actions for given state as per current policy. :param state: current state :param add_noise: whether to add Ornstein-Uhlenbeck noise """ states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): action_values = self.actor_local(states).cpu().data.numpy() self.actor_local.train() # add OU_noise to action to explore if add_noise: action_values += self.noise.sample() return np.clip(action_values, -1, 1) def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples. :param experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples :param gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ------------------- update critic ------------------- # # get predicted next state, actions and Q values from target network actions_next = self.actor_target(next_states) Qtargets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states Qtargets = rewards + (gamma * Qtargets_next * (1 - dones)) # Get expected Q values from local model Qexpected = self.critic_local(states, actions) # calculate the batch loss critic_loss = F.mse_loss(Qexpected, Qtargets) # minimize critic loss self.critic_optimizer.zero_grad() critic_loss.backward() # backward pass torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) #gradient clipping self.critic_optimizer.step() # perform a single optimization step (parameter update) # ------------------- update actor ------------------- # # compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize actor loss self.actor_optimizer.zero_grad() actor_loss.backward() # backward pass self.actor_optimizer.step() # perform a single optimization step (parameter update) # ------------------- update target network ------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target :param local_model (PyTorch model): weights will be copied from :param target_model (PyTorch model): weights will be copied to :param tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)