def train(self, transitions: int, sigma_max: float = 1., sigma_min: float = 0., buffer_size: int = 10000, batch_size: int = 128, progress_upd_step: int = None, start_training: int = 1000, shaping_coef: float = 300.): history = ReplayBuffer(buffer_size) progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100 log = { "alpha": self.alpha, "gamma": self.gamma, "sigma_max": sigma_max, "sigma_min": sigma_min, "buffer_size": buffer_size, "batch_size": batch_size, "tau": self.tau, "shaping_coef": shaping_coef, "step": [], "reward_mean": [], "reward_std": [] } state = self.reset() t = tqdm(range(transitions)) for i in t: sigma = sigma_max - (sigma_max - sigma_min) * i / transitions action = self.act(state) noise = np.random.normal(scale=sigma, size=action.shape) action = np.clip(action + noise, -1, 1) next_state, reward, done, _ = self.env.step(action) reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1])) done_ = next_state[0] >= 0.5 history.add((state, action, next_state, reward, done_)) state = self.reset() if done else next_state if i > start_training: batch = history.sample(batch_size) self.update_critic(batch) self.update_actor(batch) if (i + 1) % progress_upd_step == 0: reward_mean, reward_std = self.evaluate_policy() log["step"].append(i) log["reward_mean"].append(reward_mean) log["reward_std"].append(reward_std) t.set_description( f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}" ) return log
def train(self, transitions: int, eps_max: float = 0.5, eps_min: float = 0., buffer_size: int = 10000, batch_size: int = 128, shaping_coef: float = 300., progress_upd_step: int = None, start_training: int = 10000): history = ReplayBuffer(size=buffer_size) progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100 log = { "alpha": self.alpha, "gamma": self.gamma, "buffer_size": buffer_size, "batch_size": batch_size, "tau": self.tau, "shaping_coef": shaping_coef, "eps_max": eps_max, "eps_min": eps_min, "step": [], "reward_mean": [], "reward_std": [] } state = self.reset() t = tqdm(range(transitions)) for i in t: eps = eps_max - (eps_max - eps_min) * i / transitions if random() < eps: action = self.env.action_space.sample() else: action = self.act(state) next_state, reward, done, _ = self.env.step(action) reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1])) done_ = next_state[0] >= 0.5 history.add((state, action, next_state, reward, done_)) state = self.reset() if done else next_state if i > start_training: self.update(history.sample(batch_size)) if (i + 1) % progress_upd_step == 0: reward_mean, reward_std = self.evaluate_policy() log["step"].append(i) log["reward_mean"].append(reward_mean) log["reward_std"].append(reward_std) t.set_description( f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}" ) return log
class Agent: def __init__(self, input_dim, output_dim, tau=0.001, gamma=0.99, train_batch_size=640): self.input_dim = input_dim self.output_dim = output_dim self.tau = tau self.gamma = gamma self.train_batch_size = train_batch_size self.main_critic = Critic(input_dim, output_dim, tau, gamma) self.target_critic = Critic(input_dim, output_dim, tau, gamma) self.main_actor = Actor(input_dim, output_dim, tau, gamma) self.target_actor = Actor(input_dim, output_dim, tau, gamma) self.target_critic.model.set_weights( self.main_critic.model.get_weights()) self.target_actor.model.set_weights( self.main_actor.model.get_weights()) self.memory = ReplayBuffer(batch_size=train_batch_size) def get_action(self, state): return self.main_actor.get_action(state) def train(self): data = self.memory.sample() states = np.vstack([e.state for e in data if e is not None]) actions = np.array([e.action for e in data if e is not None ]).astype(np.float32).reshape(-1, self.output_dim) rewards = np.array([e.reward for e in data if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in data if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in data if e is not None]) actions_next = self.target_actor.model.predict_on_batch(next_states) Q_targets_next = self.target_critic.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.main_critic.train(states, actions, Q_targets) action_gradients = np.reshape(self.main_critic.get_gradient(states,actions), \ (-1, self.output_dim)) self.main_actor.train(states, action_gradients) self.target_actor.model = self.main_actor.soft_update( self.target_actor.model) self.target_critic.model = self.main_critic.soft_update( self.target_critic.model)
class AgentCommon(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Noise process #self.noise = OUNoise(action_size, random_seed) self.noise = OUNoise((self.num_agents, action_size), seed = random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.actorL = ActorAgent(state_size, action_size, num_agents, self.noise, LR_ACTOR, self.memory, random_seed) self.actorR = ActorAgent(state_size, action_size, num_agents, self.noise, LR_ACTOR, self.memory, random_seed) self.sharedcritic = CriticAgent(state_size, action_size, num_agents, LR_CRITIC, WEIGHT_DECAY, TAU, random_seed) def step(self, state, action, reward, next_state, done): self.actorL.step(state[0], action[0], reward[0], next_state[0], done[0]) self.actorR.step(state[1], action[1], reward[1], next_state[1], done[1]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences1 = self.memory.sample() experiences2 = self.memory.sample() self.sharedcritic.learn(self.actorL,experiences1, GAMMA) self.sharedcritic.learn(self.actorR,experiences2, GAMMA) def act(self, state, add_noise=True): actionL = self.actorL.act(state[0],add_noise=add_noise) actionR = self.actorL.act(state[1],add_noise=add_noise) return[actionL,actionR] def reset(self): self.noise.reset()
class Agent(): """ DDPG Agent, interacts with environment and learns from environment """ def __init__(self, device, state_size, n_agents, action_size, random_seed, \ buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay, \ learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'): # Set Computational device self.DEVICE = device # Init State, action and agent dimensions self.state_size = state_size self.n_agents = n_agents self.action_size = action_size self.seed = random.seed(random_seed) self.l_step = 0 self.log_interval = 200 # Init Hyperparameters self.BUFFER_SIZE = buffer_size self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = TAU self.LR_ACTOR = lr_actor self.LR_CRITIC = lr_critic self.WEIGHT_DECAY = weight_decay self.LEARN_INTERVAL = learn_interval self.LEARN_NUM = learn_num # Init Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Init Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Init Noise Process self.noise = OUNoise((n_agents, action_size), random_seed, mu=0., theta=ou_theta, sigma=ou_sigma) # Init Replay Memory self.memory = ReplayBuffer(device, action_size, buffer_size, batch_size, random_seed) # think def act(self, states, add_noise=True): """ Decide what action to take next """ # evaluate state through actor_local states = torch.from_numpy(states).float().to(self.DEVICE) actions = np.zeros((self.n_agents, self.action_size)) self.actor_local.eval() # put actor_local network in "evaluation" mode with torch.no_grad(): for n, state in enumerate(states): actions[n, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # put actor_local back into "training" mode # add noise for better performance if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) # embody def step(self, t, s, a, r, s_, done): """ Commit step into the brain """ # Save SARS' to replay buffer --- state-action-reward-next_state tuple for n in range(self.n_agents): # self.memory.add(s, a, r, s_, done) # print ("going to learn 10 times") self.memory.add(s[n], a[n], r[n], s_[n], done[n]) if t % self.LEARN_INTERVAL != 0: return # Learn (if enough samples are available in memory ) if len(self.memory) > self.BATCH_SIZE: # print ("going to learn 10 times") for _ in range(self.LEARN_NUM): experiences = self.memory.sample() # get a memory sample self.learn(experiences, self.GAMMA) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Learn from experiences, with discount factor gamma Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ------ Update Critic ------ # # get predicted next-state actions and Q values from target networks actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ------ Update Actor ------ # # compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ------ Update Target Networks ------ # self.soft_update(self.critic_local, self.critic_target, self.TAU) self.soft_update(self.actor_local, self.actor_target, self.TAU) # keep count of steps taken # self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
buffer = ReplayBuffer(BUFFER_SIZE) losses = np.zeros(N_EPISODES) # Loop over episodes for episode in range(N_EPISODES): episode_losses = np.zeros(EPISODE_LENGTH) # Reset the environment for the start of the episode. agent.reset() # Loop over steps within this episode. The episode length here is 20. for step_num in range(EPISODE_LENGTH): # Step the agent once, and get the transition tuple for this step transition = agent.step() buffer.append(transition) if len(buffer) >= BATCH_SIZE: loss = dqn.batch_train_q_network(buffer.sample(BATCH_SIZE)) episode_losses[step_num] = loss # time.sleep(0.2) losses[episode] = np.average(episode_losses) print("Finished episode {}, average loss = {}".format( episode, losses[episode])) # shift x-axis by BATCH_SIZE iterations ax.plot(losses, color='blue') plt.yscale('log') fig.savefig("dqn_erb_loss_vs_episodes.png")
class TD3(): """ Twin Delayed Deep Deterministic Policy Gradient Model """ def __init__(self, state_size, action_size, random_seed): """ Initialize the model with arguments as follows: ARGUMENTS ========= - state_size (int) = dimension of input space - action_size (int) = dimension of action space - random_seed (int) = random seed Returns ======= - best learned action to take after Actor-Critic Learning """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # create noise self.noise = OUNoise(action_size, random_seed) self.noise_decay = NOISE_DECAY # create memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device) # Actor Networks (local online net + target net) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = LR_ACTOR) # Critic Networks (local online net + target net) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # instantiate online and target networks with same weights self.soft_update(self.actor_local, self.actor_target, 1) self.soft_update(self.critic_local, self.critic_target, 1) self.learn_counter = 0 def act(self, state, add_noise=True): """ Choose an action while interacting and learning in the environment """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() * self.noise_decay self.noise_decay *= self.noise_decay return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, noise_clip=0.5, policy_freq=2): """ Sample from experiences and learn """ # update the learn counter self.learn_counter += 1 # get experience tuples states, actions, rewards, next_states, dones = experiences # build noise on the action ##### CAVE: need to put actions onto cpu() to create a cpu tensor that is put onto CUDA with .to(device) #noise = torch.FloatTensor(actions.cpu()).data.normal_(0, policy_noise).to(device) #noise = noise.clamp(-noise_clip, noise_clip) ### <<--- adding this kind of noise was implemented in the paper on github, ### but i used OU-Noise in the act method, so maybe better to use the same while learning noise = torch.FloatTensor([self.noise.sample() for _ in range(len(actions))]).to(device) noise = noise.clamp(-noise_clip, noise_clip) # clip between -/+ max action dims because action+noise might run oor next_action = (self.actor_target(next_states) + noise).clamp(-1, 1) # compute the target Q value target_Q1, target_Q2 = self.critic_target(next_states, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = rewards + (gamma * target_Q * (1-dones)).detach() # get current Q estimates current_Q1, current_Q2 = self.critic_local(states, actions) # compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) # update the critic self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # delay the policy update if self.learn_counter % policy_freq == 0: # get actor_local predicted next action and use critic_local to complete actions_pred = self.actor_local.forward(states) actor_loss = -self.critic_local.Q1(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # delay update of actor and critic target models self.soft_update(self.actor_local, self.actor_target, TAU) self.soft_update(self.critic_local, self.critic_target, TAU) def soft_update(self, local_model, target_model, tau): # Perform soft update of the target networks # at every time step, keep 1-tau of target network # and add only a small fraction (tau) of the current online networks # to prevent oszillation for local_param, target_param in zip(local_model.parameters(), target_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def step(self, state, action, reward, next_state, done): # at every iteration, add new SARS' trajectory to memory, then learn from batches # if learning_step is reached and enough samples are in the buffer self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA)
def main(): ########## # CONFIG # ########## # Target Reward tgt_score = 0.5 # Device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Seed seed = 7 seeding(seed) # Model Architecture # Actor hidden_in_actor = 256 hidden_out_actor = 128 lr_actor = 1e-4 # Critic hidden_in_critic = 256 hidden_out_critic = 128 lr_critic = 3e-4 weight_decay_critic = 0 # Episodes number_of_episodes = 10000 episode_length = 2000 # Buffer buffer_size = int(1e6) batchsize = 512 # Agent Update Frequency episode_per_update = 1 # Rewards Discounts Factor discount_factor = 0.95 # Soft Update Weight tau = 1e-2 # Noise Process noise_factor = 2 noise_reduction = 0.9999 noise_floor = 0.0 # Window win_len = 100 # Save Frequency save_interval = 200 # Logger log_path = os.getcwd() + "/log" logger = SummaryWriter(log_dir=log_path) # Model Directory model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # Load Saved Model load_model = False #################### # Load Environment # #################### env = UnityEnvironment(file_name="./Tennis_Linux_NoVis/Tennis.x86_64") # Get brain brain_name = env.brain_names[0] brain = env.brains[brain_name] print('Brain Name:', brain_name) # Reset the environment env_info = env.reset(train_mode=True)[brain_name] # Number of Agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) #################### # Show Progressbar # #################### widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() start = time.time() ############### # Multi Agent # ############### maddpg = MADDPG(state_size, action_size, num_agents, hidden_in_actor, hidden_out_actor, lr_actor, hidden_in_critic, hidden_out_critic, lr_critic, weight_decay_critic, discount_factor, tau, seed, device) if load_model: load_dict_list = torch.load(os.path.join(model_dir, 'episode-saved.pt')) for i in range(num_agents): maddpg.maddpg_agent[i].actor.load_state_dict( load_dict_list[i]['actor_params']) maddpg.maddpg_agent[i].actor_optimizer.load_state_dict( load_dict_list[i]['actor_optim_params']) maddpg.maddpg_agent[i].critic.load_state_dict( load_dict_list[i]['critic_params']) maddpg.maddpg_agent[i].critic_optimizer.load_state_dict( load_dict_list[i]['critic_optim_params']) ################# # Replay Buffer # ################# rebuffer = ReplayBuffer(buffer_size, seed, device) ################# # TRAINING LOOP # ################# # initialize scores scores_history = [] scores_window = deque(maxlen=save_interval) # i_episode = 0 for i_episode in range(number_of_episodes): timer.update(i_episode) # Reset Environmet env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) # Reset Agent maddpg.reset() # episode_t = 0 for episode_t in range(episode_length): # Explore with decaying noise factor actions = maddpg.act(states, noise_factor=noise_factor) env_info = env.step(actions)[brain_name] # Environment reacts next_states = env_info.vector_observations # get the next states rewards = env_info.rewards # get the rewards dones = env_info.local_done # see if episode has finished ################### # Save Experience # ################### rebuffer.add(states, actions, rewards, next_states, dones) scores += rewards states = next_states if any(dones): break scores_history.append(np.max(scores)) # save most recent score scores_window.append(np.max(scores)) # save most recent score avg_rewards = np.mean(scores_window) noise_factor = max(noise_floor, noise_factor * noise_reduction) # Reduce Noise Factor ######### # LEARN # ######### if len(rebuffer) > batchsize and i_episode % episode_per_update == 0: for a_i in range(num_agents): samples = rebuffer.sample(batchsize) maddpg.update(samples, a_i, logger) # Soft Update maddpg.update_targets() ################## # Track Progress # ################## if i_episode % save_interval == 0 or i_episode == number_of_episodes - 1: logger.add_scalars('rewards', { 'Avg Reward': avg_rewards, 'Noise Factor': noise_factor }, i_episode) print( '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}' .format((time.time() - start) / 60, maddpg.update_count, episode_t), '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'. format(i_episode, avg_rewards, noise_factor), end="\n") ############## # Save Model # ############## save_info = ((i_episode) % save_interval == 0 or i_episode == number_of_episodes) if save_info: save_dict_list = [] for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-Latest.pt')) pd.Series(scores_history).to_csv( os.path.join(model_dir, "scores.csv")) # plot the scores rolling_mean = pd.Series(scores_history).rolling(win_len).mean() fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores_history)), scores_history) plt.axhline(y=tgt_score, color='r', linestyle='dashed') plt.plot(rolling_mean, lw=3) plt.ylabel('Score') plt.xlabel('Episode #') # plt.show() fig.savefig(os.path.join(model_dir, 'Average_Score.pdf')) fig.savefig(os.path.join(model_dir, 'Average_Score.jpg')) plt.close() if avg_rewards > tgt_score: logger.add_scalars('rewards', { 'Avg Reward': avg_rewards, 'Noise Factor': noise_factor }, i_episode) print( '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}' .format((time.time() - start) / 60, maddpg.update_count, episode_t), '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'. format(i_episode, avg_rewards, noise_factor), end="\n") break env.close() logger.close() timer.finish()
ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) print("s_t: ", s_t) print("s_t size: ", s_t.size) a = [[0, 1]] #t_start = timeit.default_timer() for i in range(max_step): ob, r_t, done, info = env.step(a[0]) if done: break s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) memory.put((s_t, a[0], r_t, s_t1, done)) s_t = s_t1 #t_end = timeit.default_timer() s_done = s_t print('done?: ', s_done) #print('{}steps, {} time spent'.format(i,t_end-t_start)) env.end() s, a, r, sp, d = memory.sample(3) print('s: ', s) print('a: ', a) print('r: ', r) print('sp: ', sp) print('d: ', d) # # --noise 테스트합니다.-- # noise = OrnsteinUhlenbeckNoise(mu = np.zeros(1),theta=0.1,dt=0.2,sigma = 0.1, x0 = np.array([0.5])) # for i in range(300): # noise() # print(noise)
class DDQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, hidden_layers=[64, 64], buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, learning_rate=5e-4, update_every=4, head_name="DuelingDQN", head_scale="max"): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (list of int ; optional): number of each layer nodes buffer_size (int ; optional): replay buffer size batch_size (int; optional): minibatch size gamma (float; optional): discount factor tau (float; optional): for soft update of target parameters learning_rate (float; optional): learning rate update_every (int; optional): how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = learning_rate self.update_every = update_every # detect GPU device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Assign model parameters and assign device model_params = [ state_size, action_size, seed, hidden_layers, head_name, head_scale ] self.qnetwork_local = QNetwork(*model_params).to(self.device) self.qnetwork_target = QNetwork(*model_params).to(self.device) # Set up optimizer self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Initialize Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.device) # Initialize time step (for updating every self.update_every steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Update time step self.t_step = self.t_step + 1 # Learn every self.update_every time steps. if self.t_step % self.update_every == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Go to evaluation mode and get Q values for current state self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) # get back to train mode self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data states, actions, rewards, next_states, dones = experiences # Go to evaluation mode self.qnetwork_target.eval() with torch.no_grad(): # get Q values for the next state Q_dash_local = self.qnetwork_local(next_states) Q_dash_target = self.qnetwork_target(next_states) # Find the predicted action based on the local Q_network argmax_action = torch.max(Q_dash_local, dim=1, keepdim=True)[1] # Get the Q-value from the target network Q_dash_max = Q_dash_target.gather(1, argmax_action) # Update the target value y = rewards + gamma * Q_dash_max * (1 - dones) # Go back to train mode self.qnetwork_target.train() # Predict Q-values based on the local network self.optimizer.zero_grad() Q = self.qnetwork_local(states) y_pred = Q.gather(1, actions) # TD-error/loss function loss = torch.sum((y - y_pred)**2) # Optimize the network loss.backward() self.optimizer.step() # Update the target network using the local and target networks self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. ?_target = ?*?_local + (1 - ?)*?_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DQNAgent(Agent): class SAVE: MEMORY = 1 TARGETNETWORK = 2 TRAINNETWORK = 4 HYPERPARAM = 8 ALL = 15 def __init__(self, DQNType, input_shape, replaybuffersize=100000, input_preprocess=[]): super().__init__(MOVEMENTS.COMPLEX) self.memory = ReplayBuffer(replaybuffersize) self.train_network = DQNType(input_shape, len(self.movements)) self.target_network = self.train_network.clone_model() self.input_preprocess = input_preprocess ## Initialize self.counter = 0 self.epsilon = 1 ## hyperparameters self.hyperparams = { "burn_in": 10000, "copy_each": 5000, "learn_each": 1, "save_each": 5000, "final_epsilon": 0.1, "epsilon_decay_rate": 0.99998, "batch_size": 32, "gamma": 0.99 } def setparam(self, **kwargs): for key, val in kwargs.items(): self.hyperparams[key] = val return self def getparams(self): return self.hyperparams def preprocess(self, image): for pc in self.input_preprocess: image = pc(image) return image def reward(self, reward, info_old, info_new): return reward + (info_new["score"] - info_old["score"]) / 100 def action(self, states): self.action_states = self.preprocess(states) ## Random exploration if random.uniform(0, 1) < self.epsilon: self.action_num = random.choice(range(len(self.movements))) ## Make a decision based on the network else: normalized_states = figure.normalize()( self.action_states) # convert to 0-1 scale output = self.train_network.predict(normalized_states[None, ...]) self.action_num = np.argmax(output) return self.movements[self.action_num] def feedback(self, states, reward, info, done): # what to do after getting a reward self.counter += 1 self.memory.append(( self.action_states, # already preprocessed self.action_num, reward, info, done, self.preprocess(states))) self.updateNetwork() def save(self, file_path, saveMethod=None): if saveMethod is None: saveMethod = self.SAVE.ALL if (saveMethod & self.SAVE.MEMORY): self.memory.save(file_path + "memory") if (saveMethod & self.SAVE.TARGETNETWORK): self.target_network.save_model(file_path + "target_net") if (saveMethod & self.SAVE.TRAINNETWORK): self.train_network.save_model(file_path + "train_net") # if (saveMethod & self.SAVE.HYPERPARAM): # with open(file_path + "hyperparam.json", "w") as f: # json.dump(self.hyperparams, f, indent=2) def load(self, file_path): try: self.target_network.load_model(file_path + "target_net") self.train_network.load_model(file_path + "train_net") with open(file_path + "hyperparam.json", "r") as f: self.hyperparams = json.load(f) except Exception as e: print(e) def updateNetwork(self): if self.counter < self.hyperparams["burn_in"]: return self.epsilon *= self.hyperparams["epsilon_decay_rate"] self.epsilon = max(self.epsilon, self.hyperparams["final_epsilon"]) if (self.counter - self.hyperparams["burn_in"] ) % self.hyperparams["learn_each"] == 0: self.learn() if (self.counter - self.hyperparams["burn_in"] ) % self.hyperparams["copy_each"] == 0: self.target_network = self.train_network.clone_model() if (self.counter - self.hyperparams["burn_in"] ) % self.hyperparams["save_each"] == 0: self.save("./autosave/step_" + str(self.counter)) def learn(self): learn_sample = self.memory.sample(self.hyperparams["batch_size"]) state_raw = np.stack( [states for states, _, _, _, _, _ in learn_sample], axis=0) actions = [action for _, action, _, _, _, _ in learn_sample] rewards = [reward for _, _, reward, _, _, _ in learn_sample] not_done = [not done for _, _, _, _, done, _ in learn_sample] next_state_raw = np.stack( [states for _, _, _, _, _, states in learn_sample], axis=0) state = figure.normalize()(state_raw) next_state = figure.normalize()(next_state_raw) best_action_next = np.argmax(self.train_network.predict(next_state), axis=1) # Predicts the Q values calculated at the best_action_next # We shall only keep those entries corresponding to the real actions taken # Terminal states should not involve calculating the expected Q value. Q_value_next_target_mat = self.target_network.predict( next_state, actions=best_action_next) Q_value_next_target_vec = np.max(Q_value_next_target_mat, axis=1) Q_value_target_vec = np.array(rewards) + self.hyperparams[ "gamma"] * np.array(not_done) * Q_value_next_target_vec Q_value_target_mat = np.zeros(Q_value_next_target_mat.shape) for id, num in enumerate(actions): Q_value_target_mat[id, num] = Q_value_target_vec[id] self.train_network.fit(state, actions, Q_value_target_mat, verbose=0)
# Loop over episodes for episode in range(N_EPISODES): epsilon = min(10 / (episode + 1), 1) episode_losses = np.zeros(EPISODE_LENGTH) # Reset the environment for the start of the episode. agent.reset() # Loop over steps within this episode. The episode length here is 20. for step_num in range(EPISODE_LENGTH): # Step the agent once, and get the transition tuple for this step transition = agent.step(dqn, epsilon) buffer.append(transition) if len(buffer) >= BATCH_SIZE: loss = dqn.batch_train_q_network(buffer.sample(BATCH_SIZE), target_network=target) episode_losses[step_num] = loss if (episode * EPISODE_LENGTH + step_num) % TARGET_SWAP == 0: print("Swapped target network on step {}".format(episode * EPISODE_LENGTH + step_num)) target.q_network.load_state_dict(dqn.q_network.state_dict()) # time.sleep(0.05) losses[episode] = np.average(episode_losses) print("Finished episode {}, average loss = {}".format(episode, losses[episode])) # evaluate Q-value q_values = np.zeros((10, 10, 4)) for col in range(10): x = col / 10 + 0.05
class MADDPG: def __init__(self, num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, discount_factor=0.95, tau=0.02, device=device, random_seed=4, lr_critic=1.0e-4, weight_decay=0.0): super(MADDPG, self).__init__() # parameter configuration self.num_agents = num_agents self.device = device self.discount_factor = discount_factor self.tau = tau self.num_agents = num_agents self.global_action_size = global_action_size self.global_obs_dim = global_obs_dim torch.manual_seed(random_seed) random.seed(random_seed) self.random_seed = random_seed self.weight_decay = weight_decay # define actors self.actors = [ DDPGActor(num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, device=device) for _ in range(num_agents) ] # define centralized critic self.critic = Critic(global_obs_dim, global_action_size, self.random_seed).to(self.device) self.target_critic = Critic(global_obs_dim, global_action_size, self.random_seed).to(self.device) hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=self.weight_decay) # noise coef self.noise_coef = 1.0 self.noise_coef_decay = 1e-6 # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, obs_all_agents): actions = [ ddpg_actor.act(local_obs, self.noise_coef) for ddpg_actor, local_obs in zip(self.actors, obs_all_agents) ] return actions def target_act(self, obs_all_agents): actions = [ ddpg_actor.target_act(local_obs, noise_coef=0, add_noise=False) for ddpg_actor, local_obs in zip(self.actors, obs_all_agents) ] return actions def step(self, obs, obs_full, actions, rewards, next_obs, next_obs_full, dones, timestep): self.memory.add(obs, obs_full, actions, rewards, next_obs, next_obs_full, dones) timestep = timestep % TRAIN_EVERY # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep == 0: for _ in range(N_LEARN_UPDATES): experiences = self.memory.sample() self.learn(experiences, self.discount_factor) def learn(self, experiences, gamma): obs, obs_full, action, reward, next_obs, next_obs_full, done = experiences obs = obs.permute(1, 0, -1) # agent_id * batch_size * state_size obs_full = obs_full.view(-1, self.global_obs_dim) next_obs = next_obs.permute(1, 0, -1) next_obs_full = next_obs_full.view(-1, self.global_obs_dim) action = action.reshape(-1, self.global_action_size) # ---------------- update centralized critic ----------------------- # self.critic_optimizer.zero_grad() # get target actions from all target_actors target_actions = np.array(self.target_act(next_obs)) target_actions = torch.from_numpy(target_actions).float().permute( 1, 0, -1) target_actions = target_actions.reshape(-1, self.global_action_size) # update critic with torch.no_grad(): q_next = self.target_critic.forward(next_obs_full, target_actions.to(self.device)) y = reward + gamma * q_next * (1 - done) q = self.critic.forward(obs_full, action) critic_loss = 0 for i in range(self.num_agents): critic_loss += F.mse_loss(q, y[:, i].detach().reshape( -1, 1)) / self.num_agents critic_loss.backward() self.critic_optimizer.step() # ---------------- update actor for all agents --------------------- # for ii in range(len(self.actors)): self.actors[ii].actor_optimizer.zero_grad() q_action = [ self.actors[i].actor_local(ob) if i == ii \ else self.actors[i].actor_local(ob).detach() for i, ob in enumerate(obs) ] q_action = torch.stack(q_action).permute(1, 0, -1) q_action = q_action.reshape(-1, self.global_action_size).to( self.device) # policy_gradient actor_loss = -self.critic.forward(obs_full, q_action).mean() actor_loss.backward() self.actors[ii].actor_optimizer.step() # --------------- soft update all target networks ------------------- # soft_update(self.target_critic, self.critic, self.tau) for actor in self.actors: actor.update_target(self.tau) # -------------- reset noise --------------------------------------- # for actor in self.actors: actor.action_noise.reset() self.noise_coef -= self.noise_coef_decay if self.noise_coef < 0.01: self.noise_coef = 0.01
done = 0 total_reward = 0 step = agent.step_move() epsilon = max(1 - step * arg.epsilon_decrease, arg.epsilon_min) while not done: if np.random.uniform(0, 1) < epsilon: action = agent.random_action() else: action = agent.choose_action(obs) obs_, reward, done, _ = env.step( action + 1) # because there is only three action replay_buffer.store_transition(obs, obs_, action, reward, done) total_reward += reward obs = obs_ print('in {}, {}th game: the reward {} '.format( arg.run_name, step, total_reward)) if step % train_period == 0: s1, s2, a, r, d = replay_buffer.sample(batch_size=train_batch) if step % record_period == 0: loss = agent.train(s1, s2, a, r, d, True) agent.log_reward(total_reward) agent.save() else: loss = agent.train(s1, s2, a, r, d, False) print('{}th game: the training loss {}'.format(step, loss)) if step % arg.update_period == 0: agent.update_target_network()
class DDPG_Agent(): """Interacts with and learns from the environment.""" #self.state_size, self.action_size, self.seed, hidden_layers_actor, hidden_layers_critic, self.buffer_size, learning_rate_actor, learning_rate_critic def __init__(self, state_size, action_size, num_agents, seed, device, buffer_size=int(1e5), batch_size=128, num_batches = 5, update_every=10, gamma=0.99, tau=8e-3, learning_rate_actor=1e-3, learning_rate_critic=1e-3, weight_decay=0.0001, hidden_layers_actor=[32,32], hidden_layers_critic=[32, 32, 32], add_noise=True, start_eps=5.0, end_eps=0.0, end_eps_episode=500, agent_id=-1): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents seed (int): random seed hidden_layers (list of int ; optional): number of each layer nodes buffer_size (int ; optional): replay buffer size batch_size (int; optional): minibatch size gamma (float; optional): discount factor tau (float; optional): for soft update of target parameters learning_rate_X (float; optional): learning rate for X=actor or critic """ print('In DPPG_AGENT: seed = ', seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.device = device self.buffer_size = buffer_size self.batch_size = batch_size self.update_every = update_every self.num_batches = num_batches self.gamma = gamma self.tau = tau self.lr_actor = learning_rate_actor self.lr_critic = learning_rate_critic self.weight_decay_critic = weight_decay self.add_noise = add_noise self.start_eps = start_eps self.eps = start_eps self.end_eps = end_eps self.eps_decay = 1/(end_eps_episode*num_batches) # set decay rate based on epsilon end target self.timestep = 0 self.agent_id = agent_id ### SET UP THE ACTOR NETWORK ### # Assign model parameters and assign device model_params_actor = [state_size, action_size, seed, hidden_layers_actor] # Create the Actor Network (w/ Target Network) self.actor_local = Actor(*model_params_actor).to(self.device) self.actor_target = Actor(*model_params_actor).to(self.device) #print('actor_local network is: ', print(self.actor_local)) # Set up optimizer for the Actor network self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) ### SET UP THE CRITIC NETWORK ### model_params_critic = [state_size, action_size, seed, hidden_layers_critic] # Create the Critic Network (w/ Target Network) self.critic_local = Critic(*model_params_critic).to(self.device) self.critic_target = Critic(*model_params_critic).to(self.device) # Set up optimizer for the Critic Network self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Noise process self.noise = OUNoise(action_size, self.seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, device) def step(self, states, actions, rewards, next_states, dones, agent_number): # Increment timestep by 1 self.timestep += 1 # Save experience in replay memory self.memory.add(states, actions, rewards, next_states, dones) # If there are enough samples and a model update is to be made at this time step if len(self.memory) > self.batch_size and self.timestep%self.update_every == 0: # For each batch for i in range(self.num_batches): # Sample experiences from memory experiences = self.memory.sample() # Learn from the experience self.learn(experiences, self.gamma, agent_number) def act(self, state, scale_noise=True): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().to(self.device) # Go to evaluation mode and get Q values for current state self.actor_local.eval() with torch.no_grad(): # Get action for the agent and concatenate them action = [self.actor_local(state[0]).cpu().data.numpy()] # get back to train mode self.actor_local.train() # Add noise to the action probabilities # Note, we want the magnitude of noise to decrease as the agent keeps learning action += int(scale_noise)*(self.eps)*self.noise.sample() return np.clip(action, -1.0, 1.0) def reset(self): """ Reset the noise, and all neural network parameters for the current agent """ self.noise.reset() self.eps = self.start_eps self.timestep = 0 self.critic_local.reset_parameters() self.actor_local.reset_parameters() self.critic_target.reset_parameters() self.actor_target.reset_parameters() # ReSet up optimizer for the Actor network self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Set up optimizer for the Critic Network self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Clear the experience buffer self.memory.clear_buffer() def reset_noise(self): """ Reset the noise only """ self.noise.reset() def learn(self, experiences, gamma, agent_number): #### DRAW FROM MEMORY AND PREPARE SARS DATA #### # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data states, actions, rewards, next_states, dones = experiences # NOTE: actions has dimension of batch_size x concatenated action for all agents # get the next action for the current agent for the entire batch actions_next = self.actor_target(next_states) # Construct next action vector for the agent if agent_number == 0: actions_next = torch.cat((actions_next, actions[:,2:]), dim=1) else: actions_next = torch.cat((actions[:,:2], actions_next), dim=1) #### UPDATE CRITIC #### # Get predicted next-state actions and Q values from target models # Get the next targets Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) # Define the loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # Clip gradient @1 torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # --------------UPDATE ACTOR -----------------------# # Compute actor loss actions_pred = self.actor_local(states) # Construct action prediction vector relative to each agent if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:,2:]), dim=1) else: actions_pred = torch.cat((actions[:,:2], actions_pred), dim=1) # Calculate the loss. Note the negative sign since we use steepest ascent actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the target networks using the local and target networks self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, self.end_eps) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. X_target = tau*X_local + (1 - tau)*X_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(object): """ The Agent interacts with and learns from the environment. """ def __init__(self, state_size, action_size, num_agents, random_seed=0, params=params): """ Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.params = params # Actor (Policy) Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_target = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.params['LR_ACTOR']) # Critic (Value) Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_target = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.params['LR_CRITIC'], weight_decay=self.params['WEIGHT_DECAY']) # Initialize target and local to same weights self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'], self.params['BATCH_SIZE'], random_seed) def hard_update(self, local_model, target_model): """ Hard update model parameters. """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def step(self, states, actions, rewards, next_states, dones): """ Save experiences in replay memory and use random sample from buffer to learn. """ # Save experience / reward, cater for when multiples for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn if enough samples are available in memory if len(self.memory) > self.params['BATCH_SIZE']: experiences = self.memory.sample() self.learn(experiences, self.params['GAMMA']) def act(self, states, add_noise=True): """ Returns actions for a given state as per current policy. """ states = torch.from_numpy(states).float().to(self.params['DEVICE']) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): actions[i, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma=params['GAMMA']): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Update Critic(Value) # Get predicted next-state actions and Q-Values from target Network actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q Targe for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimise the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_( self.critic_local.parameters(), 1) # Stabilize learning per bernchmark guidelines self.critic_optimizer.step() # Update Actor (Policy) # Compute Actor Loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.soft_update(self.critic_local, self.critic_target, tau=self.params['TAU']) self.soft_update(self.actor_local, self.actor_target, tau=self.params['TAU']) def soft_update(self, local_model, target_model, tau=params['TAU']): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(graph_distr, epochs, batch_size, eps, n_step, discount, capacity, gcn_params, opt_params): ''' graph_distr: object that wraps the graph generating distr epochs: int batch_size: int eps: float for exploration probability n: int, num steps for n-step Q-learning discount: float, how much to discount future state/action value capacity: int, number of episodes to keep in memory gcn_params: dictionary of graph conv net parameters opt_params: dictionary of params for optimizer ''' qnet = QNetwork(gcn_params) memory = ReplayBuffer(capacity) opt_params['params'] = qnet.parameters() optimizer = get_optimizer(opt_params) for e in range(epochs): node_labels, edge_weights, adj = graph_distr.next() embedding = qnet.embed_graph(node_labels, edge_weights, adj) state = [] # s_0 state_vec = Variable(torch.zeros((1, qnet.embed_dim))) state_vec_prev = None actions = [] rewards = [] s_complement = set(range(len(adj))) losses = [] best_actions = [] for t in range(len(adj)): if t > 0: v_best_t = qnet.best_action(state, list(s_complement), embedding) if random.random() < eps or t == 0: v_t = random.choice(tuple(s_complement)) else: v_t = v_best_t action_vec = embedding[v_t].unsqueeze(0) vprev = None if t == 0 else state[-1] r_t = 0 if t == 0 else -edge_weights.data[vprev, v_t] s_complement.remove(v_t) # ideally store: s_0 , a_0, r_0, s_1, v_best_1 # ideally store: s_1 , a_1, r_1, s_2, v_best_2 if t >= n_step: new_state = state[:] # the action prev is what action got taken. # v_best_t must be the argmax action of the current state v_best_embedding = embedding[v_best_t].unsqueeze(0) episode = (state_vec_prev, action_vec_prev, rewards[-1], state_vec, v_best_embedding) # should try to add v_best_t so we dont recompute later memory.push(*episode) if len(memory) > batch_size: batch = memory.sample(batch_size) batch_loss = qnet.backprop_batch(batch, optimizer) losses.append(batch_loss) state_vec_prev = state_vec action_vec_prev = action_vec state.append(v_t) state_vec = state_vec + action_vec rewards.append(r_t) epoch_loss = torch.mean(torch.cat(losses)) print('Epoch {} | avg loss: {:.3f} | Exploration rate: {:.3f}'.format( e, float(epoch_loss), eps)) eps = update_exploration(eps)
class Agent: def __init__(self, state_size, action_size, device, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4): self.state_size = state_size self.action_size = action_size self.device = device self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every # model settings self.qnet_local = Model(state_size, action_size).to(self.device) self.qnet_target = Model(state_size, action_size).to(self.device) self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr) # replay buffer settings self.replay_buffer = ReplayBuffer(self.buffer_size, self.batch_size) self.update_step = 0 def step(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) self.update_step = (self.update_step + 1) % self.update_every if (self.update_step == 0) and (len(self.replay_buffer) > self.batch_size): experiences = self.replay_buffer.sample() self.learn(experiences) def act(self, state, eps=0.0): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnet_local.eval() with torch.no_grad(): action_values = self.qnet_local(state) self.qnet_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return np.random.choice(self.action_size) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # convert to tensors and send to device states = torch.from_numpy(states).float().to(self.device) actions = torch.from_numpy(actions).long().to(self.device) rewards = torch.from_numpy(rewards).float().to(self.device) next_states = torch.from_numpy(next_states).float().to(self.device) dones = torch.from_numpy(dones).float().to(self.device) # max returns max values (0) and indices (1) # unsqueeze is needed to add batch dim B x 1 q_max = self.qnet_target(next_states).detach().max(1)[0].unsqueeze(1) y = rewards + self.gamma * q_max * (1 - dones) # select action values corresponding to actions # this is what .gather does # note for the expected we pass states, not next_states q_expected = self.qnet_local(states).gather(1, actions) loss = F.mse_loss(q_expected, y) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update() def soft_update(self): for target_param, local_param in zip(self.qnet_target.parameters(), self.qnet_local.parameters()): target_param.data.copy_(self.tau * local_param.data + (1 - self.tau) * target_param.data) def train(self, env, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): scores = [] scores_window = deque(maxlen=100) eps = eps_start brain_name = env.brain_names[0] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = self.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) scores.append(score) avg_scores = np.mean(scores_window) eps = max(eps_end, eps_decay * eps) print(f'\rEpisode {i_episode}\tAverage Score: {avg_scores:.2f}', end='') if i_episode % 100 == 0: print( f'\rEpisode {i_episode}\tAverage Score: {avg_scores:.2f}') if avg_scores >= 13.0: print(f'\nEnvironment solved in {i_episode - 100} episodes!' f'\tAverage Score: {np.mean(scores_window):.2f}') torch.save(self.qnet_local.state_dict(), 'checkpoint.pth') break return scores def evaluate(self, env): brain_name = env.brain_names[0] env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations[0] score = 0 for i in range(2000): action = self.act(state) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] state = next_state score += reward if done: break print(f'Total score: {score:.2f}')
class Agent(): """Interacts with and learns from the environment""" def __init__(self, state_size, action_size, fc1_units=256, fc2_units=128, device=torch.device('cpu')): """DQN agent Args: state_size (int): dimension of each state action_size (int): dimension of each action (or the number of action choices) seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.device = device # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, fc1_units=fc1_units, fc2_units=fc2_units).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, fc1_units=fc1_units, fc2_units=fc2_units).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Initialze qnetwork_target parameters to qnetwork_local self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device=self.device) # Initialize the time step counter (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subnet and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Args: state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Set qnetwork_local to evaluation mode self.qnetwork_local.eval() # This operation should not be included in gradient calculation with torch.no_grad(): action_values = self.qnetwork_local(state) # Set back qnetwork_local to training mode self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Args: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q tagets for current states with actual rewards Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ----- Update the target network ----- self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. theta_target = tau * theta_local + (1 - tau) * theta_target Args: local_model (torch.nn.Module): weights will be copied from target_model (torch.nn.MOdule): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)
class DDPG(Model): """ Interface """ def __init__(self, name, args, sess=None, reuse=False, log_tensorboard=True, save=True): self.learn_steps = 0 # hyperparameters self.gamma = args[name]['gamma'] self.tau = args[name]['tau'] self.init_noise_sigma = args[name]['init_noise_sigma'] self.noise_decay = args[name]['noise_decay'] # replay buffer self.buffer = ReplayBuffer(sample_size=args['batch_size'], max_len=args[name]['buffer_size']) super(DDPG, self).__init__(name, args, sess=sess, reuse=reuse, build_graph=True, log_tensorboard=log_tensorboard, save=save) self._initialize_target_net() @property def main_variables(self): return self.actor_critic.trainable_variables @property def _target_variables(self): return self._target_actor_critic.trainable_variables def act(self, state): self.sess.run(self.noise_op) state = state.reshape((-1, self.state_size)) action = self.sess.run(self.actor_critic.actor_action, feed_dict={self.actor_critic.state: state}) self.sess.run(self.denoise_op) return np.squeeze(action) def step(self, state, action, reward, next_state, done): self.buffer.add(state, action, reward, next_state, done) if len(self.buffer) > self.buffer.sample_size + 100: self._learn() """ Implementation """ def _build_graph(self): # env info self._setup_env() # main actor-critic self.actor_critic = self._create_actor_critic() # target actor-critic self._target_actor_critic = self._create_actor_critic(is_target=True) # losses self.actor_loss, self.critic_loss = self._loss() # optimizating operation self.opt_op = self._optimize([self.actor_loss, self.critic_loss]) # target net update operations self.init_target_op, self.update_target_op = self._targetnet_ops() # operations that add/remove noise from parameters self.noise_op, self.denoise_op = self._noise_params() def _setup_env(self): self.state_size = self._args[self.name]['state_size'] self.action_size = self._args[self.name]['action_size'] self.env_info = {} with tf.name_scope('placeholders'): self.env_info['state'] = tf.placeholder(tf.float32, shape=(None, self.state_size), name='state') self.env_info['action'] = tf.placeholder(tf.float32, shape=(None, self.action_size), name='action') self.env_info['next_state'] = tf.placeholder( tf.float32, shape=(None, self.state_size), name='next_state') self.env_info['reward'] = tf.placeholder(tf.float32, shape=(None, 1), name='reward') self.env_info['done'] = tf.placeholder(tf.uint8, shape=(None, 1), name='done') def _create_actor_critic(self, is_target=False): name = 'target_actor_critic' if is_target else 'actor_critic' log_tensorboard = False if is_target else True actor_critic = ActorCritic(name, self._args, self.env_info, self.action_size, reuse=self.reuse, log_tensorboard=log_tensorboard, is_target=is_target) return actor_critic def _loss(self): with tf.name_scope('loss'): with tf.name_scope('l2_loss'): encoder_l2_loss = tf.losses.get_regularization_loss( scope=self.actor_critic.variable_scope + '/state_encoder', name='encoder_l2_loss') actor_l2_loss = tf.losses.get_regularization_loss( scope=self.actor_critic.variable_scope + '/actor', name='actor_l2_loss') critic_l2_loss = tf.losses.get_regularization_loss( scope=self.actor_critic.variable_scope + '/critic', name='critic_l2_loss') with tf.name_scope('actor_loss'): actor_loss = tf.negative( tf.reduce_mean(self.actor_critic.Q_with_actor), name='actor_loss') + encoder_l2_loss + actor_l2_loss with tf.name_scope('critic_loss'): target_Q = tf.stop_gradient( self.env_info['reward'] + self.gamma * tf.cast(1 - self.env_info['done'], tf.float32) * self._target_actor_critic.Q_with_actor, name='target_Q') critic_loss = tf.losses.mean_squared_error( target_Q, self.actor_critic.Q) + encoder_l2_loss + critic_l2_loss if self.log_tensorboard: tf.summary.scalar('actor_l2_loss_', actor_l2_loss) tf.summary.scalar('critic_l2_loss_', critic_l2_loss) tf.summary.scalar('encoder_l2_loss_', encoder_l2_loss) tf.summary.scalar('actor_loss_', actor_loss) tf.summary.scalar('critic_loss_', critic_loss) return actor_loss, critic_loss def _optimize(self, losses): with tf.variable_scope('optimizer'): actor_loss, critic_loss = losses actor_opt_op = self._optimize_objective(actor_loss, 'actor') critic_opt_op = self._optimize_objective(critic_loss, 'critic') opt_op = tf.group(actor_opt_op, critic_opt_op) return opt_op def _optimize_objective(self, loss, name): # params for optimizer learning_rate = self._args['actor_critic'][name][ 'learning_rate'] if 'learning_rate' in self._args['actor_critic'][ name] else 1e-3 beta1 = self._args['actor_critic'][name][ 'beta1'] if 'beta1' in self._args['actor_critic'][name] else .9 beta2 = self._args['actor_critic'][name][ 'beta2'] if 'beta2' in self._args['actor_critic'][name] else .999 clip_norm = self._args[name]['actor_critic'][ 'clip_norm'] if 'clip_norm' in self._args['actor_critic'] else 5. with tf.variable_scope(name + '_opt', reuse=self.reuse): # setup optimizer self._optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate, beta1=beta1, beta2=beta2) tvars = self.actor_critic.actor_trainable_variables if name == 'actor' else self.actor_critic.critic_trainable_variables grads, tvars = list( zip(*self._optimizer.compute_gradients(loss, var_list=tvars))) grads, _ = tf.clip_by_global_norm(grads, clip_norm) opt_op = self._optimizer.apply_gradients(zip(grads, tvars)) if self.log_tensorboard: with tf.name_scope(name): with tf.name_scope('gradients_'): for grad, var in zip(grads, tvars): if grad is not None: tf.summary.histogram(var.name.replace(':0', ''), grad) with tf.name_scope('params_'): for var in tvars: tf.summary.histogram(var.name.replace(':0', ''), var) return opt_op def _targetnet_ops(self): with tf.name_scope('target_net_op'): target_main_var_pairs = list( zip(self._target_variables, self.main_variables)) init_target_op = list( map(lambda v: tf.assign(v[0], v[1], name='init_target_op'), target_main_var_pairs)) update_target_op = list( map( lambda v: tf.assign(v[0], self.tau * v[1] + (1. - self.tau) * v[0], name='update_target_op'), target_main_var_pairs)) return init_target_op, update_target_op def _learn(self): states, actions, rewards, next_states, dones = self.buffer.sample() feed_dict = { self.env_info['state']: states, self.env_info['action']: actions, self.env_info['reward']: rewards, self.env_info['next_state']: next_states, self.env_info['done']: dones, } # update the main networks if self.log_tensorboard: _, summary = self.sess.run([self.opt_op, self.merged_op], feed_dict=feed_dict) self.writer.add_summary(summary, self.learn_steps) else: _ = self.sess.run(self.opt_op, feed_dict=feed_dict) # update the target networks self.sess.run(self.update_target_op) self.learn_steps += 1 def _noise_params(self): with tf.variable_scope('noise'): noise_sigma = tf.get_variable('noise_sigma', initializer=self.init_noise_sigma, trainable=False) noise_decay_op = tf.assign(noise_sigma, self.noise_decay * noise_sigma, name='noise_decay_op') param_noise_pairs = [] for var in self.actor_critic.actor_perturbable_variables: noise = tf.truncated_normal(tf.shape(var), stddev=noise_sigma) param_noise_pairs.append((var, noise)) with tf.control_dependencies([noise_decay_op]): noise_op = list( map( lambda v: tf.assign(v[0], v[0] + v[1], name='noise_op' ), param_noise_pairs)) denoise_op = list( map( lambda v: tf.assign( v[0], v[0] - v[1], name='denoise_op'), param_noise_pairs)) return noise_op, denoise_op def _initialize_target_net(self): self.sess.run(self.init_target_op)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, agent_id, args): self.state_size = state_size self.action_size = action_size self.seed = args['seed'] self.device = args['device'] self.args = args # Q-Network self.actor_network = ActorNetwork(state_size, action_size, args).to(self.device) self.actor_target = ActorNetwork(state_size, action_size, args).to(self.device) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR']) #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine) if not agent_id: self.actor_network.load_state_dict(torch.load( args['agent_p0_path']), strict=False) self.actor_target.load_state_dict(torch.load( args['agent_p0_path']), strict=False) else: self.actor_network.load_state_dict(torch.load( args['agent_p1_path']), strict=False) self.actor_target.load_state_dict(torch.load( args['agent_p1_path']), strict=False) # Replay memory self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.seed) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) if len(self.memory) > self.args['BATCH_SIZE']: experiences = self.memory.sample() self.train(experiences) def act(self, current_state): with torch.no_grad(): self.actor_network.eval() input_state = torch.from_numpy(current_state).float().to( self.device) with torch.no_grad(): action = self.actor_network(input_state).cpu().data.numpy() self.actor_network.train() action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def train(self, experiences): global states_ global next_states_ global actions_ global max_min_actions_vector global max_min_states_vector states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # with torch.no_grad(): # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = mCritic.target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = mCritic.network(states, actions) mCritic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss mCritic.optimizer.zero_grad() mCritic_loss.backward() mCritic.optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_network(states) actor_loss = -mCritic.network(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(mCritic.network, mCritic.target, TAU) self.soft_update(self.actor_network, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """ Interacts with and learns from then environment.""" def __init__(self, state_size, action_size, seed, model=QNetwork): """Initialize an Agent object. Param ===== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed model (object): model to use Return ====== None """ self.state_size = state_size self.action_size = action_size self.seed = seed # Q-Network self.qnetwork_local = model(state_size, action_size, seed).to(device) self.qnetwork_target = model(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=hyperparameters["lr"]) # Replay memory self.memory = ReplayBuffer(action_size, hyperparameters["buffer_size"], hyperparameters["batch_size"], seed, device) # Initialize time step (for updating every hyperparameters["update_every"] steps) self.t_step = 0 # Init tracking of params wandb.login() wandb.init(project=project_name, name=name, config=hyperparameters) jovian.log_hyperparams(hyperparameters) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every hyperparameters["update_every"] time steps. self.t_step = (self.t_step + 1) % hyperparameters["update_every"] if self.t_step == 0: # If enough samples are availble in memory, get random subset and learn if len(self.memory) > hyperparameters["batch_size"]: experiences = self.memory.sample() self.learn(experiences, hyperparameters["gamma"]) def act(self, state, eps=0.): """Return actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for espilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params: ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', d) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ---------------- update target network ----------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, hyperparameters["tau"]) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def get_model_name(self): return name def get_project_name(self): return project_name
class Agent(): """Code adapted from the Udacity course""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max action from max Q values (for next states) from target model indexes_of_Q_local_for_next_states = self.qnetwork_local( next_states).detach().max(1)[1].unsqueeze(1) Q_target_for_next_states = self.qnetwork_target(next_states).detach() Q_thetas = Q_target_for_next_states.gather( 1, indexes_of_Q_local_for_next_states) Q_targets = rewards + (gamma * Q_thetas * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. Polyak averaging θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG: def __init__(self, env, batch_size, mem_size, discount, actor_params, critic_params): self._batch_size = batch_size self._mem_size = mem_size self._discount = discount self._sess = tensorflow.Session() k_backend.set_session(self._sess) self._env = env self._state_dim = env.observation_space.shape[0] self._action_dim = env.action_space.shape[0] self._action_min = env.action_space.low self._action_max = env.action_space.high self._state_min = env.observation_space.low self._state_max = env.observation_space.high self._actor = Actor(self._sess, self._state_dim, self._action_dim, self._action_min, self._action_max, actor_params) self._critic = Critic(self._sess, 0.5, self._state_dim, self._action_dim, critic_params) self._memory = ReplayBuffer(mem_size) def get_action(self, state): return self._actor._model.predict(state) def train(self): ''' No training takes place until the replay buffer contains at least batch size number of experiences ''' if (self._memory.size() > self._batch_size): self._train() def _train(self): states, actions, rewards, done, next_states = self._memory.sample( self._batch_size) self._train_critic(states, actions, rewards, done, next_states) action_gradients = self._critic.action_gradients(states, actions) self._actor.train(states, action_gradients) def q_estimate(self, state, action): return self._critic._model.predict(state, action) def _get_q_targets(self, next_states, done, rewards): ''' q = r if done else = r + gamma * qnext ''' # use actor network to determine the next action under current policy # estimate Q values from the critic network actions = self.get_action(next_states) qnext = self.q_estimate(next_states, actions) q_targets = [ reward if end else reward * self._discount * next_q for (reward, next_q, end) in zip(rewards, qnext, done) ] return q_targets def _train_critic(self, states, actions, rewards, done, next_states): q_targets = self._get_q_targets(next_states, done, rewards) self._critic.train(states, actions, q_targets) def experience(self, state, action, reward, done, next_state): # store in replay buffer self._memory.add(state, action, reward, done, next_state) self.train()
class QMixTrainer: def __init__(self, env, args): self.env = env self.args = args self.agents = MultiAgents(args) self.train_datacollector = DataCollector(self.env, self.agents, args) self.replaybuffer = ReplayBuffer(args) def evaluate(self): mean_episode_reward = 0 for epsd in range(self.args.eval_episodes): _, episode_reward = self.train_datacollector.collect_one_episode_data( if_train=False) mean_episode_reward += episode_reward return mean_episode_reward / self.args.eval_episodes def train(self): episode_rewards = [] loss_history = [] eval_episode_rewards = [] train_steps = 0 print("Initializing replay buffer...") episodes_data = [] for epsd in range(10000): #print("simulating {} episode ...".format(epsd)) episode_data, episode_reward = self.train_datacollector.collect_one_episode_data( epsd, if_train=False, if_init_buffer=True) if episode_reward == 1: print("goal !!!!!") episodes_data.append(episode_data) print("collected {} episodes".format(len(episodes_data))) l = len(episodes_data) batch_data = {} for key in episodes_data[0].keys(): batch_data[key] = np.zeros((l, ) + episodes_data[0][key].shape) #batch_data = episodes_data[0] #episodes_data.pop(0) for epsd in range(l): for key in batch_data.keys(): #print("key {} shape {}".format(key,batch_data[key].shape)) batch_data[key][epsd] = episodes_data[epsd][key] self.replaybuffer.store_episode(batch_data) print("Start to train") plt.figure() for epoch in range(self.args.n_epoch): print("Training Epoch {} epsilon: {}".format( epoch, self.train_datacollector.epsilon)) episodes_data = [] reward_sum = 0 for epsd in range(self.args.n_episodes_per_epoch): episode_data, episode_reward = self.train_datacollector.collect_one_episode_data( epsd, if_train=True) #print("Episode {} reward is {}".format(epsd, episode_reward)) episodes_data.append(episode_data) #reward_sum += episode_reward episode_rewards.append(episode_reward) #episode_rewards.append(reward_sum / self.args.n_episodes_per_epoch) batch_data = {} for key in episodes_data[0].keys(): batch_data[key] = np.zeros((self.args.n_episodes_per_epoch, ) + episodes_data[0][key].shape) #batch_data = episodes_data[0] #episodes_data.pop(0) for epsd in range(self.args.n_episodes_per_epoch): for key in batch_data.keys(): #print("key {} shape {}".format(key,batch_data[key].shape)) batch_data[key][epsd] = episodes_data[epsd][key] self.replaybuffer.store_episode(batch_data) for t_stps in range(self.args.n_train_steps_per_epoch): mini_batch = self.replaybuffer.sample( min(self.replaybuffer.current_size, self.args.batch_size)) loss = self.agents.train(mini_batch, train_steps) loss_history.append(loss) train_steps = train_steps + 1 if epoch % self.args.evaluate_freq == 0: mean_episode_reward = self.evaluate() eval_episode_rewards.append(mean_episode_reward) print( "Evaluation Result (Mean Episode Reward) of Epoch {} is : {}" .format(epoch, mean_episode_reward)) plt.cla() plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('episode') plt.ylabel('episode reward') plt.savefig( os.path.join(self.args.resource_dir, "episode_reward_epoch_{}.png".format(epoch))) ''' plt.figure() plt.plot(range(len(eval_episode_rewards)), eval_episode_rewards) plt.xlabel('episode') plt.ylabel('episode reward') plt.savefig(os.path.join(self.args.resource_dir,"eval_episode_reward_epoch_{}.png".format(epoch))) ''' np.savetxt(os.path.join(self.args.resource_dir, "episode_rewards.txt"), episode_rewards, fmt="%.4f") np.savetxt(os.path.join(self.args.resource_dir, "eval_episode_rewards.txt"), eval_episode_rewards, fmt="%.4f") np.savetxt( os.path.join(self.args.resource_dir, "loss_history.txt"), loss_history) plt.cla() plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('episode') plt.ylabel('episode reward') plt.savefig( os.path.join(self.args.resource_dir, "episode_reward_epoch_{}.png".format(epoch))) ''' plt.figure() plt.plot(range(len(eval_episode_rewards)), eval_episode_rewards) plt.xlabel('episode') plt.ylabel('episode reward') plt.savefig(os.path.join(self.args.resource_dir,"eval_episode_reward_epoch_{}.png".format(epoch))) ''' np.savetxt(os.path.join(self.args.resource_dir, "episode_rewards.txt"), episode_rewards, fmt="%.4f") np.savetxt(os.path.join(self.args.resource_dir, "eval_episode_rewards.txt"), eval_episode_rewards, fmt="%.4f") np.savetxt(os.path.join(self.args.resource_dir, "loss_history.txt"), loss_history)
def train(self, transitions: int, eps_max: float = 0.5, eps_min: float = 0., buffer_size: int = 10000, batch_size: int = 128, shaping_coef: float = 300., progress_upd_step: int = 0, start_training: int = 10000, to_sink: bool = False): history = ReplayBuffer(size=buffer_size) progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100 log = { "alpha": self.alpha, "gamma": self.gamma, "buffer_size": buffer_size, "batch_size": batch_size, "tau": self.tau, "shaping_coef": shaping_coef, "eps_max": eps_max, "eps_min": eps_min, "bins": self.num_bins, "to_sink": to_sink, "step": [], "reward_mean": [], "reward_std": [] } state = self.reset() t = tqdm(range(transitions)) for i in t: eps = eps_max - (eps_max - eps_min) * i / transitions if random() < eps: action = self.env.action_space.sample() else: action = self.act(state) next_state, reward, done, _ = self.env.step(action) reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1])) done_ = next_state[0] > 0.5 history.add((state, action, next_state, reward, done_)) state = self.reset() if done else next_state if i > start_training: self.update(history.sample(batch_size)) # soft update with torch.no_grad(): for param, param_target in zip(self.dqn.parameters(), self.dqn_target.parameters()): param_target.data.mul_(1 - self.tau) param_target.data.add_(self.tau * param.data) if (i + 1) % progress_upd_step == 0: reward_mean, reward_std = self.evaluate_policy() log["step"].append(i) log["reward_mean"].append(reward_mean) log["reward_std"].append(reward_std) t.set_description(f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}") if to_sink and reward_mean >= 90 and self.evaluate_policy(episodes=100)[0] >= 90: self.sink(history, start_training, eps, shaping_coef) shaping_coef = 1 to_sink = False return log
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 # 0.0 self.exploration_theta = 0.1 # 0.15 self.exploration_sigma = 0.1 # 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def act_no_noise(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.eps = 3.0 self.eps_decay = 0.9999 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_target = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_target = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=0) # Noise process self.noise = OUNoise((1, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number, learn_iterations=5): """Save experience in replay memory, and use random sample from buffer to learn.""" #self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory ) > BATCH_SIZE: #and self.timestep % LEARN_EVERY == 0: for _ in range(learn_iterations): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Since the critic takes the actions of both agents we need to update only # one part of the given action if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) elif agent_number == 1: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Since the critic takes the actions of both agents we need to update only # one part of the given action if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) elif agent_number == 1: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update epsilon self.eps *= self.eps_decay self.eps = max(self.eps, 1) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train_v1(eps_start, eps_end, eps_decay, n_step, mem_capacity, num_episodes, embed_dim, iters): graph_generator = GraphGenerator(16, 16) memory = ReplayBuffer(mem_capacity) steps_done = 0 gnn = Struc2Vec(embed_dim, iters) qnet = QNet(embed_dim) optimizer = optim.Adam(list(gnn.parameters()) + list(qnet.parameters()), lr=0.0001, weight_decay=1e-4) for e in range(num_episodes): node_labels, adj, edge_weights = graph_generator.next() vtx_feats = gnn(node_labels, adj, edge_weights) remaining_vertices = set([i for i in range(len(adj))]) state = Variable(torch.zeros(embed_dim)) curr_tour = [] T = len(adj) rewards = [] states = [state] for t in range(T): eps_threshold = util.get_eps_threshold(eps_start, eps_end, eps_decay, steps_done) if random.random() > eps_threshold: # arg max action curr_vtx = arg_max_action(qnet, vtx_features, remaining_vertices) else: # random action curr_vtx = random.sample(remaining_vertices, 1)[0] action = vtx_feats[curr_vtx] # reward maintenance est_reward = qnet(state, curr_vtx) reward = get_reward(curr_tour, curr_vtx, edge_weights) rewards.append(reward) # update states curr_tour.append(curr_vtx) remaining_vertices.remove(curr_vtx) states.append(state + action) # wait till after doing the memory stuff to add the state # we only do these updates after n steps if t >= n_step: _, next_reward = arg_max_action(qnet, vtx_features, remaining_vertices) state_tminusn = states[-n_step] # this is a torch tensor action_tminusn = vtx_feats[ curr_tour[-nstep]] # this gives the vertex id reward_tminusn = sum(reward[-n:]) memory.push(state_minusn, action_tminusn, reward_tminusn, state, action) transitions = memory.sample(batch_size) # batch.state, batch.action, batch.reward, etc are now tuples # TODO: this looks a bit gross.... batch = Transition(*zip(*batch)) state_batch = torch.cat([s.unsqueeze(0) for s in batch.state], dim=0) action_batch = torch.cat( [a.unsqueeze(0) for a in batch.action], dim=0) reward_batch = torch.cat(batch.reward) newstate_batch = torch.cat( [ns.unsqueeze(0) for ns in batch.new_state], dim=0) max_action_batch = torch.cat( [ma.unsqueeze(0) for ma in batch.max_action], dim=0) # TODO: make qnet allow batch # does the experience replay memory contain state/action/reward/next_state # from only the current episode's graph? Or can any graph seen before be # in the memory? # The argmax action is the thing taken at time t-n_step right? oldstate_action_value = qnet(state_batch, action_batch) newstate_action_value = qnet(new_state_batch, max_action_batch) expected_sa_values = reward_batch + gamma * newstate_action_value loss = F.mse_loss(oldstate_action_value, expected_sa_values) optimizer.zero_grad() loss.backward() # clamp grads? state += action steps_done += 1
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, hidden_layers=[64, 64], drop_p=0.3, with_dueling=False, isDDQN=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (array): Hidden number of nodes in each layer drop_p (float [0-1]) : Probability of dropping nodes (implementation of dropout) with_dueling (boolean) : If true, network is dueling network, otherwise false. isDDQN (boolean) : If true, double dqn in implemented, otherwise false. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_layers=hidden_layers, drop_p=drop_p, dueling=with_dueling).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_layers=hidden_layers, drop_p=drop_p, dueling=with_dueling).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Parameter instance of DDQN. self.isDDQN = isDDQN def step(self, state, action, reward, next_state, done): """Takes a step and with each time step sample from buffer and learn""" # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences if self.isDDQN: # Get optimal action from local model and feed forward next_states on target network best_local_actions = self.qnetwork_local(states).max( 1)[1].unsqueeze(1) double_dqn_targets = self.qnetwork_target(next_states) # Get value of the target dqn vialocal optimal action Q_targets_next = torch.gather(double_dqn_targets, 1, best_local_actions) else: # Get max predicted Q values (for next states) from target model (without ddqn) Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)