def do_q_learning(env, reward_function, train_episodes, figure=False): alpha = 0.01 gamma = 0.9 epsilon = 0.1 policy = DQNPolicy(env, lr=alpha, gamma=gamma, input=2, output=4) # 4 actions output, up, right, down, left replay_buffer = ReplayBuffer() # Play with a random policy and see # run_current_policy(env.env, policy) agg_interval = 100 avg_history = {'episodes': [], 'timesteps': [], 'reward': []} # Train the network to predict actions for each of the states for episode_i in range(train_episodes): episode_timestep = 0 episode_reward = 0.0 env.__init__() # todo : the first current state should be 0 cur_state = env.cur_state counter = 0 done = False while not done: # Let each episode be of 30 steps counter += 1 done = counter >= 30 # todo : check if this line is working action = policy.select_action(cur_state.reshape(1, -1), epsilon) # take action in the environment next_state = env.step(action) reward = reward_function(next_state) # add the transition to replay buffer replay_buffer.add(cur_state, action, next_state, reward, done) # sample minibatch of transitions from the replay buffer # the sampling is done every timestep and not every episode sample_transitions = replay_buffer.sample() # update the policy using the sampled transitions policy.update_policy(**sample_transitions) episode_reward += reward episode_timestep += 1 cur_state = next_state avg_history['episodes'].append(episode_i + 1) avg_history['timesteps'].append(episode_timestep) avg_history['reward'].append(episode_reward) learning_policy_progress.update() if figure: plt.plot(avg_history['episodes'], avg_history['reward']) plt.title('Reward') plt.xlabel('Episode') plt.ylabel('Reward') plt.show() return policy.q_model
def test_buffer_replace(): shape = (2, 2) capacity = 2 buffer = ReplayBuffer(capacity) for i in range(10): x = onp.ones(shape) * i a, r = i, i discount = 1.0 timestep = dm_env.TimeStep(dm_env.StepType.FIRST, r, discount, x) buffer.add(timestep, a, timestep) logging.debug("i: {}, r: {}, len(buffer): {}".format( i, capacity, len(buffer))) # make sure the buffer recycles if adding more elements than its capacity assert len(buffer) == capacity # make sure the oldest elements are recycled assert onp.array_equal( onp.array([buffer[i].s for i in range(len(buffer))]), onp.array([[[8.0, 8.0], [8.0, 8.0]], [[9.0, 9], [9.0, 9.0]]], dtype=onp.float32), ) assert onp.array_equal( onp.array([buffer[i].r for i in range(len(buffer))]), onp.array([8.0, 9.0], dtype=onp.float32), ) assert onp.array_equal( onp.array([buffer[i].a for i in range(len(buffer))]), onp.array([8.0, 9.0], dtype=onp.float32), ) # try sampling with n < len(buffer) batch = buffer.sample(1) assert len(batch[0]) == 1 logging.debug(batch) # try sampling wiht n == len(buffer) batch = buffer.sample(2) assert len(batch[0]) == len(buffer) logging.debug(batch) # try sampling with n > len(buffer) batch = buffer.sample(3) assert len(batch[0]) == len(buffer) logging.debug(batch) return
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, hidden_layers=[64, 64], buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, learning_rate=5e-4, update_every=4): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (list of int ; optional): number of each layer nodes buffer_size (int ; optional): replay buffer size batch_size (int; optional): minibatch size gamma (float; optional): discount factor tau (float; optional): for soft update of target parameters learning_rate (float; optional): learning rate update_every (int; optional): how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = learning_rate self.update_every = update_every # detect GPU device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network model_params = [state_size, action_size, seed, hidden_layers] self.qnetwork_local = QNetwork(*model_params).to(self.device) self.qnetwork_target = QNetwork(*model_params).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.device) # Initialize time step (for updating every self.update_every steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every self.update_every time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Calculate target value self.qnetwork_target.eval() with torch.no_grad(): Q_dash = self.qnetwork_target(next_states) Q_dash_max = torch.max(Q_dash, dim=1, keepdim=True)[0] y = rewards + gamma * Q_dash_max * (1 - dones) self.qnetwork_target.train() # Predict Q-value self.optimizer.zero_grad() Q = self.qnetwork_local(states) y_pred = Q.gather(1, actions) # TD-error loss = torch.sum((y - y_pred)**2) # Optimize loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
episode_timestep = 0 episode_reward = 0.0 done = False cur_state = cp_env.reset() while not done: # select action action = cp_policy.select_action(cur_state.reshape(1, -1), cp_epsilon) # take action in the environment next_state, reward, done, info = cp_env.step(action) # add the transition to replay buffer replay_buffer.add(cur_state, action, next_state, reward, done) # sample minibatch of transitions from the replay buffer # the sampling is done every timestep and not every episode sample_transitions = replay_buffer.sample() # update the policy using the sampled transitions cp_policy.update_policy(**sample_transitions) episode_reward += reward episode_timestep += 1 cur_state = next_state avg_reward += episode_reward avg_timestep += episode_timestep
# # cur_state = next_state # Now play with weighted policy done = False cur_state = torch.Tensor(env.reset()) while not done: # select action action = policy_weighted.select_action(cur_state) # take action in the environment next_state, reward, done, info = env.step(action.item()) next_state = torch.Tensor(next_state) # add the transition to replay buffer replay_buffer_weighted.add(cur_state, action, next_state, reward, done) # sample minibatch of transitions from the replay buffer # the sampling is done every timestep and not every episode # sample_transitions = replay_buffer_weighted.sample(100) # update the policy using the sampled transitions # loss2 = update_weighted_policy_stochastic(cur_state, next_state, reward, action, sample_transitions['next_states'], (episode_i//weight_decay)+1) episode_weighted_reward += reward episode_timestep_weighted += 1 # loss2_cumulative += loss2 cur_state = next_state # update the policy every update_episode episodes if (episode_i+1) % update_episode == 0:
class Agent(): """Basic experinece replay agent.""" def __init__(self, state_size, action_size, seed, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, checkpoint_file='checkpoint.pth'): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed buffer_size(int): replay buffer size batch_size(int): minibatch size gamma: discount factor tau: for soft update of target parameters lr: learning rate update_every: how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.checkpoint_file = checkpoint_file self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, self.device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def train(self, env, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Train Agent by playing simulator Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode moving_avgs = [] # list of moving averages scores_window = deque(maxlen=100) # last 100 scores brain_name = env.brain_names[0] # get env default branin name env_info = env.reset( train_mode=False)[brain_name] # intialize the environment eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] # get the next state score = 0 for t in range(max_t): action = self.act(state, eps).astype(int) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[ 0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score moving_avg = np.mean(scores_window) # calculate moving average moving_avgs.append(moving_avg) # save most recent moving average eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, moving_avg)) if moving_avg >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, moving_avg)) self.save() break return scores, moving_avgs def test(self, env, num_episodes=10): brain_name = env.brain_names[0] scores = [] # list of scores avg_scores = [] # list of average scores for i_episode in range(1, num_episodes + 1): env_info = env.reset( train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score t = 1 while True: action = self.act(state, eps=0) # select an action env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[ 0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step # print('empisode: {}, step: {}, reward: {}, score: {}, scores: {}'.format(i_episode, t, reward, score, scores)) t += 1 if done: # exit loop if episode finished scores.append(score) avg_scores.append(np.mean(scores)) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores))) break return scores, avg_scores def save(self): """Save the model Params ====== file: checkpoint file name """ torch.save(self.qnetwork_local.state_dict(), self.checkpoint_file) def load(self): """Load the model Params ====== file: checkpoint file name """ self.qnetwork_local.load_state_dict(torch.load(self.checkpoint_file))
def train(env_id, lr=1e-4, gamma=0.99, memory_size=1000, batch_size=32, train_timesteps=10000, train_start_time=1000, target_update_frequency=1000, init_epsilon=1, final_epsilon=0.1, epsilon_decay=300, model_path=None): device = 'cuda' if torch.cuda.is_available() else 'cpu' LOG_PATH = f'logs/dqn_log_{env_id}.txt' if get_env_type(env_id) == 'atari': env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) model_type = 'conv' else: env = gym.make(env_id) model_type = 'linear' obs_shape = env.observation_space.shape num_actions = env.action_space.n memory = ReplayBuffer(memory_size) agent = DQN(obs_shape, num_actions, lr, gamma, device, model_type) policy = EpsilonGreedy(agent, num_actions, init_epsilon, final_epsilon, epsilon_decay) # populate replay memory obs = env.reset() for t in range(train_start_time): # uniform random policy action = random.randrange(num_actions) next_obs, reward, done, _ = env.step(action) memory.add(obs, action, reward, next_obs, done) obs = next_obs if done: # start a new episode obs = env.reset() # for monitoring ep_num = 1 ep_start_time = 1 episode_reward = 0 reward_list = [] # train start obs = env.reset() for t in tqdm.tqdm(range(1, train_timesteps + 1)): # choose action action = policy.act(obs, t) next_obs, reward, done, _ = env.step(action) memory.add(obs, action, reward, next_obs, done) obs = next_obs # sample batch transitions from memory transitions = memory.sample(batch_size) # train loss = agent.train(transitions) # record reward episode_reward += reward # update target network at every C timesteps if t % target_update_frequency == 0: agent.update_target() if done: # start a new episode obs = env.reset() # write log with open(LOG_PATH, 'a') as f: f.write(f'{ep_num}\t{episode_reward}\t{ep_start_time}\t{t}\n') if model_path is not None: # save model info = { 'epoch': ep_num, 'timesteps': t, } agent.save(model_path, info) ep_num += 1 ep_start_time = t + 1 reward_list.append(episode_reward) episode_reward = 0