class Agent(): def __init__(self, render=False, method='Duel'): # Create an instance of the network itself, as well as the memory. # Here is also a good place to set environmental parameters, # as well as training parameters - number of episodes / iterations, etc. self.render = render if render: self.env = gym.make('NEL-render-v0') else: self.env = gym.make('NEL-v0') #self.test_env = gym.make('NEL-v0') self.an = self.env.action_space.n # No. of actions in env self.epsilon = 0.5 self.training_time = PARAM.TRAINING_TIME # Training Time self.df = PARAM.DISCOUNT_FACTOR # Discount Factor self.batch_size = PARAM.BATCH_SIZE self.method = method self.test_curr_state = None self.log_time = 100.0 self.test_time = 1000.0 self.prioritized_replay = PARAM.PRIORITIZED_REPLAY self.prioritized_replay_eps = 1e-6 #self.prioritized_replay_alpha = 0.6 self.prioritized_replay_alpha = 0.8 self.prioritized_replay_beta0 = 0.4 self.burn_in = PARAM.BURN_IN # Create Replay Memory and initialize with burn_in transitions if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( PARAM.REPLAY_MEMORY_SIZE, alpha=self.prioritized_replay_alpha) self.beta_schedule = LinearSchedule( float(self.training_time), initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE) self.beta_schedule = None # Create QNetwork instance if self.method == 'Duel': print('Using Duel Network.') self.net = DuelQNetwork(self.an) elif self.method == 'DoubleQ': print('Using DoubleQ Network.') self.net = DoubleQNetwork(self.an) else: raise NotImplementedError cur_dir = os.getcwd() self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime( "%Y%m%d-%H%M%S") + '/' # Create output directory if not os.path.exists(self.dump_dir): os.makedirs(self.dump_dir) self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w') self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w') def update_epsilon(self): ''' Epsilon decay from 0.5 to 0.05 over 100000 iterations. ''' if self.epsilon <= 0.05: self.epsilon = 0.05 return self.epsilon = self.epsilon - (0.5 - 0.1) / 200000.0 def epsilon_greedy_policy(self, q_values, epsilon): # Creating epsilon greedy probabilities to sample from. val = np.random.rand(1) if val <= epsilon: return np.random.randint(q_values.shape[1]) return np.argmax(q_values) def greedy_policy(self, q_values): # Creating greedy policy for test time. return np.argmax(q_values) def train(self): train_rewards = [] test_rewards = [] count = 0 steps = 0 test_steps = 0 cum_reward = 0.0 elapsed = 0.0 curr_state = self.env.reset() curr_state = self.burn_in_memory(curr_state) prev_action = -1 if self.render: self.env.render() for i in range(self.training_time): # Get q_values based on the current state Vt, St = self.get_input_tensor(curr_state) q_values = self.net.get_Q_output(Vt, St) # Selecting an action based on the policy action = self.epsilon_greedy_policy(q_values, self.epsilon) #if not curr_state['moved'] and action == prev_action and self.epsilon > 0.1: # action = self.epsilon_greedy_policy(q_values, 0.5) # Executing action in simulator nextstate, reward, _, _ = self.env.step(action) steps = steps + 1 test_steps = test_steps + 1 if self.render: self.env.render() # Store Transition if nextstate['moved'] or prev_action != action: self.replay_buffer.add(curr_state, action, reward / 100.0, nextstate, 0) prev_action = action # Sample random minibatch from experience replay if self.prioritized_replay: batch, weights, batch_idxes = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(i)) else: batch = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones(self.batch_size), None # Train the Network with mini batches xVT, xST = self.get_input_tensors(batch) yT = self.get_output_tensors(batch) # Mask to select the actions from the Q network output mT = torch.zeros(self.batch_size, self.an, dtype=torch.uint8) for k, tran in enumerate(batch): mT[k, tran[1]] = 1 td_errors = self.net.train(xVT, xST, yT, mT, weights) if self.prioritized_replay: #new_priorities = np.abs(td_errors) + self.prioritized_replay_eps #new_priorities = [] #for i, tran in enumerate(batch): # new_priorities.append(tran[2] + self.prioritized_replay_eps) self.replay_buffer.update_priorities(batch_idxes, weights) # Decay epsilon self.update_epsilon() cum_reward += reward curr_state = nextstate if steps == 100: cum_reward = cum_reward / float(self.log_time) train_rewards.append(cum_reward) self.train_file.write(str(cum_reward)) self.train_file.write('\n') self.train_file.flush() cum_reward = 0.0 print('Train Reward: %.4f' % (train_rewards[-1])) steps = 0 x = list(range(len(train_rewards))) plt.plot(x, train_rewards, '-bo') plt.xlabel('Time') plt.ylabel('Average Reward') plt.title('Training Curve') plt.savefig(self.dump_dir + 'Training_Curve_' + self.method + '.png') plt.close() plot(self.dump_dir + self.method, train_rewards) # if test_steps == 500: # self.net.set_eval() # test_rewards.append(self.test()) # self.test_file.write(str(test_rewards[-1])) # self.test_file.write('\n') # self.test_file.flush() # self.net.set_train() # count = count + 1 # print('\nTest Reward: %.4f\n' % (test_rewards[-1])) # test_steps = 0 # # x = list(range(len(test_rewards))) # plt.plot(x, test_rewards, '-bo') # plt.xlabel('Time') # plt.ylabel('Average Reward') # plt.title('Testing Curve') # plt.savefig(self.dump_dir + 'Testing_Curve_' + self.method + '.png') # plt.close() if count > 0 and count % 30 == 0: self.net.save_model_weights(count, self.dump_dir) def test(self, testing_steps=100, model_file=None, capture=False): if model_file is not None: self.net.load_model(model_file) if capture: self.test_env = gym.wrappers.Monitor(self.test_env, './') epsilon = 0.05 rewards = [] self.test_curr_state = self.test_env.reset() #if self.render: # self.test_env.render() cum_reward = 0.0 for i in range(testing_steps): # Initializing the episodes Vt, St = self.get_input_tensor(self.test_curr_state) q_values = self.net.get_Q_output(Vt, St) action = self.epsilon_greedy_policy(q_values, epsilon) # Executing action in simulator nextstate, reward, _, _ = self.test_env.step(action) #if self.render: # self.test_env.render() cum_reward += reward self.test_curr_state = nextstate avg_reward = cum_reward / float(testing_steps) rewards.append(avg_reward) return avg_reward def burn_in_memory(self, curr_state): # Initialize your replay memory with a burn_in number of episodes / transitions. cnt = 0 while self.burn_in > cnt: # Randomly selecting action for burn in. Not sure if this is correct. action = self.env.action_space.sample() next_state, reward, _, _ = self.env.step(action) self.replay_buffer.add(curr_state, action, reward / 100.0, next_state, 0) curr_state = next_state cnt = cnt + 1 return curr_state def get_input_tensor(self, obs): ''' Returns an input tensor from the observation. ''' iV = np.zeros((1, 3, 11, 11)) iS = np.zeros((1, 4)) iV[0] = np.moveaxis(obs['vision'], -1, 0) iS[0] = np.concatenate((obs['scent'], np.array([int(obs['moved'])])), axis=0) iVt, iSt = torch.from_numpy(iV).float(), torch.from_numpy(iS).float() return iVt, iSt def get_input_tensors(self, batch, next_state=False): ''' Returns an input tensor created from the sampled batch. ''' V = np.zeros((self.batch_size, 3, 11, 11)) S = np.zeros((self.batch_size, 4)) for i, tran in enumerate(batch): if next_state: obs = tran[3] # next state else: obs = tran[0] # current state V[i] = np.moveaxis(obs['vision'], -1, 0) S[i] = np.concatenate( (obs['scent'], np.array([int(obs['moved'])])), axis=0) Vt, St = torch.from_numpy(V).float(), torch.from_numpy(S).float() return Vt, St def get_output_tensors(self, batch): ''' Returns an output tensor created from the sampled batch. ''' Y = np.zeros(self.batch_size) Vt, St = self.get_input_tensors(batch, next_state=True) q_values_a = self.net.get_Q_output(Vt, St) q_values_e = self.net.get_target_output(Vt, St) for i, tran in enumerate(batch): action = self.greedy_policy(q_values_a[i]) Y[i] = tran[2] + self.df * q_values_e[i][action] Yt = torch.from_numpy(Y).float() return Yt
class DQNAgent: def __init__(self, gamma, action_number, minibatch, episodes, begin_train, train_step, begin_copy, copy_step, epsilon_delta, epsilon_start, epsilon_end, load_model, path_to_load, path_to_save, episode_steps, episode_to_save, max_buffer_len): # Epsilon self.epsilon_delta = epsilon_delta self.epsilon_end = epsilon_end self.epsilon_start = epsilon_start self.epsilon = epsilon_start # Main Params self.minibatch = minibatch self.action_number = action_number self.gamma = gamma # Episode Params self.begin_train = begin_train self.begin_copy = begin_copy self.copy_step = copy_step self.train_step = train_step self.episodes = episodes self.episode_steps = episode_steps self.episode_to_save = episode_to_save # I/O params self.path_to_load = path_to_load self.path_to_save = path_to_save self.load_model = load_model # Model Fields self.action = None self.state = None self.replay_buffer = ReplayBuffer(max_buffer_len) # Model self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') # self.device = torch.device('cpu') self.model = BoxModel((150, 100, 1), action_number).to(self.device) if self.load_model: self.model.load_state_dict(torch.load(self.path_to_load)) # Rewards self.rewards_white, self.rewards_black, self.rewards = [], [], [] def reduce_epsilon(self, episode): self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ np.exp(-1. * episode / self.epsilon_delta) def epsilon_greedy(self): if (1 - self.epsilon) <= np.random.random(): self.action = np.random.randint(self.action_number) else: state = torch.autograd.Variable( torch.FloatTensor(self.state).to(self.device).unsqueeze(0)) self.action = self.model(state).max(1)[1].item() return self.action @staticmethod def preprocess_observation(observation): rgb = observation[30:180, 30:130] / 255 r, g, b = rgb[:, :, 0], rgb[:, :, 1], rgb[:, :, 2] gray = 0.2989 * r + 0.5870 * g + 0.1140 * b return gray.reshape(1, 150, 100) def transition_process(self, o_state, o_act, o_reward, o_next_state, o_done): return \ torch.autograd.Variable(torch.FloatTensor(np.float32(o_state)).to(self.device)), \ torch.autograd.Variable(torch.LongTensor(o_act).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(o_reward).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(np.float32(o_next_state)).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(o_done).to(self.device)) def train_model(self): o_state, o_act, o_reward, o_next_state, o_done = \ self.transition_process(*self.replay_buffer.sample(self.minibatch)) q = self.model(o_state) q_next = self.model(o_next_state) y_hat = o_reward + self.gamma * q_next.max(1)[0] * (1 - o_done) loss = (q.gather(1, o_act.unsqueeze(1)).squeeze(1) - torch.autograd.Variable(y_hat.data)).pow(2).mean() self.model.optimizer.zero_grad() loss.backward() self.model.optimizer.step() def print(self, episode, reward_black, reward_white, epsilon): print(f"For episode {episode} reward white - " f"{reward_white} and black - {reward_black}," f"epsilon - {epsilon}") def train(self, env: gym.wrappers.time_limit.TimeLimit): start = time() print("Begin to Train") for episode in range(self.episodes): observation = env.reset() self.state = self.preprocess_observation(observation) reward_black, reward_white, total_reward = 0, 0, 0 for episode_steps in range(self.episode_steps): action = self.epsilon_greedy() next_observation, reward, done, _ = env.step(action) reward_black += (reward < 0) * abs(reward) reward_white += (reward > 0) * reward total_reward += reward next_state = self.preprocess_observation(next_observation) self.replay_buffer.push(self.state, action, reward, next_state, done) if len(self.replay_buffer) >= self.begin_train: self.train_model() # if (episode_step >= self.begin_copy) and (episode_step % self.copy_step == 0): # plt.plot(total_reward) # plt.show() # self.const_model = self.model.clone() if done: break self.reduce_epsilon(episode) if episode != 0 and episode % self.episode_to_save == 0: torch.save(self.model.state_dict(), self.path_to_save) plt.plot(self.rewards) plt.show() self.rewards_black.append(reward_black) self.rewards_white.append(reward_white) self.rewards.append(total_reward) self.print(episode, reward_black=reward_black, reward_white=reward_white, epsilon=self.epsilon) print(time() - start) def play(self, env: gym.wrappers.time_limit.TimeLimit): observation = env.reset() reward_black, reward_white, total_reward = 0, 0, 0 for episode_steps in range(self.episode_steps): state = self.preprocess_observation(observation) state = torch.autograd.Variable( torch.FloatTensor(state).to(self.device).unsqueeze(0)) print(self.model(state)) action = self.model(state).max(1)[1].item() observation, reward, done, _ = env.step(action) reward_black += (reward < 0) * abs(reward) reward_white += (reward > 0) * reward total_reward += reward sleep(0.01) env.render() if done: break print(total_reward)
class DDQNAgentCnn(GeneralAgent): def __init__(self, gamma, action_number, minibatch, episodes, begin_train, copy_step, epsilon_delta, epsilon_start, epsilon_end, load_model, path_to_load, path_to_save, plots_to_save, episode_steps, episode_to_save, max_buffer_len, model_type ): super().__init__(gamma=gamma, action_number=action_number, path_to_load=path_to_load, path_to_save=path_to_save, plots_to_save=plots_to_save, load_model=load_model, episode_to_save=episode_to_save, episodes=episodes, model_type=model_type) # Epsilon self.epsilon_delta = epsilon_delta self.epsilon_end = epsilon_end self.epsilon_start = epsilon_start self.epsilon = epsilon_start # Main Params self.minibatch = minibatch # Episode Params self.begin_train = begin_train self.copy_step = copy_step self.episode_steps = episode_steps # Model Fields self.action = None self.state = None self.replay_buffer = ReplayBuffer(max_buffer_len) # Model self.target_model = model_type(action_number).to(self.device) self.update_target() # Rewards self.rewards_white, self.rewards_black, self.rewards = [], [], [] self.losses = [] self.periodic_reward = 0 self.periodic_rewards = [] def update_target(self): self.target_model.load_state_dict(self.model.state_dict()) def reduce_epsilon(self, episode): self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ np.exp(-1. * episode / self.epsilon_delta) def epsilon_greedy(self): if (1 - self.epsilon) <= np.random.random(): self.action = np.random.randint(self.action_number) else: state = torch.autograd.Variable(torch.FloatTensor(self.state).to(self.device).unsqueeze(0)) self.action = self.model(state).max(1)[1].item() return self.action @staticmethod def preprocess_observation(obs): img = resize(rgb2gray(obs[0:188, 23:136, :]), (28, 28), mode='constant') img = img.reshape(1, 28, 28) return img def transition_process(self, o_state, o_act, o_reward, o_next_state, o_done): return \ torch.autograd.Variable(torch.FloatTensor(np.float32(o_state)).to(self.device)), \ torch.autograd.Variable(torch.LongTensor(o_act).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(o_reward).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(np.float32(o_next_state)).to(self.device)), \ torch.autograd.Variable(torch.FloatTensor(o_done).to(self.device)) def train_model(self): o_state, o_act, o_reward, o_next_state, o_done = \ self.transition_process(*self.replay_buffer.sample(self.minibatch)) q = self.model(o_state).gather(1, o_act.unsqueeze(1)).squeeze(1) q_next = self.target_model(o_next_state) y_hat = o_reward + self.gamma * q_next.max(1)[0] * (1 - o_done) loss = (q - y_hat.detach()).pow(2).mean() self.model.optimizer.zero_grad() loss.backward() self.model.optimizer.step() return loss def init_new_episode(self, env): observation = env.reset() self.state = self.preprocess_observation(observation) def episode_check(self, episode, loss): if episode % self.copy_step == 0: self.losses.append(loss) self.update_target() if episode % self.episode_steps == 0: self.periodic_rewards.append(self.periodic_reward / self.episode_steps) self.periodic_reward = 0 if episode % self.episode_to_save == 0: torch.save(self.model.state_dict(), self.path_to_save) fig = plt.figure() plt.plot(self.rewards) fig.savefig(self.plots_to_save + '_reward.png') plt.close(fig) fig = plt.figure() plt.plot(self.losses) fig.savefig(self.plots_to_save + '_loss.png') plt.close(fig) fig = plt.figure() plt.plot(self.periodic_rewards) fig.savefig(self.plots_to_save + '_periodic_reward.png') plt.close(fig) def train(self, env: gym.wrappers.time_limit.TimeLimit): self.init_new_episode(env) total_reward = 0 episode_reward = 0 loss = 0 for episode in self.trangle: self.trangle.set_description( f"Episode: {episode} | Episode Reward {episode_reward} | Periodic reward " f"{self.periodic_reward / self.episode_steps} | Average Reward {total_reward / (episode + 1)}" ) self.trangle.refresh() action = self.epsilon_greedy() next_observation, reward, done, _ = env.step(action) total_reward += reward episode_reward += reward self.periodic_reward += reward next_state = self.preprocess_observation(next_observation) self.replay_buffer.push(self.state, action, reward, next_state, done) self.state = next_state if len(self.replay_buffer) >= self.begin_train: loss = self.train_model() self.reduce_epsilon(episode) self.episode_check(episode, loss) if done: self.init_new_episode(env) self.rewards.append(episode_reward) episode_reward = 0