def learn(self): # print('Learning Session') batch_s, batch_s_, batch_r, batch_terminal, batch_a_indices_one_hot, batch_a_indices = \ self.memory.sample_batch(self.memory_batch_size) q_eval_s = self.policy_nn.forward(batch_s) q_eval_s_ = self.policy_nn.forward( batch_s_) if self.target_nn is None else self.target_nn.forward( batch_s_) if self.lib_type == LIBRARY_TORCH: self.policy_nn.learn_batch(batch_a_indices, batch_r, batch_terminal, self.GAMMA, self.memory_batch_size, q_eval_s, q_eval_s_) else: q_target_chosen_a = batch_r + self.GAMMA * np.max( q_eval_s_, axis=1) * batch_terminal self.policy_nn.learn_batch(batch_s, batch_a_indices_one_hot, q_target_chosen_a) self.learn_step_counter += 1 if self.learn_step_counter > self.pure_exploration_phase: self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec, self.eps_dec_type)
def perform_off_policy_mc_control(self, print_info=False, visualize=False, record=False): # off-policy methods are the alternative to non-exploring-starts if record: self.env = wrappers.Monitor(self.env, 'recordings/OP-MC/', force=True, video_callable=lambda episode_id: episode_id == 0 or episode_id == (self.episodes - 1)) target_policy, Q, C = {}, {}, {} for s in self.states: target_policy[s] = self.env.action_space.sample() for a in range(self.action_space_size): Q[s, a] = 0 C[s, a] = 0 accumulated_scores = 0 print('\n', 'Game Started', '\n') for i in range(self.episodes): done = False ep_steps = 0 ep_score = 0 behavior_policy = {} for s in self.states: rand = np.random.random() behavior_policy[s] = [target_policy[s]] \ if rand > self.EPS \ else [a for a in range(self.action_space_size)] memory = [] observation = self.env.reset() s = self.custom_env.get_state(observation) if visualize and i == self.episodes - 1: self.env.render() while not done: a = np.random.choice(behavior_policy[s]) observation_, reward, done, info = self.env.step(a) ep_steps += 1 ep_score += reward accumulated_scores += reward s_ = self.custom_env.get_state(observation_) memory.append((s, a, reward)) observation, s = observation_, s_ if visualize and i == self.episodes - 1: self.env.render() if self.episodes < 10 or (i + 1) % (self.episodes // 10) == 0: print('episode %d - score: %d, steps: %d' % (i + 1, ep_score, ep_steps)) self.totalSteps[i] = ep_steps self.totalScores[i] = ep_score self.totalAccumulatedScores[i] = accumulated_scores if visualize and i == self.episodes - 1: self.env.close() #################### G = 0 W = 1 for s, a, reward in reversed(memory): # from end to start G = self.GAMMA * G + reward # calculate discounted return C[s, a] += W Q[s, a] += (W / C[s, a]) * (G - Q[s, a]) target_policy[s] = max_action_q(Q, s, self.action_space_size) # taking a sub-optimal action breaks the learning loop # it only learns from greedy actions - this is a shortcoming of the class of algorithms # this makes the off-policy MC a sub-optimal strategy for MC methods if a != target_policy[s]: break if len(behavior_policy[s]) == 1: # agent took a greedy action prob = 1 - self.EPS # probability of taking a greedy action. else: # agent took a random action prob = self.EPS / len( behavior_policy[s] ) # probability of taking a random action. W *= 1 / prob # updating the weight self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec, self.eps_dec_type) if print_info: print_q(Q) print_policy(Q, target_policy) print('\n', 'Game Ended', '\n') return target_policy, self.totalScores, self.totalAccumulatedScores
def perform_mc_non_exploring_starts_control(self, print_info=False, visualize=False, record=False): # Monte Carlo control without exploring starts # we use epsilon greedy with a decaying epsilon if record: self.env = wrappers.Monitor(self.env, 'recordings/MC-NES/', force=True, video_callable=lambda episode_id: episode_id == 0 or episode_id == (self.episodes - 1)) Q, states_actions_visited_counter = init_q1_q2(self.states, self.action_space_size) accumulated_scores = 0 print('\n', 'Game Started', '\n') for i in range(self.episodes): done = False ep_steps = 0 ep_score = 0 memory = [] observation = self.env.reset() s = self.custom_env.get_state(observation) if visualize and i == self.episodes - 1: self.env.render() while not done: a = eps_greedy_q(Q, s, self.action_space_size, self.EPS, self.env) observation_, reward, done, info = self.env.step(a) ep_steps += 1 ep_score += reward accumulated_scores += reward s_ = self.custom_env.get_state(observation_) memory.append((s, a, reward)) observation, s = observation_, s_ if visualize and i == self.episodes - 1: self.env.render() if self.episodes < 10 or (i + 1) % (self.episodes // 10) == 0: print('episode %d - score: %d, steps: %d' % (i + 1, ep_score, ep_steps)) self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec, self.eps_dec_type) self.totalSteps[i] = ep_steps self.totalScores[i] = ep_score self.totalAccumulatedScores[i] = accumulated_scores if visualize and i == self.episodes - 1: self.env.close() #################### ep_states_actions_returns = calculate_episode_states_actions_returns( memory, self.GAMMA) ep_states_actions_visited = [] for s, a, G in ep_states_actions_returns: if (s, a) not in ep_states_actions_visited: # first visit ep_states_actions_visited.append((s, a)) states_actions_visited_counter[s, a] += 1 # Incremental Implementation # (of the update rule for the agent's estimate of the discounted future rewards) # this is a shortcut that saves you from calculating the average of a function every single time # (computationally expensive and doesn't really get you anything in terms of accuracy) # new estimate = old estimate + [sample - old estimate] / N Q[s, a] += (G - Q[s, a]) / states_actions_visited_counter[s, a] policy = get_policy_table_from_q_table(self.states, Q, self.action_space_size) if print_info: print_q(Q) print_policy(Q, policy) print('\n', 'Game Ended', '\n') return policy, self.totalScores, self.totalAccumulatedScores
def perform_double_q_learning(self, visualize=False, record=False): if record: self.env = wrappers.Monitor(self.env, 'recordings/D-Q-L/', force=True, video_callable=lambda episode_id: episode_id == 0 or episode_id == (self.episodes - 1)) Q1, Q2 = init_q1_q2(self.states, self.action_space_size) accumulated_scores = 0 print('\n', 'Game Started', '\n') for i in range(self.episodes): done = False ep_steps = 0 ep_score = 0 observation = self.env.reset() s = self.custom_env.get_state(observation) if visualize and i == self.episodes - 1: self.env.render() while not done: a = eps_greedy_q1_q2(Q1, Q2, s, self.action_space_size, self.EPS, self.env) observation_, reward, done, info = self.env.step(a) ep_steps += 1 ep_score += reward accumulated_scores += reward s_ = self.custom_env.get_state(observation_) rand = np.random.random() if rand <= 0.5: a_ = max_action_q1_q2(Q1, Q1, s_, self.action_space_size) Q1[s, a] += self.ALPHA * (reward + self.GAMMA * Q2[s_, a_] - Q1[s, a]) else: # elif rand > 0.5 a_ = max_action_q1_q2(Q2, Q2, s_, self.action_space_size) Q2[s, a] += self.ALPHA * (reward + self.GAMMA * Q1[s_, a_] - Q2[s, a]) observation, s = observation_, s_ if visualize and i == self.episodes - 1: self.env.render() if self.episodes < 10 or (i + 1) % (self.episodes // 10) == 0: print('episode %d - eps: %.2f, score: %d, steps: %d' % (i + 1, self.EPS, ep_score, ep_steps)) self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec, self.eps_dec_type) self.totalSteps[i] = ep_steps self.totalScores[i] = ep_score self.totalAccumulatedScores[i] = accumulated_scores if visualize and i == self.episodes - 1: self.env.close() print('\n', 'Game Ended', '\n') return Q1, Q2, self.totalScores, self.totalAccumulatedScores
def perform_q_learning(self, visualize=False, record=False, pickle=False): if record: self.env = wrappers.Monitor(self.env, 'recordings/Q-L/', force=True, video_callable=lambda episode_id: episode_id == 0 or episode_id == (self.episodes - 1)) Q = init_q(self.states, self.action_space_size, self.custom_env.file_name, pickle) accumulated_scores = 0 print('\n', 'Game Started', '\n') for i in range(self.episodes): done = False ep_steps = 0 ep_score = 0 observation = self.env.reset() s = self.custom_env.get_state(observation) if visualize and i == self.episodes - 1: self.env.render() while not done: a = eps_greedy_q(Q, s, self.action_space_size, self.EPS, self.env) observation_, reward, done, info = self.env.step(a) ep_steps += 1 ep_score += reward accumulated_scores += reward s_ = self.custom_env.get_state(observation_) a_ = max_action_q(Q, s_, self.action_space_size) Q[s, a] += self.ALPHA * (reward + self.GAMMA * Q[s_, a_] - Q[s, a]) # Q[s, a] += self.ALPHA * (reward + self.GAMMA * np.max(Q[s_, :]) - Q[s, a]) # if Q is a numpy.ndarray observation, s = observation_, s_ if visualize and i == self.episodes - 1: self.env.render() if self.episodes < 10 or (i + 1) % (self.episodes // 10) == 0: print('episode %d - eps: %.2f, score: %d, steps: %d' % (i + 1, self.EPS, ep_score, ep_steps)) self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec, self.eps_dec_type) self.totalSteps[i] = ep_steps self.totalScores[i] = ep_score self.totalAccumulatedScores[i] = accumulated_scores if visualize and i == self.episodes - 1: self.env.close() print('\n', 'Game Ended', '\n') if pickle: pickle_save(Q, self.custom_env.file_name + '-q-table') return Q, self.totalScores, self.totalAccumulatedScores