def policy(self, A, s, Q, epsilon): #重写基类的policy函数 ''' 使用ε-greedy策略 return a 策略得到的行为 ''' return epsilon_greedy_policy(A, s, Q, epsilon)
def learn(self): env = self.env Q = self.Q N = np.zeros(STATE_SPACE_SHAPE) for episode in range(1, self.num_episodes + 1): env.reset() state1 = env.observe() E = np.zeros(STATE_SPACE_SHAPE) # eligibility traces while state1 != TERMINAL_STATE: action1 = epsilon_greedy_policy(Q, N, state1) state2, reward = env.step(action1) dealer1, player1 = state1 idx1 = (dealer1 - 1, player1 - 1, action1) Q1 = Q[idx1] if state2 == TERMINAL_STATE: Q2 = 0.0 else: action2 = epsilon_greedy_policy(Q, N, state2) dealer2, player2 = state2 idx2 = (dealer2 - 1, player2 - 1, action2) Q2 = Q[idx2] N[idx1] += 1 E[idx1] += 1 alpha = 1.0 / N[idx1] delta = reward + self.gamma * Q2 - Q1 Q += alpha * delta * E E *= self.gamma * self.lmbd state1 = state2 if self.save_error_history: self.error_history.append((episode, mse(self.Q, self.opt_Q))) return Q
def select_action(self, state, i_episode): """ Given the state, select an action. Params ====== - state: the current state of the environment Returns ======= - action: an integer, compatible with the task's action space """ Qs = self.Q[state] policy = utils.epsilon_greedy_policy(Qs, self.nA, i_episode) return np.random.choice(self.nA, p=policy)
def epsilon_greedy_policy(self, dealer, epsilon=None): ''' The greedy strategy here is with epsilon parameters ''' player_points, _ = self.get_points() if player_points >= 21: return self.A[1] if player_points < 12: return self.A[0] else: A, Q = self.A, self.Q s = self.get_state_name(dealer) if epsilon is None: epsilon = 1.0 / ( 1 + 4 * math.log10(1 + player.total_learning_times)) return epsilon_greedy_policy(A, s, Q, epsilon)
def learn(self): env = self.env Q = self.Q N = np.zeros(STATE_SPACE_SHAPE) for episode in range(1, self.num_episodes + 1): env.reset() state = env.observe() E = [] # experience from the episode while state != TERMINAL_STATE: action = epsilon_greedy_policy(Q, N, state) state_, reward = env.step(action) E.append([state, action, reward]) state = state_ for (dealer, player), action, reward in E: idx = dealer - 1, player - 1, action N[idx] += 1 alpha = 1.0 / N[idx] Q[idx] += alpha * (reward - Q[idx]) return Q
def learn(self): N0 = 100 Ns = np.zeros(V_SHAPE) Nsa = np.zeros(Q_SHAPE) for i in range(self.num_episodes): print("Episode:" + str(i + 1)) E = np.zeros(Q_SHAPE) self.env.reset() print("Your card: " + str(self.env.state.player_sum)) print("Dealer's card: " + str(self.env.state.dealer_card)) state1 = self.env.state epsilon = get_epsilon(N0, Ns, state1) action1 = epsilon_greedy_policy(epsilon, self.Q, state1) while (state1 is not None): index1 = (state1.dealer_card - 1, state1.player_sum - 1, action1) Q1 = self.Q[index1] state2, reward = self.env.step(state1, action1) if (state2 is not None): action2 = epsilon_greedy_policy(epsilon, self.Q, state2) index2 = (state2.dealer_card - 1, state2.player_sum - 1, action2) Q2 = self.Q[index2] else: Q2 = 0 if reward == 1: self.wins += 1 elif reward == -1: self.losses += 1 else: self.draws += 1 delta = reward + self.gamma * Q2 - Q1 E[index1] += 1 Ns[index1[0:2]] += 1 Nsa[index1] += 1 alpha = 1 / Nsa[index1] self.Q += alpha * delta * E E *= self.gamma * self.lmbd state1 = state2 if (state2 is not None): action1 = action2 epsilon = get_epsilon(N0, Ns, state1) print("-------------------------------------------------------") info = "Wins: {}\nLosses: {}\nWin to Lose ratio: {}\nDraws: {}" info = info.format(self.wins, self.losses, (self.wins / self.losses), self.draws) print(info) path = "results/sarsa_{}_{}".format(self.lmbd, self.num_episodes) with open(path + '_info.txt', 'a') as the_file: the_file.write(info) save_nd_arr(path + "_Q.txt", self.Q) plot_V(np.max(self.Q, axis=2), save=path + "_V_plot.png")
def policy(self, A, s, Q, epsilon): return epsilon_greedy_policy(A, s, Q, epsilon)
for rotate_idx in range(len(grasp_predictions)): grasp_heatmap = utils.vis_affordance(grasp_predictions[rotate_idx]) grasp_name = grasp_path[rotate_idx] + "grasp_{:06}.jpg".format(iteration, rotate_idx) cv2.imwrite(grasp_name, grasp_heatmap) grasp_mixed_idx = cv2.addWeighted(color, 1.0, grasp_heatmap, 0.4, 0) grasp_mixed.append(grasp_mixed_idx) grasp_name = mixed_path + "grasp_{:06}_idx_{}.jpg".format(iteration, rotate_idx) cv2.imwrite(grasp_name, grasp_mixed_idx) print "[{:.6f}]: suck max: \033[0;34m{}\033[0m| grasp max: \033[0;35m{}\033[0m".format(time.time(), \ np.max(suck_predictions), np.max(grasp_predictions)) explore = -1 # None # Policy decider if not testing: # Train if not grasp_only: explore, action, action_str, pixel_index, angle = \ utils.epsilon_greedy_policy(epsilon_, suck_predictions, grasp_predictions) else: explore, action, action_str, pixel_index, angle = \ utils.grasp_epsilon_greedy_policy(epsilon_, grasp_predictions) if testing: # Test if not grasp_only: action, action_str, pixel_index, angle = utils.greedy_policy(suck_predictions, grasp_predictions) else: # Grasp-only action = 0 action_str = 'grasp' pixel_index, angle = utils.grasp_only_policy(grasp_predictions) explore_list.append(explore) if explore == 1: print "Use exploring..." del suck_predictions, grasp_predictions, state_feat print "[%f]: Take action: \033[0;31m %s\033[0m at \ \033[0;32m(%d, %d)\033[0m with theta \033[0;33m%f \033[0m" %(time.time(), action_str, pixel_index[1], \
color, depth, points = utils.get_heightmap( pc_response.pc, image_path, depth_path, iteration) ts = time.time() suck_1_prediction, suck_2_prediction, grasp_prediction = trainer.forward( color, depth, is_volatile=True) print "Forward past: {} seconds".format(time.time() - ts) heatmaps, mixed_imgs = utils.save_heatmap_and_mixed( suck_1_prediction, suck_2_prediction, grasp_prediction, feat_paths, mixed_paths, color, iteration) # Standarize predictions to avoid bias between them #suck_1_prediction = utils.standarization(suck_1_prediction);suck_2_prediction = utils.standarization(suck_2_prediction) #grasp_prediction = utils.standarization(grasp_prediction) # SELECT ACTION if not testing: # Train explore, action, action_str, pixel_index, angle = utils.epsilon_greedy_policy( epsilon_, suck_1_prediction, suck_2_prediction, grasp_prediction, depth, diff_path, iteration, specific_tool) else: # Testing action, action_str, pixel_index, angle = utils.greedy_policy( suck_1_prediction, suck_2_prediction, grasp_prediction, specific_tool) explore = False explore_list.append(explore) target_list.append(pixel_index) position_list.append(points[pixel_index[1], pixel_index[2]]) del suck_1_prediction, suck_2_prediction, grasp_prediction utils.print_action(action_str, pixel_index, points[pixel_index[1], pixel_index[2]]) # Save (color heightmap + prediction heatmap + motion primitive and corresponding position), then show it visual_img = utils.draw_image( mixed_imgs[pixel_index[0]], explore, pixel_index,