Python epsilon_greedy_policy示例，utils.epsilon_greedy_policy Python示例

示例#1

0

显示文件

 def policy(self, A, s, Q, epsilon): #重写基类的policy函数
     '''
     使用ε-greedy策略
     return
         a 策略得到的行为
     '''
     return epsilon_greedy_policy(A, s, Q, epsilon)

示例#2

0

显示文件

文件： sarsa.py 项目： hartikainen/easy21

    def learn(self):
        env = self.env
        Q = self.Q
        N = np.zeros(STATE_SPACE_SHAPE)

        for episode in range(1, self.num_episodes + 1):
            env.reset()
            state1 = env.observe()
            E = np.zeros(STATE_SPACE_SHAPE)  # eligibility traces

            while state1 != TERMINAL_STATE:
                action1 = epsilon_greedy_policy(Q, N, state1)
                state2, reward = env.step(action1)

                dealer1, player1 = state1
                idx1 = (dealer1 - 1, player1 - 1, action1)
                Q1 = Q[idx1]

                if state2 == TERMINAL_STATE:
                    Q2 = 0.0
                else:
                    action2 = epsilon_greedy_policy(Q, N, state2)
                    dealer2, player2 = state2
                    idx2 = (dealer2 - 1, player2 - 1, action2)
                    Q2 = Q[idx2]

                N[idx1] += 1
                E[idx1] += 1

                alpha = 1.0 / N[idx1]
                delta = reward + self.gamma * Q2 - Q1
                Q += alpha * delta * E
                E *= self.gamma * self.lmbd

                state1 = state2

            if self.save_error_history:
                self.error_history.append((episode, mse(self.Q, self.opt_Q)))

        return Q

示例#3

0

显示文件

文件： agent.py 项目： mirocody/Deep-Reinforcement-Learning

    def select_action(self, state, i_episode):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        Qs = self.Q[state]
        policy = utils.epsilon_greedy_policy(Qs, self.nA, i_episode)

        return np.random.choice(self.nA, p=policy)

示例#4

0

显示文件

 def epsilon_greedy_policy(self, dealer, epsilon=None):
     '''
     The greedy strategy here is with epsilon parameters
     '''
     player_points, _ = self.get_points()
     if player_points >= 21:
         return self.A[1]
     if player_points < 12:
         return self.A[0]
     else:
         A, Q = self.A, self.Q
         s = self.get_state_name(dealer)
         if epsilon is None:
             epsilon = 1.0 / (
                 1 + 4 * math.log10(1 + player.total_learning_times))
         return epsilon_greedy_policy(A, s, Q, epsilon)

示例#5

0

显示文件

文件： monte_carlo.py 项目： hartikainen/easy21

    def learn(self):
        env = self.env
        Q = self.Q
        N = np.zeros(STATE_SPACE_SHAPE)

        for episode in range(1, self.num_episodes + 1):
            env.reset()
            state = env.observe()
            E = []  # experience from the episode

            while state != TERMINAL_STATE:
                action = epsilon_greedy_policy(Q, N, state)
                state_, reward = env.step(action)

                E.append([state, action, reward])
                state = state_

            for (dealer, player), action, reward in E:
                idx = dealer - 1, player - 1, action
                N[idx] += 1
                alpha = 1.0 / N[idx]
                Q[idx] += alpha * (reward - Q[idx])

        return Q

示例#6

0

显示文件

    def learn(self):
        N0 = 100
        Ns = np.zeros(V_SHAPE)
        Nsa = np.zeros(Q_SHAPE)

        for i in range(self.num_episodes):

            print("Episode:" + str(i + 1))

            E = np.zeros(Q_SHAPE)

            self.env.reset()

            print("Your card: " + str(self.env.state.player_sum))
            print("Dealer's card: " + str(self.env.state.dealer_card))

            state1 = self.env.state

            epsilon = get_epsilon(N0, Ns, state1)

            action1 = epsilon_greedy_policy(epsilon, self.Q, state1)

            while (state1 is not None):
                index1 = (state1.dealer_card - 1, state1.player_sum - 1,
                          action1)
                Q1 = self.Q[index1]

                state2, reward = self.env.step(state1, action1)

                if (state2 is not None):
                    action2 = epsilon_greedy_policy(epsilon, self.Q, state2)
                    index2 = (state2.dealer_card - 1, state2.player_sum - 1,
                              action2)
                    Q2 = self.Q[index2]
                else:
                    Q2 = 0
                    if reward == 1:
                        self.wins += 1
                    elif reward == -1:
                        self.losses += 1
                    else:
                        self.draws += 1

                delta = reward + self.gamma * Q2 - Q1

                E[index1] += 1
                Ns[index1[0:2]] += 1
                Nsa[index1] += 1

                alpha = 1 / Nsa[index1]

                self.Q += alpha * delta * E
                E *= self.gamma * self.lmbd

                state1 = state2
                if (state2 is not None):
                    action1 = action2
                    epsilon = get_epsilon(N0, Ns, state1)

            print("-------------------------------------------------------")

        info = "Wins: {}\nLosses: {}\nWin to Lose ratio: {}\nDraws: {}"
        info = info.format(self.wins, self.losses, (self.wins / self.losses),
                           self.draws)
        print(info)

        path = "results/sarsa_{}_{}".format(self.lmbd, self.num_episodes)

        with open(path + '_info.txt', 'a') as the_file:
            the_file.write(info)

        save_nd_arr(path + "_Q.txt", self.Q)

        plot_V(np.max(self.Q, axis=2), save=path + "_V_plot.png")

示例#7

0

显示文件

文件： agent.py 项目： BepfCp/RL-imple

 def policy(self, A, s, Q, epsilon):
     return epsilon_greedy_policy(A, s, Q, epsilon)

示例#8

0

显示文件

文件： main.py 项目： sean85914/rl_pnp

		for rotate_idx in range(len(grasp_predictions)):
			grasp_heatmap = utils.vis_affordance(grasp_predictions[rotate_idx])
			grasp_name = grasp_path[rotate_idx] + "grasp_{:06}.jpg".format(iteration, rotate_idx)
			cv2.imwrite(grasp_name, grasp_heatmap)
			grasp_mixed_idx = cv2.addWeighted(color, 1.0, grasp_heatmap, 0.4, 0)
			grasp_mixed.append(grasp_mixed_idx)
			grasp_name = mixed_path + "grasp_{:06}_idx_{}.jpg".format(iteration, rotate_idx)
			cv2.imwrite(grasp_name, grasp_mixed_idx)
		print "[{:.6f}]: suck max: \033[0;34m{}\033[0m| grasp max: \033[0;35m{}\033[0m".format(time.time(), \
                                                   np.max(suck_predictions), np.max(grasp_predictions))
		explore = -1 # None
		# Policy decider
		if not testing: # Train
			if not grasp_only:
				explore, action, action_str, pixel_index, angle = \
					utils.epsilon_greedy_policy(epsilon_, suck_predictions, grasp_predictions)
			else:
				explore, action, action_str, pixel_index, angle = \
					utils.grasp_epsilon_greedy_policy(epsilon_, grasp_predictions)
		if testing: # Test
			if not grasp_only:
				action, action_str, pixel_index, angle = utils.greedy_policy(suck_predictions, grasp_predictions)
			else: # Grasp-only
				action = 0
				action_str = 'grasp'
				pixel_index, angle = utils.grasp_only_policy(grasp_predictions)
		explore_list.append(explore)
		if explore == 1: print "Use exploring..."
		del suck_predictions, grasp_predictions, state_feat
		print "[%f]: Take action: \033[0;31m %s\033[0m at \
\033[0;32m(%d, %d)\033[0m with theta \033[0;33m%f \033[0m" %(time.time(), action_str, pixel_index[1], \

示例#9

0

显示文件

文件： main.py 项目： sean85914/rl_pnp

 color, depth, points = utils.get_heightmap(
     pc_response.pc, image_path, depth_path, iteration)
 ts = time.time()
 suck_1_prediction, suck_2_prediction, grasp_prediction = trainer.forward(
     color, depth, is_volatile=True)
 print "Forward past: {} seconds".format(time.time() - ts)
 heatmaps, mixed_imgs = utils.save_heatmap_and_mixed(
     suck_1_prediction, suck_2_prediction, grasp_prediction,
     feat_paths, mixed_paths, color, iteration)
 # Standarize predictions to avoid bias between them
 #suck_1_prediction = utils.standarization(suck_1_prediction);suck_2_prediction = utils.standarization(suck_2_prediction)
 #grasp_prediction = utils.standarization(grasp_prediction)
 # SELECT ACTION
 if not testing:  # Train
     explore, action, action_str, pixel_index, angle = utils.epsilon_greedy_policy(
         epsilon_, suck_1_prediction, suck_2_prediction,
         grasp_prediction, depth, diff_path, iteration,
         specific_tool)
 else:  # Testing
     action, action_str, pixel_index, angle = utils.greedy_policy(
         suck_1_prediction, suck_2_prediction, grasp_prediction,
         specific_tool)
     explore = False
 explore_list.append(explore)
 target_list.append(pixel_index)
 position_list.append(points[pixel_index[1], pixel_index[2]])
 del suck_1_prediction, suck_2_prediction, grasp_prediction
 utils.print_action(action_str, pixel_index,
                    points[pixel_index[1], pixel_index[2]])
 # Save (color heightmap + prediction heatmap + motion primitive and corresponding position), then show it
 visual_img = utils.draw_image(
     mixed_imgs[pixel_index[0]], explore, pixel_index,