示例#1
0
    def play_game(self):
        '''
        Simulate an actual game till the agent loses.
        '''
        # Your Code Goes Here!
        num_game = 1000
        result = []

        for i in range(num_game):
            print("Round: " + str(i + 1))
            paddle_height = 0.2
            end_sign = False
            counter = 0

            game = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - 0.2 / 2)
            game.discretize_state()
            while not end_sign:
                action = self.f_function(game)
                end_sign, reward = game.simulate_one_time_step(action)
                game.discretize_state()
                if reward == 1:
                    counter += 1

            print "Rebound:" + str(counter)
            result.append(counter)
        print "the mean is " + str(statistics.mean(result))
        print "the variance is " + str(statistics.stdev(result))

        pass
示例#2
0
    def train_agent(self):
        '''
        Train the agent over a certain number of games.
        '''
        # Your Code Goes Here!
        print("Training")
        for i in range(self.num_games):
            game = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - 0.2 / 2)
            game.discretize_state()  #decretize the first state
            end_sign = False

            while not end_sign:

                act = self.f_function(game)
                cur_idx = game.find_state()  #get current game index

                end_sign, reward = game.simulate_one_time_step(
                    act)  #simulate one time step
                game.discretize_state()  #discretize
                next_idx = game.find_state()  #next game index

                utility_list = self.Q_table[
                    next_idx]  #(q1,q2,q3)     #get next game utility
                max_q_next = max(utility_list)

                self.Q_table[cur_idx][act] = (
                    1 - self.alpha_value
                ) * self.Q_table[cur_idx][act] + self.alpha_value * (
                    self.gamma_val * max_q_next + reward)
                #game.print_state()
            #print "===end round==="

        pass
示例#3
0
 def train_agent(self):
     '''
     Train the agent over a certain number of games.
     '''
     for i in range(0, self.num_games):
         self.mdp= MDP(0.5, 0.5, 0.03, 0.01, 0.4)
         self.play_game()
     
     '''Testing
示例#4
0
    def train_agent(self, should_show_gui):
        '''
        Train the agent over a certain number of games.
        '''
        if should_show_gui:
            win = GraphWin('Pong game', 500, 500)
        ball_count = 0
        for i in range(self.num_games):
            if should_show_gui:
                mdpInstance = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - .2 / 2, win)
            else:
                mdpInstance = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - .2 / 2, None)
            self.play_game(mdpInstance)
            ball_count += MDP.get_ball_count(mdpInstance)
        if should_show_gui:
            win.close()

        print("average: ", float(ball_count) / float(self.num_games))
        pass
示例#5
0
    def play_game(self):
        '''
        Simulate an actual game till the agent loses.
        '''
        reward = 0

        self.curState = MDP()
        self.d_curState = self.curState.discretize_state()
        while self.d_curState != 10368:
            old_State = self.d_curState
            action_selected = self.f_function()
            reward = self.curState.simulate_one_time_step(action_selected)

            if (reward == 1):
                #print reward
                self.bounced = self.bounced + 1

            next_State = self.curState.discretize_state()
            self.updateQ(old_State, reward, action_selected, next_State)
示例#6
0
 def __init__(self,
              num_games=0,
              alpha_value=0,
              gamma_value=0,
              epsilon_value=0):
     '''
     Setup the Simulator with the provided values.
     :param num_games - number of games to be trained on.
     :param alpha_value - 1/alpha_value is the decay constant.
     :param gamma_value - Discount Factor.
     :param epsilon_value - Probability value for the epsilon-greedy approach.
     '''
     self.num_games = num_games
     self.epsilon_value = epsilon_value
     self.alpha_value = alpha_value
     self.gamma_val = gamma_value
     self.q_table = {}
     self.mdp = MDP()
     self.hits = 0
     self.train_agent()
示例#7
0
 def __init__(self, num_games=0, alpha_value=0, gamma_value=0, epsilon_value=0):
     '''
     Setup the Simulator with the provided values.
     :param num_games - number of games to be trained on.
     :param alpha_value - 1/alpha_value is the decay constant.
     :param gamma_value - Discount Factor.
     :param epsilon_value - Probability value for the epsilon-greedy approach.
     '''
     self.num_games = num_games       
     self.epsilon_value = epsilon_value       
     self.alpha_value = alpha_value       
     self.gamma_val = gamma_value
     
     self.total_rebounds = 0
     self.mdp= MDP(0.5, 0.5, 0.03, 0.01, 0.4)
     #self.Q = [[[[[[0] * 3] * 12] * 3] * 2] * 12] * 12
     self.Q = [[[[[[0 for i in range(3)] for j in range(12)] for k in range(3)] for l in range(2)] for m in range(12)] for n in range(12)]
     #self.R = [[[[[0] * 12] * 3] * 2] * 12] * 12
     self.R = [[[[[0 for i in range(12)] for j in range(3)] for k in range(2)] for l in range(12)] for m in range(12)]
     
     self.train_agent()
示例#8
0
    def f_function(self):
        '''
        Choose action based on an epsilon greedy approach
        :return action selected
        '''
        action_selected = None
        
        self.mdp.discretize_state()
        r = random.random()
        if(r > self.epsilon_value):
            curr_max = 0
            for a in range(0,3):
                if self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][a] >= curr_max:
                    curr_max = self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][a]
                    action_selected = a
            # If they're all zeros, chose one at random
            if curr_max == 0:
                action_selected = floor(random.random() * 3)
        else:
            action_selected = floor(random.random() * 3) 
            
        # Create a temporary MDP for help with the Q learning formula, in order to look forward into the future
        temp_mdp = MDP(self.mdp.ball_x, self.mdp.ball_y, self.mdp.velocity_x, self.mdp.velocity_y, self.mdp.paddle_y)
        temp_mdp.simulate_one_time_step(action_selected)
        temp_mdp.discretize_state()
        max_a_prime = max(self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][0],
                          self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][1],
                          self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][2])
                    

        # Update Q via the learning function
        self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][action_selected] = \
        (1 - self.alpha_value) * self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][action_selected] \
         + self.alpha_value * (self.R[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y] + self.gamma_val * max_a_prime)
                 
        return action_selected
示例#9
0
    # cost.exceptObjective(1,1)

    # Define a cube state space
    sC = Cube(intervals=np.asarray([[0, 10], [0, 10], [0, 10], [0, 10]]),
              isContinuous=False)
    # Define action space
    aC = Cube(intervals=np.asarray([[0, 5], [0, 5], [0, 5], [0, 5]]),
              isContinuous=False)

    # Define exogenous noise space
    nC = Cube(intervals=np.asarray([[0, 8], [0, 8], [0, 8], [0, 8]]),
              isContinuous=False)

    # As an illustration, we define the following MDP transition kernel.
    T = Transition(sC, aC)

    # MDP cost/reward function can be defined using the "Objective" class.
    O = Objective(sC, aC, False, False)

    # Construct a finite horizon MDP
    mdp = MDP(initState=np.array([5, 5, 5, 5]),
              sSpace=sC,
              aSpace=aC,
              nSpace=nC,
              transition=T,
              objective=O,
              isFiniteHorizon=10,
              isAveCost=False,
              terminalStates=None)