def play_game(self): ''' Simulate an actual game till the agent loses. ''' # Your Code Goes Here! num_game = 1000 result = [] for i in range(num_game): print("Round: " + str(i + 1)) paddle_height = 0.2 end_sign = False counter = 0 game = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - 0.2 / 2) game.discretize_state() while not end_sign: action = self.f_function(game) end_sign, reward = game.simulate_one_time_step(action) game.discretize_state() if reward == 1: counter += 1 print "Rebound:" + str(counter) result.append(counter) print "the mean is " + str(statistics.mean(result)) print "the variance is " + str(statistics.stdev(result)) pass
def train_agent(self): ''' Train the agent over a certain number of games. ''' # Your Code Goes Here! print("Training") for i in range(self.num_games): game = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - 0.2 / 2) game.discretize_state() #decretize the first state end_sign = False while not end_sign: act = self.f_function(game) cur_idx = game.find_state() #get current game index end_sign, reward = game.simulate_one_time_step( act) #simulate one time step game.discretize_state() #discretize next_idx = game.find_state() #next game index utility_list = self.Q_table[ next_idx] #(q1,q2,q3) #get next game utility max_q_next = max(utility_list) self.Q_table[cur_idx][act] = ( 1 - self.alpha_value ) * self.Q_table[cur_idx][act] + self.alpha_value * ( self.gamma_val * max_q_next + reward) #game.print_state() #print "===end round===" pass
def train_agent(self): ''' Train the agent over a certain number of games. ''' for i in range(0, self.num_games): self.mdp= MDP(0.5, 0.5, 0.03, 0.01, 0.4) self.play_game() '''Testing
def train_agent(self, should_show_gui): ''' Train the agent over a certain number of games. ''' if should_show_gui: win = GraphWin('Pong game', 500, 500) ball_count = 0 for i in range(self.num_games): if should_show_gui: mdpInstance = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - .2 / 2, win) else: mdpInstance = MDP(0.5, 0.5, 0.03, 0.01, 0.5 - .2 / 2, None) self.play_game(mdpInstance) ball_count += MDP.get_ball_count(mdpInstance) if should_show_gui: win.close() print("average: ", float(ball_count) / float(self.num_games)) pass
def play_game(self): ''' Simulate an actual game till the agent loses. ''' reward = 0 self.curState = MDP() self.d_curState = self.curState.discretize_state() while self.d_curState != 10368: old_State = self.d_curState action_selected = self.f_function() reward = self.curState.simulate_one_time_step(action_selected) if (reward == 1): #print reward self.bounced = self.bounced + 1 next_State = self.curState.discretize_state() self.updateQ(old_State, reward, action_selected, next_State)
def __init__(self, num_games=0, alpha_value=0, gamma_value=0, epsilon_value=0): ''' Setup the Simulator with the provided values. :param num_games - number of games to be trained on. :param alpha_value - 1/alpha_value is the decay constant. :param gamma_value - Discount Factor. :param epsilon_value - Probability value for the epsilon-greedy approach. ''' self.num_games = num_games self.epsilon_value = epsilon_value self.alpha_value = alpha_value self.gamma_val = gamma_value self.q_table = {} self.mdp = MDP() self.hits = 0 self.train_agent()
def __init__(self, num_games=0, alpha_value=0, gamma_value=0, epsilon_value=0): ''' Setup the Simulator with the provided values. :param num_games - number of games to be trained on. :param alpha_value - 1/alpha_value is the decay constant. :param gamma_value - Discount Factor. :param epsilon_value - Probability value for the epsilon-greedy approach. ''' self.num_games = num_games self.epsilon_value = epsilon_value self.alpha_value = alpha_value self.gamma_val = gamma_value self.total_rebounds = 0 self.mdp= MDP(0.5, 0.5, 0.03, 0.01, 0.4) #self.Q = [[[[[[0] * 3] * 12] * 3] * 2] * 12] * 12 self.Q = [[[[[[0 for i in range(3)] for j in range(12)] for k in range(3)] for l in range(2)] for m in range(12)] for n in range(12)] #self.R = [[[[[0] * 12] * 3] * 2] * 12] * 12 self.R = [[[[[0 for i in range(12)] for j in range(3)] for k in range(2)] for l in range(12)] for m in range(12)] self.train_agent()
def f_function(self): ''' Choose action based on an epsilon greedy approach :return action selected ''' action_selected = None self.mdp.discretize_state() r = random.random() if(r > self.epsilon_value): curr_max = 0 for a in range(0,3): if self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][a] >= curr_max: curr_max = self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][a] action_selected = a # If they're all zeros, chose one at random if curr_max == 0: action_selected = floor(random.random() * 3) else: action_selected = floor(random.random() * 3) # Create a temporary MDP for help with the Q learning formula, in order to look forward into the future temp_mdp = MDP(self.mdp.ball_x, self.mdp.ball_y, self.mdp.velocity_x, self.mdp.velocity_y, self.mdp.paddle_y) temp_mdp.simulate_one_time_step(action_selected) temp_mdp.discretize_state() max_a_prime = max(self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][0], self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][1], self.Q[temp_mdp.dis_ball_x][temp_mdp.dis_ball_y][temp_mdp.dis_velocity_x][temp_mdp.dis_velocity_y][temp_mdp.dis_paddle_y][2]) # Update Q via the learning function self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][action_selected] = \ (1 - self.alpha_value) * self.Q[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y][action_selected] \ + self.alpha_value * (self.R[self.mdp.dis_ball_x][self.mdp.dis_ball_y][self.mdp.dis_velocity_x][self.mdp.dis_velocity_y][self.mdp.dis_paddle_y] + self.gamma_val * max_a_prime) return action_selected
# cost.exceptObjective(1,1) # Define a cube state space sC = Cube(intervals=np.asarray([[0, 10], [0, 10], [0, 10], [0, 10]]), isContinuous=False) # Define action space aC = Cube(intervals=np.asarray([[0, 5], [0, 5], [0, 5], [0, 5]]), isContinuous=False) # Define exogenous noise space nC = Cube(intervals=np.asarray([[0, 8], [0, 8], [0, 8], [0, 8]]), isContinuous=False) # As an illustration, we define the following MDP transition kernel. T = Transition(sC, aC) # MDP cost/reward function can be defined using the "Objective" class. O = Objective(sC, aC, False, False) # Construct a finite horizon MDP mdp = MDP(initState=np.array([5, 5, 5, 5]), sSpace=sC, aSpace=aC, nSpace=nC, transition=T, objective=O, isFiniteHorizon=10, isAveCost=False, terminalStates=None)