def Q_learning(gamma, numRounds, alpha): states = darts.get_states() actions = darts.get_actions() Q = {} for s in states: Q[s] = [0] * len(actions) totaliter = 0 for i in range(numRounds): s = throw.START_SCORE numiterations = 0 while s > 0: randAction = random.randint(0, len(actions) - 1) maxAction = Q[s].index(max(Q[s])) a = ex_strategy_one(numRounds, i, randAction, maxAction) #a = ex_strategy_two(numRounds, i, Q, len(actions), s) action = actions[a] s_prime = s - throw.location_to_score(action) if s_prime < 0: s_prime = s maxQ = 0.0 for a_prime in range(len(actions)): if Q[s_prime][a_prime] > maxQ: maxQ = Q[s_prime][a_prime] Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a]) s = s_prime numiterations += 1 totaliter += numiterations print "Average number of throws: " + str(float(totaliter) / numRounds)
def Q_learning(gamma, numRounds, alpha): states = darts.get_states() actions = darts.get_actions() currentRound = 0 Q = {} for s in states: Q[s] = [0] * len(actions) for i in range(numRounds): s = throw.START_SCORE numiterations = 0 while s > 0: randAction = random.randint(0, len(actions)) maxAction = Q[score].index(max(Q[s])) #a = ex_strategy_one(Q, randAction, maxAction) a = ex_strategy_two(numRounds, currentRound, Q, len(actions), s) action = actions[a] s_prime = s - throw.location_to_score(action) if s_prime < 0: s_prime = s maxQ = 0.0 for a_prime in range(len(actions)): if Q[s_prime][a_prime] > maxQ: maxQ = Q[s_prime][a_prime] Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a]) s = s_prime currentRound += 1
def Q_learning(gamma, numRounds, alpha): states = darts.get_states() actions = darts.get_actions() Q = {} for s in states: Q[s] = [0] * len(actions) for i in range(numRounds): s = throw.START_SCORE numiterations = 0 while s > 0: randAction = random.randint(0, len(actions)) maxAction = Q[score].index(max(Q[s])) #a = ex_strategy_one(Q, randAction, maxAction) a = ex_strategy_two(Q, randAction, maxAction) action = actions[a] s_prime = s - throw.location_to_score(action) if s_prime < 0: s_prime = s maxQ = 0.0 for a_prime in range(len(actions)): if Q[s_prime][a_prime] > maxQ: maxQ = Q[s_prime][a_prime] Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a]) s = s_prime
def infiniteValueIteration(gamma): # takes a discount factor gamma and convergence cutoff epislon # returns V = {} Q = {} V_prime = {} states = darts.get_states() actions = darts.get_actions() notConverged = True # intialize value of each state to 0 for s in states: V[s] = 0 Q[s] = {} # until convergence is reached while notConverged: # store values from previous iteration for s in states: V_prime[s] = V[s] # update Q, pi, and V for s in states: for a in actions: # given current state and action, sum product of T and V over all states summand = 0 for s_prime in states: summand += T(a, s, s_prime)*V_prime[s_prime] # update Q Q[s][a] = darts.R(s, a) + gamma*summand # given current state, store the action that maximizes V in pi and the corresponding value in V PI[s] = actions[0] V[s] = Q[s][PI[s]] for a in actions: if V[s] <= Q[s][a]: V[s] = Q[s][a] PI[s] = a notConverged = False for s in states: if abs(V[s] - V_prime[s]) > EPSILON: notConverged = True # Print table of optimal policy, for writeup purposes def print_table(): for s in [9, 8, 7, 6, 5, 4, 3, 2, 1]: EPs = map(lambda a : EPoints(a, s), actions) print " " + str(s) + " & (" + str(PI[s].wedge) + ", " + str(PI[s].ring) + ") & " + str(EPoints(PI[s], s)) + " & [" + str(min(EPs)) + ", " + str(max(EPs)) + "] & " + str(s - EPoints(PI[s], s)) + " \\\\" print V print_table()
def start_game(gamma, learning_rate, num_games): actions = darts.get_actions() states = darts.get_states() Q_learning(gamma, learning_rate, num_games) a = q_values[throw.START_SCORE].index(max(q_values[throw.START_SCORE])) return actions[a]
def start_game(): global states, actions, Q states = darts.get_states() actions = darts.get_actions() for s in states: Q[s] = {} for a in range(len(actions)): Q[s][a] = 0 return throw.location(throw.INNER_RING, throw.NUM_WEDGES)
def infiniteValueIteration(gamma): # takes a discount factor gamma and convergence cutoff epislon # returns V = {} Q = {} V_prime = {} states = darts.get_states() actions = darts.get_actions() notConverged = True # intialize value of each state to 0 for s in states: V[s] = 0 Q[s] = {} # until convergence is reached while notConverged: # store values from previous iteration for s in states: V_prime[s] = V[s] # update Q, pi, and V for s in states: for a in actions: # given current state and action, sum product of T and V over all states summand = 0 for s_prime in states: summand += T(a, s, s_prime)*V_prime[s_prime] # update Q Q[s][a] = darts.R(s, a) + gamma*summand # given current state, store the action that maximizes V in pi and the corresponding value in V PI[s] = actions[0] V[s] = Q[s][PI[s]] for a in actions: if V[s] <= Q[s][a]: V[s] = Q[s][a] PI[s] = a notConverged = False for s in states: if abs(V[s] - V_prime[s]) > EPSILON: notConverged = True # test_score = 9 # test_action = throw.location(throw.OUTER_RING,4) # print T(test_action,test_score,5)
def modelbased_value_iteration(gamma, T_matrix, pi_star, V_n={}): V = {} V[0] = {} V[1] = {} converging = 0 num_iterations = 0 Q = {} # Get all possible actions actions = darts.get_actions() states = darts.get_states() # initialize v if len(V_n) == 0: for s in states: V[0][s] = 0 V[1][s] = 0 else: for s in states: V[0][s] = V_n[s] V[1][s] = V_n[s] # iterate until all state values (v[s]) converge while not(converging): num_iterations += 1 for s in states: for a in range(len(actions)): # find the value of each action, given state s Q[a] = darts.R(s, actions[a]) for s_prime in states: Q[a] += gamma * T_matrix[s][s_prime][a] * V[0][s_prime] # find the action that maximizes Q and the maximum value of Q if a == 0 or (Q[a] >= V[1][s]): pi_star[s] = a V[1][s] = Q[a] # values of v for iteration k become the values of v for iteration k-1 converging = True for s in states: # check for one component that does not converge if EPSILON_VI < abs(V[0][s] - V[1][s]): converging = False V[0][s] = V[1][s] return T_matrix, pi_star, Q, V[1]
def infiniteValueIteration(gamma): # takes a discount factor gamma and convergence cutoff epislon # returns V = {} Q = {} V_prime = {} states = darts.get_states() actions = darts.get_actions() notConverged = True # intialize value of each state to 0 for s in states: V[s] = 0 Q[s] = {} # until convergence is reached while notConverged: # store values from previous iteration for s in states: V_prime[s] = V[s] # update Q, pi, and V for s in states: for a in actions: # given current state and action, sum product of T and V over all states summand = 0 for s_prime in states: summand += T(a, s, s_prime) * V_prime[s_prime] # update Q Q[s][a] = darts.R(s, a) + gamma * summand # given current state, store the action that maximizes V in pi and the corresponding value in V PI[s] = actions[0] # bug fix from piazza post 283 V[s] = Q[s][PI[s]] for a in actions: if V[s] <= Q[s][a]: V[s] = Q[s][a] PI[s] = a notConverged = False for s in states: if abs(V[s] - V_prime[s]) > EPSILON: notConverged = True
def start_game(gamma): global GAMMA global Q global actions GAMMA = gamma states = darts.get_states() actions = darts.get_actions() for s in states: Q[s] = {} for a in actions: Q[s][a]=100 return choice(actions)#(throw.location(throw.INNER_RING, throw.NUM_WEDGES))
def start_game(): global actions, s_old, a_old if actions == None: print "GAMMA: ", darts.GAMMA print "LEARNING_RATE: ", LEARNING_RATE print "strategy: ", darts.strategy actions = darts.get_actions() for s in darts.get_states(): Q[s] = {} for a in actions: Q[s][a] = 0. s_old = throw.START_SCORE a_old = actions[-15] return a_old
def modelfree(alpha, gamma, num_games): # store all actions (targets on dartboard) in actions array actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_iterations = 0 Q = [[]] * len(states) # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions)-1) Q[s] = [0] * len(actions) # play num_games games for g in range(1, num_games + 1): #print str(g) + "/" + str(num_games) # run a single game s = throw.START_SCORE while s > 0: num_iterations += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. a = ex_strategy_one(num_iterations, actions, pi_star, s) #a = ex_strategy_two(num_iterations, Q, actions, s, pi_star) action = actions[a] # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) s_prime = int(s - throw.location_to_score(loc)) if s_prime < 0: s_prime = s max_Q = max(Q[s_prime]) Q[s][a] += alpha * (darts.R(s, actions[a]) + gamma * max(Q[s_prime]) - Q[s][a]) pi_star[s] = Q[s].index(max(Q[s])) # Next state becomes current state s = s_prime print "Average turns = ", float(num_iterations)/float(num_games)
def start_game(): global last_score, last_action, actions if last_score is None: actions = darts.get_actions() for s in darts.get_states(): Q[s] = {} for a in actions: Q[s][a] = 0. last_score = throw.START_SCORE print >> sys.stderr, 'start' last_action = throw.location(throw.INNER_RING, throw.NUM_WEDGES) print >> sys.stderr, last_action return last_action
def start_game(): global last_score, last_action, actions if last_score is None: actions = darts.get_actions() for s in darts.get_states(): Q[s] = {} for a in actions: Q[s][a] = 0. last_score = throw.START_SCORE print >>sys.stderr, 'start' last_action = throw.location(throw.INNER_RING, throw.NUM_WEDGES) print >>sys.stderr, last_action return last_action
def modelbased_value_iteration(gamma, T_matrix, pi_star): V = {} V[0] = {} V[1] = {} converging = 0 num_iterations = 0 Q = {} # Get all possible actions actions = darts.get_actions() states = darts.get_states() # initialize v for s in states: V[0][s] = 0 V[1][s] = 0 # iterate until all state values (v[s]) converge while not (converging): num_iterations += 1 for s in states: for a in range(len(actions)): # find the value of each action, given state s Q[a] = darts.R(s, actions[a]) for s_prime in states: Q[a] += gamma * T_matrix[s][s_prime][a] * V[0][s_prime] # find the action that maximizes Q and the maximum value of Q if a == 0 or (Q[a] >= V[1][s]): pi_star[s] = a V[1][s] = Q[a] # values of v for iteration k become the values of v for iteration k-1 converging = True for s in states: # check for one component that does not converge if EPSILON_VI < abs(V[0][s] - V[1][s]): converging = False V[0][s] = V[1][s] return T_matrix, pi_star
def Q_learning(gamma, learning_rate, num_games): g = 0 num_iterations = 0 # store all actions (targets on dartboard) in actions array # actions = darts.get_actions() # states = darts.get_states() actions = darts.get_actions() states = darts.get_states() # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: list_a = [] for a in range(len(actions)): list_a.append(0) q_values.append(list_a) for g in range(1, num_games + 1): # run a single game s = throw.START_SCORE while s > 0: num_iterations += 1 # which strategy to use #to_explore = ex_strategy_one(num_iterations) to_explore = ex_strategy_two(num_iterations) if to_explore: # explore a = random.randint(0, len(actions)-1) action = actions[a] else: # exploit a = q_values[s].index(max(q_values[s])) action = actions[a]
def start_game(): global Q, states, actions, cur_s, last_a, throws cur_s = throw.START_SCORE throws = 1 # only initialize states, actions, and Q once if states == None: states = darts.get_states() if actions == None: actions = darts.get_actions() if len(Q) == 0: for s in states: Q[s] = {} for a in actions: Q[s][a] = 0.0 # start by returning uniform random action last_a = choice(actions) return last_a
def test_get_states(self): x = throw.START_SCORE self.assertTrue(x / 2 < len(darts.get_states())) self.assertTrue(0.75 * x < len(darts.get_states()))
def modelfree(gamma, learning_rate, num_games, strategy_idx): actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} Q = {} num_iterations = 0 # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions)-1) num_actions[s] = {} Q[s] = {} num_transitions[s] = {} T_matrix[s] = {} for a in range(len(actions)): Q[s][a] = 1.0 num_actions[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): # run a single game s = throw.START_SCORE throws = 0 explores = 0 exploits = 0 while s > 0: num_iterations += 1 throws += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. if(strategy_idx==1): to_explore = ex_strategy_one(s,g) else: to_explore = ex_strategy_two(s,g) if to_explore: # explore a = random.randint(0, len(actions)-1) action = actions[a] explores += 1 else: # exploit a = bestAction(Q, s) action = actions[a] exploits += 1 #print "a", a, "action",action # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) delta = throw.location_to_score(loc) s_prime = s - delta if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 this_lr = 1 / num_actions[s][a] Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr) # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits)) print g,throws,"%1.4f" % (float(explores)/(explores+exploits)) avg = float(num_iterations)/float(num_games) return avg
import random import throw import darts # The default player aims for the maximum score, unless the # current score is less than the number of wedges, in which # case it aims for the exact score it needs. # # You may use the following functions as a basis for # implementing the Q learning algorithm or define your own # functions. ACTIVE_STRATEGY=1; actions = darts.get_actions() states = darts.get_states() gamma = .5 learning_rate = .1 num_games = 10 #def start_game(): # num_throws_this = 1 # last_action = throw.location(throw.INNER_RING, throw.NUM_WEDGES) # return(last_action) #def update_counts(score): # last_delta = score - last_state # update_T(last_state, last_action, last_delta) #def get_target(score):
import random import throw import darts # The default player aims for the maximum score, unless the # current score is less than the number of wedges, in which # case it aims for the exact score it needs. # # You may use the following functions as a basis for # implementing the Q learning algorithm or define your own # functions. ACTIVE_STRATEGY = 1 actions = darts.get_actions() states = darts.get_states() gamma = .5 learning_rate = .1 num_games = 10 #def start_game(): # num_throws_this = 1 # last_action = throw.location(throw.INNER_RING, throw.NUM_WEDGES) # return(last_action) #def update_counts(score): # last_delta = score - last_state # update_T(last_state, last_action, last_delta) #def get_target(score):
def modelfree(gamma, learning_rate, num_games, strategy_idx): actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} Q = {} num_iterations = 0 # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions) - 1) num_actions[s] = {} Q[s] = {} num_transitions[s] = {} T_matrix[s] = {} for a in range(len(actions)): Q[s][a] = 1.0 num_actions[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): # run a single game s = throw.START_SCORE throws = 0 explores = 0 exploits = 0 while s > 0: num_iterations += 1 throws += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. if (strategy_idx == 1): to_explore = ex_strategy_one(s, g) else: to_explore = ex_strategy_two(s, g) if to_explore: # explore a = random.randint(0, len(actions) - 1) action = actions[a] explores += 1 else: # exploit a = bestAction(Q, s) action = actions[a] exploits += 1 #print "a", a, "action",action # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) delta = throw.location_to_score(loc) s_prime = s - delta if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 this_lr = 1 / num_actions[s][a] Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr) # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits)) print g, throws, "%1.4f" % (float(explores) / (explores + exploits)) avg = float(num_iterations) / float(num_games) return avg
def test_get_states(self): x = throw.START_SCORE; self.assertTrue(x/2 < len(darts.get_states() )); self.assertTrue(0.75* x < len(darts.get_states()));
def modelbased(gamma, epoch_size, num_games): # store all actions (targets on dartboard) in actions array actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} num_iterations = 0 # initialize v V = {} V[0] = {} V[1] = {} for s in states: V[0][s] = 0 V[1][s] = 0 # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions) - 1) num_actions[s] = {} num_transitions[s] = {} T_matrix[s] = {} for a in range(len(actions)): num_actions[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): iterations_this_game = 0 Q = {} # run a single game s = throw.START_SCORE while s > 0: iterations_this_game += 1 num_iterations += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. a = ex_strategy_one(actions, pi_star, s, iterations_this_game) # a = ex_strategy_two(actions, Q, s, iterations_this_game) action = actions[a] # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) s_prime = s - throw.location_to_score(loc) if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. if num_iterations % epoch_size == 0: # Update transition probabilities for i in states: for j in states: for k in range(len(actions)): if num_actions[i][k] != 0: T_matrix[i][j][k] = float( num_transitions[i][j][k]) / float( num_actions[i][k]) # Update strategy (stored in pi) based on newly updated reward function and transition # probabilities T_matrix, pi_star, Q, V = modelbased_value_iteration( gamma, T_matrix, pi_star, actions, states, V) avg_turns = float(num_iterations) / float(num_games) print "Average turns = ", avg_turns return avg_turns
def modelbased(gamma, epoch_size, num_games): # store all actions (targets on dartboard) in actions array actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} num_iterations = 0 Q = {} # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions)-1) num_actions[s] = {} num_transitions[s] = {} T_matrix[s] = {} Q[s] = {} for a in range(len(actions)): num_actions[s][a] = 0 Q[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): # run a single game s = throw.START_SCORE while s > 0: num_iterations += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. #to_explore = ex_strategy_one(s, num_iterations) # Second strategy to_explore = 2 newindex, newaction = ex_strategy_two(s, num_iterations, Q, actions) if to_explore == 2: a = newindex action = newaction elif to_explore: # explore a = random.randint(0, len(actions)-1) action = actions[a] else: # exploit a = pi_star[s] action = actions[a] # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) s_prime = s - throw.location_to_score(loc) if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. if num_iterations % epoch_size == 0: # Update transition probabilities for i in states: for j in states: for k in range(len(actions)): if num_actions[i][k] != 0: T_matrix[i][j][k] = float(num_transitions[i][j][k]) / float(num_actions[i][k]) # Update strategy (stored in pi) based on newly updated reward function and transition # probabilities T_matrix, pi_star, Q = modelbased_value_iteration(gamma, T_matrix, pi_star) print "Average turns = ", float(num_iterations)/float(num_games)
def Q_learning(gamma, alpha, num_games): # set these to values that make sense! #alpha = .5 #gamma = .3 Q = {} states = darts.get_states() actions = darts.get_actions() num_iterations = 0 num_total_iterations = 1 # Initialize all the Q values to zero for s in states: Q[s]= {} for a in actions: Q[s][a] = 0 for g in range(1, num_games + 1): #print "Average turns = ", float(num_iterations)/float(g) #print "GAME {}".format(g) # run a single game s = throw.START_SCORE gamethrows = 0; while s > 0: num_total_iterations += 1 gamethrows += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. #to_explore = ex_strategy_one(num_iterations) to_explore = ex_strategy_two(num_total_iterations) #to_explore = ex_strategy_three(g, num_games) action = 0 if to_explore: #explore #print "explore\n" a = random.randint(0, len(actions)-1) action = actions[a] # print "action {}".format(action) else: # exploit num_iterations += 1 #print "exploit\n" action = lookup_max_a(Q,s, actions) #print "action {}".format(action) #action = a # actions[a] # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) #print "score {}".format(s) #print "throw value:{}".format(throw.location_to_score(loc)) #should reward be based on action of loc? reward = darts.R(s,action) #print "reward {}".format(reward) s_prime = s - throw.location_to_score(loc) if s_prime < 0: s_prime = s # now we update the q score table #oldQ = copy.deepcopy(Q[s][a]) oldQ = (Q[s][action]) #print "oldQ {}".format(oldQ) nextQaction = lookup_max_a(Q, s_prime, actions) #print "nextQaction {}".format(nextQaction) newQ = oldQ + alpha*(reward + gamma*(Q[s_prime][nextQaction]) - oldQ) #print "newQ {}".format(newQ) Q[s][action] = newQ #print "Q[s][a] {}".format(Q[s][a]) #print "in game {},score {}, throw value {}, oldQ {}, newQ{}".format(g,s,throw.location_to_score(loc),oldQ,newQ) s = s_prime #print gamethrows print "Average turns = ", float(num_iterations)/float(num_games/2)