def q_learn(board_prototype, nepisodes, alpha, gamma, epsilon): ''' Q-Learning using Epsilon-greedy policy http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node65.html ''' global Q Q = {} for episode in xrange(nepisodes): # Create empty board with right size board = board_prototype.clone() for i in range(board.ncols()*board.nrows()): q_greedy_policy = QGreedyPolicy(Q) eps_greedy_policy = EpsilonGreedyPolicy(q_greedy_policy, epsilon) color = Board.BLACK if i%2 else Board.RED old_state = board.to_tuple() # s if color == Board.RED: board.flip() action = eps_greedy_policy.take_action(board) # a winner = board.play(color, action) reward = get_reward(board, we_are=Board.BLACK) # r_t if color == Board.RED: board.flip() new_state = board.to_tuple() # s' Q.setdefault(old_state, {}) Q[old_state].setdefault(action, 0.) current = Q[old_state][action] # Q(s,a) Q.setdefault(new_state, {}) best = max_action(Q[new_state], value_if_empty=0.) # max_a Q(s',a) # Q(s,a) <- Q(s,a) + alpha * (r_t + gamma * max_a Q(s',a) - Q(s,a)) Q[old_state][action] = current + alpha * (reward + gamma * best - current) if winner != Board.EMPTY: break return Q