示例#1
0
def q_learn(board_prototype, nepisodes, alpha, gamma, epsilon):
    '''
    Q-Learning using Epsilon-greedy policy
    http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node65.html
    '''
    global Q
    Q = {}
    for episode in xrange(nepisodes):
        # Create empty board with right size
        board = board_prototype.clone()
        for i in range(board.ncols()*board.nrows()):
            q_greedy_policy = QGreedyPolicy(Q)
            eps_greedy_policy = EpsilonGreedyPolicy(q_greedy_policy, epsilon)
            
            color = Board.BLACK if i%2 else Board.RED
            
            old_state = board.to_tuple()      # s
            
            if color == Board.RED:
                board.flip()
            action = eps_greedy_policy.take_action(board) # a
            winner = board.play(color, action)
            reward = get_reward(board, we_are=Board.BLACK) # r_t
            if color == Board.RED:
                board.flip()            
            
            new_state = board.to_tuple()         # s'
            
            Q.setdefault(old_state, {})
            Q[old_state].setdefault(action, 0.)
            current = Q[old_state][action] # Q(s,a)

            Q.setdefault(new_state, {})
            best = max_action(Q[new_state], value_if_empty=0.) # max_a Q(s',a)
            
            # Q(s,a) <- Q(s,a) + alpha * (r_t + gamma * max_a Q(s',a) - Q(s,a))
            Q[old_state][action] = current + alpha * (reward + gamma * best - current)
            if winner != Board.EMPTY:
                break 
    return Q