예제 #1
0
def action(board, dice, oplayer, i = 0):

    flippedplayer = -1
    if (flippedplayer == oplayer): # view it from player 1 perspective
        board = flipped_agent.flip_board(np.copy(board))
        player = -oplayer # player now the other player +1
    else:
        player = oplayer
    possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player)
    na = len(possible_boards)
    if (na == 0):
        return []
    xa = np.zeros((na,nx+1))
    va = np.zeros((na))
    for j in range(0, na):
        xa[j,:] = one_hot_encoding(possible_boards[j],i)
    x = Variable(torch.tensor(xa.transpose(), dtype = torch.float, device = device))
    # now do a forward pass to evaluate the board's after-state value
    h = torch.mm(w1,x) + b1 # matrix-multiply x with input weight w1 and add bias
    h_sigmoid = h.sigmoid() # squash this with a sigmoid function
    y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
    va = y.sigmoid().detach().cpu()
    action = possible_moves[np.argmax(va)]
    if (flippedplayer == oplayer): # map this move to right view
        action = flipped_agent.flip_move(action)
    return action
예제 #2
0
def action(board_copy, dice, player, i):
    
    if player == -1:
        board_copy = FA.flip_board(np.copy(board_copy))
    possible_moves, possible_boards = B.legal_moves(board_copy, dice, 1)
    
    if len(possible_moves) == 0:
        return []
    
    action = AgentJ.sample_action(np.vstack(possible_boards))
    move = possible_moves[action]
    if player == -1:
        move = FA.flip_move(move)
    return move
def action(net, board_copy, dice, player, i):

    if player == -1:
        board_copy = flipped_agent.flip_board(board_copy)  # #Flip the board
    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1)

    if len(possible_moves) == 0:
        return []
    move = []
    


    if player == -1:
        move = flipped_agent.flip_move(move)  # ##Flip the move
    return move
예제 #4
0
def action(board, dice, oplayer):
    flippedplayer = -1
    if (flippedplayer == oplayer):  # view it from player 1 perspective
        board = flipped_agent.flip_board(np.copy(board))
        player = -oplayer  # player now the other player +1
    else:
        player = oplayer
    possible_moves, possible_boards = e_legal_moves(board, dice, 1)
    if len(possible_moves) == 0:
        return []
    #index = get_action(actor, possible_boards)
    index = epsilon_greedy(critic, possible_boards)
    action = possible_moves[index]
    #print("ACTION")
    #print(action)
    if (flippedplayer == oplayer):  # map this move to right view
        action = flipped_agent.flip_move(action)
    return action
예제 #5
0
def action(board, dice, oplayer, nRoll = 0):
    flipped_player = -1
    if (flipped_player == oplayer):
        board = flipped_agent.flip_board(np.copy(board))
        player = -flipped_player
    else:
        player = oplayer
    # check out the legal moves available for the throw
    race = c_int(israce(board))
    possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player)
    na = len(possible_moves)
    va = np.zeros(na)
    if (na == 0):
        return []
    for i in range(0, na):
        board = pubeval_flip(possible_boards[i])
        board = board.astype(dtype = ctypes.c_int)
        va[i] = lib.pubeval(race, board.ctypes.data_as(intp))
    action = possible_moves[np.argmax(va)]
    if (flipped_player == oplayer): # map this move to right view
        action = flipped_agent.flip_move(action)
    return action
def action(board, dice, oplayer, i=0):

    flippedplayer = -1
    if (flippedplayer == oplayer):  # view it from player 1 perspective
        board = flipped_agent.flip_board(np.copy(board))
        player = -oplayer  # player now the other player +1
    else:
        player = oplayer

    possible_moves, possible_boards = Backgammon.legal_moves(
        board, dice, player)

    # if there are no moves available
    if len(possible_moves) == 0:
        return []

    after_state, action = epsilon_nn_greedy(board, possible_moves,
                                            possible_boards, player)

    if (flippedplayer == oplayer):  # map this move to right view
        action = flipped_agent.flip_move(action)
    return action
예제 #7
0
def learnit(numgames, epsilon, lam, alpha, alpha1, alpha2, w1, b1, w2, b2):
    gamma = 1 # for completeness
    # play numgames games for training
    for games in range(0, numgames):
        board = Backgammon.init_board()    # initialize the board (empty)
        # now we initilize all the eligibility traces for the neural network
        Z_w1 = torch.zeros(w1.size(), device = device, dtype = torch.float)
        Z_b1 = torch.zeros(b1.size(), device = device, dtype = torch.float)
        Z_w2 = torch.zeros(w2.size(), device = device, dtype = torch.float)
        Z_b2 = torch.zeros(b2.size(), device = device, dtype = torch.float)
        # player to start is "1" the other player is "-1"
        player = 1
        otherplayer = -1
        winner = 0 # this implies a draw
        isGameOver = False
        moveNumber = 0
        while (isGameOver == False):
            dice = Backgammon.roll_dice()
            # use a policy to find action
            # both are using the neural-network to approximate the after-state value
            if (player == otherplayer): # this one flippes the board to find an action.
                possible_moves, possible_boards = Backgammon.legal_moves(flipped_agent.flip_board(np.copy(board)), dice, -player)
                action = epsilon_nn_greedy(flipped_agent.flip_board(np.copy(board)), dice, -player, epsilon, w1, b1, w2, b2,  possible_moves, possible_boards, False)
                action = flipped_agent.flip_move(action)
            else: # this one uses the original board.
                possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player)
                action = epsilon_nn_greedy(np.copy(board), dice, player, epsilon, w1, b1, w2, b2, possible_moves, possible_boards, False)
            # perform move and update board
            for i in range(0,len(action)):
                board = Backgammon.update_board(board, action[i], player)
            if (1 == Backgammon.game_over(board)): # has this player won?
                winner = player
                isGameOver = True
                break # bail out of inner game loop
            # once both player have performed at least one move we can start doing updates
            if (1 < moveNumber):
                if otherplayer == player: # here we have player -1 updating the table V
                    x_flipped = Variable(torch.tensor(one_hot_encoding(flipped_agent.flip_board(board)), dtype = torch.float, device = device)).view(28*2*6,1)
                    h = torch.mm(w1,x_flipped) + b1 # matrix-multiply x with input weight w1 and add bias
                    h_sigmoid = h.sigmoid() # squash this with a sigmoid function
                    y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
                    y_sigmoid = y.sigmoid() # squash this with a sigmoid function
                    target = y_sigmoid.detach().cpu().numpy()
                    # lets also do a forward past for the old board, this is the state we will update
                    h = torch.mm(w1,xold_flipped) + b1 # matrix-multiply x with input weight w1 and add bias
                    h_sigmoid = h.sigmoid() # squash this with a sigmoid function
                    y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
                    y_sigmoid = y.sigmoid() # squash the output
                    delta2 = 0 + gamma * target - y_sigmoid.detach().cpu().numpy() # this is the usual TD error
                else: # here we have player 1 updating the neural-network (2 layer feed forward with Sigmoid units)
                    x = Variable(torch.tensor(one_hot_encoding(board), dtype = torch.float, device = device)).view(28*2*6,1)
                    # now do a forward pass to evaluate the new board's after-state value
                    h = torch.mm(w1,x) + b1 # matrix-multiply x with input weight w1 and add bias
                    h_sigmoid = h.sigmoid() # squash this with a sigmoid function
                    y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
                    y_sigmoid = y.sigmoid() # squash this with a sigmoid function
                    target = y_sigmoid.detach().cpu().numpy()
                    # lets also do a forward past for the old board, this is the state we will update
                    h = torch.mm(w1,xold) + b1 # matrix-multiply x with input weight w1 and add bias
                    h_sigmoid = h.sigmoid() # squash this with a sigmoid function
                    y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
                    y_sigmoid = y.sigmoid() # squash the output
                    delta2 = 0 + gamma * target - y_sigmoid.detach().cpu().numpy() # this is the usual TD error
                # using autograd and the contructed computational graph in pytorch compute all gradients
                y_sigmoid.backward()
                # update the eligibility traces using the gradients
                Z_w1 = gamma * lam * Z_w1 + w1.grad.data
                Z_b1 = gamma * lam * Z_b1 + b1.grad.data
                Z_w2 = gamma * lam * Z_w2 + w2.grad.data
                Z_b2 = gamma * lam * Z_b2 + b2.grad.data
                # zero the gradients
                w1.grad.data.zero_()
                b1.grad.data.zero_()
                w2.grad.data.zero_()
                b2.grad.data.zero_()
                # perform now the update for the weights
                delta2 =  torch.tensor(delta2, dtype = torch.float, device = device)
                w1.data = w1.data + alpha1 * delta2 * Z_w1
                b1.data = b1.data + alpha1 * delta2 * Z_b1
                w2.data = w2.data + alpha2 * delta2 * Z_w2
                b2.data = b2.data + alpha2 * delta2 * Z_b2

            # we need to keep track of the last board state visited by the players
            if otherplayer == player:
                xold_flipped = Variable(torch.tensor(one_hot_encoding(flipped_agent.flip_board(board)), dtype=torch.float, device = device)).view(28*2*6,1)
            else:
                xold = Variable(torch.tensor(one_hot_encoding(board), dtype=torch.float, device = device)).view(28*2*6,1)
            # swap players
            player = -player
            moveNumber = moveNumber + 1

        # The game epsiode has ended and we know the outcome of the game, and can find the terminal rewards
        if winner == otherplayer:
            reward = 0
        elif winner == -otherplayer:
            reward = 1
        else:
            reward = 0.5
        # Now we perform the final update (terminal after-state value is zero)
        # these are basically the same updates as in the inner loop but for the final-after-states (xold and xold_flipped)
        
        # Fist we update the values for player -1
        h = torch.mm(w1,xold_flipped) + b1 # matrix-multiply x with input weight w1 and add bias
        h_sigmoid = h.sigmoid() # squash this with a sigmoid function
        y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
        y_sigmoid = y.sigmoid() # squash the output
        delta = (1.0 - reward) + gamma * 0 - y_sigmoid.detach().cpu().numpy()
        # using autograd and the contructed computational graph in pytorch compute all gradients
        y_sigmoid.backward()
        # update the eligibility traces
        Z_w1 = gamma * lam * Z_w1 + w1.grad.data
        Z_b1 = gamma * lam * Z_b1 + b1.grad.data
        Z_w2 = gamma * lam * Z_w2 + w2.grad.data
        Z_b2 = gamma * lam * Z_b2 + b2.grad.data
        # zero the gradients
        w1.grad.data.zero_()
        b1.grad.data.zero_()
        w2.grad.data.zero_()
        b2.grad.data.zero_()
        # perform now the update of weights
        delta =  torch.tensor(delta, dtype = torch.float, device = device)
        w1.data = w1.data + alpha1 * delta * Z_w1
        b1.data = b1.data + alpha1 * delta * Z_b1
        w2.data = w2.data + alpha2 * delta * Z_w2
        b2.data = b2.data + alpha2 * delta * Z_b2
        
        # Then we update the values for player 1
        h = torch.mm(w1,xold) + b1 # matrix-multiply x with input weight w1 and add bias
        h_sigmoid = h.sigmoid() # squash this with a sigmoid function
        y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
        y_sigmoid = y.sigmoid() # squash the output
        delta2 = reward + gamma * 0 - y_sigmoid.detach().cpu().numpy()  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        y_sigmoid.backward()
        # update the eligibility traces
        Z_w1 = gamma * lam * Z_w1 + w1.grad.data
        Z_b1 = gamma * lam * Z_b1 + b1.grad.data
        Z_w2 = gamma * lam * Z_w2 + w2.grad.data
        Z_b2 = gamma * lam * Z_b2 + b2.grad.data
        # zero the gradients
        w1.grad.data.zero_()
        b1.grad.data.zero_()
        w2.grad.data.zero_()
        b2.grad.data.zero_()
        # perform now the update of weights
        delta2 =  torch.tensor(delta2, dtype = torch.float, device = device)
        w1.data = w1.data + alpha1 * delta2 * Z_w1
        b1.data = b1.data + alpha1 * delta2 * Z_b1
        w2.data = w2.data + alpha2 * delta2 * Z_w2
        b2.data = b2.data + alpha2 * delta2 * Z_b2