示例#1
0
def action(board_copy,dice,player,i):
    # the champion to be
    # inputs are the board, the dice and which player is to move
    # outputs the chosen move accordingly to its policy
    
    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player)
    
    # if there are no moves available
    if len(possible_moves) == 0: 
        return [] 
    
    # make the best move according to the policy
    
    # policy missing, returns a random move for the time being
    #
    #
    #
    #
    #
    epsilon = 0.1
    w1 = torch.load('./w1_trained.pth', map_location=lambda storage, loc: storage)
    w2 = torch.load('./w2_trained.pth', map_location=lambda storage, loc: storage)
    b1 = torch.load('./b1_trained.pth', map_location=lambda storage, loc: storage)
    b2 = torch.load('./b2_trained.pth', map_location=lambda storage, loc: storage)
    
    #w1 = torch.load('./w1_trained_first_time_working.pth', map_location=lambda storage, loc: storage)
    #w2 = torch.load('./w2_trained_first_time_working.pth', map_location=lambda storage, loc: storage)
    #b1 = torch.load('./b1_trained_first_time_working.pth', map_location=lambda storage, loc: storage)
    #b2 = torch.load('./b2_trained_first_time_working.pth', map_location=lambda storage, loc: storage)
    move = neural_network_agent.epsilon_nn_greedy(board_copy, dice, player, epsilon, w1, b1, w2, b2, possible_moves, possible_boards, False)

    return move
def action(board, dice, oplayer, i = 0):

    flippedplayer = -1
    if (flippedplayer == oplayer): # view it from player 1 perspective
        board = flipped_agent.flip_board(np.copy(board))
        player = -oplayer # player now the other player +1
    else:
        player = oplayer
    possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player)
    na = len(possible_boards)
    if (na == 0):
        return []
    xa = np.zeros((na,nx+1))
    va = np.zeros((na))
    for j in range(0, na):
        xa[j,:] = one_hot_encoding(possible_boards[j],i)
    x = Variable(torch.tensor(xa.transpose(), dtype = torch.float, device = device))
    # now do a forward pass to evaluate the board's after-state value
    h = torch.mm(w1,x) + b1 # matrix-multiply x with input weight w1 and add bias
    h_sigmoid = h.sigmoid() # squash this with a sigmoid function
    y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
    va = y.sigmoid().detach().cpu()
    action = possible_moves[np.argmax(va)]
    if (flippedplayer == oplayer): # map this move to right view
        action = flipped_agent.flip_move(action)
    return action
def action(board_copy, dice, player, i):
    # the champion to be
    # inputs are the board, the dice and which player is to move
    # outputs the chosen move accordingly to its policy

    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(
        board_copy, dice, player)

    # if there are no moves available
    if len(possible_moves) == 0:
        return []

    # make the best move according to the policy

    # policy missing, returns a random move for the time being
    #
    #
    #
    #
    #
    move = epsilon_nn_greedy(board_copy, player, epsilon, w1, b1, w2, b2,
                             debug)

    return move
示例#4
0
def action(board_copy, dice, player, i, model):
    global actionCount

    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(
        board_copy, dice, player)

    # if there are no moves available
    if len(possible_moves) == 0:
        return []

    #Backgammon.pretty_print(board_copy)

    after_state, action = epsilon_nn_greedy(board_copy, possible_moves,
                                            possible_boards, player, model)
    #model.xtheta = xtheta_mean
    if (actionCount > 0):
        model.updateNeural(after_state)
    if (actionCount > 1):
        model.dynaUpdate()

    actionCount += 1

    model.xold = Variable(
        torch.tensor(one_hot_encoding(after_state),
                     dtype=torch.float,
                     device=model.device)).view((28 * 31, 1))

    return action
示例#5
0
def action(board_copy, dice, player, i, model):
    global actionCount
    # starts by flipping the board so that the player always sees himself as player 1
    if player == -1: board_copy = flip_board(board_copy)

    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(board_copy,
                                                             dice,
                                                             player=1)

    # if there are no moves available, return an empty move
    if len(possible_moves) == 0:
        return []

    # Make the bestmove:
    after_state, action = epsilon_nn_greedy(board_copy, possible_moves,
                                            possible_boards, player, model)
    #model.xtheta = xtheta_mean
    if (actionCount > 0):
        model.updateNeural(after_state)
    if (actionCount > 1):
        model.dynaUpdate()

    actionCount += 1

    model.xold = Variable(
        torch.tensor(one_hot_encoding(after_state),
                     dtype=torch.float,
                     device=model.device)).view((28 * 31, 1))

    # if the table was flipped the move has to be flipped as well
    if player == -1: move = flip_move(action)

    return move
示例#6
0
def action(board_copy, epsilon, dice, player, i):
    if player == -1:
        board_copy = flip_board(board_copy)

    possible_moves, possible_boards = BG.legal_moves(board_copy,
                                                     dice,
                                                     player=1)
    na = len(possible_moves)
    va = np.zeros(na)
    j = 0

    # if there are no moves available
    if na == 0:
        return []
    if (np.random.uniform() < epsilon):
        move = possible_moves[randrange(na)]
        if player == -1:
            move = flip_move(move)
        return move

    for board in possible_boards:
        # encode the board to create the input
        x = Variable(
            torch.tensor(ice_hot_encoding(board),
                         dtype=torch.float,
                         device=device)).view(encSize, 1)
        # now do a forward pass to evaluate the board's after-state value
        va[j] = feed_forward_w(x)
        j += 1
    move = possible_moves[np.argmax(va)]
    if player == -1:
        move = flip_move(move)
    return move
示例#7
0
def action(net, board_copy, dice, player, i):
    # the champion to be
    # inputs are the board, the dice and which player is to move
    # outputs the chosen move accordingly to its policy

    if player == -1: board_copy = flip_board(board_copy)  ##Flip the board
    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(board_copy,
                                                             dice,
                                                             player=1)

    # if there are no moves available
    if len(possible_moves) == 0:
        return []

    feature_boards = []
    ###Create new features using Tsesauros
    for b in possible_boards:
        feature_boards.append(oneHot(b))

    ### Get probabilites of each action via the actor forward
    probs, log_probs = net.actor.forward(feature_boards)

    ###index is used as a help to pick action
    index = np.arange(0, len(possible_boards))
    ###This works because numpy and pytorch hate each other
    probs = probs.detach().numpy()

    ###The index of the action chose
    i = choice(index, p=probs)
    move = possible_moves[
        i]  ###Pick the next move according to the index selected
    newBoard = possible_boards[
        i]  ###Pick the nex board according to the index selected
    newBoardFeatures = oneHot(newBoard)

    ### Critic feedforward
    target, oldtarget = net.critic.forward(newBoardFeatures, oneHot(
        board_copy))  #(newBoardFeatures,getFeatures(board_copy,player) )

    R = 0
    if (Backgammon.game_over(newBoard)
        ):  ###Did I win? If so the reward shall be +1
        R = 1
        target = 0  ###Terminal state is 0

    ### Now we update the neaural network

#    target, oldtarget =net.critic.forward(newBoardFeatures,getFeatures(board_copy,player) )

    delta = R + net.gamma * target - oldtarget

    ###Update the critic via backpropgation
    net.critic.backward(R, delta, net.gamma)
    ###Update the actor via backpropogation
    net.actor.backward(log_probs[i], delta, net.gamma)

    if player == -1: move = flip_move(move)  ###Flip the move

    return move
    def greedy_action(self, board, dice, player, i):
        if player == -1: board = flip_board(board)

        # check out the legal moves available for the throw
        possible_moves, possible_boards = Backgammon.legal_moves(board,
                                                                 dice,
                                                                 player=1)

        # if there are no moves available, return an empty move
        if len(possible_moves) == 0:
            return []

        na = len(possible_boards)
        enc = np.zeros((na, 312))
        for i in range(0, na):
            enc[i, :] = oneHot(possible_boards[i])
        x = Variable(
            torch.tensor(enc.transpose(),
                         dtype=torch.double,
                         device=self.device))

        h = torch.mm(self.w1, x) + self.b1
        h_sigmoid = h.sigmoid()
        y = torch.mm(self.W, h_sigmoid) + self.B
        va = y.sigmoid().detach().cpu()
        action = possible_moves[np.argmax(va)]

        if player == -1: action = flip_move(action)

        return action
def action(board_copy, dice, player, i):
    # the champion to be
    # inputs are the board, the dice and which player is to move
    # outputs the chosen move accordingly to its policy

    # starts by flipping the board so that the player always sees himself as player 1
    if player == -1: board_copy = flip_board(board_copy)

    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(board_copy,
                                                             dice,
                                                             player=1)

    # if there are no moves available, return an empty move
    if len(possible_moves) == 0:
        return []

    # Make the bestmove:
    # policy missing, returns a random move for the time being
    #
    #
    #
    #
    #
    move = possible_moves[np.random.randint(len(possible_moves))]

    # if the table was flipped the move has to be flipped as well
    if player == -1: move = flip_move(move)

    return move
示例#10
0
 def legal_moves(self, dice, player):
     moves, boards = B.legal_moves(board=self.board,
                                   dice=dice,
                                   player=player)
     if len(boards) == 0:
         return [], []
     boards = np.vstack(boards)
     return moves, boards
示例#11
0
 def legal_moves(self, board, dice, player):
     if player == -1:
         board = FA.flip_board(np.copy(board))
     moves, boards = B.legal_moves(board=board, dice=dice, player=1)
     if len(boards) == 0:
         return [], []
     boards = np.vstack(boards)
     return moves, boards
示例#12
0
def e_legal_moves(board, dice, player=1):
    moves, boards = B.legal_moves(board, dice=dice, player=player)
    if len(boards) == 0:
        return [], features(board, player)
    n_boards = np.shape(boards)[0]
    tesauro = np.zeros((n_boards, 198))
    for b in range(n_boards):
        tesauro[b, :] = features(boards[b], player)
    tesauro = np.array(tesauro)
    return moves, tesauro
示例#13
0
 def ExamplePolicy(self):
     _, st = B.legal_moves(B.init_board(), B.roll_dice(), 1)
     st = np.vstack(st)
     st = st[:, 1:]
     out = np.round(
         self._s.run(self._actor_policy, ({
             self._possible_states: st
         })) * 100) / 100
     out = out.flatten()
     out.sort()
     return out[::-1]
示例#14
0
def action(net, board_copy, dice, player, i, learn=True):
    # the champion to be
    # inputs are the board, the dice and which player is to move
    # outputs the chosen move accordingly to its policy

    if player == -1: board_copy = flip_board(board_copy)  ##Flip the board
    # check out the legal moves available for the throw
    if (player == 1):
        xold = net.xold
        net.xnew = board_copy
    else:  ########################################################################
        xold = net.xFlipOld
        net.xFlipNew = board_copy

    possible_moves, possible_boards = Backgammon.legal_moves(board_copy,
                                                             dice,
                                                             player=1)

    # if there are no moves available
    if len(possible_moves) == 0:
        return []

    one_hot = []
    for b in possible_boards:
        one_hot.append(oneHot(b))

    if learn:
        if not net.firstMove:
            net.update(player)

    m, xtheta = net.actor(one_hot, possible_moves)
    if player == 1:
        net.xtheta = xtheta
    else:
        net.flipxtheta = xtheta

    move = possible_moves[m]
    newBoard = possible_boards[m]

    #    if learn:
    #        if not net.firstMove:
    #            net.update(player)

    if player == -1: move = flip_move(move)  ###Flip the move

    if player == 1:
        net.xold = board_copy
    else:
        net.xFlipOld = board_copy
        net.firstMove = False

    return move
示例#15
0
def action(board_copy, dice, player, i, learning=False):
    if player == -1:
        board_copy = flip_board(board_copy)

    # Get every possible move and board
    xtheta_mean = torch.zeros((len(theta), 1))
    possible_moves, possible_boards = BG.legal_moves(board_copy,
                                                     dice,
                                                     player=1)
    na = len(possible_moves)
    one_hot_boards = np.zeros((2 * (n - 1) * 7, na))
    j = 0
    # if there are no moves available
    if len(possible_moves) == 0:
        x = Variable(
            torch.tensor(ice_hot_encoding(board_copy),
                         dtype=torch.float,
                         device=device)).view(2 * (n - 1) * 7, 1)
        h_sigmoid = feed_forward_th(x)
        pi = torch.mm(theta, h_sigmoid).softmax(0)
        xtheta_mean = h_sigmoid * pi.item()
        if learning == True:
            return [], xtheta_mean
        else:
            return []

    for board in possible_boards:
        # encode the board to create the input for the NN
        x = Variable(
            torch.tensor(ice_hot_encoding(board),
                         dtype=torch.float,
                         device=device)).view(2 * (n - 1) * 7, 1)
        one_hot_boards[:, j] = x[:, 0]
        j += 1
    # select the move from a distribution
    X = Variable(torch.tensor(one_hot_boards, dtype=torch.float,
                              device=device))
    h = feed_forward_th(X)
    h_sigmoid = h.sigmoid()
    pi = torch.mm(theta, h_sigmoid).softmax(1)
    xtheta_mean = torch.sum(torch.mm(h_sigmoid, torch.diagflat(pi)), 1)
    xtheta_mean = torch.unsqueeze(xtheta_mean, 1)
    move_index = torch.multinomial(pi, num_samples=1)
    move = possible_moves[move_index]
    if player == -1:
        move = flip_move(move)

    if learning == True:
        return move, xtheta_mean

    return move
def action(board_copy, dice, player, i):
    # the champion to be
    # inputs are the board, the dice and which player is to move
    # outputs the chosen move accordingly to its policy
    move = []

    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice)

    # make the best move according to the policy
    if len(possible_moves) != 0:
        move = policy(possible_moves, possible_boards, dice, i)

    return move
示例#17
0
def action(board_copy, dice, player, i):
    
    if player == -1:
        board_copy = FA.flip_board(np.copy(board_copy))
    possible_moves, possible_boards = B.legal_moves(board_copy, dice, 1)
    
    if len(possible_moves) == 0:
        return []
    
    action = AgentJ.sample_action(np.vstack(possible_boards))
    move = possible_moves[action]
    if player == -1:
        move = FA.flip_move(move)
    return move
    def nextMove(self, board, dice, player, actor_theta):
        possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player)
        
        if(len(possible_moves) == 0):
            return [], []
        board_vals = np.zeros(len(possible_boards))
        for k in range(0, len(possible_boards)):
            board_vals[k] = self.getValue(possible_boards[k], actor_theta, player)

        pi_vals = softmax(board_vals)
        index = np.arange(0, len(possible_boards))
        i = choice(index, p=pi_vals)
        move = possible_moves[i]
        newBoard = possible_boards[i]

        return move, newBoard
示例#19
0
def epsilon_nn_greedy(board, player, epsilon, w1, b1, w2, b2, debug=False):
    moves = Backgammon.legal_moves(board)
    if np.random.uniform() < epsilon:
        if debug is True:
            print("explorative move")
        return np.random.choice(moves, 1)
    na = np.size(moves)
    va = np.zeros(na)
    for i in range(0, na):
        board[moves[i]] = player
        # encode the board to create the input

        # FEATURES eru X

        # va[i] = y.sigmoid()
    return moves[np.argmax(va)]
def action(net, board_copy, dice, player, i):

    if player == -1:
        board_copy = flipped_agent.flip_board(board_copy)  # #Flip the board
    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1)

    if len(possible_moves) == 0:
        return []
    move = []
    


    if player == -1:
        move = flipped_agent.flip_move(move)  # ##Flip the move
    return move
示例#21
0
def action(board_copy, dice, player, i):
    global count
    # the champion to be
    # inputs are the board, the dice and which player is to move
    # outputs the chosen move accordingly to its policy

    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(
        board_copy, dice, player)

    # if there are no moves available
    if len(possible_moves) == 0:
        return []

    # make the best move according to the policy

    na = len(possible_moves)
    va = np.zeros(na)
    for i in range(0, na):
        move = possible_moves[i]
        board = possible_boards[i]

        # encode the board to create the input
        x = Variable(
            torch.tensor(one_hot_encoding(board),
                         dtype=torch.float,
                         device=device)).view(29, 31)
        # now do a forward pass to evaluate the board's after-state value
        h = torch.mm(
            w1, x) + b1  # matrix-multiply x with input weight w1 and add bias
        h_sigmoid = h.sigmoid()  # squash this with a sigmoid function
        y = torch.mm(
            w2,
            h_sigmoid) + b2  # multiply with the output weights w2 and add bias
        y_sigmoid = y.sigmoid()
        z = torch.mm(y_sigmoid, w3) + b3
        va[i] = z.sigmoid()

    count += 1

    if not Backgammon.game_over(possible_boards[np.argmax(va)]):
        update(possible_boards[np.argmax(va)])
    else:
        reward = 1 if player == 1 else 0
        update(possible_boards[np.argmax(va)], reward)

    return possible_moves[np.argmax(va)]
示例#22
0
def action(board_copy,dice,player,i, y_old, model, firstMove, training): 
    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player)
   # if there are no moves available
    if len(possible_moves) == 0: 
        return [], y_old

    boards = []
    for board in possible_boards:
        boards.append(getinputboard(board))
    
    if(not firstMove and training):
        # learn
        learn(y_old, model, boards, "")  
    # take greedy Action
    action, y_new = greedy(boards, model)  
    move = possible_moves[action]  
    # make the best move according to the policy
    return move, y_new
示例#23
0
def action(board, dice, oplayer, nRoll = 0):
    flipped_player = -1
    if (flipped_player == oplayer):
        board = flipped_agent.flip_board(np.copy(board))
        player = -flipped_player
    else:
        player = oplayer
    # check out the legal moves available for the throw
    race = c_int(israce(board))
    possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player)
    na = len(possible_moves)
    va = np.zeros(na)
    if (na == 0):
        return []
    for i in range(0, na):
        board = pubeval_flip(possible_boards[i])
        board = board.astype(dtype = ctypes.c_int)
        va[i] = lib.pubeval(race, board.ctypes.data_as(intp))
    action = possible_moves[np.argmax(va)]
    if (flipped_player == oplayer): # map this move to right view
        action = flipped_agent.flip_move(action)
    return action
def action(board, dice, oplayer, i=0):

    flippedplayer = -1
    if (flippedplayer == oplayer):  # view it from player 1 perspective
        board = flipped_agent.flip_board(np.copy(board))
        player = -oplayer  # player now the other player +1
    else:
        player = oplayer

    possible_moves, possible_boards = Backgammon.legal_moves(
        board, dice, player)

    # if there are no moves available
    if len(possible_moves) == 0:
        return []

    after_state, action = epsilon_nn_greedy(board, possible_moves,
                                            possible_boards, player)

    if (flippedplayer == oplayer):  # map this move to right view
        action = flipped_agent.flip_move(action)
    return action
    def softMax(self, board, dice, player, i):
        if player == -1: board = flip_board(board)

        # check out the legal moves available for the throw
        possible_moves, possible_boards = Backgammon.legal_moves(board,
                                                                 dice,
                                                                 player=1)

        # if there are no moves available, return an empty move
        if len(possible_moves) == 0:
            return []

        na = len(possible_boards)
        enc = np.zeros((na, 312))
        for i in range(0, na):
            enc[i, :] = oneHot(possible_boards[i])
        x = Variable(
            torch.tensor(enc.transpose(),
                         dtype=torch.double,
                         device=self.device))
        h = torch.mm(self.w1, x) + self.b1
        h_sigmoid = h.sigmoid()
        pi = (torch.mm(self.theta, h_sigmoid)).softmax(1)
        xtheta_mean = torch.sum(torch.mm(h_sigmoid, torch.diagflat(pi)), 1)
        xtheta_mean = torch.unsqueeze(xtheta_mean, 1)
        if player == 1:
            self.xtheta = xtheta_mean
        else:
            self.xthetaF = xtheta_mean
        self.xtheta = xtheta_mean

        m = torch.multinomial(pi, 1)

        action = possible_moves[m]

        if player == -1: action = flip_move(action)

        return action
 def nextMove(self, board, dice, player, search_theta):
     if player == -1: 
         board_copy = flip_board(board_copy)
     possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player=1)
     
     if(len(possible_moves) == 0):
         return [], []
     # feature_boards = []
     board_vals = np.zeros(len(possible_boards))
     for k in range(0, len(possible_boards)):
         # feature_boards.append()
         board_vals[k] = self.getValue(possible_boards[k], search_theta)
     
     i = np.where(board_vals == max(board_vals))
     if(len(i[0]) > 1):
         i = choice(i[0])
     else:
         i = i[0][0]
     if player == -1:
         move = flip_move(move)
     move = possible_moves[i]  # ##Pick the next move according to the index selected
     newBoard = possible_boards[i]  # ##Pick the nex board according to the index selected
     return move, newBoard
示例#27
0
def action(board_copy, dice, player, i, net=None):
    # the champion to be
    # inputs are the board, the dice and which player is to move
    # outputs the chosen move accordingly to its policy

    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(
        board_copy, dice, player)

    # console.log(possible_moves)
    # if there are no moves available
    if len(possible_moves) == 0:
        return []

    ret_arr, softmax_deriv = softmax(possible_moves, possible_boards,
                                     board_copy, player, net)
    s_prime = ret_arr[0]

    # print(0 + gamma * net.torch_nn.forward(getFeatures(s_prime, player)) - net.torch_nn.forward(getFeatures(board_copy, player)))

    # delta = 0 + gamma * net.val_func_nn.forward(s_prime, player) - net.val_func_nn.forward(board_copy, player)
    delta = 0 + net.gamma * net.torch_nn.forward(getFeatures(
        s_prime, player)) - net.torch_nn.forward(
            getFeatures(board_copy, player))
    # print(delta)
    net.torch_nn.backward(net.gamma, delta)
    # print("delta is %i", delta)
    # net.val_func_nn.w = net.val_func_nn.w + (net.val_func_nn.alpha_w * delta * net.val_func_nn.backward(board_copy, player))
    # net.policy_nn.theta = np.append(np.ravel(nn.input_weights), nn.hidden_weights)
    # backprop
    # HERA WEIGHTS I SITTHVORU LAGI

    net.torch_nn_policy.theta = net.torch_nn_policy.theta + net.torch_nn_policy.alpha_theta * net.i * delta * softmax_deriv
    net.i = net.gamma * net.i
    # if(i > 1)

    return ret_arr[1]
def epsilon_nn_greedy(board,
                      dice,
                      player,
                      epsilon,
                      w1,
                      b1,
                      w2,
                      b2,
                      debug=False):
    possible_moves, possible_boards = Backgammon.legal_moves(
        board, dice, player)
    if (np.random.uniform() < epsilon):
        if debug == True:
            print("explorative move")
        return possible_moves[np.random.randint(len(possible_moves))]
    na = len(possible_boards)
    va = np.zeros(na)
    for i in range(0, na):
        # encode the board to create the input
        x = Variable(
            torch.tensor(one_hot_encoding(possible_moves[i]),
                         dtype=torch.float,
                         device=device)).view(28 * 2 * 6, 1)
        p, h = actor_policy_forward(x, w1, b1, w2, b2)
        #need this information for the backpropagation for policy gradient
        xs.append(x)  #inputs
        hs.append(h)  #hidden states
        # now do a forward pass to evaluate the board's after-state value
        h = torch.mm(
            w1, x) + b1  # matrix-multiply x with input weight w1 and add bias
        h_sigmoid = h.sigmoid()  # squash this with a sigmoid function
        y = torch.mm(
            w2,
            h_sigmoid) + b2  # multiply with the output weights w2 and add bias
        logProp.append(y - p)
        va[i] = y.sigmoid()
    return possible_moves[np.argmax(va)]
示例#29
0
def action(board_copy,dice,player,i,train=False,train_config=None):
    """
    inputs are the board, the dice and which player is to move
    outputs the chosen move accordingly to its policy
    """

    # global variables
    global counter
    global bearing_off_counter

    # starts by flipping the board so that the player always sees himself as player 1
    if player == -1: 
        board_copy = flip_board(board_copy)
        
    # check out the legal moves available for the throw
    possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1)
    
    # if there are no moves available, return an empty move
    if len(possible_moves) == 0: 
        return []

    if not bearing_off(board_copy):
        model = DQN
        buffer = D
    else:
        model = DQN_bearing_off
        buffer = D_bearing_off
        bearing_off_counter += 1
    
    # Current state and Q value, possible next states
    S = np.array([board_2_state(board_copy, i==2)])
    Q = model(S)
    first_of_2 = 1+(dice[0] == dice[1])-i
    S_primes = np.array([board_2_state(b, first_of_2) for b in possible_boards])

    # Find best action and it's q-value w/ epsilon-greedy
    Q_primes = model(S_primes)  # TODO: only evaluate unique boards
    action = np.argmax(Q_primes)
    if train and np.random.rand() < config.eps: # epsilon-greedy when training
        action = np.random.randint(len(possible_moves))

    # TODO: Fix the 16-piece bug (1 hour)
    # print("action:", action)
    # print("board:")
    # Backgammon.pretty_print(board_copy)
    # print('"endgames":')
    # [Backgammon.pretty_print(b) for b in possible_boards]

    if train:
        # # number of games
        # g = train_config['g']

        # state
        S_prime = np.array([board_2_state(possible_boards[action], first_of_2)])
        
        # Target update
        if not bearing_off(possible_boards[action]):
            target_model = DQN_target
        else:
            target_model = DQN_bearing_off_target
        Q_max = target_model(S_prime)

        r = game_won(possible_boards[action])
        target = Q + config.lr*(r + config.gamma*Q_max - Q)
        buffer.push(S, None, r, S_prime, target, done=True)

        # update the target network every C steps
        if counter % config.C == 0:
            target_model.set_weights(model.get_weights()) 

        # train model from buffer
        if counter % config.batch_size == 0 and bearing_off_counter > config.batch_size:
            state_batch, action_batch, reward_batch, next_state_batch, target_batch, done_batch = D.sample(config.batch_size)
            DQN.train_on_batch(np.array(state_batch), np.array(target_batch))
            state_batch, action_batch, reward_batch, next_state_batch, target_batch, done_batch = D_bearing_off.sample(config.batch_size)
            DQN_bearing_off.train_on_batch(np.array(state_batch), np.array(target_batch))
        
        # save model every 1000_000 training moves
        if counter % 10_000_000 == 0 and not counter in saved_models and counter != 0:
            # save both networks
            filepath = "./kotra_weights/DQN_"+str(counter)
            print("saving weights in file:"+filepath)
            DQN.save(filepath, overwrite=True, include_optimizer=True)

            filepath += "bearing_off"
            print("saving bearing-off-weights in file:"+filepath)
            DQN_bearing_off.save(filepath, overwrite=True, include_optimizer=True)
            saved_models.append(counter)

        counter += 1
示例#30
0
def action(board_copy, dice, player, i, learning=False):
    if player == -1:
        board_copy = flip_board(board_copy)

    # Get every possible move and board
    possible_moves, possible_boards = BG.legal_moves(board_copy,
                                                     dice,
                                                     player=1)
    na = len(possible_moves)
    # stores the output of the NN
    if na == 0:
        values = Variable(torch.zeros((7 * (n - 1) * 2),
                                      device=device,
                                      dtype=torch.float),
                          requires_grad=False)
    else:
        values = Variable(torch.zeros((7 * (n - 1) * 2, na),
                                      device=device,
                                      dtype=torch.float),
                          requires_grad=False)
    j = 0
    # if there are no moves available
    if len(possible_moves) == 0:
        x = Variable(
            torch.tensor(ice_hot_encoding(board_copy),
                         dtype=torch.float,
                         device=device)).view(2 * (n - 1) * 7, 1)
        prob_temp = feed_forward_th(x)
        prob_temp = prob_temp.softmax(dim=0)
        prob_nomove = torch.tensor([prob_temp],
                                   dtype=torch.float,
                                   device=device,
                                   requires_grad=True)
        move_index = torch.tensor([0], device=device)
        if learning == True:
            return [], prob_nomove, move_index
        else:
            return []

    for board in possible_boards:
        # encode the board to create the input for the NN
        x = Variable(
            torch.tensor(ice_hot_encoding(board),
                         dtype=torch.float,
                         device=device)).view(2 * (n - 1) * 7, 1)
        values[:, j] = x[:, 0]
        j += 1
    # forward pass to evaluate all of the boards' after-state values using the NN
    prob = feed_forward_th(values)
    # squash the after state values with softmax
    prob = prob.softmax(dim=-1)
    prob_temp = torch.tensor(prob[0, :],
                             dtype=torch.float,
                             device=device,
                             requires_grad=True)
    # select the move from a distribution
    move_index = torch.multinomial(prob_temp, num_samples=1)
    move_index = Variable(move_index, requires_grad=False)
    move = possible_moves[move_index]
    if player == -1:
        move = flip_move(move)

    if learning == True:
        return move, prob_temp[move_index], move_index

    return move