Пример #1
0
def naive_game_step(board):
    valid_next_board = []

    up_board = Board2048(board)
    res = up_board.up()
    if res:
        valid_next_board.append(up_board)

    down_board = Board2048(board)
    res = down_board.down()
    if res:
        valid_next_board.append(down_board)

    left_board = Board2048(board)
    res = left_board.left()
    if res:
        valid_next_board.append(left_board)

    right_board = Board2048(board)
    res = right_board.right()
    if res:
        valid_next_board.append(right_board)

    if not valid_next_board:
        return False, None
    else:
        next_board = max(valid_next_board, key=lambda x: naive_score(x))
        return True, next_board
def generate_replay_buffer_using_A_star(batch_size, maxlen):
    from tqdm import tqdm

    #-- Set a max replay buffer size --#
    replay_buffer = deque(maxlen=maxlen)

    #-- Mapping of action to int --#
    move = {'up': 0, 'down': 1, 'left': 2, 'right': 3}

    #-- Run a_star for each batch --#
    for _ in tqdm(range(batch_size)):
        b = Board2048()
        current_node = A_star(b)['current_node']
        print(current_node, current_node.parent)

        #-- Trace back to parent nodes and register all moves --#
        while current_node.parent != None:
            current = current_node
            parent = current_node.parent
            done = int(current.is_root())
            action = move[current.move]
            reward = reward_func_merge_score(current.board, parent.board,
                                             action, done)

            #-- Append games to replay_buffer --#
            replay_buffer.append(
                (current.board, action, reward, current.board, done))

            current_node = current_node.parent
    return replay_buffer
 def basic_upleft_algorithm(self, k=4):
     board = Board2048(k=k)
     simple_score = board.simple_score()
     single_game_history = []
     while True:
         board = board.peek_action("up")
         single_game_history.append(
             (board.state, 'up', board.simple_score(), board.merge_score()))
         board = board.peek_action("left")
         single_game_history.append(
             (board.state, 'left', board.simple_score(),
              board.merge_score()))
         if simple_score == board.simple_score():
             board = board.peek_action('down')
             single_game_history.append(
                 (board.state, 'down', board.simple_score(),
                  board.merge_score()))
             board = board.peek_action('right')
             single_game_history.append(
                 (board.state, 'r', board.simple_score(),
                  board.merge_score()))
             if simple_score == board.simple_score():
                 break
         simple_score = board.simple_score()
     self.games_history.append(single_game_history)
     return board
    def play_game(self, random_policy=False):
        board = Board2048()
        done = False
        single_game_history = []

        while not done:
            available_moves = board.available_moves_as_torch_unit_vector(
                device=self.device)
            done = torch.max(available_moves) == 0

            state = board.normalized().state_as_4d_tensor().to(self.device)
            if not random_policy:
                Q_values = self.model(state)
            else:
                Q_values = torch.rand((4, ), device=self.device)
            available_Q_values = available_moves * Q_values

            next_action = torch.argmax(available_Q_values)
            next_board = board.peek_action(next_action)
            reward = self.reward_func(board, next_board, next_action, done)
            merge_score = board.merge_score()
            single_game_history.append(
                (board.state, ['u', 'd', 'l',
                               'r'][int(next_action)], reward, merge_score))
            board = next_board

        self.games_history.append(single_game_history)
        return single_game_history
Пример #5
0
def search_best_step(root_node, step=3):
    stack = [root_node]
    for _ in range(step):
        new_stack = []
        for game_node in stack:
            up_board = Board2048(game_node.board)
            res = up_board.up()
            if res:
                new_game_node = GameNode(up_board)
                game_node.children.append(new_game_node)
                new_stack.append(new_game_node)

            down_board = Board2048(game_node.board)
            res = down_board.down()
            if res:
                new_game_node = GameNode(down_board)
                game_node.children.append(new_game_node)
                new_stack.append(new_game_node)

            left_board = Board2048(game_node.board)
            res = left_board.left()
            if res:
                new_game_node = GameNode(left_board)
                game_node.children.append(new_game_node)
                new_stack.append(new_game_node)

            right_board = Board2048(game_node.board)
            res = right_board.right()
            if res:
                new_game_node = GameNode(right_board)
                game_node.children.append(new_game_node)
                new_stack.append(new_game_node)
        stack = new_stack

    best_child_board, best_score = None, -1
    for child_board in root_node.children:
        score = child_board.score()
        if score > best_score:
            best_child_board = child_board
            best_score = score
    return best_child_board
Пример #6
0
    def __init__(self,
                 initial_grid,
                 player_mode,
                 game_mode,
                 method_idx,
                 play_turn=True):

        self.board = Board2048(grid=initial_grid,
                               player_turn=play_turn,
                               score=0)
        self.player_mode = player_mode
        self.game_mode = game_mode
        self.method = METHODS[method_idx]
        print(self.board)
Пример #7
0
def naive_game_play():
    # Init game play
    board = Board2048()
    board.add_num()
    board.add_num()
    board.pprint()

    while True:
        alive, board = naive_game_step(board)
        if not alive:
            break
        board.pprint()
        board.add_num()
        board.pprint()
        input()
Пример #8
0
def search_game_play():
    # Init game play
    board = Board2048()
    board.add_num()
    board.add_num()
    board.pprint()

    root_node = GameNode(board)
    while True:
        root_node = search_best_step(root_node)
        if not root_node:
            break
        root_node.board.pprint()
        root_node.board.add_num()
        root_node.board.pprint()
        input()
Пример #9
0
def check_valid_board(board, move):
    if (move == "left"):
        oldgrid = board.grid_
        newgrid, score_added = move_grid_helper(oldgrid)
    elif (move == "up"):
        oldgrid = np.rot90(board.grid_, 1)
        newgrid, score_added = move_grid_helper(oldgrid)
        newgrid = np.rot90(newgrid, -1)
    elif (move == "right"):
        oldgrid = np.rot90(board.grid_, 2)
        newgrid, score_added = move_grid_helper(oldgrid)
        newgrid = np.rot90(newgrid, 2)
    else:
        oldgrid = np.rot90(board.grid_, -1)
        newgrid, score_added = move_grid_helper(oldgrid)
        newgrid = np.rot90(newgrid, 1)

    return Board2048(newgrid, True, board.score + score_added)
Пример #10
0
def play_one_step(board: Board2048,
                  epsilon: float,
                  model: torch.nn.Sequential,
                  replay_buffer: deque,
                  device: str,
                  reward_function: Callable = reward_func_merge_score,
                  board_to_tensor_function: Callable = board_as_4d_tensor):

    action, done, max_q_value = epsilon_greedy_policy(
        board,
        epsilon=epsilon,
        model=model,
        device=device,
        board_to_tensor_function=board_to_tensor_function)

    next_board = board.peek_action(action)

    reward = reward_function(board, next_board, action, done)

    replay_buffer.append((board, action, reward, next_board, done))
    return next_board, action, reward, done, max_q_value
Пример #11
0
    else:
        initial_grid_list = InitialStates(shape).generate(1)
    csvfile = open("log_size{0}*{1}_24_static.csv".format(shape[0], shape[1]),
                   "w")
    writer = csv.writer(csvfile)
    writer.writerow([
        'initial state', 'end state', 'score', 'max value', 'time(s)',
        'number of nodes'
    ])
    i = 0
    for initial_grid in initial_grid_list:  #[58:59]: #58 True 10 false
        i += 1
        print(
            "############################################ New Game ##################################################",
            i)
        board = Board2048(grid=initial_grid, player_turn=True, score=0)
        print("initial board:", board)
        cnt_nodes = 0
        try:
            with open('{0}*{1}_data_.pkl'.format(shape[0], shape[1]),
                      'rb') as data_file:
                heuristic_table = pickle.load(data_file)
                print(len(heuristic_table))
        except:
            heuristic_table = {}
        for depth_limit in range(15, 50, 5):

            cnt_nodes_it = 0
            is_complete = True
            trans_table = {}
Пример #12
0
 def __init__(self, board):
     self.board = Board2048(board)
     self.children = []
Пример #13
0
def reward_func_merge_score(board: Board2048, next_board: Board2048,
                            action: int, done: int) -> int:
    return next_board.merge_score() - board.merge_score()
Пример #14
0
def board_as_4d_tensor(board: Board2048, device: str) -> torch.tensor:
    return board.log_scale().state_as_4d_tensor().to(device)
Пример #15
0
def training_loop(replay_buffer_length,
                  no_episodes,
                  no_episodes_to_reach_epsilon,
                  no_episodes_to_fill_up_existing_model_replay_buffer,
                  min_epsilon,
                  model,
                  reward_function,
                  board_to_tensor_function,
                  device,
                  experiment,
                  snapshot_game_every_n_episodes,
                  no_episodes_before_training,
                  batch_size,
                  discount_factor,
                  target_model,
                  loss_fn,
                  optimizer,
                  use_double_dqn,
                  no_episodes_before_updating_target,
                  extract_samples_function,
                  replay_buffer_override=None):
    try:
        if replay_buffer_override:
            replay_buffer = replay_buffer_override
        else:
            replay_buffer = deque(maxlen=replay_buffer_length)

        for ep in range(no_episodes):
            print(ep)
            board = Board2048()
            done = False
            board_history = []
            rewards = []
            q_values = []
            epsilon = None
            while not done:
                # value to determine how greedy the policy should be for that step
                epsilon = max((no_episodes_to_reach_epsilon - ep) /
                              no_episodes_to_reach_epsilon, min_epsilon)

                if ep < no_episodes_to_fill_up_existing_model_replay_buffer:
                    epsilon = 0

                new_board, action, reward, done, max_q_value = play_one_step(
                    board,
                    epsilon,
                    model,
                    replay_buffer,
                    reward_function=reward_function,
                    board_to_tensor_function=board_to_tensor_function,
                    device=device)
                board_history.append((board.state, ['u', 'd', 'l',
                                                    'r'][int(action)], reward))
                rewards.append(reward)
                q_values.append(float(max_q_value))
                board = new_board
            mean_of_rewards = np.mean(np.array(rewards))
            mean_of_q_values = np.mean(np.array(q_values))
            experiment.add_episode(board, epsilon, ep, mean_of_rewards,
                                   mean_of_q_values)
            if ep % snapshot_game_every_n_episodes == 0:
                experiment.snapshot_game(board_history, ep)
            if ep % 10 == 0:
                print(
                    f"Episode: {ep}: {board.merge_score()}, {np.max(board.state.flatten())}, {len(board._action_history)}"
                )
            if ep > no_episodes_before_training:
                train_step(batch_size,
                           discount_factor,
                           model,
                           target_model,
                           replay_buffer,
                           loss_fn,
                           optimizer,
                           device=device,
                           use_double_dqn=use_double_dqn,
                           board_to_tensor_function=board_to_tensor_function,
                           extract_samples_function=extract_samples_function)
            if ep % no_episodes_before_updating_target == 0 and ep >= no_episodes_to_fill_up_existing_model_replay_buffer:
                target_model.load_state_dict(copy.deepcopy(model.state_dict()))
            if ep % 1000 == 0:
                experiment.save()
                print("Saved game")

        experiment.save()

    except KeyboardInterrupt as e:
        print(e)
        print(
            f'\nKeyboard interrut caught. Saving current experiment in {experiment.folder}'
        )
        experiment.save()

    except Exception as e:
        experiment.save()
        print(f'\nSaving current experiment in {experiment.folder}\n')
        raise e