Exemplo n.º 1
0
    def test(self, episodes=100):
        player, opp = (TfAgent(self), RandomAgent())
        players = (player, opp)

        winners = {pl: 0 for pl in players}
        marses = {pl: 0 for pl in players}
        kokses = {pl: 0 for pl in players}

        for episode in tqdm(range(episodes)):
            game = bg.Game(players=players, show_logs=False)

            winner, status = game.play()

            winners[winner] += 1

            if status == 2:
                marses[winner] += 1
            elif status == 3:
                kokses[winner] += 1

            winners_total = sum(list(winners.values()))
            tqdm.write(
                f"[Episode {episode}] {player} vs {opp} "
                f"{winners[player]}:{winners[opp]} of {winners_total} games "
                f"- ({(winners[player] / winners_total) * 100.0:.2f})% "
                f"| Mars-{marses[player]}/{marses[opp]}; Koks-{kokses[player]}/{kokses[opp]}"
            )
            players = tuple(reversed(players))
Exemplo n.º 2
0
    def train(
            self, episodes: int,
            validation: int = None,
            save_path: PathLike = None,
            gamma: float = 1.0, epsilon: float = 0.2,
    ) -> None:
        """

        :param episodes:
        :param validation:
        :param save_path: path to save models and logs.
        :param gamma: γ - discount factor. Is used to balance immediate and future reward.
        :param epsilon: ε - chance to get random move in ε-greedy policy
        """
        if save_path:
            save_path = pathlib.Path(save_path) / f'SIGMOID_24_negative_reward_gamma_{gamma}_epsilon_{epsilon}_q_learning'
            save_path.mkdir(exist_ok=False, parents=False)

        for episode in tqdm(range(episodes)):
            if validation is not None and not (episode + 1) % validation:
                self.validate(path=save_path)
                if save_path:
                    self.save(save_path, episode + 1)

            self.model.train()

            players = (self.agent_cls(self.model), self.agent_cls(self.model))
            game = bg.Game(players=players)

            with self.e_greedy_get_action(episode, epsilon=epsilon):
                for agent, new_board, prev_board, move, available_moves in game.play_step_by_step():
                    agent: agents.NNAgent
                    pred_q = agent.estimate(board=prev_board)
                    if new_board.status:
                        reward = new_board.status
                        self.update(pred_q, torch.Tensor([reward]))
                        with prev_board.reverse() as reversed_board:
                            self.update(agent.estimate(board=reversed_board), torch.Tensor([-reward]))
                        break
                    else:
                        estimated_moves = list(agent.estimate_moves(available_moves=available_moves, board=prev_board))
                        agent_checkers, opp_checkers = prev_board.to_schema()

                        if estimated_moves:
                            max_q = np.max(estimated_moves)
                            new_q = gamma * max_q
                        else:
                            # it is too bad, if we could not make any step.
                            new_q = torch.Tensor([-1])

                        self.update(pred_q, new_q)
Exemplo n.º 3
0
    def validate(self, episodes: int = 100, path: pathlib.Path = None) -> None:
        """Compare current model against random player, save it on disk.
        :param episodes: number of games to play against random player.
        :param path: if specified, log into it.
        """
        @contextmanager
        def validate_log_open(path: pathlib.Path = None):
            """Open validate file or write into stdout, if it is not specified."""
            if path:
                path = path / 'validate.log'
                path.touch(exist_ok=True)
                with open(path, 'a') as f:
                    yield f
            else:
                yield sys.stdout

        self.model.eval()
        player, opp = self.agent_cls(self.model), agents.RandomAgent()
        players = (player, opp)

        winners = {pl: 0 for pl in players}
        marses = {pl: 0 for pl in players}
        kokses = {pl: 0 for pl in players}

        with validate_log_open(path) as f:
            for episode in tqdm(range(episodes)):
                game = bg.Game(players=players, show_logs=False)

                winner, status = game.play()

                winners[winner] += 1

                if status == 2:
                    marses[winner] += 1
                elif status == 3:
                    kokses[winner] += 1

                winners_total = sum(list(winners.values()))
                tqdm.write(
                    f"[Episode {episode}] {player} vs {opp} "
                    f"{winners[player]}:{winners[opp]} of {winners_total} games "
                    f"- ({(winners[player] / winners_total) * 100.0:.2f})% "
                    f"| Mars-{marses[player]}/{marses[opp]}; Koks-{kokses[player]}/{kokses[opp]}",
                    file=f
                )
                players = tuple(reversed(players))
            tqdm.write('____________________________________________________________', file=f)
Exemplo n.º 4
0
def check():
    model = torch.nn.Sequential(
        torch.nn.Linear(24, 40),
        # torch.nn.Linear(720, 100),
        torch.nn.ReLU(),
        torch.nn.Linear(40, 1)
    ).cuda()
    model.load_state_dict(
        torch.load(pathlib.Path('./models/24_negative_reward_gamma_0.9_epsilon_0.3_q_learning/1500.pth')))
    model.eval()
    player, opp = agents.NNAgent(model), agents.RandomAgent()
    players = (player, opp)
    game = bg.Game(players=players, show_logs=True)
    game.board = bg.Board.from_schema( {22: 11, }, {23: 5, 22: 4})

    available_moves = game.board.get_available_moves((1, 1))
    result = list(player.estimate_moves(available_moves, game.board))
    result
Exemplo n.º 5
0
def play(episodes=100):
    # model = torch.nn.Sequential(
    #     torch.nn.Linear(24, 100),
    #     # torch.nn.Linear(720, 100),
    #     torch.nn.ReLU(),
    #     torch.nn.Linear(100, 1)
    # ).cuda()
    # model.load_state_dict(torch.load(pathlib.Path('./models/720_gamma_0.95_epsilon_0.3_q_learning/1500.pth')))
    # model.eval()
    #
    # player, opp = agents.NNAgent(model), agents.RandomAgent()
    player, opp = agents.RandomAgent(), agents.RandomAgent()
    players = (player, opp)

    winners = {pl: 0 for pl in players}
    marses = {pl: 0 for pl in players}
    kokses = {pl: 0 for pl in players}

    for episode in tqdm(range(episodes)):
        game = bg.Game(players=players, show_logs=True)

        winner, status = game.play()

        winners[winner] += 1

        if status == 2:
            marses[winner] += 1
        elif status == 3:
            kokses[winner] += 1

        winners_total = sum(list(winners.values()))
        tqdm.write(
            f"[Episode {episode}] {player} vs {opp} "
            f"{winners[player]}:{winners[opp]} of {winners_total} games "
            f"- ({(winners[player] / winners_total) * 100.0:.2f})% "
            f"| Mars-{marses[player]}/{marses[opp]}; Koks-{kokses[player]}/{kokses[opp]}",
            file=None
        )
Exemplo n.º 6
0
    player, opp = [
        agents.TCPAgent.with_server(initializer(path, model_cls), port=port)
        for port, path, model_cls in zip(ports, PATHS, (Model, OtherModel))
    ]

    players = (player, opp)

    time.sleep(1)

    winners = {pl: 0 for pl in players}
    marses = {pl: 0 for pl in players}
    kokses = {pl: 0 for pl in players}

    for episode in tqdm(range(100)):
        game = bg.Game(players=players, show_logs=False)

        winner, status = game.play()

        winners[winner] += 1

        if status == 2:
            marses[winner] += 1
        elif status == 3:
            kokses[winner] += 1

        winners_total = sum(list(winners.values()))
        tqdm.write(
            f"[Episode {episode}] {player} vs {opp} "
            f"{winners[player]}:{winners[opp]} of {winners_total} games "
            f"- ({(winners[player] / winners_total) * 100.0:.2f})% "
Exemplo n.º 7
0
    def train(self):
        tf.train.write_graph(self.sess.graph,
                             self.path,
                             'model.pb',
                             as_text=False)
        summary_writer = tf.summary.FileWriter(
            os.path.join(self.summaries_path, str(int(time.time()))),
            self.sess.graph)

        # the agent plays against itself, making the best move for each player
        players = (player, opp) = (TfAgent(self), TfAgent(self))

        validation_interval = 500
        episodes = 3000

        for episode in tqdm(range(episodes)):
            game = bg.Game(players=players)

            game_step = 0

            x = self.extract_features(game.board)

            while game.board.status is None:
                current_player = next(game.players_steps)
                with game.board.reverse(
                        fake=(current_player == player)) as board:
                    game.make_step(player=current_player, board=board)

                x_next = self.extract_features(game.board)
                V_next = self.get_output(x_next)

                self.sess.run(self.train_op,
                              feed_dict={
                                  self.x: x,
                                  self.V_next: V_next
                              })
                x = x_next
                game_step += 1

            V_next = 0 if current_player != player else 1

            _, global_step, summaries, _ = self.sess.run([
                self.train_op, self.global_step, self.summaries_op,
                self.reset_op
            ],
                                                         feed_dict={
                                                             self.x:
                                                             x,
                                                             self.V_next:
                                                             np.array(
                                                                 [[V_next]],
                                                                 dtype='float')
                                                         })

            summary_writer.add_summary(summaries, global_step=global_step)
            winner = 'X' if current_player == players[0] else 'O'
            # print(f"Game {episode}/{episodes} (Winner: {winner}) in {game_step} turns")

            if (episode + 1) % validation_interval == 0:
                self.saver.save(self.sess,
                                os.path.join(self.checkpoint_path,
                                             'checkpoint'),
                                global_step=global_step)
                self.test(episodes=100)

        summary_writer.close()

        self.test(episodes=1000)
Exemplo n.º 8
0
"""

all_dice = (dice for dice in store['dice'])
all_moves = (moves for moves in store['moves'])


class FakeRandomAgent(RandomAgent):
    def get_action(self, available_moves: Set[bg.Moves],
                   board: bg.Board) -> bg.Moves:
        return next(all_moves)


def roll_dice(*args, **kwargs):
    return next(all_dice)


if __name__ == '__main__':
    t1 = time.time()
    players = (FakeRandomAgent(), FakeRandomAgent())
    game = bg.Game(players=players,
                   show_logs=False,
                   who_start=store['who_start'])
    bg.roll_dice = roll_dice

    status = game.play()

    print(game.board)
    print(status)
    print(game._store)
    print(time.time() - t1)