Exemplo n.º 1
0
    def final_result(self, result: GameResult):
        """
        This method is called once the game is over. If `self.training` is True, we execute a training run for
        the Neural Network.
        :param result: The result of the game that just finished.
        """

        # Compute the final reward based on the game outcome
        if (result == GameResult.NAUGHT_WIN
                and self.side == NAUGHT) or (result == GameResult.CROSS_WIN
                                             and self.side == CROSS):
            reward = self.win_value  # type: float
        elif (result == GameResult.NAUGHT_WIN
              and self.side == CROSS) or (result == GameResult.CROSS_WIN
                                          and self.side == NAUGHT):
            reward = self.loss_value  # type: float
        elif result == GameResult.DRAW:
            reward = self.draw_value  # type: float
        else:
            raise ValueError("Unexpected game result {}".format(result))

        # The final reward is also the Q value we want to learn for the action that led to it.
        self.next_max_log.append(reward)

        # If we are in training mode we run the optimizer.
        if self.training:
            # We calculate our new estimate of what the true Q values are and feed that into the network as
            # learning target
            targets = self.calculate_targets()

            # We convert the input states we have recorded to feature vectors to feed into the training.
            nn_input = [
                self.board_state_to_nn_input(x)
                for x in self.board_position_log
            ]

            # We run the training step with the recorded inputs and new Q value targets.
            TFSN.get_session().run([self.nn.train_step],
                                   feed_dict={
                                       self.nn.input_positions: nn_input,
                                       self.nn.target_input: targets
                                   })

            self.random_move_prob *= self.random_move_decrease
Exemplo n.º 2
0
 def get_probs(self, input_pos: [np.ndarray]) -> ([float], [float]):
     """
     Compute action probabilities through the Neural Network
     :param input_pos: List of input states for which to compute probabilities
     :return: Tuple of lists of action probabilities and raw logits for the given input states
     """
     probs, logits = TFSN.get_session().run(
         [self.nn.output, self.nn.logits],
         feed_dict={self.nn.state_in: input_pos})
     return probs, logits
Exemplo n.º 3
0
 def get_probs(self, input_pos: np.ndarray) -> ([float], [float]):
     """
     Feeds the feature vector `input_pos` which encodes a board state into the Neural Network and computes the
     Q values and corresponding probabilities for all moves (including illegal ones).
     :param input_pos: The feature vector to be fed into the Neural Network.
     :return: A tuple of probabilities and q values of all actions (including illegal ones).
     """
     probs, qvalues = TFSN.get_session().run(
         [self.nn.probabilities, self.nn.q_values],
         feed_dict={self.nn.input_positions: [input_pos]})
     return probs[0], qvalues[0]
Exemplo n.º 4
0
def evaluate_players(p1: Player,
                     p2: Player,
                     games_per_battle=100,
                     num_battles=100):
    board = Board()

    p1_wins = []
    p2_wins = []
    draws = []
    game_number = []
    game_counter = 0

    TFSessionManager.set_session(tf.Session())
    TFSessionManager.get_session().run(tf.global_variables_initializer())

    for i in range(num_battles):
        p1win, p2win, draw = battle(p1, p2, games_per_battle, False)
        p1_wins.append(p1win)
        p2_wins.append(p2win)
        draws.append(draw)
        game_counter = game_counter + 1
        game_number.append(game_counter)

    TFSessionManager.set_session(None)
    return game_number, p1_wins, p2_wins, draws
Exemplo n.º 5
0
 def get_probs(self, input_pos: [np.ndarray],
               network: QNetwork) -> ([float], [float]):
     """
     Feeds the feature vectors `input_pos` (which encode a board states) into the Neural Network and computes the
     Q values and corresponding probabilities for all moves (including illegal ones).
     :param network: The network to get probabilities from
     :param input_pos: A list of feature vectors to be fed into the Neural Network.
     :return: A list of tuples of probabilities and q values of all actions (including illegal ones).
     """
     probs, qvalues = TFSN.get_session().run(
         [network.probabilities, network.q_values],
         feed_dict={network.input_positions: input_pos})
     return probs, qvalues
Exemplo n.º 6
0
from tic_tac_toe.RndMinMaxAgent import RndMinMaxAgent
from tic_tac_toe.DirectPolicyAgent import DirectPolicyAgent

min_reward = -3
max_reward = 3

num_reward_steps = 1 + max_reward - min_reward

rewards = np.zeros((num_reward_steps, num_reward_steps))

for loss_reward in range(min_reward, max_reward):
    for draw_reward in range(loss_reward + 1, max_reward + 1):

        tf.reset_default_graph()
        TFSessionManager.set_session(tf.Session())

        sess = TFSessionManager.get_session()

        nnplayer = DirectPolicyAgent("PolicyLearner1",
                                     loss_value=loss_reward,
                                     draw_value=draw_reward)
        rm_player = RndMinMaxAgent()

        sess.run(tf.global_variables_initializer())

        game_number, p1_wins, p2_wins, draws = evaluate_players(
            nnplayer, rm_player, num_battles=1000,
            silent=True)  # , num_battles = 20)

        print("With loss reward {} and draw reward {} we get draws: {}".format(
Exemplo n.º 7
0
    def final_result(self, result: GameResult):
        """
        This method is called once the game is over. If `self.training` is True, we execute a training run for
        the Neural Network.
        :param result: The result of the game that just finished.
        """

        self.game_counter += 1

        # Compute the final reward based on the game outcome
        if (result == GameResult.NAUGHT_WIN
                and self.side == NAUGHT) or (result == GameResult.CROSS_WIN
                                             and self.side == CROSS):
            reward = self.win_value  # type: float
        elif (result == GameResult.NAUGHT_WIN
              and self.side == CROSS) or (result == GameResult.CROSS_WIN
                                          and self.side == NAUGHT):
            reward = self.loss_value  # type: float
        elif result == GameResult.DRAW:
            reward = self.draw_value  # type: float
        else:
            raise ValueError("Unexpected game result {}".format(result))

        self.add_game_to_replay_buffer(reward)

        # If we are in training mode we run the optimizer.
        if self.training and (self.game_counter > self.pre_training_games):

            batch_third = self.batch_size // 3
            train_batch = self.replay_buffer_win.sample(batch_third)
            train_batch.extend(self.replay_buffer_loss.sample(batch_third))
            train_batch.extend(self.replay_buffer_draw.sample(batch_third))
            train_batch = np.array(train_batch)

            #
            # Let's compute the target q values for all non terminal move
            # We extract the resulting state, run it through the target net work and
            # get the maximum q value (of all valid moves)
            next_states = [s[2] for s in train_batch if s[2] is not None]
            target_qs = []

            if len(next_states) > 0:
                probs, qvals = self.get_valid_probs(
                    [self.board_state_to_nn_input(s) for s in next_states],
                    self.target_net, [Board(s) for s in next_states])

                i = 0
                for t in train_batch:
                    if t[2] is not None:
                        max_move = np.argmax(probs[i])
                        max_qval = qvals[i][max_move]
                        target_qs.append(max_qval * self.reward_discount)
                        i += 1
                    else:
                        target_qs.append(t[3])

                if i != len(next_states):
                    print("Something wrong here!!!")
            else:
                target_qs.extend(train_batch[:, 3])

            # We convert the input states we have recorded to feature vectors to feed into the training.
            nn_input = [
                self.board_state_to_nn_input(x[0]) for x in train_batch
            ]
            actions = train_batch[:, 1]

            # We run the training step with the recorded inputs and new Q value targets.
            summary, _ = TFSN.get_session().run(
                [self.q_net.merge, self.q_net.train_step],
                feed_dict={
                    self.q_net.input_positions: nn_input,
                    self.q_net.target_q: target_qs,
                    self.q_net.actions: actions
                })
            self.random_move_prob *= self.random_move_decrease

            if self.writer is not None:
                self.writer.add_summary(summary, self.game_counter)
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag='Random_Move_Probability',
                                     simple_value=self.random_move_prob)
                ])
                self.writer.add_summary(summary, self.game_counter)

            TFSN.get_session().run(self.graph_copy_op)
Exemplo n.º 8
0
    def final_result(self, result: GameResult):
        """
        Called when the game has ended. Time to record results and train the network.
        :param result: The final result of the game
        """
        # Compute the final reward based on the game outcome
        if (result == GameResult.NAUGHT_WIN
                and self.side == NAUGHT) or (result == GameResult.CROSS_WIN
                                             and self.side == CROSS):
            final_reward = self.win_value  # type: float
        elif (result == GameResult.NAUGHT_WIN
              and self.side == CROSS) or (result == GameResult.CROSS_WIN
                                          and self.side == NAUGHT):
            final_reward = self.loss_value  # type: float
        elif result == GameResult.DRAW:
            final_reward = self.draw_value  # type: float
        else:
            raise ValueError("Unexpected game result {}".format(result))

        self.game_counter += 1

        rewards = self.calculate_rewards(final_reward, len(self.action_log))

        # noinspection PyTypeChecker
        self.add_game_to_replay_buffer(final_reward, rewards)

        # If we are in training mode we run the optimizer.
        if self.training and (self.game_counter > self.pre_training_games):

            batch_third = self.batch_size // 3
            train_batch = self.replay_buffer_win.sample(batch_third)
            train_batch.extend(self.replay_buffer_loss.sample(batch_third))
            train_batch.extend(self.replay_buffer_draw.sample(batch_third))
            train_batch = np.array(train_batch)

            # We convert the input states we have recorded to feature vectors to feed into the training.
            nn_input = np.array(
                [self.board_state_to_nn_input(x[0]) for x in train_batch])
            actions = np.array(train_batch[:, 1])
            rewards = np.array(train_batch[:, 2])
            feed_dict = {
                self.nn.reward_holder: rewards,
                self.nn.action_holder: actions,
                self.nn.state_in: nn_input
            }
            summary, _, inds, rps, loss = TFSN.get_session().run(
                [
                    self.nn.merge, self.nn.update_batch, self.nn.indexes,
                    self.nn.responsible_outputs, self.nn.loss
                ],
                feed_dict=feed_dict)

            self.random_move_probability *= self.random_move_decrease

            if self.writer is not None:
                self.writer.add_summary(summary, self.game_counter)
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag='Random_Move_Probability',
                                     simple_value=self.random_move_probability)
                ])
                self.writer.add_summary(summary, self.game_counter)
Exemplo n.º 9
0
player1 = RandomPlayer()

player2 = RandomPlayer()

p1_wins = []
p1count = 0
p2_wins = []
p2count = 0
draws = []
drawcount = 0
count = []
num_battles = 100
games_per_battle = 10

TFSessionManager.set_session(tf.Session())
TFSessionManager.get_session().run(tf.global_variables_initializer())

for i in range(num_battles):
    p1win, p2win, draw = battle(player1, player2, games_per_battle, True)
    p1_wins.append(p1win)
    p1count += p1win
    p2_wins.append(p2win)
    p2count += p2win
    draws.append(draw)
    drawcount += draw
    count.append(i * games_per_battle)
#TFSessionManager.set_session(None)
p = plt.plot(count, draws, 'r-', count, p1_wins, 'g-', count, p2_wins, 'b-')
print("p1 wins = " + str(p1count / (num_battles * games_per_battle) * 100) +
      "% (green)")
Exemplo n.º 10
0
rndplayer = RandomPlayer()
mm_player = MinMaxAgent()
tq_player = TQPlayer()

p1_wins = []
p2_wins = []
draws = []
game_number = []
game_counter = 0

num_battles = 10
games_per_battle = 100
num_training_battles = 1000

TFSessionManager.set_session(tf.Session())

TFSessionManager.get_session().run(tf.global_variables_initializer())
writer = tf.summary.FileWriter('log', TFSessionManager.get_session().graph)

# nnplayer rndplayer mm_player
p1_t = deep_nnplayer
p2_t = mm_player

p1 = p1_t
p2 = p2_t

# nnplayer.training= False
# nnplayer2.training= False

for i in range(num_training_battles):