def evaluate_players(p1: Player, p2: Player, games_per_battle=100, num_battles=100): board = Board() p1_wins = [] p2_wins = [] draws = [] game_number = [] game_counter = 0 TFSessionManager.set_session(tf.Session()) TFSessionManager.get_session().run(tf.global_variables_initializer()) for i in range(num_battles): p1win, p2win, draw = battle(p1, p2, games_per_battle, False) p1_wins.append(p1win) p2_wins.append(p2win) draws.append(draw) game_counter = game_counter + 1 game_number.append(game_counter) TFSessionManager.set_session(None) return game_number, p1_wins, p2_wins, draws
def final_result(self, result: GameResult): """ This method is called once the game is over. If `self.training` is True, we execute a training run for the Neural Network. :param result: The result of the game that just finished. """ # Compute the final reward based on the game outcome if (result == GameResult.NAUGHT_WIN and self.side == NAUGHT) or (result == GameResult.CROSS_WIN and self.side == CROSS): reward = self.win_value # type: float elif (result == GameResult.NAUGHT_WIN and self.side == CROSS) or (result == GameResult.CROSS_WIN and self.side == NAUGHT): reward = self.loss_value # type: float elif result == GameResult.DRAW: reward = self.draw_value # type: float else: raise ValueError("Unexpected game result {}".format(result)) # The final reward is also the Q value we want to learn for the action that led to it. self.next_max_log.append(reward) # If we are in training mode we run the optimizer. if self.training: # We calculate our new estimate of what the true Q values are and feed that into the network as # learning target targets = self.calculate_targets() # We convert the input states we have recorded to feature vectors to feed into the training. nn_input = [ self.board_state_to_nn_input(x) for x in self.board_position_log ] # We run the training step with the recorded inputs and new Q value targets. TFSN.get_session().run([self.nn.train_step], feed_dict={ self.nn.input_positions: nn_input, self.nn.target_input: targets }) self.random_move_prob *= self.random_move_decrease
def get_probs(self, input_pos: [np.ndarray]) -> ([float], [float]): """ Compute action probabilities through the Neural Network :param input_pos: List of input states for which to compute probabilities :return: Tuple of lists of action probabilities and raw logits for the given input states """ probs, logits = TFSN.get_session().run( [self.nn.output, self.nn.logits], feed_dict={self.nn.state_in: input_pos}) return probs, logits
def get_probs(self, input_pos: np.ndarray) -> ([float], [float]): """ Feeds the feature vector `input_pos` which encodes a board state into the Neural Network and computes the Q values and corresponding probabilities for all moves (including illegal ones). :param input_pos: The feature vector to be fed into the Neural Network. :return: A tuple of probabilities and q values of all actions (including illegal ones). """ probs, qvalues = TFSN.get_session().run( [self.nn.probabilities, self.nn.q_values], feed_dict={self.nn.input_positions: [input_pos]}) return probs[0], qvalues[0]
def get_probs(self, input_pos: [np.ndarray], network: QNetwork) -> ([float], [float]): """ Feeds the feature vectors `input_pos` (which encode a board states) into the Neural Network and computes the Q values and corresponding probabilities for all moves (including illegal ones). :param network: The network to get probabilities from :param input_pos: A list of feature vectors to be fed into the Neural Network. :return: A list of tuples of probabilities and q values of all actions (including illegal ones). """ probs, qvalues = TFSN.get_session().run( [network.probabilities, network.q_values], feed_dict={network.input_positions: input_pos}) return probs, qvalues
from tic_tac_toe.DirectPolicyAgent import DirectPolicyAgent min_reward = -3 max_reward = 3 num_reward_steps = 1 + max_reward - min_reward rewards = np.zeros((num_reward_steps, num_reward_steps)) for loss_reward in range(min_reward, max_reward): for draw_reward in range(loss_reward + 1, max_reward + 1): tf.reset_default_graph() TFSessionManager.set_session(tf.Session()) sess = TFSessionManager.get_session() nnplayer = DirectPolicyAgent("PolicyLearner1", loss_value=loss_reward, draw_value=draw_reward) rm_player = RndMinMaxAgent() sess.run(tf.global_variables_initializer()) game_number, p1_wins, p2_wins, draws = evaluate_players( nnplayer, rm_player, num_battles=1000, silent=True) # , num_battles = 20) print("With loss reward {} and draw reward {} we get draws: {}".format( loss_reward, draw_reward, draws[-1]))
def final_result(self, result: GameResult): """ This method is called once the game is over. If `self.training` is True, we execute a training run for the Neural Network. :param result: The result of the game that just finished. """ self.game_counter += 1 # Compute the final reward based on the game outcome if (result == GameResult.NAUGHT_WIN and self.side == NAUGHT) or (result == GameResult.CROSS_WIN and self.side == CROSS): reward = self.win_value # type: float elif (result == GameResult.NAUGHT_WIN and self.side == CROSS) or (result == GameResult.CROSS_WIN and self.side == NAUGHT): reward = self.loss_value # type: float elif result == GameResult.DRAW: reward = self.draw_value # type: float else: raise ValueError("Unexpected game result {}".format(result)) self.add_game_to_replay_buffer(reward) # If we are in training mode we run the optimizer. if self.training and (self.game_counter > self.pre_training_games): batch_third = self.batch_size // 3 train_batch = self.replay_buffer_win.sample(batch_third) train_batch.extend(self.replay_buffer_loss.sample(batch_third)) train_batch.extend(self.replay_buffer_draw.sample(batch_third)) train_batch = np.array(train_batch) # # Let's compute the target q values for all non terminal move # We extract the resulting state, run it through the target net work and # get the maximum q value (of all valid moves) next_states = [s[2] for s in train_batch if s[2] is not None] target_qs = [] if len(next_states) > 0: probs, qvals = self.get_valid_probs( [self.board_state_to_nn_input(s) for s in next_states], self.target_net, [Board(s) for s in next_states]) i = 0 for t in train_batch: if t[2] is not None: max_move = np.argmax(probs[i]) max_qval = qvals[i][max_move] target_qs.append(max_qval * self.reward_discount) i += 1 else: target_qs.append(t[3]) if i != len(next_states): print("Something wrong here!!!") else: target_qs.extend(train_batch[:, 3]) # We convert the input states we have recorded to feature vectors to feed into the training. nn_input = [ self.board_state_to_nn_input(x[0]) for x in train_batch ] actions = train_batch[:, 1] # We run the training step with the recorded inputs and new Q value targets. summary, _ = TFSN.get_session().run( [self.q_net.merge, self.q_net.train_step], feed_dict={ self.q_net.input_positions: nn_input, self.q_net.target_q: target_qs, self.q_net.actions: actions }) self.random_move_prob *= self.random_move_decrease if self.writer is not None: self.writer.add_summary(summary, self.game_counter) summary = tf.Summary(value=[ tf.Summary.Value(tag='Random_Move_Probability', simple_value=self.random_move_prob) ]) self.writer.add_summary(summary, self.game_counter) TFSN.get_session().run(self.graph_copy_op)
def final_result(self, result: GameResult): """ Called when the game has ended. Time to record results and train the network. :param result: The final result of the game """ # Compute the final reward based on the game outcome if (result == GameResult.NAUGHT_WIN and self.side == NAUGHT) or (result == GameResult.CROSS_WIN and self.side == CROSS): final_reward = self.win_value # type: float elif (result == GameResult.NAUGHT_WIN and self.side == CROSS) or (result == GameResult.CROSS_WIN and self.side == NAUGHT): final_reward = self.loss_value # type: float elif result == GameResult.DRAW: final_reward = self.draw_value # type: float else: raise ValueError("Unexpected game result {}".format(result)) self.game_counter += 1 rewards = self.calculate_rewards(final_reward, len(self.action_log)) # noinspection PyTypeChecker self.add_game_to_replay_buffer(final_reward, rewards) # If we are in training mode we run the optimizer. if self.training and (self.game_counter > self.pre_training_games): batch_third = self.batch_size // 3 train_batch = self.replay_buffer_win.sample(batch_third) train_batch.extend(self.replay_buffer_loss.sample(batch_third)) train_batch.extend(self.replay_buffer_draw.sample(batch_third)) train_batch = np.array(train_batch) # We convert the input states we have recorded to feature vectors to feed into the training. nn_input = np.array( [self.board_state_to_nn_input(x[0]) for x in train_batch]) actions = np.array(train_batch[:, 1]) rewards = np.array(train_batch[:, 2]) feed_dict = { self.nn.reward_holder: rewards, self.nn.action_holder: actions, self.nn.state_in: nn_input } summary, _, inds, rps, loss = TFSN.get_session().run( [ self.nn.merge, self.nn.update_batch, self.nn.indexes, self.nn.responsible_outputs, self.nn.loss ], feed_dict=feed_dict) self.random_move_probability *= self.random_move_decrease if self.writer is not None: self.writer.add_summary(summary, self.game_counter) summary = tf.Summary(value=[ tf.Summary.Value(tag='Random_Move_Probability', simple_value=self.random_move_probability) ]) self.writer.add_summary(summary, self.game_counter)
player1 = RandomPlayer() player2 = RandomPlayer() p1_wins = [] p1count = 0 p2_wins = [] p2count = 0 draws = [] drawcount = 0 count = [] num_battles = 100 games_per_battle = 10 TFSessionManager.set_session(tf.Session()) TFSessionManager.get_session().run(tf.global_variables_initializer()) for i in range(num_battles): p1win, p2win, draw = battle(player1, player2, games_per_battle, True) p1_wins.append(p1win) p1count += p1win p2_wins.append(p2win) p2count += p2win draws.append(draw) drawcount += draw count.append(i * games_per_battle) #TFSessionManager.set_session(None) p = plt.plot(count, draws, 'r-', count, p1_wins, 'g-', count, p2_wins, 'b-') print("p1 wins = " + str(p1count / (num_battles * games_per_battle) * 100) + "% (green)") print("p2 wins = " + str(p2count / (num_battles * games_per_battle) * 100) +
mm_player = MinMaxAgent() tq_player = TQPlayer() p1_wins = [] p2_wins = [] draws = [] game_number = [] game_counter = 0 num_battles = 10 games_per_battle = 100 num_training_battles = 1000 TFSessionManager.set_session(tf.Session()) TFSessionManager.get_session().run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('log', TFSessionManager.get_session().graph) # nnplayer rndplayer mm_player p1_t = deep_nnplayer p2_t = mm_player p1 = p1_t p2 = p2_t # nnplayer.training= False # nnplayer2.training= False for i in range(num_training_battles): p1win, p2win, draw = battle(p1_t, p2_t, games_per_battle, False) p1_wins.append(p1win)