def test_tic_tac_toe(self): game_spec = TicTacToeGameSpec() create_model_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100,)) variables, win_rate = train_policy_gradients(game_spec, create_model_func, None, learn_rate=1e-4, number_of_games=60000, print_results_every=1000, batch_size=100, randomize_first_player=False) self.assertGreater(win_rate, 0.4)
class TestCreatePositionsSet(TestCase): def setUp(self): self._game_spec = TicTacToeGameSpec() def test_create_positions(self): number_of_positions = 100 positions = create_positions_set( self._game_spec, number_of_positions, self._game_spec.get_random_player_func()) self.assertGreater(len(positions), number_of_positions - 1)
""" import functools from common.network_helpers import create_network from games.tic_tac_toe import TicTacToeGameSpec from techniques.train_policy_gradient import train_policy_gradients BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 PRINT_RESULTS_EVERY_X = 100 # every how many games to print the results NETWORK_FILE_PATH = 'current_network.p' # path to save the network to NUMBER_OF_GAMES_TO_RUN = 1000 # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run # well may require tuning the hyper parameters a bit game_spec = TicTacToeGameSpec() create_network_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100)) train_policy_gradients(game_spec, create_network_func, NETWORK_FILE_PATH, number_of_games=NUMBER_OF_GAMES_TO_RUN, batch_size=BATCH_SIZE, learn_rate=LEARN_RATE, print_results_every=PRINT_RESULTS_EVERY_X) def second_player_move(board_state, side):
first_unvisited_node = False current_side = -current_side result = game_spec.has_winner(current_board_state) for path_board_state, path_side in rollout_path: state_samples[path_board_state] += 1. result *= path_side # normalize results to be between 0 and 1 before this it between -1 and 1 result /= 2. result += .5 state_results[path_board_state] += result move_states = {move: game_spec.apply_move(board_state, move, side) for move in game_spec.available_moves(board_state)} move = max(move_states, key=lambda x: state_results[move_states[x]] / state_samples[move_states[x]]) return state_results[move_states[move]] / state_samples[move_states[move]], move if __name__ == '__main__': from games.tic_tac_toe import TicTacToeGameSpec sample_board_state = ((1, 0, -1), (1, 0, 0), (0, -1, 0)) print(monte_carlo_tree_search_uct(TicTacToeGameSpec(), sample_board_state, -1, 10000)) © 2017 GitHub, Inc.
from games.tic_tac_toe import TicTacToeGameSpec from network_helpers import create_network, load_network, save_network, \ get_deterministic_network_move HIDDEN_NODES_VALUE = (120, 100, 80, 60, 40) HIDDEN_NODES_REINFORCEMENT = (100, 80, 60, 40) BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 REINFORCEMENT_NETWORK_PATH = 'current_network.p' VALUE_NETWORK_PATH = 'value_netowrk.p' TRAIN_SAMPLES = 10000 TEST_SAMPLES = 10000 # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec game_spec = TicTacToeGameSpec() NUMBER_RANDOM_RANGE = (1, game_spec.board_squares() * 0.8) # it would be good to have real board positions, but failing that just generate random ones def generate_random_board_position(): while True: board_state = game_spec.new_board() number_moves = random.randint(*NUMBER_RANDOM_RANGE) side = 1 for _ in range(number_moves): board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))), side) if game_spec.has_winner(board_state) != 0: # start again if we hit an already winning position
def load_games(): """If we had a database of games this would load and return it... Returns: """ raise Exception( "If we had a database of tic-tac-toe games this would load them") HIDDEN_NODES = (100, 80, 60, 40) # number of hidden layer neurons BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 NETWORK_FILE_PATH = 'current_network.p' game_spec = TicTacToeGameSpec() input_layer, output_layer, variables = create_network( game_spec.board_squares(), HIDDEN_NODES, output_nodes=game_spec.outputs()) actual_move_placeholder = tf.placeholder("float", (None, game_spec.outputs())) error = tf.reduce_sum(tf.square(actual_move_placeholder - output_layer)) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) if os.path.isfile(NETWORK_FILE_PATH): print("loading existing network") load_network(session, variables, NETWORK_FILE_PATH)
from common.network_helpers import create_network, load_network, save_network, \ get_deterministic_network_move from games.tic_tac_toe import TicTacToeGameSpec HIDDEN_NODES_VALUE = (100, 100, 100) HIDDEN_NODES_REINFORCEMENT = (100, 100, 100) BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 REINFORCEMENT_NETWORK_PATH = 'current_network.p' VALUE_NETWORK_PATH = 'value_netowrk.p' TRAIN_SAMPLES = 10000 TEST_SAMPLES = 10000 # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec game_spec = TicTacToeGameSpec() NUMBER_RANDOM_RANGE = (1, game_spec.board_squares() * 0.8) # it would be good to have real board positions, but failing that just generate random ones def generate_random_board_position(): while True: board_state = game_spec.new_board() number_moves = random.randint(*NUMBER_RANDOM_RANGE) side = 1 for _ in range(number_moves): board_state = game_spec.apply_move( board_state, random.choice(list(game_spec.available_moves(board_state))), side)
""" This is the same as the policy_gradient.py network except that instead of playing against a random opponent. It plays against previous versions of itself. It is first created with the weights from the "current_network.p" file, if no file is found there random weights are used. It then creates a series of copies of itself and plays against them. After "SAVE_HISTORICAL_NETWORK_EVERY" games, it saves it's current weights into the weights of one of the historical networks. Over time the main network and the historical networks should improve. """ import collections import functools import os import random import numpy as np import tensorflow as tf from common.network_helpers import create_network, load_network, get_stochastic_network_move, \ save_network from games.tic_tac_toe import TicTacToeGameSpec from techniques.train_policy_gradient_historic import train_policy_gradients_vs_historic HIDDEN_NODES = (100, 100, 100) SAVE_HISTORICAL_NETWORK_EVERY = 10000 game_spec = TicTacToeGameSpec() create_network_func = functools.partial(create_network, game_spec.board_squares(), HIDDEN_NODES) train_policy_gradients_vs_historic(game_spec, create_network_func, 'train_vs_historical.p', save_historic_every=SAVE_HISTORICAL_NETWORK_EVERY)
default=10000, help="Every how many games to print results.") parser.add_argument("--learning-rate", type=float, default=1e-4) parser.add_argument("--batch-size", type=int, default=100, help="Every how many games to update network weights") parser.add_argument("hidden_layers", nargs="*", type=int, help="List of hidden layer sizes") args = parser.parse_args() # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run # well may require tuning the hyper parameters a bit game_spec = TicTacToeGameSpec() if not args.hidden_layers: args.hidden_layers = (100, 100, 100) # create_network_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100)) create_network_func = functools.partial(create_network, game_spec.board_squares(), args.hidden_layers) network_file_path = 'current_network' for n in args.hidden_layers: network_file_path = network_file_path + ("_%05d" % n) network_file_path = network_file_path + ".p"
a valid move, so initially it must learn the rules of the game. I have trained this version with success at 3x3 tic tac toe until it has a success rate in the region of 75% this maybe as good as it can do, because 3x3 tic-tac-toe is a theoretical draw, so the random opponent will often get lucky and force a draw. """ import functools from common.network_helpers import create_network from games.tic_tac_toe import TicTacToeGameSpec from techniques.train_policy_gradient import train_policy_gradients BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 PRINT_RESULTS_EVERY_X = 1000 # every how many games to print the results NETWORK_FILE_PATH = None#'current_network.p' # path to save the network to NUMBER_OF_GAMES_TO_RUN = 1000000 # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run # well may require tuning the hyper parameters a bit game_spec = TicTacToeGameSpec() create_network_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100)) train_policy_gradients(game_spec, create_network_func, NETWORK_FILE_PATH, number_of_games=NUMBER_OF_GAMES_TO_RUN, batch_size=BATCH_SIZE, learn_rate=LEARN_RATE, print_results_every=PRINT_RESULTS_EVERY_X)
import numpy as np import tensorflow as tf from games.tic_tac_toe import TicTacToeGameSpec from network_helpers import create_network, load_network, get_stochastic_network_move, save_network HIDDEN_NODES = (100, 80, 60, 40) BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 PRINT_RESULTS_EVERY_X = 1000 # every how many games to print the results NETWORK_FILE_PATH = 'current_network.p' NUMBER_OF_GAMES_TO_RUN = 100000 # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec game_spec = TicTacToeGameSpec() OUTPUT_NODES = game_spec.outputs() reward_placeholder = tf.placeholder("float", shape=(None, )) actual_move_placeholder = tf.placeholder("float", shape=(None, OUTPUT_NODES)) input_layer, output_layer, variables = create_network( game_spec.board_squares(), HIDDEN_NODES, output_nodes=OUTPUT_NODES) policy_gradient = tf.reduce_sum( tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient) with tf.Session() as session:
import tensorflow as tf from games.tic_tac_toe import TicTacToeGameSpec from network_helpers import create_network, load_network, get_stochastic_network_move, \ save_network NUMBER_OF_HISTORICAL_COPIES_TO_KEEP = 8 NUMBER_OF_GAMES_TO_PLAY = 1000000 MINI_BATCH_SIZE = 100 SAVE_HISTORICAL_NETWORK_EVERY = 100000 STARTING_NETWORK_WEIGHTS = 'current_network.p' BASE_HISTORICAL_NETWORK_PATH = 'historical_network_' HIDDEN_NODES = (100, 80, 60, 40) PRINT_RESULTS_EVERY_X = 500 LEARN_RATE = 1e-4 game_spec = TicTacToeGameSpec() input_layer, output_layer, variables = create_network(game_spec.board_squares(), HIDDEN_NODES, output_nodes=game_spec.outputs()) reward_placeholder = tf.placeholder("float", shape=(None,)) actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares())) policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient) current_historical_index = 0 historical_networks = [] mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], [] results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X)
def setUp(self): self._game_spec = TicTacToeGameSpec()