def test_tic_tac_toe(self): game_spec = TicTacToeGameSpec() create_model_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100,)) variables, win_rate = train_policy_gradients(game_spec, create_model_func, None, learn_rate=1e-4, number_of_games=60000, print_results_every=1000, batch_size=100, randomize_first_player=False) self.assertGreater(win_rate, 0.4)
from common.network_helpers import create_network from games.tic_tac_toe import TicTacToeGameSpec from techniques.train_policy_gradient import train_policy_gradients BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 PRINT_RESULTS_EVERY_X = 100 # every how many games to print the results NETWORK_FILE_PATH = 'current_network.p' # path to save the network to NUMBER_OF_GAMES_TO_RUN = 1000 # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run # well may require tuning the hyper parameters a bit game_spec = TicTacToeGameSpec() create_network_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100)) train_policy_gradients(game_spec, create_network_func, NETWORK_FILE_PATH, number_of_games=NUMBER_OF_GAMES_TO_RUN, batch_size=BATCH_SIZE, learn_rate=LEARN_RATE, print_results_every=PRINT_RESULTS_EVERY_X) def second_player_move(board_state, side): return game_spec.flat_move_to_tuple(int(input("Next Move:")))
from network_helpers import create_network, load_network, save_network, \ get_deterministic_network_move HIDDEN_NODES_VALUE = (120, 100, 80, 60, 40) HIDDEN_NODES_REINFORCEMENT = (100, 80, 60, 40) BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 REINFORCEMENT_NETWORK_PATH = 'current_network.p' VALUE_NETWORK_PATH = 'value_netowrk.p' TRAIN_SAMPLES = 10000 TEST_SAMPLES = 10000 # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec game_spec = TicTacToeGameSpec() NUMBER_RANDOM_RANGE = (1, game_spec.board_squares() * 0.8) # it would be good to have real board positions, but failing that just generate random ones def generate_random_board_position(): while True: board_state = game_spec.new_board() number_moves = random.randint(*NUMBER_RANDOM_RANGE) side = 1 for _ in range(number_moves): board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))), side) if game_spec.has_winner(board_state) != 0: # start again if we hit an already winning position continue
Returns: """ raise Exception( "If we had a database of tic-tac-toe games this would load them") HIDDEN_NODES = (100, 80, 60, 40) # number of hidden layer neurons BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 NETWORK_FILE_PATH = 'current_network.p' game_spec = TicTacToeGameSpec() input_layer, output_layer, variables = create_network( game_spec.board_squares(), HIDDEN_NODES, output_nodes=game_spec.outputs()) actual_move_placeholder = tf.placeholder("float", (None, game_spec.outputs())) error = tf.reduce_sum(tf.square(actual_move_placeholder - output_layer)) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) if os.path.isfile(NETWORK_FILE_PATH): print("loading existing network") load_network(session, variables, NETWORK_FILE_PATH) episode_number = 1 positions_train, positions_test = load_games()
get_deterministic_network_move from games.tic_tac_toe import TicTacToeGameSpec HIDDEN_NODES_VALUE = (100, 100, 100) HIDDEN_NODES_REINFORCEMENT = (100, 100, 100) BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 REINFORCEMENT_NETWORK_PATH = 'current_network.p' VALUE_NETWORK_PATH = 'value_netowrk.p' TRAIN_SAMPLES = 10000 TEST_SAMPLES = 10000 # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec game_spec = TicTacToeGameSpec() NUMBER_RANDOM_RANGE = (1, game_spec.board_squares() * 0.8) # it would be good to have real board positions, but failing that just generate random ones def generate_random_board_position(): while True: board_state = game_spec.new_board() number_moves = random.randint(*NUMBER_RANDOM_RANGE) side = 1 for _ in range(number_moves): board_state = game_spec.apply_move( board_state, random.choice(list(game_spec.available_moves(board_state))), side) if game_spec.has_winner(board_state) != 0: # start again if we hit an already winning position
""" This is the same as the policy_gradient.py network except that instead of playing against a random opponent. It plays against previous versions of itself. It is first created with the weights from the "current_network.p" file, if no file is found there random weights are used. It then creates a series of copies of itself and plays against them. After "SAVE_HISTORICAL_NETWORK_EVERY" games, it saves it's current weights into the weights of one of the historical networks. Over time the main network and the historical networks should improve. """ import collections import functools import os import random import numpy as np import tensorflow as tf from common.network_helpers import create_network, load_network, get_stochastic_network_move, \ save_network from games.tic_tac_toe import TicTacToeGameSpec from techniques.train_policy_gradient_historic import train_policy_gradients_vs_historic HIDDEN_NODES = (100, 100, 100) SAVE_HISTORICAL_NETWORK_EVERY = 10000 game_spec = TicTacToeGameSpec() create_network_func = functools.partial(create_network, game_spec.board_squares(), HIDDEN_NODES) train_policy_gradients_vs_historic(game_spec, create_network_func, 'train_vs_historical.p', save_historic_every=SAVE_HISTORICAL_NETWORK_EVERY)
a valid move, so initially it must learn the rules of the game. I have trained this version with success at 3x3 tic tac toe until it has a success rate in the region of 75% this maybe as good as it can do, because 3x3 tic-tac-toe is a theoretical draw, so the random opponent will often get lucky and force a draw. """ import functools from common.network_helpers import create_network from games.tic_tac_toe import TicTacToeGameSpec from techniques.train_policy_gradient import train_policy_gradients BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 PRINT_RESULTS_EVERY_X = 1000 # every how many games to print the results NETWORK_FILE_PATH = None#'current_network.p' # path to save the network to NUMBER_OF_GAMES_TO_RUN = 1000000 # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run # well may require tuning the hyper parameters a bit game_spec = TicTacToeGameSpec() create_network_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100)) train_policy_gradients(game_spec, create_network_func, NETWORK_FILE_PATH, number_of_games=NUMBER_OF_GAMES_TO_RUN, batch_size=BATCH_SIZE, learn_rate=LEARN_RATE, print_results_every=PRINT_RESULTS_EVERY_X)
BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 PRINT_RESULTS_EVERY_X = 1000 # every how many games to print the results NETWORK_FILE_PATH = 'current_network.p' NUMBER_OF_GAMES_TO_RUN = 100000 # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec game_spec = TicTacToeGameSpec() OUTPUT_NODES = game_spec.outputs() reward_placeholder = tf.placeholder("float", shape=(None, )) actual_move_placeholder = tf.placeholder("float", shape=(None, OUTPUT_NODES)) input_layer, output_layer, variables = create_network( game_spec.board_squares(), HIDDEN_NODES, output_nodes=OUTPUT_NODES) policy_gradient = tf.reduce_sum( tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient) with tf.Session() as session: session.run(tf.initialize_all_variables()) if os.path.isfile(NETWORK_FILE_PATH): print("loading pre-existing network") load_network(session, variables, NETWORK_FILE_PATH) mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], [] results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X)
from games.tic_tac_toe import TicTacToeGameSpec from network_helpers import create_network, load_network, get_stochastic_network_move, \ save_network NUMBER_OF_HISTORICAL_COPIES_TO_KEEP = 8 NUMBER_OF_GAMES_TO_PLAY = 1000000 MINI_BATCH_SIZE = 100 SAVE_HISTORICAL_NETWORK_EVERY = 100000 STARTING_NETWORK_WEIGHTS = 'current_network.p' BASE_HISTORICAL_NETWORK_PATH = 'historical_network_' HIDDEN_NODES = (100, 80, 60, 40) PRINT_RESULTS_EVERY_X = 500 LEARN_RATE = 1e-4 game_spec = TicTacToeGameSpec() input_layer, output_layer, variables = create_network(game_spec.board_squares(), HIDDEN_NODES, output_nodes=game_spec.outputs()) reward_placeholder = tf.placeholder("float", shape=(None,)) actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares())) policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient) current_historical_index = 0 historical_networks = [] mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], [] results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X) for _ in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP): historical_input_layer, historical_output_layer, historical_variables = create_network(game_spec.board_squares(),