side = 1 for _ in range(number_moves): board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))), side) if game_spec.has_winner(board_state) != 0: # start again if we hit an already winning position continue side = -side return board_state reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network( game_spec.board_squares(), HIDDEN_NODES_REINFORCEMENT, game_spec.outputs()) value_input_layer, value_output_layer, value_variables = create_network(game_spec.board_squares(), HIDDEN_NODES_VALUE, output_nodes=1, output_softmax=False) target_placeholder = tf.placeholder("float", (None, 1)) error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer)) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) load_network(session, reinforcement_variables, REINFORCEMENT_NETWORK_PATH) if os.path.isfile(VALUE_NETWORK_PATH):
Returns: """ raise Exception( "If we had a database of tic-tac-toe games this would load them") HIDDEN_NODES = (100, 80, 60, 40) # number of hidden layer neurons BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 NETWORK_FILE_PATH = 'current_network.p' game_spec = TicTacToeGameSpec() input_layer, output_layer, variables = create_network( game_spec.board_squares(), HIDDEN_NODES, output_nodes=game_spec.outputs()) actual_move_placeholder = tf.placeholder("float", (None, game_spec.outputs())) error = tf.reduce_sum(tf.square(actual_move_placeholder - output_layer)) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) if os.path.isfile(NETWORK_FILE_PATH): print("loading existing network") load_network(session, variables, NETWORK_FILE_PATH) episode_number = 1 positions_train, positions_test = load_games()
import tensorflow as tf from games.tic_tac_toe import TicTacToeGameSpec from network_helpers import create_network, load_network, get_stochastic_network_move, save_network HIDDEN_NODES = (100, 80, 60, 40) BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 PRINT_RESULTS_EVERY_X = 1000 # every how many games to print the results NETWORK_FILE_PATH = 'current_network.p' NUMBER_OF_GAMES_TO_RUN = 100000 # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec game_spec = TicTacToeGameSpec() OUTPUT_NODES = game_spec.outputs() reward_placeholder = tf.placeholder("float", shape=(None, )) actual_move_placeholder = tf.placeholder("float", shape=(None, OUTPUT_NODES)) input_layer, output_layer, variables = create_network( game_spec.board_squares(), HIDDEN_NODES, output_nodes=OUTPUT_NODES) policy_gradient = tf.reduce_sum( tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient) with tf.Session() as session: session.run(tf.initialize_all_variables())
side = 1 for _ in range(number_moves): board_state = game_spec.apply_move( board_state, random.choice(list(game_spec.available_moves(board_state))), side) if game_spec.has_winner(board_state) != 0: # start again if we hit an already winning position continue side = -side return board_state reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network( game_spec.board_squares(), HIDDEN_NODES_REINFORCEMENT, game_spec.outputs()) value_input_layer, value_output_layer, value_variables = create_network( game_spec.board_squares(), HIDDEN_NODES_VALUE, output_nodes=1, output_softmax=False) target_placeholder = tf.placeholder("float", (None, 1)) error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer)) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables())
from network_helpers import create_network, load_network, get_stochastic_network_move, \ save_network NUMBER_OF_HISTORICAL_COPIES_TO_KEEP = 8 NUMBER_OF_GAMES_TO_PLAY = 1000000 MINI_BATCH_SIZE = 100 SAVE_HISTORICAL_NETWORK_EVERY = 100000 STARTING_NETWORK_WEIGHTS = 'current_network.p' BASE_HISTORICAL_NETWORK_PATH = 'historical_network_' HIDDEN_NODES = (100, 80, 60, 40) PRINT_RESULTS_EVERY_X = 500 LEARN_RATE = 1e-4 game_spec = TicTacToeGameSpec() input_layer, output_layer, variables = create_network(game_spec.board_squares(), HIDDEN_NODES, output_nodes=game_spec.outputs()) reward_placeholder = tf.placeholder("float", shape=(None,)) actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares())) policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient) current_historical_index = 0 historical_networks = [] mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], [] results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X) for _ in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP): historical_input_layer, historical_output_layer, historical_variables = create_network(game_spec.board_squares(), HIDDEN_NODES)