예제 #1
0
        side = 1
        for _ in range(number_moves):
            board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))),
                                               side)
            if game_spec.has_winner(board_state) != 0:
                # start again if we hit an already winning position
                continue

            side = -side
        return board_state


reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network(
    game_spec.board_squares(),
    HIDDEN_NODES_REINFORCEMENT,
    game_spec.outputs())

value_input_layer, value_output_layer, value_variables = create_network(game_spec.board_squares(), HIDDEN_NODES_VALUE,
                                                                        output_nodes=1, output_softmax=False)

target_placeholder = tf.placeholder("float", (None, 1))
error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer))

train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error)

with tf.Session() as session:
    session.run(tf.initialize_all_variables())

    load_network(session, reinforcement_variables, REINFORCEMENT_NETWORK_PATH)

    if os.path.isfile(VALUE_NETWORK_PATH):
    Returns:

    """
    raise Exception(
        "If we had a database of tic-tac-toe games this would load them")


HIDDEN_NODES = (100, 80, 60, 40)  # number of hidden layer neurons
BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
NETWORK_FILE_PATH = 'current_network.p'
game_spec = TicTacToeGameSpec()

input_layer, output_layer, variables = create_network(
    game_spec.board_squares(), HIDDEN_NODES, output_nodes=game_spec.outputs())
actual_move_placeholder = tf.placeholder("float", (None, game_spec.outputs()))

error = tf.reduce_sum(tf.square(actual_move_placeholder - output_layer))
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error)

with tf.Session() as session:
    session.run(tf.initialize_all_variables())

    if os.path.isfile(NETWORK_FILE_PATH):
        print("loading existing network")
        load_network(session, variables, NETWORK_FILE_PATH)

    episode_number = 1

    positions_train, positions_test = load_games()
예제 #3
0
import tensorflow as tf

from games.tic_tac_toe import TicTacToeGameSpec
from network_helpers import create_network, load_network, get_stochastic_network_move, save_network

HIDDEN_NODES = (100, 80, 60, 40)
BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
PRINT_RESULTS_EVERY_X = 1000  # every how many games to print the results
NETWORK_FILE_PATH = 'current_network.p'
NUMBER_OF_GAMES_TO_RUN = 100000

# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec
game_spec = TicTacToeGameSpec()

OUTPUT_NODES = game_spec.outputs()

reward_placeholder = tf.placeholder("float", shape=(None, ))
actual_move_placeholder = tf.placeholder("float", shape=(None, OUTPUT_NODES))

input_layer, output_layer, variables = create_network(
    game_spec.board_squares(), HIDDEN_NODES, output_nodes=OUTPUT_NODES)

policy_gradient = tf.reduce_sum(
    tf.reshape(reward_placeholder,
               (-1, 1)) * actual_move_placeholder * output_layer)
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient)

with tf.Session() as session:
    session.run(tf.initialize_all_variables())
예제 #4
0
        side = 1
        for _ in range(number_moves):
            board_state = game_spec.apply_move(
                board_state,
                random.choice(list(game_spec.available_moves(board_state))),
                side)
            if game_spec.has_winner(board_state) != 0:
                # start again if we hit an already winning position
                continue

            side = -side
        return board_state


reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network(
    game_spec.board_squares(), HIDDEN_NODES_REINFORCEMENT, game_spec.outputs())

value_input_layer, value_output_layer, value_variables = create_network(
    game_spec.board_squares(),
    HIDDEN_NODES_VALUE,
    output_nodes=1,
    output_softmax=False)

target_placeholder = tf.placeholder("float", (None, 1))
error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer))

train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error)

with tf.Session() as session:
    session.run(tf.initialize_all_variables())
예제 #5
0
from network_helpers import create_network, load_network, get_stochastic_network_move, \
    save_network

NUMBER_OF_HISTORICAL_COPIES_TO_KEEP = 8
NUMBER_OF_GAMES_TO_PLAY = 1000000
MINI_BATCH_SIZE = 100
SAVE_HISTORICAL_NETWORK_EVERY = 100000
STARTING_NETWORK_WEIGHTS = 'current_network.p'
BASE_HISTORICAL_NETWORK_PATH = 'historical_network_'
HIDDEN_NODES = (100, 80, 60, 40)
PRINT_RESULTS_EVERY_X = 500
LEARN_RATE = 1e-4
game_spec = TicTacToeGameSpec()

input_layer, output_layer, variables = create_network(game_spec.board_squares(), HIDDEN_NODES,
                                                      output_nodes=game_spec.outputs())

reward_placeholder = tf.placeholder("float", shape=(None,))
actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares()))
policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer)
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient)

current_historical_index = 0
historical_networks = []

mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X)

for _ in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP):
    historical_input_layer, historical_output_layer, historical_variables = create_network(game_spec.board_squares(),
                                                                                           HIDDEN_NODES)