def test_tic_tac_toe(self):
     game_spec = TicTacToeGameSpec()
     create_model_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100,))
     variables, win_rate = train_policy_gradients(game_spec, create_model_func, None,
                                                  learn_rate=1e-4,
                                                  number_of_games=60000,
                                                  print_results_every=1000,
                                                  batch_size=100,
                                                  randomize_first_player=False)
     self.assertGreater(win_rate, 0.4)
from common.network_helpers import create_network
from games.tic_tac_toe import TicTacToeGameSpec
from techniques.train_policy_gradient import train_policy_gradients

BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
PRINT_RESULTS_EVERY_X = 100  # every how many games to print the results
NETWORK_FILE_PATH = 'current_network.p'  # path to save the network to
NUMBER_OF_GAMES_TO_RUN = 1000

# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run
# well may require tuning the hyper parameters a bit
game_spec = TicTacToeGameSpec()

create_network_func = functools.partial(create_network,
                                        game_spec.board_squares(),
                                        (100, 100, 100))

train_policy_gradients(game_spec,
                       create_network_func,
                       NETWORK_FILE_PATH,
                       number_of_games=NUMBER_OF_GAMES_TO_RUN,
                       batch_size=BATCH_SIZE,
                       learn_rate=LEARN_RATE,
                       print_results_every=PRINT_RESULTS_EVERY_X)


def second_player_move(board_state, side):
    return game_spec.flat_move_to_tuple(int(input("Next Move:")))

示例#3
0
from network_helpers import create_network, load_network, save_network, \
    get_deterministic_network_move

HIDDEN_NODES_VALUE = (120, 100, 80, 60, 40)
HIDDEN_NODES_REINFORCEMENT = (100, 80, 60, 40)
BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
REINFORCEMENT_NETWORK_PATH = 'current_network.p'
VALUE_NETWORK_PATH = 'value_netowrk.p'
TRAIN_SAMPLES = 10000
TEST_SAMPLES = 10000

# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec
game_spec = TicTacToeGameSpec()

NUMBER_RANDOM_RANGE = (1, game_spec.board_squares() * 0.8)


# it would be good to have real board positions, but failing that just generate random ones
def generate_random_board_position():
    while True:
        board_state = game_spec.new_board()
        number_moves = random.randint(*NUMBER_RANDOM_RANGE)
        side = 1
        for _ in range(number_moves):
            board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))),
                                               side)
            if game_spec.has_winner(board_state) != 0:
                # start again if we hit an already winning position
                continue
    Returns:

    """
    raise Exception(
        "If we had a database of tic-tac-toe games this would load them")


HIDDEN_NODES = (100, 80, 60, 40)  # number of hidden layer neurons
BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
NETWORK_FILE_PATH = 'current_network.p'
game_spec = TicTacToeGameSpec()

input_layer, output_layer, variables = create_network(
    game_spec.board_squares(), HIDDEN_NODES, output_nodes=game_spec.outputs())
actual_move_placeholder = tf.placeholder("float", (None, game_spec.outputs()))

error = tf.reduce_sum(tf.square(actual_move_placeholder - output_layer))
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error)

with tf.Session() as session:
    session.run(tf.initialize_all_variables())

    if os.path.isfile(NETWORK_FILE_PATH):
        print("loading existing network")
        load_network(session, variables, NETWORK_FILE_PATH)

    episode_number = 1

    positions_train, positions_test = load_games()
示例#5
0
    get_deterministic_network_move
from games.tic_tac_toe import TicTacToeGameSpec

HIDDEN_NODES_VALUE = (100, 100, 100)
HIDDEN_NODES_REINFORCEMENT = (100, 100, 100)
BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
REINFORCEMENT_NETWORK_PATH = 'current_network.p'
VALUE_NETWORK_PATH = 'value_netowrk.p'
TRAIN_SAMPLES = 10000
TEST_SAMPLES = 10000

# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec
game_spec = TicTacToeGameSpec()

NUMBER_RANDOM_RANGE = (1, game_spec.board_squares() * 0.8)


# it would be good to have real board positions, but failing that just generate random ones
def generate_random_board_position():
    while True:
        board_state = game_spec.new_board()
        number_moves = random.randint(*NUMBER_RANDOM_RANGE)
        side = 1
        for _ in range(number_moves):
            board_state = game_spec.apply_move(
                board_state,
                random.choice(list(game_spec.available_moves(board_state))),
                side)
            if game_spec.has_winner(board_state) != 0:
                # start again if we hit an already winning position
"""
This is the same as the policy_gradient.py network except that instead of playing against a random opponent. It plays
against previous versions of itself. It is first created with the weights from the "current_network.p" file, if no file
is found there random weights are used. It then creates a series of copies of itself and plays against them.
After "SAVE_HISTORICAL_NETWORK_EVERY" games, it saves it's current weights into the weights of one of the historical
networks. Over time the main network and the historical networks should improve.
"""
import collections
import functools
import os
import random

import numpy as np
import tensorflow as tf

from common.network_helpers import create_network, load_network, get_stochastic_network_move, \
    save_network
from games.tic_tac_toe import TicTacToeGameSpec
from techniques.train_policy_gradient_historic import train_policy_gradients_vs_historic

HIDDEN_NODES = (100, 100, 100)
SAVE_HISTORICAL_NETWORK_EVERY = 10000
game_spec = TicTacToeGameSpec()

create_network_func = functools.partial(create_network, game_spec.board_squares(), HIDDEN_NODES)

train_policy_gradients_vs_historic(game_spec, create_network_func,
                                   'train_vs_historical.p',
                                   save_historic_every=SAVE_HISTORICAL_NETWORK_EVERY)
示例#7
0
a valid move, so initially it must learn the rules of the game.

I have trained this version with success at 3x3 tic tac toe until it has a success rate in the region of 75% this maybe
as good as it can do, because 3x3 tic-tac-toe is a theoretical draw, so the random opponent will often get lucky and
force a draw.
"""
import functools

from common.network_helpers import create_network
from games.tic_tac_toe import TicTacToeGameSpec
from techniques.train_policy_gradient import train_policy_gradients

BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
PRINT_RESULTS_EVERY_X = 1000  # every how many games to print the results
NETWORK_FILE_PATH = None#'current_network.p'  # path to save the network to
NUMBER_OF_GAMES_TO_RUN = 1000000


# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run
# well may require tuning the hyper parameters a bit
game_spec = TicTacToeGameSpec()

create_network_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100))

train_policy_gradients(game_spec, create_network_func, NETWORK_FILE_PATH,
                       number_of_games=NUMBER_OF_GAMES_TO_RUN,
                       batch_size=BATCH_SIZE,
                       learn_rate=LEARN_RATE,
                       print_results_every=PRINT_RESULTS_EVERY_X)
BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
PRINT_RESULTS_EVERY_X = 1000  # every how many games to print the results
NETWORK_FILE_PATH = 'current_network.p'
NUMBER_OF_GAMES_TO_RUN = 100000

# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec
game_spec = TicTacToeGameSpec()

OUTPUT_NODES = game_spec.outputs()

reward_placeholder = tf.placeholder("float", shape=(None, ))
actual_move_placeholder = tf.placeholder("float", shape=(None, OUTPUT_NODES))

input_layer, output_layer, variables = create_network(
    game_spec.board_squares(), HIDDEN_NODES, output_nodes=OUTPUT_NODES)

policy_gradient = tf.reduce_sum(
    tf.reshape(reward_placeholder,
               (-1, 1)) * actual_move_placeholder * output_layer)
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient)

with tf.Session() as session:
    session.run(tf.initialize_all_variables())

    if os.path.isfile(NETWORK_FILE_PATH):
        print("loading pre-existing network")
        load_network(session, variables, NETWORK_FILE_PATH)

    mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
    results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X)
示例#9
0
from games.tic_tac_toe import TicTacToeGameSpec
from network_helpers import create_network, load_network, get_stochastic_network_move, \
    save_network

NUMBER_OF_HISTORICAL_COPIES_TO_KEEP = 8
NUMBER_OF_GAMES_TO_PLAY = 1000000
MINI_BATCH_SIZE = 100
SAVE_HISTORICAL_NETWORK_EVERY = 100000
STARTING_NETWORK_WEIGHTS = 'current_network.p'
BASE_HISTORICAL_NETWORK_PATH = 'historical_network_'
HIDDEN_NODES = (100, 80, 60, 40)
PRINT_RESULTS_EVERY_X = 500
LEARN_RATE = 1e-4
game_spec = TicTacToeGameSpec()

input_layer, output_layer, variables = create_network(game_spec.board_squares(), HIDDEN_NODES,
                                                      output_nodes=game_spec.outputs())

reward_placeholder = tf.placeholder("float", shape=(None,))
actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares()))
policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer)
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient)

current_historical_index = 0
historical_networks = []

mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X)

for _ in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP):
    historical_input_layer, historical_output_layer, historical_variables = create_network(game_spec.board_squares(),