def test_tic_tac_toe(self):
     game_spec = TicTacToeGameSpec()
     create_model_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100,))
     variables, win_rate = train_policy_gradients(game_spec, create_model_func, None,
                                                  learn_rate=1e-4,
                                                  number_of_games=60000,
                                                  print_results_every=1000,
                                                  batch_size=100,
                                                  randomize_first_player=False)
     self.assertGreater(win_rate, 0.4)
class TestCreatePositionsSet(TestCase):
    def setUp(self):
        self._game_spec = TicTacToeGameSpec()

    def test_create_positions(self):
        number_of_positions = 100
        positions = create_positions_set(
            self._game_spec, number_of_positions,
            self._game_spec.get_random_player_func())

        self.assertGreater(len(positions), number_of_positions - 1)
"""
import functools

from common.network_helpers import create_network
from games.tic_tac_toe import TicTacToeGameSpec
from techniques.train_policy_gradient import train_policy_gradients

BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
PRINT_RESULTS_EVERY_X = 100  # every how many games to print the results
NETWORK_FILE_PATH = 'current_network.p'  # path to save the network to
NUMBER_OF_GAMES_TO_RUN = 1000

# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run
# well may require tuning the hyper parameters a bit
game_spec = TicTacToeGameSpec()

create_network_func = functools.partial(create_network,
                                        game_spec.board_squares(),
                                        (100, 100, 100))

train_policy_gradients(game_spec,
                       create_network_func,
                       NETWORK_FILE_PATH,
                       number_of_games=NUMBER_OF_GAMES_TO_RUN,
                       batch_size=BATCH_SIZE,
                       learn_rate=LEARN_RATE,
                       print_results_every=PRINT_RESULTS_EVERY_X)


def second_player_move(board_state, side):
Пример #4
0
                    first_unvisited_node = False

            current_side = -current_side

            result = game_spec.has_winner(current_board_state)

        for path_board_state, path_side in rollout_path:
            state_samples[path_board_state] += 1.
            result *= path_side
            # normalize results to be between 0 and 1 before this it between -1 and 1
            result /= 2.
            result += .5
            state_results[path_board_state] += result

    move_states = {move: game_spec.apply_move(board_state, move, side) for move in game_spec.available_moves(board_state)}

    move = max(move_states, key=lambda x: state_results[move_states[x]] / state_samples[move_states[x]])

    return state_results[move_states[move]] / state_samples[move_states[move]], move


if __name__ == '__main__':
    from games.tic_tac_toe import TicTacToeGameSpec

    sample_board_state = ((1, 0, -1),
                          (1, 0, 0),
                          (0, -1, 0))

    print(monte_carlo_tree_search_uct(TicTacToeGameSpec(), sample_board_state, -1, 10000))
© 2017 GitHub, Inc.
Пример #5
0
from games.tic_tac_toe import TicTacToeGameSpec
from network_helpers import create_network, load_network, save_network, \
    get_deterministic_network_move

HIDDEN_NODES_VALUE = (120, 100, 80, 60, 40)
HIDDEN_NODES_REINFORCEMENT = (100, 80, 60, 40)
BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
REINFORCEMENT_NETWORK_PATH = 'current_network.p'
VALUE_NETWORK_PATH = 'value_netowrk.p'
TRAIN_SAMPLES = 10000
TEST_SAMPLES = 10000

# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec
game_spec = TicTacToeGameSpec()

NUMBER_RANDOM_RANGE = (1, game_spec.board_squares() * 0.8)


# it would be good to have real board positions, but failing that just generate random ones
def generate_random_board_position():
    while True:
        board_state = game_spec.new_board()
        number_moves = random.randint(*NUMBER_RANDOM_RANGE)
        side = 1
        for _ in range(number_moves):
            board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))),
                                               side)
            if game_spec.has_winner(board_state) != 0:
                # start again if we hit an already winning position
def load_games():
    """If we had a database of games this would load and return it...

    Returns:

    """
    raise Exception(
        "If we had a database of tic-tac-toe games this would load them")


HIDDEN_NODES = (100, 80, 60, 40)  # number of hidden layer neurons
BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
NETWORK_FILE_PATH = 'current_network.p'
game_spec = TicTacToeGameSpec()

input_layer, output_layer, variables = create_network(
    game_spec.board_squares(), HIDDEN_NODES, output_nodes=game_spec.outputs())
actual_move_placeholder = tf.placeholder("float", (None, game_spec.outputs()))

error = tf.reduce_sum(tf.square(actual_move_placeholder - output_layer))
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error)

with tf.Session() as session:
    session.run(tf.initialize_all_variables())

    if os.path.isfile(NETWORK_FILE_PATH):
        print("loading existing network")
        load_network(session, variables, NETWORK_FILE_PATH)
Пример #7
0
from common.network_helpers import create_network, load_network, save_network, \
    get_deterministic_network_move
from games.tic_tac_toe import TicTacToeGameSpec

HIDDEN_NODES_VALUE = (100, 100, 100)
HIDDEN_NODES_REINFORCEMENT = (100, 100, 100)
BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
REINFORCEMENT_NETWORK_PATH = 'current_network.p'
VALUE_NETWORK_PATH = 'value_netowrk.p'
TRAIN_SAMPLES = 10000
TEST_SAMPLES = 10000

# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec
game_spec = TicTacToeGameSpec()

NUMBER_RANDOM_RANGE = (1, game_spec.board_squares() * 0.8)


# it would be good to have real board positions, but failing that just generate random ones
def generate_random_board_position():
    while True:
        board_state = game_spec.new_board()
        number_moves = random.randint(*NUMBER_RANDOM_RANGE)
        side = 1
        for _ in range(number_moves):
            board_state = game_spec.apply_move(
                board_state,
                random.choice(list(game_spec.available_moves(board_state))),
                side)
"""
This is the same as the policy_gradient.py network except that instead of playing against a random opponent. It plays
against previous versions of itself. It is first created with the weights from the "current_network.p" file, if no file
is found there random weights are used. It then creates a series of copies of itself and plays against them.
After "SAVE_HISTORICAL_NETWORK_EVERY" games, it saves it's current weights into the weights of one of the historical
networks. Over time the main network and the historical networks should improve.
"""
import collections
import functools
import os
import random

import numpy as np
import tensorflow as tf

from common.network_helpers import create_network, load_network, get_stochastic_network_move, \
    save_network
from games.tic_tac_toe import TicTacToeGameSpec
from techniques.train_policy_gradient_historic import train_policy_gradients_vs_historic

HIDDEN_NODES = (100, 100, 100)
SAVE_HISTORICAL_NETWORK_EVERY = 10000
game_spec = TicTacToeGameSpec()

create_network_func = functools.partial(create_network, game_spec.board_squares(), HIDDEN_NODES)

train_policy_gradients_vs_historic(game_spec, create_network_func,
                                   'train_vs_historical.p',
                                   save_historic_every=SAVE_HISTORICAL_NETWORK_EVERY)
Пример #9
0
                    default=10000,
                    help="Every how many games to print results.")
parser.add_argument("--learning-rate", type=float, default=1e-4)
parser.add_argument("--batch-size",
                    type=int,
                    default=100,
                    help="Every how many games to update network weights")
parser.add_argument("hidden_layers",
                    nargs="*",
                    type=int,
                    help="List of hidden layer sizes")
args = parser.parse_args()

# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run
# well may require tuning the hyper parameters a bit
game_spec = TicTacToeGameSpec()

if not args.hidden_layers:
    args.hidden_layers = (100, 100, 100)

# create_network_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100))
create_network_func = functools.partial(create_network,
                                        game_spec.board_squares(),
                                        args.hidden_layers)

network_file_path = 'current_network'
for n in args.hidden_layers:
    network_file_path = network_file_path + ("_%05d" % n)

network_file_path = network_file_path + ".p"
Пример #10
0
a valid move, so initially it must learn the rules of the game.

I have trained this version with success at 3x3 tic tac toe until it has a success rate in the region of 75% this maybe
as good as it can do, because 3x3 tic-tac-toe is a theoretical draw, so the random opponent will often get lucky and
force a draw.
"""
import functools

from common.network_helpers import create_network
from games.tic_tac_toe import TicTacToeGameSpec
from techniques.train_policy_gradient import train_policy_gradients

BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
PRINT_RESULTS_EVERY_X = 1000  # every how many games to print the results
NETWORK_FILE_PATH = None#'current_network.p'  # path to save the network to
NUMBER_OF_GAMES_TO_RUN = 1000000


# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run
# well may require tuning the hyper parameters a bit
game_spec = TicTacToeGameSpec()

create_network_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100))

train_policy_gradients(game_spec, create_network_func, NETWORK_FILE_PATH,
                       number_of_games=NUMBER_OF_GAMES_TO_RUN,
                       batch_size=BATCH_SIZE,
                       learn_rate=LEARN_RATE,
                       print_results_every=PRINT_RESULTS_EVERY_X)
Пример #11
0
import numpy as np
import tensorflow as tf

from games.tic_tac_toe import TicTacToeGameSpec
from network_helpers import create_network, load_network, get_stochastic_network_move, save_network

HIDDEN_NODES = (100, 80, 60, 40)
BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
PRINT_RESULTS_EVERY_X = 1000  # every how many games to print the results
NETWORK_FILE_PATH = 'current_network.p'
NUMBER_OF_GAMES_TO_RUN = 100000

# to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec
game_spec = TicTacToeGameSpec()

OUTPUT_NODES = game_spec.outputs()

reward_placeholder = tf.placeholder("float", shape=(None, ))
actual_move_placeholder = tf.placeholder("float", shape=(None, OUTPUT_NODES))

input_layer, output_layer, variables = create_network(
    game_spec.board_squares(), HIDDEN_NODES, output_nodes=OUTPUT_NODES)

policy_gradient = tf.reduce_sum(
    tf.reshape(reward_placeholder,
               (-1, 1)) * actual_move_placeholder * output_layer)
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient)

with tf.Session() as session:
Пример #12
0
import tensorflow as tf

from games.tic_tac_toe import TicTacToeGameSpec
from network_helpers import create_network, load_network, get_stochastic_network_move, \
    save_network

NUMBER_OF_HISTORICAL_COPIES_TO_KEEP = 8
NUMBER_OF_GAMES_TO_PLAY = 1000000
MINI_BATCH_SIZE = 100
SAVE_HISTORICAL_NETWORK_EVERY = 100000
STARTING_NETWORK_WEIGHTS = 'current_network.p'
BASE_HISTORICAL_NETWORK_PATH = 'historical_network_'
HIDDEN_NODES = (100, 80, 60, 40)
PRINT_RESULTS_EVERY_X = 500
LEARN_RATE = 1e-4
game_spec = TicTacToeGameSpec()

input_layer, output_layer, variables = create_network(game_spec.board_squares(), HIDDEN_NODES,
                                                      output_nodes=game_spec.outputs())

reward_placeholder = tf.placeholder("float", shape=(None,))
actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares()))
policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer)
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient)

current_historical_index = 0
historical_networks = []

mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X)
Пример #13
0
 def setUp(self):
     self._game_spec = TicTacToeGameSpec()