def train(board_size, max_timesteps): """train gomoku AI play board whose size is board_size x board_size. Parameters ---------- board_size: int Size of board in one dimension, example: board_size = 9 --> board have size 9x9 max_timesteps: int Number of training step Returns ------- None """ env = gym.make( 'Gomoku{}x{}-arena-v0'.format(board_size, board_size)) val_env = gym.make( 'Gomoku{}x{}-arena-v0'.format(board_size, board_size), __val_opponent_policy) # Enabling layer_norm here is import for parameter space noise! capility = 64 num_conv_layer = 8 conv_layers = [(capility, 3, 1)] * num_conv_layer hidden_layers = [capility] model = deepq.models.cnn_to_mlp( convs=conv_layers, hiddens=hidden_layers, ) timesteps_to_explore = 800000 act = deepq.learn( env=env, val_env=val_env, q_func=model, max_timesteps=max_timesteps, lr=1e-4, buffer_size=400000, batch_size=512, exploration_fraction=(timesteps_to_explore / max_timesteps), exploration_final_eps=0.35, train_freq=4, val_freq=1000, print_freq=100, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=False, deterministic_filter=True, random_filter=True, state_file='kaithy_cnn_to_mlp_{}_model.pkl'.format(board_size), ) print('Saving model to kaithy_cnn_to_mlp_{}_model.pkl'.format( board_size)) act.save('kaithy_cnn_to_mlp_{}_model.pkl'.format(board_size))
def enjoy(board_size): """enjoy trained gomoku AI play board whose size is board_size x board_size. Parameters ---------- board_size: int Size of board in one dimension, example: board_size = 9 --> board have size 9x9 Returns ------- None """ env = gym.make('Gomoku{}x{}-arena-v0'.format(board_size, board_size), __val_opponent_policy) act = deepq.load("kaithy_cnn_to_mlp_{}_model.pkl".format(board_size)) # Enabling layer_norm here is import for parameter space noise! while True: obs, done = env.reset(), False episode_rew = 0 while not done: obs, rew, done, _ = env.step(act(obs[None], stochastic=False)[0]) episode_rew += rew env.render() print('Episode reward', episode_rew) input('Hit enter to play next match') print('Swap color') env.swap_role()
def main(): ''' AI Self-training program ''' deterministic_actions_filter = True env = gym.make('Gomoku5x5-training-camp-v0', opponent_policy) obs_ph = tf.placeholder( dtype=tf.float32, shape=[None] + list(env.observation_space.shape)) if deterministic_actions_filter: invalid_masks = tf.reduce_sum(obs_ph, axis=3) sess = tf.Session() observations = [] for i in range(2): observation = env.reset() done = None while not done: action = env.action_space.sample() observation, reward, done, info = env.step(action) observations.append(observation) env.render() out = sess.run(invalid_masks, feed_dict={ obs_ph: observations}) print(out) print(out.shape)
def main(): ''' AI Self-training program ''' class Opponent(object): def __init__(self): self.__old_obs = None self.__old_action = None self.__obs = None def policy(self, curr_state, prev_state, prev_action): ''' Define policy for opponent here ''' return gym.gym_gomoku.envs.util.make_beginner_policy(np.random)(curr_state, prev_state, prev_action) opponent = Opponent() env = gym.make('Gomoku5x5-training-camp-v0') env.opponent_policy = opponent.policy for i in range(2): observation = env.reset() done = None while not done: action = env.action_space.sample() # sample without replacement observation, reward, done, info = env.step(action) env.render() env.swap_role() print("\n----SWAP----\n")
def main(): ''' AI Self-training program ''' env = gym.make('Gomoku9x9-training-camp-v0', opponent_policy) env.reset() action = env.action_space.sample() # sample without replacement observation, reward, done, info = env.step(action)
def main(): env = gym.make('Gomoku9x9-training-camp-v0', opponent_policy) model = models.mlp([64]) act = simple.learn(env, q_func=model, lr=1e-3, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback) print("Saving model to cartpole_model.pkl") act.save("cartpole_model.pkl")
def main(): ''' AI Self-training program ''' env = gym.make('Gomoku5x5-training-camp-v0', opponent_policy) for i in range(2): observation = env.reset() done = None while not done: action = env.action_space.sample() # sample without replacement observation, reward, done, info = env.step(action) env.render() env.swap_role() print("\n----SWAP----\n")
import matplotlib.pyplot as plt import scipy.misc import os #%matplotlib inline def opponent_policy(curr_state, prev_state, prev_action): ''' Define policy for opponent here ''' return gym_gomoku.envs.util.make_beginner_policy(np.random)(curr_state, prev_state, prev_action) env = gym.make('Gomoku9x9-training-camp-v0', opponent_policy) env.reset() class Qnetwork(): def __init__(self, h_size): # The network recieves a frame from the game, flattened into an array. # It then resizes it and processes it through four convolutional layers. self.scalarInput = tf.placeholder(shape=[None, 81], dtype=tf.float32) self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 9, 9, 2]) self.conv1 = slim.conv2d(inputs=self.imageIn, num_outputs=3, kernel_size=[2, 2], stride=[1, 1], padding='VALID', biases_initializer=None)
def main(): ''' AI Self-training program ''' deterministic_filter = True random_filter = True env = gym.make('Gomoku5x5-training-camp-v0', opponent_policy) num_actions = env.action_space.n obs_ph = tf.placeholder( dtype=tf.float32, shape=[None] + list(env.observation_space.shape)) q_values = layers.fully_connected(layers.flatten(obs_ph), num_actions) if deterministic_filter or random_filter: invalid_masks = tf.contrib.layers.flatten( tf.reduce_sum(obs_ph[:, :, :, 1:3], axis=3)) if deterministic_filter: q_values_worst = tf.reduce_min(q_values, axis=1, keep_dims=True) # q_values = tf.where(tf.equal( # invalid_masks, 1.), q_values_worst - 1.0, q_values) q_values = invalid_masks * (q_values_worst - 1.0) + \ (1.0 - invalid_masks) * q_values deterministic_actions = tf.argmax(q_values, axis=1, output_type=tf.int32) batch_size = tf.shape(obs_ph)[0] stochastic_ph = tf.constant(True, dtype=tf.bool) random_actions = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int32) if random_filter: def get_elements(data, indices): indeces = tf.range(0, tf.shape(indices)[ 0]) * data.shape[1] + indices return tf.gather(tf.reshape(data, [-1]), indeces) is_invalid_random_actions = get_elements( invalid_masks, random_actions) random_actions = tf.where(tf.equal( is_invalid_random_actions, 1.), deterministic_actions, random_actions) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < 0.9 stochastic_actions = tf.where( chose_random, random_actions, deterministic_actions) output_actions = tf.where( stochastic_ph, stochastic_actions, deterministic_actions) optimizer = tf.train.AdamOptimizer(learning_rate=0.001) alo = optimizer.minimize(q_values) sess = tf.Session() sess.run(tf.global_variables_initializer()) observations = [] for i in range(2): observation = env.reset() done = None while not done: action = sess.run(output_actions, feed_dict={ obs_ph: observation[None]})[0] observation, reward, done, info = env.step(action) env.render() observations.append(observation) print(reward) env.swap_role() print("\n----SWAP----\n") actions = sess.run(output_actions, feed_dict={ obs_ph: observations}) sess.run(q_values, feed_dict={ obs_ph: observations}) print(actions)
def main(): ''' AI Self-training program ''' deterministic_filter = True random_filter = True env = gym.make('Gomoku5x5-training-camp-v0', opponent_policy) num_actions = env.action_space.n # obs_ph = tf.placeholder( # dtype=tf.float32, shape=[None] + list(env.observation_space.shape)) # q_values = layers.fully_connected(layers.flatten(obs_ph), num_actions) def make_obs_ph(name): obs_shape = env.observation_space.shape if flatten_obs: flattened_env_shape = 1 for dim_size in env.observation_space.shape: flattened_env_shape *= dim_size obs_shape = (flattened_env_shape, ) return U.BatchInput(obs_shape, name=name) # Create batch aumentation for obs ------------------------------------------ obs_t_input = tf.placeholder(dtype=tf.float32, shape=list(env.observation_space.shape)) list_obs = [] list_obs.append(obs_t_input) for i in range(0, 8): if (i > 0 and i < 4): list_obs.append(tf.image.rot90(obs_t_input, k=i)) if (i == 4): list_obs.append(tf.image.flip_left_right(obs_t_input)) if (i > 4 and i < 8): list_obs.append(tf.image.rot90(obs_t_input, k=(i - 4))) obs_ph = tf.stack(list_obs) # end create augmentation---------------------------------------- q_values = layers.fully_connected(layers.flatten(obs_ph), num_actions) if deterministic_filter or random_filter: invalid_masks = tf.contrib.layers.flatten( tf.reduce_sum(obs_ph[:, :, :, 1:3], axis=3)) # print(tf.shape(invalid_masks)) # exit(0) if deterministic_filter: q_values_worst = tf.reduce_min(q_values, axis=1, keep_dims=True) # q_values = tf.where(tf.equal( # invalid_masks, 1.), q_values_worst - 1.0, q_values) q_values = invalid_masks * (q_values_worst - 1.0) + \ (1.0 - invalid_masks) * q_values deterministic_actions = tf.argmax(q_values, axis=1, output_type=tf.int32) batch_size = tf.shape(obs_ph)[0] stochastic_ph = tf.constant(True, dtype=tf.bool) random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int32) if random_filter: def get_elements(data, indices): indeces = tf.range(0, tf.shape(indices)[0]) * data.shape[1] + indices return tf.gather(tf.reshape(data, [-1]), indeces) is_invalid_random_actions = get_elements(invalid_masks, random_actions) random_actions = tf.where(tf.equal(is_invalid_random_actions, 1.), deterministic_actions, random_actions) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < 0.9 stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.where(stochastic_ph, stochastic_actions, deterministic_actions) optimizer = tf.train.AdamOptimizer(learning_rate=0.001) alo = optimizer.minimize(q_values) sess = tf.Session() sess.run(tf.global_variables_initializer()) observations = [] for i in range(2): observation = env.reset() done = None while not done: # create action rotate ----------------------------- def rotate_action(board_size, pos_1D, k): """ Function rotate board :param board_size: size of board :param pos_1D: position in board :param k: 1: rotate 90 2: rotate 180 3: rotate 270 """ pos_2D = (pos_1D // board_size, pos_1D % board_size) # rot90 if (k == 1): rot_pos = pos_2D[0] + (board_size - 1 - pos_2D[1]) * board_size # rot180 if (k == 2): rot_pos = (board_size - 1 - pos_2D[0]) * board_size + ( board_size - 1 - pos_2D[1]) # rot270 if (k == 3): rot_pos = (board_size - 1 - pos_2D[0]) + pos_2D[1] * board_size return rot_pos def flip_action(board_size, pos_1D, k): """ Flip board and rotate :param board_size: size of board :param pos_1D: position in board :param k: 0: only flip 1: flip and rotate 90 2: flip and rotate 180 3: flip and rotate 270 """ pos_2D = (pos_1D // board_size, pos_1D % board_size) # flip and rot 0 if (k == 0): flip_rot = pos_2D[ 0] * board_size + -pos_2D[1] + board_size - 1 # flip and rot 90 if (k == 1): flip_rot = pos_2D[1] * board_size + pos_2D[0] # flip and rot 180 if (k == 2): flip_rot = (-pos_2D[0] + board_size - 1) * board_size + pos_2D[1] # flip and rot 270 if (k == 3): flip_rot = (-pos_2D[1] + board_size - 1) * board_size + -pos_2D[0] + board_size - 1 return flip_rot # run to get action from AI actions = sess.run(output_actions, feed_dict={obs_t_input: observation}) # Get first valid action action = actions[0] # Rotate this action for i in range(1, 8): if (i < 4): actions[i] = rotate_action(observation.shape[0], action, i) else: actions[i] = flip_action(observation.shape[0], action, (i - 4)) # END create actions -------------------------------- observation, reward, done, info = env.step(action) angle = 1 flip_action(observation.shape[0], action, angle) print(action, rotate_action(observation.shape[0], action, angle)) # exit(0) # observation flip and rotate print(observation[:, :, 1], env.observation_space.shape[0:2]) # exit(0) obs_temp_ph = tf.placeholder(dtype=tf.int32, shape=(env.observation_space.shape)) k = tf.placeholder(tf.int32) # tf_img = tf.image.rot90(obs_temp_ph, k = k) # tf_img1 = tf.image.flip_left_right(obs_temp_ph) tf_img = tf.image.rot90(obs_temp_ph, k=k) # tf_img = tf.image.flip_left_right(obs_temp_ph) rotated_img = sess.run(tf_img, feed_dict={ obs_temp_ph: observation, k: angle }) # rotated_img = sess.run(tf_img1, feed_dict = {obs_temp_ph: observation}) print(rotated_img[:, :, 1]) exit(0) # end obser env.render() observations.append(observation) print(reward) env.swap_role() print("\n----SWAP----\n")