def test_create_network(self): input_nodes = 20 hidden_nodes = (50, 40, 30) input_layer, output_layer, variables = create_network( input_nodes, hidden_nodes) self.assertSequenceEqual(input_layer.get_shape().as_list(), [None, input_nodes]) self.assertSequenceEqual(output_layer.get_shape().as_list(), [None, input_nodes]) self.assertEqual(len(variables), (len(hidden_nodes) + 1) * 2)
def test_save_and_load_network(self): try: file_name = 'test.p' input_nodes = 20 hidden_nodes = (50, 40, 30) _, _, variables1 = create_network(input_nodes, hidden_nodes) _, _, variables2 = create_network(input_nodes, hidden_nodes) with tf.Session() as session: session.run(tf.initialize_all_variables()) save_network(session, variables1, file_name) load_network(session, variables2, file_name) for var1, var2 in zip(variables1, variables2): np.testing.assert_array_almost_equal( session.run(var1), session.run(var2)) finally: try: os.remove(file_name) except OSError: pass
board_state = game_spec.new_board() number_moves = random.randint(*NUMBER_RANDOM_RANGE) side = 1 for _ in range(number_moves): board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))), side) if game_spec.has_winner(board_state) != 0: # start again if we hit an already winning position continue side = -side return board_state reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network( game_spec.board_squares(), HIDDEN_NODES_REINFORCEMENT, game_spec.outputs()) value_input_layer, value_output_layer, value_variables = create_network(game_spec.board_squares(), HIDDEN_NODES_VALUE, output_nodes=1, output_softmax=False) target_placeholder = tf.placeholder("float", (None, 1)) error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer)) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) load_network(session, reinforcement_variables, REINFORCEMENT_NETWORK_PATH)
"""If we had a database of games this would load and return it... Returns: """ raise Exception( "If we had a database of tic-tac-toe games this would load them") HIDDEN_NODES = (100, 80, 60, 40) # number of hidden layer neurons BATCH_SIZE = 100 # every how many games to do a parameter update? LEARN_RATE = 1e-4 NETWORK_FILE_PATH = 'current_network.p' game_spec = TicTacToeGameSpec() input_layer, output_layer, variables = create_network( game_spec.board_squares(), HIDDEN_NODES, output_nodes=game_spec.outputs()) actual_move_placeholder = tf.placeholder("float", (None, game_spec.outputs())) error = tf.reduce_sum(tf.square(actual_move_placeholder - output_layer)) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) if os.path.isfile(NETWORK_FILE_PATH): print("loading existing network") load_network(session, variables, NETWORK_FILE_PATH) episode_number = 1 positions_train, positions_test = load_games()
def train_policy_gradient(network_file_path, save_network_file_path=None, learn_rate=1e-3, number_of_games=50000, print_results_every=1000, batch_size=100): print 'parameters => LR : ', learn_rate, ' Batch Size : ', batch_size save_network_file_path = save_network_file_path or network_file_path actual_move_placeholder = tf.placeholder("float", shape=(None, 100)) input_layer, output_layer, variables = create_network(100, (100, 100, 100), output_softmax=False) error = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=output_layer, labels=actual_move_placeholder)) #error = tf.reduce_sum(tf.square(tf.subtract(actual_move_placeholder, output_layer)), reduction_indices=1) train_step = tf.train.AdamOptimizer(learn_rate).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) if network_file_path and os.path.isfile(network_file_path): print("loading pre-existing network") load_network(session, variables, network_file_path) mini_batch_board_states, mini_batch_moves = [], [] def my_player(board_state, side): #printboard(board_state) mini_batch_board_states.append(np.ravel(board_state) * side) a1 = Agent(side, lossval=-1) move_tuple = a1.random_greedy(board_state) move = np.zeros(100) move[move_tuple[0] * 10 + move_tuple[1]] = 1. mini_batch_moves.append(move) return move_tuple def make_training_move(board_state, side): a1 = Agent(side, lossval=-1) move = a1.action(board_state) return move game_length = 0 #count = set() for episode_number in range(1, number_of_games): #print 'episode no ',episode_number if bool(random.getrandbits(1)): #print 'network goes first' board_state = emptyboard() player_turn = 1 while True: _available_moves = list(available_moves(board_state)) if len(_available_moves) == 0: break if player_turn > 0: #move = random_player(board_state, 1) move = make_training_move(board_state, 1) #_ = my_player(board_state, 1) #print 'network move position ', move else: move = my_player(board_state, -1) #print 'player move position ', move if move not in _available_moves: print 'illegal move' break board_state = apply_move(board_state, move, player_turn) #print board_state winner = gameover(board_state) if winner != 0 and winner != 2: break player_turn = -player_turn #printboard(board_state) #count.add(tuple(np.array(board_state).ravel())) else: #print 'player goes first' board_state = emptyboard() player_turn = -1 while True: _available_moves = list(available_moves(board_state)) if len(_available_moves) == 0: break if player_turn > 0: #move = random_player(board_state, 1) move = make_training_move(board_state, 1) #_ = my_player(board_state, 1) #print 'network move position ', move else: move = my_player(board_state, -1) #print 'player move position ', move if move not in _available_moves: print 'illegal move' break board_state = apply_move(board_state, move, player_turn) #print board_state winner = gameover(board_state) if winner != 0 and winner != 2: break player_turn = -player_turn #printboard(board_state) #count.add(tuple(np.array(board_state).ravel())) last_game_length = len(mini_batch_board_states) - game_length game_length += last_game_length if episode_number % batch_size == 0: np_mini_batch_board_states = np.array( mini_batch_board_states).reshape( game_length, *input_layer.get_shape().as_list()[1:]) ol, _ = session.run( [output_layer, train_step], feed_dict={ input_layer: np_mini_batch_board_states, actual_move_placeholder: mini_batch_moves }) # print np.array(np_mini_batch_board_states).reshape(10,10) # print 'output_layer_move', np.argmax(ol) # print 'our_moves', np.argmax(mini_batch_moves) #print np.array(ol).shape,np.array(mini_batch_moves).shape correct = np.sum( np.argmax(ol, axis=1) == np.argmax(mini_batch_moves, axis=1)) del mini_batch_board_states[:] del mini_batch_moves[:] print episode_number, ': ', 'accuracy ', correct / float( game_length) #print 'distinct final states ', len(count) game_length = 0 if episode_number % print_results_every == 0: if network_file_path: save_network(session, variables, save_network_file_path) if network_file_path: print 'saving final network' save_network(session, variables, save_network_file_path) return variables
def train_policy_gradient(network_file_path, save_network_file_path=None, learn_rate=1e-3, number_of_games=50000, print_results_every=1000, batch_size=100): print 'parameters => LR : ', learn_rate, ' Batch Size : ', batch_size save_network_file_path = save_network_file_path or network_file_path target_placeholder = tf.placeholder("float", shape=(None, 1)) input_layer, output_layer, variables = create_network( 10, (100, 100, 100, 100, 100), output_nodes=1, output_softmax=False) error = tf.reduce_sum(tf.square(target_placeholder - output_layer)) train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) if network_file_path and os.path.isfile(network_file_path): print("loading pre-existing network") load_network(session, variables, network_file_path) def make_training_move(board_state, side): a1 = Agent(side, lossval=-1) move = a1.action(board_state) return move def make_move(board_state, side): a1 = Agent(side, lossval=-1) move = a1.random_greedy(board_state) return move board_states_training = {} board_states_test = [] episode_number = 0 board_states_training_input = {} while len(board_states_training_input) < TRAIN_SAMPLES + TEST_SAMPLES: #if len(board_states_training_input)%100 == 0: print 'total games ', len(board_states_training_input) board_state = emptyboard() current_board_states_test = [] if bool(random.getrandbits(1)): side = 1 else: side = -1 while True: board_state = apply_move(board_state, make_training_move(board_state, side), side) current_board_states_test.append(deepcopy(board_state)) winner = gameover(board_state) if winner != 0: if winner == 2: winner = 0 break side = -side for i in range(len(current_board_states_test)): board_state_flat = tuple(np.ravel( current_board_states_test[i])) # only accept the board_state if not already in the dict if board_state_flat not in board_states_training_input: board_states_training[state_key( current_board_states_test[i])] = float(winner) board_states_training_input[board_state_flat] = 1 # take a random selection from training into a test set for _ in range(TEST_SAMPLES): sample = random.choice(list(board_states_training.keys())) board_states_test.append((sample, board_states_training[sample])) del board_states_training[sample] board_states_training = list(board_states_training.items()) test_error = session.run(error, feed_dict={ input_layer: [x[0] for x in board_states_test], target_placeholder: [[x[1]] for x in board_states_test] }) while True: np.random.shuffle(board_states_training) train_error = 0 for start_index in range( 0, len(board_states_training) - batch_size + 1, batch_size): mini_batch = board_states_training[start_index:start_index + batch_size] batch_error, _ = session.run( [error, train_step], feed_dict={ input_layer: [x[0] for x in mini_batch], target_placeholder: [[x[1]] for x in mini_batch] }) train_error += batch_error new_test_error = session.run(error, feed_dict={ input_layer: [x[0] for x in board_states_test], target_placeholder: [[x[1]] for x in board_states_test] }) print( "episode: %s train_error: %s new_test_error: %s test_error: %s" % (episode_number, train_error, new_test_error, test_error)) if new_test_error > test_error: print("train error went up, stopping training") break test_error = new_test_error episode_number += 1 if network_file_path: print 'saving final network' save_network(session, variables, save_network_file_path) return variables
NAMES = {0: '_', 1: 'X', -1: 'O'} def printboard(state): cells = [] print ' ', for i in range(BOARD_SIZE): print '{0}'.format(str(i).center(5)), print '\n' for i in range(BOARD_SIZE): print i, for j in range(BOARD_SIZE): print '{0}'.format(NAMES[state[i][j]].center(5)), print('\n') if __name__ == '__main__': input_layer, output_layer, variables = create_network(100,(100,100,100)) with tf.Session() as session: session.run(tf.initialize_all_variables()) # MoonGo_supervised_cross_prob MoonGo_reinforcement load_network(session, variables, 'MoonGo_supervised_cross_prob.pickle') while 1: board_state = emptyboard() player_turn = 1 while True: printboard(board_state) _available_moves = list(available_moves(board_state)) if len(_available_moves) == 0: print("no moves left, game ended a draw")
def printboard(state): cells = [] print ' ', for i in range(BOARD_SIZE): print '{0}'.format(str(i).center(5)), print '\n' for i in range(BOARD_SIZE): print i, for j in range(BOARD_SIZE): print '{0}'.format(NAMES[state[i][j]].center(5)), print('\n') if __name__ == '__main__': input_layer, output_layer, variables = create_network(10, (10, 10, 10, 10, 10), output_nodes=1, output_softmax=False) with tf.Session() as session: session.run(tf.initialize_all_variables()) # MoonGo_supervised_cross_prob MoonGo_reinforcement load_network(session, variables, 'MoonGo_reinforcement.pickle') while 1: board_state = emptyboard() player_turn = 1 while True: printboard(board_state) _available_moves = list(available_moves(board_state)) if len(_available_moves) == 0: