def run(): board_size = 9 encoder = zero.ZeroEncoder(board_size) board_input = Input(shape=encoder.shape(), name='board_input') pb = board_input for i in range(16): pb = Conv2D(64, (3, 3), padding='same', data_format='channels_first', activation='relu')(pb) policy_conv = Conv2D(2, (1, 1), data_format='channels_first', activation='relu')(pb) policy_flat = Flatten()(policy_conv) policy_output = Dense(encoder.num_moves(), activation='softmax')(policy_flat) value_conv = Conv2D(1, (1, 1), data_format='channels_first', activation='relu')(pb) value_flat = Flatten()(value_conv) value_hidden = Dense(256, activation='relu')(value_flat) value_output = Dense(1, activation='tanh')(value_hidden) model = Model(inputs=[board_input], outputs=[policy_output, value_output]) black_agent = zero.ZeroAgent(model, encoder, rounds_per_move=10, c=2.0) white_agent = zero.ZeroAgent(model, encoder, rounds_per_move=10, c=2.0) c1 = zero.ZeroExperienceCollector() c2 = zero.ZeroExperienceCollector() black_agent.set_collector(c1) white_agent.set_collector(c2) num_games = 10 for i in range(num_games): print(f'Game {i+1}/{num_games}') start_time = time.time() simulate_game(board_size, black_agent, c1, white_agent, c2) elapsed = time.time() - start_time print(f'elapsed: {elapsed} s') print( f'estimated time remaining this session: {(num_games - (i + 1)) * elapsed} s' ) exp = zero.combine_experience([c1, c2], board_size) black_agent.train(exp, 0.01, 1024) with h5py.File('agz_experience.h5', 'a') as expfile: exp.serialize(expfile)
def main(): board_size = 9 encoder = zero.ZeroEncoder(board_size) board_input = Input(shape=encoder.shape(), name='board_input') pb = board_input for i in range(4): pb = Conv2D(64, (3, 3), padding='same', data_format='channels_first')(pb) pb = BatchNormalization(axis=1)(pb) pb = Activation('relu')(pb) # Policy output policy_conv = Conv2D(2, (1, 1), data_format='channels_first')(pb) policy_batch = BatchNormalization(axis=1)(policy_conv) policy_relu = Activation('relu')(policy_batch) policy_flat = Flatten()(policy_relu) policy_output = Dense(encoder.num_moves(), activation='softmax')(policy_flat) # Value output value_conv = Conv2D(1, (1, 1), data_format='channels_first')(pb) value_batch = BatchNormalization(axis=1)(value_conv) value_relu = Activation('relu')(value_batch) value_flat = Flatten()(value_relu) value_hidden = Dense(256, activation='relu')(value_flat) value_output = Dense(1, activation='tanh')(value_hidden) model = Model(inputs=[board_input], outputs=[policy_output, value_output]) c1 = zero.ZeroExperienceCollector() c2 = zero.ZeroExperienceCollector() black_agent = zero.ZeroAgent(model, encoder, rounds_per_move=10, c=2.0) white_agent = zero.ZeroAgent(model, encoder, rounds_per_move=10, c=2.0) black_agent.set_collector(c1) white_agent.set_collector(c2) print('Starting the game!') game = GameState.new_game(board_size) c1.begin_episode() c2.begin_episode() black_move = black_agent.select_move(game) print('B', black_move) game = game.apply_move(black_move) white_move = white_agent.select_move(game) print('W', white_move) black_move = black_agent.select_move(game) print('B', black_move) c1.complete_episode(1) c2.complete_episode(-1) exp = zero.combine_experience([c1, c2]) black_agent.train(exp, 0.01, 2048)
def generate_experience(learning_agent, reference_agent, exp_file, num_games, board_size, num_workers): experience_files = [] workers = [] gpu_frac = 1 / float(num_workers) games_per_worker = num_games // num_workers for i in range(num_workers): filename = get_temp_file() experience_files.append(filename) worker = multiprocessing.Process(target=do_self_play, args=( board_size, learning_agent, reference_agent, games_per_worker, filename, gpu_frac, )) worker.start() workers.append(worker) # Wait for all workers to finish. print('Waiting for workers...') for worker in workers: worker.join() # Merge experience buffers. print('Merging experience buffers...') first_filename = experience_files[0] other_filenames = experience_files[1:] with h5py.File(first_filename, 'r') as expf: combined_buffer = zero.load_experience(expf) for filename in other_filenames: with h5py.File(filename, 'r') as expf: next_buffer = zero.load_experience(expf) combined_buffer = zero.combine_experience( [combined_buffer, next_buffer]) print('Saving into %s...' % exp_file) with h5py.File(exp_file, 'w') as experience_outf: combined_buffer.serialize(experience_outf) # Clean up. for fname in experience_files: os.unlink(fname)
def main(): board_size = 9 encoder = zero.ZeroEncoder(board_size) board_input = Input(shape=encoder.shape(), name='board_input') pb = board_input for i in range(4): pb = Conv2D(64, (3, 3), padding='same', data_format='channels_first', activation='relu')(pb) pb = BatchNormalization(axis=1)(pb) pb = Activation('relu')(pb) policy_conv = Conv2D(2, (1, 1), data_format='channels_first', activation='relu')(pb) policy_batch = BatchNormalization(axis=1)(policy_conv) policy_flat = Flatten()(policy_batch) policy_output = Dense(encoder.num_moves(), activation='softmax')(policy_flat) value_conv = Conv2D(1, (1, 1), data_format='channels_first', activation='relu')(pb) value_batch = BatchNormalization(axis=1)(value_conv) value_flat = Flatten()(value_batch) value_hidden = Dense(256, activation='relu')(value_flat) value_output = Dense(1, activation='tanh')(value_hidden) model = Model(inputs=[board_input], outputs=[policy_output, value_output]) black_agent = zero.ZeroAgent(model, encoder, rounds_per_move=10, c=2.0) white_agent = zerp.ZeroAgent(model, encoder, rounds_per_move=10, c=2.0) c1 = zero.ZeroExperienceCollector() c2 = zero.ZeroExperienceCollector() black_agent.set_collector(c1) white_agent.set_collector(c2) for i in range(5): simulate_game(board_size, black_agent, c1, white_agent, c2) exp = zero.combine_experience([c1, c2]) black_agent.train(exp, 0.01, 2048)
def do_self_play(board_size, agent1_filename, agent2_filename, num_games, experience_filename, gpu_frac): kerasutil.set_gpu_memory_target(gpu_frac) random.seed(int(time.time()) + os.getpid()) np.random.seed(int(time.time()) + os.getpid()) agent1 = load_agent(agent1_filename) agent2 = load_agent(agent2_filename) collector1 = zero.ZeroExperienceCollector() collector2 = zero.ZeroExperienceCollector() color1 = Player.black for i in range(num_games): print('Simulating game %d/%d...' % (i + 1, num_games)) collector1.begin_episode() collector2.begin_episode() agent1.set_collector(collector1) agent2.set_collector(collector2) if color1 == Player.black: black_player, white_player = agent1, agent2 else: white_player, black_player = agent1, agent2 game_record = simulate_game(black_player, white_player, board_size) if game_record.winner == color1: print('Agent 1 wins.') collector1.complete_episode(reward=1) collector2.complete_episode(reward=-1) else: print('Agent 2 wins.') collector1.complete_episode(reward=-1) collector2.complete_episode(reward=1) color1 = color1.other experience = zero.combine_experience([collector1, collector2]) print('Saving experience buffer to %s\n' % experience_filename) with h5py.File(experience_filename, 'w') as experience_outf: experience.serialize(experience_outf)
activation='softmax')(policy_flat) value_conv = Conv2D(1, (1, 1), data_format='channels_first', activation='relu')(pb) # <3> value_flat = Flatten()(value_conv) # <3> value_hidden = Dense(256, activation='relu')(value_flat) # <3> value_output = Dense(1, activation='tanh')(value_hidden) # <3> model = Model( inputs=[board_input], outputs=[policy_output, value_output]) # end::zero_model[] # tag::zero_train[] black_agent = zero.ZeroAgent( model, encoder, rounds_per_move=10, c=2.0) # <4> white_agent = zero.ZeroAgent( model, encoder, rounds_per_move=10, c=2.0) c1 = zero.ZeroExperienceCollector() c2 = zero.ZeroExperienceCollector() black_agent.set_collector(c1) white_agent.set_collector(c2) for i in range(5): # <5> simulate_game(board_size, black_agent, c1, white_agent, c2) exp = zero.combine_experience([c1, c2]) black_agent.train(exp, 0.01, 2048) # end::zero_train[]
def generate_game(board_size, game_id_str, rounds_per_move=10, c=2.0): start = time.time() print(f'Generating {game_id_str}...') game = GameState.new_game(board_size) encoder = zero.ZeroEncoder(board_size) # load current best agent, if any # has to be able to pass through cPickle which is why we don't just reuse it if os.path.exists('agz_bot.h5'): with h5py.File('agz_bot.h5') as bot_file: black_agent = zero.load_zero_agent(bot_file) white_agent = zero.load_zero_agent(bot_file) else: print(f'WARN: using default model to generate {game_id_str}') model = zero_model(board_size) black_agent = zero.ZeroAgent(model, encoder, rounds_per_move=rounds_per_move, c=c) white_agent = zero.ZeroAgent(model, encoder, rounds_per_move=rounds_per_move, c=c) agents = { Player.black: black_agent, Player.white: white_agent, } c1 = zero.ZeroExperienceCollector() c2 = zero.ZeroExperienceCollector() black_agent.set_collector(c1) white_agent.set_collector(c2) c1.begin_episode() c2.begin_episode() while not game.is_over(): next_move = agents[game.next_player].select_move(game) game = game.apply_move(next_move) game_result = scoring.compute_game_result(game) if game_result.winner == Player.black: c1.complete_episode(1) c2.complete_episode(-1) else: c1.complete_episode(-1) c2.complete_episode(1) combined = zero.combine_experience([c1, c2], board_size) c1 = c2 = game_result = None model = encoder = None game = None del black_agent.model del white_agent.model black_agent = white_agent = None import gc K.clear_session() gc.collect() return combined, game_id_str, time.time() - start