def _worker_run(games_idxs): global _env_ import self_play from dots_boxes.dots_boxes_game import BoxesState from utils.utils import write_to_hdf import time loop = asyncio.get_event_loop() tick = time.time() try: _env_.sp = self_play.SelfPlay(_env_.nnet, _env_.params) _env_.sp.set_player_change_callback(_player_change_callback) loop.run_until_complete(_env_.sp.play_games(BoxesState(), games_idxs, show_progress=False)) except Exception as e: print(e, flush=True) raise e tack = time.time() df = _env_.sp.get_datasets(_env_.generations, not _env_.compare_models) if not _env_.compare_models: df["training"] = np.zeros(len(df.index), dtype=np.int8) with _env_.hdf_lock: write_to_hdf(_env_.hdf_file_name, "fresh", df) tock = time.time() logger.warning("Worker %s played %d games (%d samples) in %.0fs (save=%.3fs)", _env_.name, len(games_idxs), len(df.index), tock-tick, tock-tack)
def test(self, render, opponent, muzero_player): """ Test the model in a dedicated thread. Args: render: Boolean to display or not the environment. opponent: "self" for self-play, "human" for playing against MuZero and "random" for a random agent. muzero_player: Integer with the player number of MuZero in case of multiplayer games, None let MuZero play all players turn by turn. """ print("\nTesting...") # ray.init() # self_play_workers = self_play.SelfPlay.remote( self_play_workers = self_play.SelfPlay( copy.deepcopy(self.muzero_weights), self.Game(numpy.random.randint(1000)), self.config, ) # history = ray.get( # self_play_workers.play_game.remote(0, 0, render, opponent, muzero_player) # ) history = self_play_workers.play_game(0, 0, render, opponent, muzero_player) # ray.shutdown() return sum(history.reward_history)
def selfplay_and_train(self): self.model = network.NNmodel(self.config) self.DQNagent = agent.DQNagent(self.config, self.model) self.data = replay_buffer.DataContainer(self.config) self.selfplay = self_play.SelfPlay(self.config) self.selfplay.play_games(self.DQNagent, self.data) observation, action, value = self.data.get_data() observation = tf.reshape(observation, [observation.shape[0], -1]) action = tf.one_hot(action, 9) input = tf.concat([observation, action], axis=1) self.model.fit(input, value, batch_size=1024, epochs=50)
def train(self): os.makedirs(self.config.results_path, exist_ok=True) # Initialize workers training_worker = trainer.Trainer(copy.deepcopy(self.muzero_weights), self.config) shared_storage_worker = shared_storage.SharedStorage( copy.deepcopy(self.muzero_weights), self.game_name, self.config, ) replay_buffer_worker = replay_buffer.ReplayBuffer(self.config) # Pre-load buffer if pulling from persistent storage if self.replay_buffer: for game_history_id in self.replay_buffer: replay_buffer_worker.save_game( self.replay_buffer[game_history_id]) print("\nLoaded {} games from replay buffer.".format( len(self.replay_buffer))) self_play_workers = [ self_play.SelfPlay( copy.deepcopy(self.muzero_weights), self.Game(self.config.seed + seed), self.config, ) for seed in range(self.config.num_actors) ] # Launch workers [ self_play_worker.continuous_self_play(shared_storage_worker, replay_buffer_worker) for self_play_worker in self_play_workers ] training_worker.continuous_update_weights(replay_buffer_worker, shared_storage_worker) # Save performance in TensorBoard print("Printing Logging info") self._logging_loop(shared_storage_worker, replay_buffer_worker) self.muzero_weights = shared_storage.get_weights() self.replay_buffer = replay_buffer_worker.get_buffer() # Persist replay buffer to disk print("\n\nPersisting replay buffer games to disk...") pickle.dump( self.replay_buffer, open(os.path.join(self.config.results_path, "replay_buffer.pkl"), "wb"), )
def train(self): # Manage GPUs ''' if 0 < self.num_gpus: num_gpus_per_worker = self.num_gpus / ( self.config.train_on_gpu + self.config.num_workers * self.config.selfplay_on_gpu + log_in_tensorboard * self.config.selfplay_on_gpu + self.config.use_last_model_value * self.config.reanalyse_on_gpu ) if 1 < num_gpus_per_worker: num_gpus_per_worker = math.floor(num_gpus_per_worker) else: num_gpus_per_worker = 0 ''' # Initialize Worker Threads for SP_worker_index in range(self.config.num_workers): self.self_play_workers.append( self_play.SelfPlay(self.checkpoint, self.Game, self.config, self.config.seed + SP_worker_index)) self.training_worker = trainer.Trainer(self.checkpoint, self.config) self.replay_buffer_worker = replay_buffer.ReplayBuffer( self.checkpoint, self.replay_buffer, self.config) self.shared_storage_worker = shared_storage.SharedStorage( self.checkpoint, self.config) self.shared_storage_worker.set_info("terminate", False) #Launch Workers play_thread = threading.Thread( target=self.self_play_workers[0].continuous_self_play, args=(self.shared_storage_worker, self.replay_buffer_worker)) train_thread = threading.Thread( target=self.training_worker.continuous_update_weights, args=(self.shared_storage_worker, self.replay_buffer_worker)) play_thread.start() train_thread.start()
def _logging_loop(self, shared_storage_worker, replay_buffer_worker): """ Keep track of the training performance """ # Launch the test worker to get performance metrics test_worker = self_play.SelfPlay( copy.deepcopy(self.muzero_weights), self.Game(self.config.seed + self.config.num_actors), self.config, ) test_worker.continuous_self_play(shared_storage_worker, None, True) # Write everything in TensorBoard writer = SummaryWriter(self.config.results_path) print( "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" ) # Save hyperparameters to TensorBoard hp_table = [ "| {} | {} |".format(key, value) for key, value in self.config.__dict__.items() ] writer.add_text( "Hyperparameters", "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), ) # Save model representation writer.add_text( "Model summary", str(models.MuZeroNetwork(self.config)).replace("\n", " \n\n"), ) # Loop for updating the training performance counter = 0 info = shared_storage_worker.get_info() try: while info["training_step"] < self.config.training_steps: info = shared_storage_worker.get_info() writer.add_scalar( "1.Total reward/1.Total reward", info["total_reward"], counter, ) writer.add_scalar( "1.Total reward/2.Mean value", info["mean_value"], counter, ) writer.add_scalar( "1.Total reward/3.Episode length", info["episode_length"], counter, ) writer.add_scalar( "1.Total reward/4.MuZero reward", info["muzero_reward"], counter, ) writer.add_scalar( "1.Total reward/5.Opponent reward", info["opponent_reward"], counter, ) writer.add_scalar( "2.Workers/1.Self played games", replay_buffer_worker.get_self_play_count(), counter, ) writer.add_scalar("2.Workers/2.Training steps", info["training_step"], counter) writer.add_scalar( "2.Workers/3.Self played games per training step ratio", replay_buffer_worker.get_self_play_count() / max(1, info["training_step"]), counter, ) writer.add_scalar("2.Workers/4.Learning rate", info["lr"], counter) writer.add_scalar("3.Loss/1.Total weighted loss", info["total_loss"], counter) writer.add_scalar("3.Loss/Value loss", info["value_loss"], counter) writer.add_scalar("3.Loss/Reward loss", info["reward_loss"], counter) writer.add_scalar("3.Loss/Policy loss", info["policy_loss"], counter) print( "Last test reward: {:.2f}. Training step: {}/{}. Played games: {}. Loss: {:.2f}" .format( info["total_reward"], info["training_step"], self.config.training_steps, replay_buffer_worker.get_self_play_count(), info["total_loss"], ), end="\r", ) counter += 1 time.sleep(0.5) except KeyboardInterrupt as err: # Comment the line below to be able to stop the training but keep running # raise err pass
def train(self): # ray.init() os.makedirs(self.config.results_path, exist_ok=True) # Initialize workers # training_worker = trainer.Trainer.options( # num_gpus=1 if "cuda" in self.config.training_device else 0 # ).remote(copy.deepcopy(self.muzero_weights), self.config) training_worker = trainer.Trainer(copy.deepcopy(self.muzero_weights), self.config) # shared_storage_worker = shared_storage.SharedStorage.remote( # copy.deepcopy(self.muzero_weights), self.game_name, self.config, # ) shared_storage_worker = shared_storage.SharedStorage( copy.deepcopy(self.muzero_weights), self.game_name, self.config, ) # replay_buffer_worker = replay_buffer.ReplayBuffer.remote(self.config) replay_buffer_worker = replay_buffer.ReplayBuffer(self.config) # Pre-load buffer if pulling from persistent storage if self.replay_buffer: for game_history_id in self.replay_buffer: # replay_buffer_worker.save_game.remote( replay_buffer_worker.save_game( self.replay_buffer[game_history_id]) print("\nLoaded {} games from replay buffer.".format( len(self.replay_buffer))) self_play_workers = [ # self_play.SelfPlay.remote( self_play.SelfPlay( copy.deepcopy(self.muzero_weights), self.Game(self.config.seed + seed), self.config, ) for seed in range(self.config.num_actors) ] # # Launch workers # [ # # self_play_worker.continuous_self_play.remote( # self_play_worker.continuous_self_play( # shared_storage_worker, replay_buffer_worker # ) # for self_play_worker in self_play_workers # ] # # training_worker.continuous_update_weights.remote( # training_worker.continuous_update_weights( # replay_buffer_worker, shared_storage_worker # ) # # Save performance in TensorBoard # self._logging_loop(shared_storage_worker, replay_buffer_worker) while True: # play a game [ self_play_worker.joe_self_play(shared_storage_worker, replay_buffer_worker) for self_play_worker in self_play_workers ] self._joe_logging(shared_storage_worker, replay_buffer_worker) training_worker.joe_update_weights(replay_buffer_worker, shared_storage_worker) info = shared_storage_worker.get_info() if info["training_step"] >= self.config.training_steps: break # self.muzero_weights = ray.get(shared_storage_worker.get_weights.remote()) self.muzero_weights = shared_storage_worker.get_weights() # self.replay_buffer = ray.get(replay_buffer_worker.get_buffer.remote()) self.replay_buffer = replay_buffer_worker.get_buffer() # Persist replay buffer to disk print("\n\nPersisting replay buffer games to disk...") pickle.dump( self.replay_buffer, open(os.path.join(self.config.results_path, "replay_buffer.pkl"), "wb"), )
def _joe_logging(self, shared_storage_worker, replay_buffer_worker): """ Keep track of the training performance """ if not hasattr(self, '_has_logged_one'): # Launch the test worker to get performance metrics self._test_worker = self_play.SelfPlay( copy.deepcopy(self.muzero_weights), self.Game(self.config.seed + self.config.num_actors), self.config, ) # Write everything in TensorBoard writer = SummaryWriter(self.config.results_path) print( "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" ) # Save hyperparameters to TensorBoard hp_table = [ "| {} | {} |".format(key, value) for key, value in self.config.__dict__.items() ] writer.add_text( "Hyperparameters", "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), ) # Save model representation writer.add_text( "Model summary", str(models.MuZeroNetwork(self.config)).replace("\n", " \n\n"), ) self._has_logged_one = True self._writer = writer self._counter = 0 self._last_game_played = 0 return info = shared_storage_worker.get_info() writer = self._writer counter = info['training_step'] if info['training_step'] % self.config.checkpoint_interval != 0: return games_played = replay_buffer_worker.get_self_play_count() if games_played % 3 == 0 and games_played != self._last_game_played: self._test_worker.joe_self_play(shared_storage_worker, None, True) # self._test_worker.joe_self_play(shared_storage_worker, replay_buffer_worker, True) self._last_game_played = games_played writer.add_scalar( "1.Total reward/1.Total reward", info["total_reward"], counter, ) writer.add_scalar( "1.Total reward/2.Mean value", info["mean_value"], counter, ) writer.add_scalar( "1.Total reward/3.Episode length", info["episode_length"], counter, ) writer.add_scalar( "1.Total reward/4.MuZero reward", info["muzero_reward"], counter, ) writer.add_scalar( "1.Total reward/5.Opponent reward", info["opponent_reward"], counter, ) writer.add_scalar( "2.Workers/1.Self played games", # ray.get(replay_buffer_worker.get_self_play_count.remote()), replay_buffer_worker.get_self_play_count(), counter, ) writer.add_scalar("2.Workers/2.Training steps", info["training_step"], counter) writer.add_scalar( "2.Workers/3.Self played games per training step ratio", # ray.get(replay_buffer_worker.get_self_play_count.remote()) replay_buffer_worker.get_self_play_count() / max(1, info["training_step"]), counter, ) writer.add_scalar("2.Workers/4.Learning rate", info["lr"], counter) writer.add_scalar("3.Loss/1.Total weighted loss", info["total_loss"], counter) writer.add_scalar("3.Loss/Value loss", info["value_loss"], counter) writer.add_scalar("3.Loss/Reward loss", info["reward_loss"], counter) writer.add_scalar("3.Loss/Policy loss", info["policy_loss"], counter) print( "Last test reward: {:.2f}. Training step: {}/{}. Played games: {}. Loss: {:.2f}" .format( info["total_reward"], info["training_step"], self.config.training_steps, # ray.get(replay_buffer_worker.get_self_play_count.remote()), replay_buffer_worker.get_self_play_count(), info["total_loss"], ), end="\r", )