def __init__(self, initial_weights, shared_storage, replay_buffer, config): self.shared_storage = shared_storage self.replay_buffer = replay_buffer self.config = config # Initialize the network self.latest_network = models.MuZeroNetwork(self.config) self.latest_network.set_weights(initial_weights) self.latest_network.to(torch.device("cpu")) self.latest_network.eval() self.target_network = models.MuZeroNetwork(self.config) self.target_network.set_weights(initial_weights) self.target_network.to(torch.device("cpu")) self.target_network.eval()
def __init__(self, checkpoint, config): self.config = config # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(checkpoint["weights"]) self.model.eval()
def __init__(self, initial_weights, config): self.config = config self.training_step = 0 # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(initial_weights) self.model.to(torch.device(config.training_device)) self.model.train() if self.config.optimizer == "SGD": self.optimizer = torch.optim.SGD( self.model.parameters(), lr=self.config.lr_init, momentum=self.config.momentum, weight_decay=self.config.weight_decay, ) elif self.config.optimizer == "Adam": self.optimizer = torch.optim.Adam( self.model.parameters(), lr=self.config.lr_init, weight_decay=self.config.weight_decay, ) else: raise ValueError( "{} is not implemented. You can change the optimizer manually in trainer.py." )
def __init__(self, game_name): self.game_name = game_name # Load the game and the config from the module with the game name try: game_module = importlib.import_module("games." + self.game_name) self.config: MuZeroConfigBase = game_module.MuZeroConfig() self.Game = game_module.Game except Exception as err: print( '{} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' .format(self.game_name)) raise err os.makedirs(os.path.join(self.config.results_path), exist_ok=True) # Fix random generator seed for reproductibility numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) # Initial weights used to initialize components self.muzero_weights = models.MuZeroNetwork( self.config.observation_shape, len(self.config.action_space), self.config.encoding_size, self.config.hidden_size, ).get_weights() self.config.results_path = (Path(self.config.results_path) / (self.game_name + "_summary") / time_stamp_str())
def __init__(self, weights, config): self.config = config # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(weights) self.model.to(torch.device("cpu")) self.model.eval()
def __init__(self, initial_checkpoint, Game, config, seed, opponent_initial_checkpoint = None): self.config = config self.game = Game(seed) # Fix random generator seed numpy.random.seed(seed) torch.manual_seed(seed) # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(initial_checkpoint["weights"]) self.model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) self.model.eval() if opponent_initial_checkpoint is not None: # Initialize the opponent network self.opponent_model = models.MuZeroNetwork(self.config) self.opponent_model.set_weights(opponent_initial_checkpoint["weights"]) self.opponent_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) self.opponent_model.eval()
def __init__(self, initial_checkpoint, Game, config, seed): self.config = config self.game = Game(seed) # Fix random generator seed numpy.random.seed(seed) torch.manual_seed(seed) # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(initial_checkpoint["weights"]) self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) self.model.eval()
def __init__(self, initial_weights, config): self.config = config self.num_reanalysed_games = 0 # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(initial_weights) self.model.to(torch.device(self.config.reanalyse_device)) self.model.eval()
def __init__(self, initial_weights, game, config): self.config = config self.game = game # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(initial_weights) self.model.to(torch.device("cpu")) self.model.eval()
def __init__(self, initial_weights, game, config): self.config = config self.game = game # Initialize the network self.model = models.MuZeroNetwork( self.config.observation_shape, len(self.config.action_space), self.config.encoding_size, self.config.hidden_size, ) self.model.set_weights(initial_weights) self.model.to(torch.device("cpu")) self.model.eval()
def __init__(self, initial_checkpoint, config): self.config = config # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(initial_checkpoint["weights"]) self.model.to(torch.device("cuda" if self.config.reanalyse_on_gpu else "cpu")) self.model.eval() self.num_reanalysed_games = initial_checkpoint["num_reanalysed_games"]
def __init__(self, initial_checkpoint, Game, config, seed): self.config = config #self.game = traffic_environment.TrafficEnv() self.game = Game # Fix random generator seed numpy.random.seed(seed) torch.manual_seed(seed) # Initialize the network load most recent network weights. Load model onto GPU and set mode to eval self.model = models.MuZeroNetwork(self.config) self.model.set_weights(initial_checkpoint["weights"]) self.model.to( torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) self.model.eval()
def __init__(self, initial_checkpoint, Game, config, seed): self.config = config self.game = Game(seed) if seed < 0: seed = numpy.random.randint(10000) # Fix random generator seed numpy.random.seed(seed) torch.manual_seed(seed) # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(initial_checkpoint["weights"]) self.model.to( torch.device("cuda" if torch.cuda.is_available() else "cpu")) self.model.eval()
def __init__(self, config): self.config = config self.buffer = {} self.game_priorities = collections.deque( maxlen=self.config.window_size) self.max_recorded_game_priority = 1.0 self.self_play_count = 0 self.total_samples = 0 # Used only for the Reanalyze options self.model = (models.MuZeroNetwork(self.config) if self.config.use_last_model_value else None) # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed)
def __init__(self, initial_weights, config): self.config = config self.training_step = 0 # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(initial_weights) self.model.to(torch.device(config.training_device)) self.model.train() self.optimizer = torch.optim.Adam( self.model.parameters(), lr=self.config.lr_init, # momentum=self.config.momentum, weight_decay=self.config.weight_decay, )
def __init__(self, initial_checkpoint, config): self.config = config self.has_LR_message_been_shown = False # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) self.model.train() self.training_step = initial_checkpoint["training_step"] if "cuda" not in str(next(self.model.parameters()).device): print("You are not training on GPU.\n") # Initialize the optimizer if self.config.optimizer == "SGD": self.optimizer = torch.optim.SGD( self.model.parameters(), lr=self.config.lr_init, momentum=self.config.momentum, weight_decay=self.config.weight_decay, ) elif self.config.optimizer == "Adam": self.optimizer = torch.optim.Adam( self.model.parameters(), lr=self.config.lr_init, weight_decay=self.config.weight_decay, ) else: raise NotImplementedError( f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." ) if initial_checkpoint["optimizer_state"] is not None: print("Loading optimizer...\n") self.optimizer.load_state_dict( copy.deepcopy(initial_checkpoint["optimizer_state"]) )
def __init__(self, initial_weights, game, config, test=False, idx=-1, render=False): self.config: MuZeroConfigBase = config self.game = game self.idx = idx self.episode = 0 self.render = render self.writer = SummaryWriter(self.config.results_path / f"self_play_{idx}") # Initialize the network self.model = models.MuZeroNetwork( self.config.observation_shape, len(self.config.action_space), self.config.encoding_size, self.config.hidden_size, ) self.model.set_weights(initial_weights) self.model.to(torch.device("cpu")) self.model.eval() self.continuous_self_play(test)
def __init__(self, game_name): self.game_name = game_name # Load the game and the config from the module with the game name try: game_module = importlib.import_module("games." + self.game_name) self.config = game_module.MuZeroConfig() self.Game = game_module.Game except Exception as err: print( '{} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' .format(self.game_name)) raise err # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) # Weights used to initialize components self.muzero_weights = models.MuZeroNetwork(self.config).get_weights()
def __init__(self, initial_weights, config): self.config = config self.training_step = 0 # Initialize the network self.model = models.MuZeroNetwork( self.config.observation_shape, len(self.config.action_space), self.config.encoding_size, self.config.hidden_size, ) self.model.set_weights(initial_weights) self.model.to(torch.device(config.training_device)) self.model.train() self.optimizer = torch.optim.SGD( self.model.parameters(), lr=self.config.lr_init, momentum=self.config.momentum, weight_decay=self.config.weight_decay, )
def __init__(self, initial_checkpoint, config): self.config = config # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) # Import the game class to enable MCTS updates game_module = importlib.import_module("games." + self.config.game_filename) self.game = game_module.Game() # Initialize the network self.model = models.MuZeroNetwork(self.config) self.model.set_weights(initial_checkpoint["weights"]) self.model.to( torch.device("cuda" if self.config.reanalyse_on_gpu else "cpu")) self.model.eval() # Create the target network (for stable bootstrapping) if self.config.use_last_model_value: self.target_model = copy.deepcopy(self.model) self.last_update_step = initial_checkpoint["training_step"]
def __init__(self, initial_weights, config): self.config: MuZeroConfigBase = config self.training_step = 0 self.writer = SummaryWriter(self.config.results_path / "trainer") # Initialize the network self.model = models.MuZeroNetwork( self.config.observation_shape, len(self.config.action_space), self.config.encoding_size, self.config.hidden_size, ) self.model.set_weights(initial_weights) self.model.to(torch.device(config.training_device)) self.model.train() self.optimizer = torch.optim.SGD( self.model.parameters(), lr=self.config.lr_init, momentum=self.config.momentum, weight_decay=self.config.weight_decay, ) def async_put_weights(): last_idx = None while True: if self.config.q_weights.empty(): if self.training_step != last_idx: weights = self.model.get_weights() last_idx = self.training_step self.config.q_weights.put(weights) else: time.sleep(0.1) Thread(target=async_put_weights).start() self.continuous_update_weights()
def get_initial_weights(config): model = models.MuZeroNetwork(config) weigths = model.get_weights() summary = str(model).replace("\n", " \n\n") return weigths, summary
def _logging_loop(self, shared_storage_worker, replay_buffer_worker): """ Keep track of the training performance """ # Launch the test worker to get performance metrics test_worker = self_play.SelfPlay( copy.deepcopy(self.muzero_weights), self.Game(self.config.seed + self.config.num_actors), self.config, ) test_worker.continuous_self_play(shared_storage_worker, None, True) # Write everything in TensorBoard writer = SummaryWriter(self.config.results_path) print( "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" ) # Save hyperparameters to TensorBoard hp_table = [ "| {} | {} |".format(key, value) for key, value in self.config.__dict__.items() ] writer.add_text( "Hyperparameters", "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), ) # Save model representation writer.add_text( "Model summary", str(models.MuZeroNetwork(self.config)).replace("\n", " \n\n"), ) # Loop for updating the training performance counter = 0 info = shared_storage_worker.get_info() try: while info["training_step"] < self.config.training_steps: info = shared_storage_worker.get_info() writer.add_scalar( "1.Total reward/1.Total reward", info["total_reward"], counter, ) writer.add_scalar( "1.Total reward/2.Mean value", info["mean_value"], counter, ) writer.add_scalar( "1.Total reward/3.Episode length", info["episode_length"], counter, ) writer.add_scalar( "1.Total reward/4.MuZero reward", info["muzero_reward"], counter, ) writer.add_scalar( "1.Total reward/5.Opponent reward", info["opponent_reward"], counter, ) writer.add_scalar( "2.Workers/1.Self played games", replay_buffer_worker.get_self_play_count(), counter, ) writer.add_scalar("2.Workers/2.Training steps", info["training_step"], counter) writer.add_scalar( "2.Workers/3.Self played games per training step ratio", replay_buffer_worker.get_self_play_count() / max(1, info["training_step"]), counter, ) writer.add_scalar("2.Workers/4.Learning rate", info["lr"], counter) writer.add_scalar("3.Loss/1.Total weighted loss", info["total_loss"], counter) writer.add_scalar("3.Loss/Value loss", info["value_loss"], counter) writer.add_scalar("3.Loss/Reward loss", info["reward_loss"], counter) writer.add_scalar("3.Loss/Policy loss", info["policy_loss"], counter) print( "Last test reward: {:.2f}. Training step: {}/{}. Played games: {}. Loss: {:.2f}" .format( info["total_reward"], info["training_step"], self.config.training_steps, replay_buffer_worker.get_self_play_count(), info["total_loss"], ), end="\r", ) counter += 1 time.sleep(0.5) except KeyboardInterrupt as err: # Comment the line below to be able to stop the training but keep running # raise err pass
def play_against_algorithm(weight_file_path, config_name, seed, algo="expert", render=False): np.random.seed(seed) torch.manual_seed(seed) game_module = importlib.import_module("games." + config_name) config = game_module.MuZeroConfig() model = models.MuZeroNetwork(config) model.set_weights(torch.load(weight_file_path)) model.eval() algo = globals()[algo.capitalize()](-1, 1) game = Game(seed) observation = game.reset() game_history = GameHistory() game_history.action_history.append(0) game_history.reward_history.append(0) game_history.to_play_history.append(game.to_play()) game_history.legal_actions.append(game.legal_actions()) game_history.observation_history.append(observation) done = False depth = 9 reward = 0 while not done: if game.to_play_real() == -1: action = algo(game.get_state(), depth, game.to_play_real()) else: stacked_observations = game_history.get_stacked_observations( -1, config.stacked_observations, ) root, priority, tree_depth = MCTS(config).run( model, stacked_observations, game.legal_actions(), game.to_play(), False, ) action = SelfPlay.select_action( root, 0, ) game_history.store_search_statistics(root, config.action_space) game_history.priorities.append(priority) observation, reward, done = game.step(action) if render: game.render() depth -= 1 game_history.action_history.append(action) game_history.observation_history.append(observation) game_history.reward_history.append(reward) game_history.to_play_history.append(game.to_play()) game_history.legal_actions.append(game.legal_actions()) return reward, TictactoeComp.wins(game.get_state(), 1)
def __init__(self, game_name, config=None, split_resources_in=1): # Load the game and the config from the module with the game name ''' try: game_module = importlib.import_module("games." + game_name) self.Game = game_module.Game self.config = game_module.MuZeroConfig() except ModuleNotFoundError as err: print( f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' ) raise err # Overwrite the config if config: if type(config) is dict: for param, value in config.items(): setattr(self.config, param, value) else: self.config = config ''' # Network 1 speed = 2 # need to test speeds 0 1 2 self.Game = traffic_environment.TrafficEnv(car_speed=speed, max_wait=1400, horiz_lanes=('e', ), vert_lanes=('n', ), horiz_sizes=(7, 7), vert_sizes=(3, 3)) self.Game.make_spawn_blocks( self.Game.start_indices, [0.9, 0.1]) # [0.5 for _ in range(len(self.Game.start_indices))]) #env_numLanes = (len(self.Game.horiz_lanes) + len(self.Game.vert_lanes)) # Network 2 # self.Game = traffic_environment.TrafficEnv(car_speed=speed, max_wait=800, horiz_lanes=('e','w'), vert_lanes=('s','sn'), horiz_sizes=(3,3,2), vert_sizes=(3,2,2)) # self.Game.make_spawn_blocks(self.Game.start_indices, [0.9, 0.1, 0.9, 0.9, 0.1])#, [0.5 for _ in range(len(self.Game.start_indices))]) # env_numLanes = (len(self.Game.horiz_lanes) + len(self.Game.vert_lanes)) self.config = muzero_config.MuZeroConfig() #self.config.observation_shape = (1, 1, len(self.Game.observation())) self.config.observation_shape = ( 1, 1, len(self.Game.observation_space.sample())) self.config.action_space = list( range(0, 2**len(self.Game.action_space.sample()))) #self.config.action_space = list(range(2**self.Game.action_space.shape[0])) # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) # Manage GPUs # TODO could trim this out total_gpus = (self.config.max_num_gpus if self.config.max_num_gpus is not None else torch.cuda.device_count()) self.num_gpus = total_gpus / split_resources_in if 1 < self.num_gpus: self.num_gpus = math.floor(self.num_gpus) # Checkpoint and replay buffer used to initialize workers self.checkpoint = { "weights": None, "optimizer_state": None, "total_reward": 0, "muzero_reward": 0, "opponent_reward": 0, "episode_length": 0, "mean_value": 0, "training_step": 0, "lr": 0, "total_loss": 0, "value_loss": 0, "reward_loss": 0, "policy_loss": 0, "num_played_games": 0, "num_played_steps": 0, "num_reanalysed_games": 0, "terminate": False } self.replay_buffer = {} model = models.MuZeroNetwork(self.config) weights = model.get_weights() self.summary = str(model).replace("\n", " \n\n") self.checkpoint["weights"] = copy.deepcopy(weights) # Workers self.self_play_workers = [] self.test_worker = None self.training_worker = None self.reanalyse_worker = None self.replay_buffer_worker = None self.shared_storage_worker = None
def play_against_other(weights1, config1, weights2, config2, seed, render=False): np.random.seed(seed) torch.manual_seed(seed) game_module = importlib.import_module("games." + config1) config1 = game_module.MuZeroConfig() model1 = models.MuZeroNetwork(config1) model1.set_weights(torch.load(weights1)) model1.eval() game_module = importlib.import_module("games." + config2) config2 = game_module.MuZeroConfig() model2 = models.MuZeroNetwork(config2) model2.set_weights(torch.load(weights2)) model2.eval() game = Game(seed) observation = game.reset() game_history1 = GameHistory() game_history1.action_history.append(0) game_history1.reward_history.append(0) game_history1.to_play_history.append(game.to_play()) game_history1.legal_actions.append(game.legal_actions()) observation1 = copy.deepcopy(observation) # observation1[0] = -observation1[1] # observation1[1] = -observation1[0] # observation1[2] = -observation1[2] game_history1.observation_history.append(observation1) game_history2 = GameHistory() game_history2.action_history.append(0) game_history2.reward_history.append(0) game_history2.to_play_history.append(not game.to_play()) game_history2.legal_actions.append(game.legal_actions()) observation2 = copy.deepcopy(observation) observation2[0] = -observation2[1] observation2[1] = -observation2[0] observation2[2] = -observation2[2] game_history2.observation_history.append(observation2) done = False reward = 0 while not done: if game.to_play_real() == 1: config = config1 model = model1 game_history = game_history1 else: config = config2 model = model2 game_history = game_history2 stacked_observations = game_history.get_stacked_observations( -1, config.stacked_observations, ) root, priority, tree_depth = MCTS(config).run( model, stacked_observations, game.legal_actions(), game.to_play(), False, ) action = SelfPlay.select_action( root, 0, ) game_history1.store_search_statistics(root, config.action_space) game_history1.priorities.append(priority) game_history2.store_search_statistics(root, config.action_space) game_history2.priorities.append(priority) observation, reward, done = game.step(action) if render: game.render() game_history1.action_history.append(action) observation1 = copy.deepcopy(observation) # observation1[0] = -observation1[1] # observation1[1] = -observation1[0] # observation1[2] = -observation1[2] game_history1.observation_history.append(observation1) game_history1.reward_history.append(reward) game_history1.to_play_history.append(game.to_play()) game_history1.legal_actions.append(game.legal_actions()) game_history2.action_history.append(action) observation2 = copy.deepcopy(observation) observation2[0] = -observation2[1] observation2[1] = -observation2[0] observation2[2] = -observation2[2] game_history2.observation_history.append(observation2) game_history2.reward_history.append(reward) game_history2.to_play_history.append(not game.to_play()) game_history2.legal_actions.append(game.legal_actions()) return reward, TictactoeComp.wins(game.get_state(), 1)
def _joe_logging(self, shared_storage_worker, replay_buffer_worker): """ Keep track of the training performance """ if not hasattr(self, '_has_logged_one'): # Launch the test worker to get performance metrics self._test_worker = self_play.SelfPlay( copy.deepcopy(self.muzero_weights), self.Game(self.config.seed + self.config.num_actors), self.config, ) # Write everything in TensorBoard writer = SummaryWriter(self.config.results_path) print( "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" ) # Save hyperparameters to TensorBoard hp_table = [ "| {} | {} |".format(key, value) for key, value in self.config.__dict__.items() ] writer.add_text( "Hyperparameters", "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), ) # Save model representation writer.add_text( "Model summary", str(models.MuZeroNetwork(self.config)).replace("\n", " \n\n"), ) self._has_logged_one = True self._writer = writer self._counter = 0 self._last_game_played = 0 return info = shared_storage_worker.get_info() writer = self._writer counter = info['training_step'] if info['training_step'] % self.config.checkpoint_interval != 0: return games_played = replay_buffer_worker.get_self_play_count() if games_played % 3 == 0 and games_played != self._last_game_played: self._test_worker.joe_self_play(shared_storage_worker, None, True) # self._test_worker.joe_self_play(shared_storage_worker, replay_buffer_worker, True) self._last_game_played = games_played writer.add_scalar( "1.Total reward/1.Total reward", info["total_reward"], counter, ) writer.add_scalar( "1.Total reward/2.Mean value", info["mean_value"], counter, ) writer.add_scalar( "1.Total reward/3.Episode length", info["episode_length"], counter, ) writer.add_scalar( "1.Total reward/4.MuZero reward", info["muzero_reward"], counter, ) writer.add_scalar( "1.Total reward/5.Opponent reward", info["opponent_reward"], counter, ) writer.add_scalar( "2.Workers/1.Self played games", # ray.get(replay_buffer_worker.get_self_play_count.remote()), replay_buffer_worker.get_self_play_count(), counter, ) writer.add_scalar("2.Workers/2.Training steps", info["training_step"], counter) writer.add_scalar( "2.Workers/3.Self played games per training step ratio", # ray.get(replay_buffer_worker.get_self_play_count.remote()) replay_buffer_worker.get_self_play_count() / max(1, info["training_step"]), counter, ) writer.add_scalar("2.Workers/4.Learning rate", info["lr"], counter) writer.add_scalar("3.Loss/1.Total weighted loss", info["total_loss"], counter) writer.add_scalar("3.Loss/Value loss", info["value_loss"], counter) writer.add_scalar("3.Loss/Reward loss", info["reward_loss"], counter) writer.add_scalar("3.Loss/Policy loss", info["policy_loss"], counter) print( "Last test reward: {:.2f}. Training step: {}/{}. Played games: {}. Loss: {:.2f}" .format( info["total_reward"], info["training_step"], self.config.training_steps, # ray.get(replay_buffer_worker.get_self_play_count.remote()), replay_buffer_worker.get_self_play_count(), info["total_loss"], ), end="\r", )