Exemplo n.º 1
0
    def __init__(self, initial_weights, shared_storage, replay_buffer, config):
        self.shared_storage = shared_storage
        self.replay_buffer = replay_buffer
        self.config = config
        # Initialize the network
        self.latest_network = models.MuZeroNetwork(self.config)
        self.latest_network.set_weights(initial_weights)
        self.latest_network.to(torch.device("cpu"))
        self.latest_network.eval()

        self.target_network = models.MuZeroNetwork(self.config)
        self.target_network.set_weights(initial_weights)
        self.target_network.to(torch.device("cpu"))
        self.target_network.eval()
Exemplo n.º 2
0
    def __init__(self, checkpoint, config):
        self.config = config

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(checkpoint["weights"])
        self.model.eval()
Exemplo n.º 3
0
    def __init__(self, initial_weights, config):
        self.config = config
        self.training_step = 0

        # Fix random generator seed
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(initial_weights)
        self.model.to(torch.device(config.training_device))
        self.model.train()

        if self.config.optimizer == "SGD":
            self.optimizer = torch.optim.SGD(
                self.model.parameters(),
                lr=self.config.lr_init,
                momentum=self.config.momentum,
                weight_decay=self.config.weight_decay,
            )
        elif self.config.optimizer == "Adam":
            self.optimizer = torch.optim.Adam(
                self.model.parameters(),
                lr=self.config.lr_init,
                weight_decay=self.config.weight_decay,
            )
        else:
            raise ValueError(
                "{} is not implemented. You can change the optimizer manually in trainer.py."
            )
Exemplo n.º 4
0
    def __init__(self, game_name):
        self.game_name = game_name

        # Load the game and the config from the module with the game name
        try:
            game_module = importlib.import_module("games." + self.game_name)
            self.config: MuZeroConfigBase = game_module.MuZeroConfig()
            self.Game = game_module.Game
        except Exception as err:
            print(
                '{} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
                .format(self.game_name))
            raise err

        os.makedirs(os.path.join(self.config.results_path), exist_ok=True)

        # Fix random generator seed for reproductibility
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        # Initial weights used to initialize components
        self.muzero_weights = models.MuZeroNetwork(
            self.config.observation_shape,
            len(self.config.action_space),
            self.config.encoding_size,
            self.config.hidden_size,
        ).get_weights()
        self.config.results_path = (Path(self.config.results_path) /
                                    (self.game_name + "_summary") /
                                    time_stamp_str())
Exemplo n.º 5
0
    def __init__(self, weights, config):
        self.config = config

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(weights)
        self.model.to(torch.device("cpu"))
        self.model.eval()
Exemplo n.º 6
0
    def __init__(self, initial_checkpoint, Game, config, seed, opponent_initial_checkpoint = None):
        self.config = config
        self.game = Game(seed)

        # Fix random generator seed
        numpy.random.seed(seed)
        torch.manual_seed(seed)

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(initial_checkpoint["weights"])
        self.model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        self.model.eval()

        if opponent_initial_checkpoint is not None:
            # Initialize the opponent network
            self.opponent_model = models.MuZeroNetwork(self.config)
            self.opponent_model.set_weights(opponent_initial_checkpoint["weights"])
            self.opponent_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
            self.opponent_model.eval()
Exemplo n.º 7
0
    def __init__(self, initial_checkpoint, Game, config, seed):
        self.config = config
        self.game = Game(seed)

        # Fix random generator seed
        numpy.random.seed(seed)
        torch.manual_seed(seed)

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(initial_checkpoint["weights"])
        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
        self.model.eval()
Exemplo n.º 8
0
    def __init__(self, initial_weights, config):
        self.config = config
        self.num_reanalysed_games = 0

        # Fix random generator seed
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(initial_weights)
        self.model.to(torch.device(self.config.reanalyse_device))
        self.model.eval()
Exemplo n.º 9
0
    def __init__(self, initial_weights, game, config):
        self.config = config
        self.game = game

        # Fix random generator seed
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(initial_weights)
        self.model.to(torch.device("cpu"))
        self.model.eval()
Exemplo n.º 10
0
    def __init__(self, initial_weights, game, config):
        self.config = config
        self.game = game

        # Initialize the network
        self.model = models.MuZeroNetwork(
            self.config.observation_shape,
            len(self.config.action_space),
            self.config.encoding_size,
            self.config.hidden_size,
        )
        self.model.set_weights(initial_weights)
        self.model.to(torch.device("cpu"))
        self.model.eval()
Exemplo n.º 11
0
    def __init__(self, initial_checkpoint, config):
        self.config = config

        # Fix random generator seed
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(initial_checkpoint["weights"])
        self.model.to(torch.device("cuda" if self.config.reanalyse_on_gpu else "cpu"))
        self.model.eval()

        self.num_reanalysed_games = initial_checkpoint["num_reanalysed_games"]
Exemplo n.º 12
0
    def __init__(self, initial_checkpoint, Game, config, seed):
        self.config = config
        #self.game = traffic_environment.TrafficEnv()
        self.game = Game

        # Fix random generator seed
        numpy.random.seed(seed)
        torch.manual_seed(seed)

        # Initialize the network load most recent network weights. Load model onto GPU and set mode to eval
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(initial_checkpoint["weights"])
        self.model.to(
            torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
        self.model.eval()
Exemplo n.º 13
0
    def __init__(self, initial_checkpoint, Game, config, seed):
        self.config = config
        self.game = Game(seed)
        if seed < 0:
            seed = numpy.random.randint(10000)
        # Fix random generator seed
        numpy.random.seed(seed)
        torch.manual_seed(seed)

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(initial_checkpoint["weights"])
        self.model.to(
            torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        self.model.eval()
Exemplo n.º 14
0
    def __init__(self, config):
        self.config = config
        self.buffer = {}
        self.game_priorities = collections.deque(
            maxlen=self.config.window_size)
        self.max_recorded_game_priority = 1.0
        self.self_play_count = 0
        self.total_samples = 0

        # Used only for the Reanalyze options
        self.model = (models.MuZeroNetwork(self.config)
                      if self.config.use_last_model_value else None)

        # Fix random generator seed
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)
Exemplo n.º 15
0
    def __init__(self, initial_weights, config):
        self.config = config
        self.training_step = 0

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(initial_weights)
        self.model.to(torch.device(config.training_device))
        self.model.train()

        self.optimizer = torch.optim.Adam(
            self.model.parameters(),
            lr=self.config.lr_init,
            # momentum=self.config.momentum,
            weight_decay=self.config.weight_decay,
        )
Exemplo n.º 16
0
    def __init__(self, initial_checkpoint, config):
        self.config = config
        self.has_LR_message_been_shown = False

        # Fix random generator seed
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
        self.model.train()

        self.training_step = initial_checkpoint["training_step"]

        if "cuda" not in str(next(self.model.parameters()).device):
            print("You are not training on GPU.\n")

        # Initialize the optimizer
        if self.config.optimizer == "SGD":
            self.optimizer = torch.optim.SGD(
                self.model.parameters(),
                lr=self.config.lr_init,
                momentum=self.config.momentum,
                weight_decay=self.config.weight_decay,
            )
        elif self.config.optimizer == "Adam":
            self.optimizer = torch.optim.Adam(
                self.model.parameters(),
                lr=self.config.lr_init,
                weight_decay=self.config.weight_decay,
            )
        else:
            raise NotImplementedError(
                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
            )

        if initial_checkpoint["optimizer_state"] is not None:
            print("Loading optimizer...\n")
            self.optimizer.load_state_dict(
                copy.deepcopy(initial_checkpoint["optimizer_state"])
            )
Exemplo n.º 17
0
    def __init__(self, initial_weights, game, config, test=False, idx=-1, render=False):
        self.config: MuZeroConfigBase = config
        self.game = game
        self.idx = idx
        self.episode = 0
        self.render = render
        self.writer = SummaryWriter(self.config.results_path / f"self_play_{idx}")

        # Initialize the network
        self.model = models.MuZeroNetwork(
            self.config.observation_shape,
            len(self.config.action_space),
            self.config.encoding_size,
            self.config.hidden_size,
        )
        self.model.set_weights(initial_weights)
        self.model.to(torch.device("cpu"))
        self.model.eval()

        self.continuous_self_play(test)
Exemplo n.º 18
0
    def __init__(self, game_name):
        self.game_name = game_name

        # Load the game and the config from the module with the game name
        try:
            game_module = importlib.import_module("games." + self.game_name)
            self.config = game_module.MuZeroConfig()
            self.Game = game_module.Game
        except Exception as err:
            print(
                '{} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
                .format(self.game_name))
            raise err

        # Fix random generator seed
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        # Weights used to initialize components
        self.muzero_weights = models.MuZeroNetwork(self.config).get_weights()
Exemplo n.º 19
0
    def __init__(self, initial_weights, config):
        self.config = config
        self.training_step = 0

        # Initialize the network
        self.model = models.MuZeroNetwork(
            self.config.observation_shape,
            len(self.config.action_space),
            self.config.encoding_size,
            self.config.hidden_size,
        )
        self.model.set_weights(initial_weights)
        self.model.to(torch.device(config.training_device))
        self.model.train()

        self.optimizer = torch.optim.SGD(
            self.model.parameters(),
            lr=self.config.lr_init,
            momentum=self.config.momentum,
            weight_decay=self.config.weight_decay,
        )
Exemplo n.º 20
0
    def __init__(self, initial_checkpoint, config):
        self.config = config

        # Fix random generator seed
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        # Import the game class to enable MCTS updates
        game_module = importlib.import_module("games." +
                                              self.config.game_filename)
        self.game = game_module.Game()

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(initial_checkpoint["weights"])
        self.model.to(
            torch.device("cuda" if self.config.reanalyse_on_gpu else "cpu"))
        self.model.eval()

        # Create the target network (for stable bootstrapping)
        if self.config.use_last_model_value:
            self.target_model = copy.deepcopy(self.model)
            self.last_update_step = initial_checkpoint["training_step"]
Exemplo n.º 21
0
    def __init__(self, initial_weights, config):
        self.config: MuZeroConfigBase = config
        self.training_step = 0
        self.writer = SummaryWriter(self.config.results_path / "trainer")

        # Initialize the network
        self.model = models.MuZeroNetwork(
            self.config.observation_shape,
            len(self.config.action_space),
            self.config.encoding_size,
            self.config.hidden_size,
        )
        self.model.set_weights(initial_weights)
        self.model.to(torch.device(config.training_device))
        self.model.train()

        self.optimizer = torch.optim.SGD(
            self.model.parameters(),
            lr=self.config.lr_init,
            momentum=self.config.momentum,
            weight_decay=self.config.weight_decay,
        )

        def async_put_weights():
            last_idx = None
            while True:
                if self.config.q_weights.empty():
                    if self.training_step != last_idx:
                        weights = self.model.get_weights()
                        last_idx = self.training_step
                    self.config.q_weights.put(weights)
                else:
                    time.sleep(0.1)

        Thread(target=async_put_weights).start()
        self.continuous_update_weights()
Exemplo n.º 22
0
 def get_initial_weights(config):
     model = models.MuZeroNetwork(config)
     weigths = model.get_weights()
     summary = str(model).replace("\n", " \n\n")
     return weigths, summary
Exemplo n.º 23
0
    def _logging_loop(self, shared_storage_worker, replay_buffer_worker):
        """
        Keep track of the training performance
        """
        # Launch the test worker to get performance metrics
        test_worker = self_play.SelfPlay(
            copy.deepcopy(self.muzero_weights),
            self.Game(self.config.seed + self.config.num_actors),
            self.config,
        )
        test_worker.continuous_self_play(shared_storage_worker, None, True)

        # Write everything in TensorBoard
        writer = SummaryWriter(self.config.results_path)

        print(
            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
        )

        # Save hyperparameters to TensorBoard
        hp_table = [
            "| {} | {} |".format(key, value)
            for key, value in self.config.__dict__.items()
        ]
        writer.add_text(
            "Hyperparameters",
            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
        )
        # Save model representation
        writer.add_text(
            "Model summary",
            str(models.MuZeroNetwork(self.config)).replace("\n", " \n\n"),
        )
        # Loop for updating the training performance
        counter = 0
        info = shared_storage_worker.get_info()
        try:
            while info["training_step"] < self.config.training_steps:
                info = shared_storage_worker.get_info()
                writer.add_scalar(
                    "1.Total reward/1.Total reward",
                    info["total_reward"],
                    counter,
                )
                writer.add_scalar(
                    "1.Total reward/2.Mean value",
                    info["mean_value"],
                    counter,
                )
                writer.add_scalar(
                    "1.Total reward/3.Episode length",
                    info["episode_length"],
                    counter,
                )
                writer.add_scalar(
                    "1.Total reward/4.MuZero reward",
                    info["muzero_reward"],
                    counter,
                )
                writer.add_scalar(
                    "1.Total reward/5.Opponent reward",
                    info["opponent_reward"],
                    counter,
                )
                writer.add_scalar(
                    "2.Workers/1.Self played games",
                    replay_buffer_worker.get_self_play_count(),
                    counter,
                )
                writer.add_scalar("2.Workers/2.Training steps",
                                  info["training_step"], counter)
                writer.add_scalar(
                    "2.Workers/3.Self played games per training step ratio",
                    replay_buffer_worker.get_self_play_count() /
                    max(1, info["training_step"]),
                    counter,
                )
                writer.add_scalar("2.Workers/4.Learning rate", info["lr"],
                                  counter)
                writer.add_scalar("3.Loss/1.Total weighted loss",
                                  info["total_loss"], counter)
                writer.add_scalar("3.Loss/Value loss", info["value_loss"],
                                  counter)
                writer.add_scalar("3.Loss/Reward loss", info["reward_loss"],
                                  counter)
                writer.add_scalar("3.Loss/Policy loss", info["policy_loss"],
                                  counter)
                print(
                    "Last test reward: {:.2f}. Training step: {}/{}. Played games: {}. Loss: {:.2f}"
                    .format(
                        info["total_reward"],
                        info["training_step"],
                        self.config.training_steps,
                        replay_buffer_worker.get_self_play_count(),
                        info["total_loss"],
                    ),
                    end="\r",
                )
                counter += 1
                time.sleep(0.5)
        except KeyboardInterrupt as err:
            # Comment the line below to be able to stop the training but keep running
            # raise err
            pass
Exemplo n.º 24
0
def play_against_algorithm(weight_file_path,
                           config_name,
                           seed,
                           algo="expert",
                           render=False):
    np.random.seed(seed)
    torch.manual_seed(seed)

    game_module = importlib.import_module("games." + config_name)
    config = game_module.MuZeroConfig()
    model = models.MuZeroNetwork(config)
    model.set_weights(torch.load(weight_file_path))
    model.eval()

    algo = globals()[algo.capitalize()](-1, 1)

    game = Game(seed)
    observation = game.reset()

    game_history = GameHistory()
    game_history.action_history.append(0)
    game_history.reward_history.append(0)
    game_history.to_play_history.append(game.to_play())
    game_history.legal_actions.append(game.legal_actions())
    game_history.observation_history.append(observation)

    done = False
    depth = 9
    reward = 0

    while not done:
        if game.to_play_real() == -1:
            action = algo(game.get_state(), depth, game.to_play_real())
        else:
            stacked_observations = game_history.get_stacked_observations(
                -1,
                config.stacked_observations,
            )

            root, priority, tree_depth = MCTS(config).run(
                model,
                stacked_observations,
                game.legal_actions(),
                game.to_play(),
                False,
            )

            action = SelfPlay.select_action(
                root,
                0,
            )

            game_history.store_search_statistics(root, config.action_space)
            game_history.priorities.append(priority)
        observation, reward, done = game.step(action)
        if render:
            game.render()
        depth -= 1

        game_history.action_history.append(action)
        game_history.observation_history.append(observation)
        game_history.reward_history.append(reward)
        game_history.to_play_history.append(game.to_play())
        game_history.legal_actions.append(game.legal_actions())

    return reward, TictactoeComp.wins(game.get_state(), 1)
Exemplo n.º 25
0
    def __init__(self, game_name, config=None, split_resources_in=1):
        # Load the game and the config from the module with the game name
        '''
        try:
            game_module = importlib.import_module("games." + game_name)
            self.Game = game_module.Game
            self.config = game_module.MuZeroConfig()
        except ModuleNotFoundError as err:
            print(
                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
            )
            raise err

        # Overwrite the config
        if config:
            if type(config) is dict:
                for param, value in config.items():
                    setattr(self.config, param, value)
            else:
                self.config = config
        '''
        # Network 1
        speed = 2  # need to test speeds 0 1 2
        self.Game = traffic_environment.TrafficEnv(car_speed=speed,
                                                   max_wait=1400,
                                                   horiz_lanes=('e', ),
                                                   vert_lanes=('n', ),
                                                   horiz_sizes=(7, 7),
                                                   vert_sizes=(3, 3))
        self.Game.make_spawn_blocks(
            self.Game.start_indices,
            [0.9, 0.1])  # [0.5 for _ in range(len(self.Game.start_indices))])
        #env_numLanes = (len(self.Game.horiz_lanes) + len(self.Game.vert_lanes))
        # Network 2
        # self.Game = traffic_environment.TrafficEnv(car_speed=speed, max_wait=800, horiz_lanes=('e','w'), vert_lanes=('s','sn'), horiz_sizes=(3,3,2), vert_sizes=(3,2,2))
        # self.Game.make_spawn_blocks(self.Game.start_indices, [0.9, 0.1, 0.9, 0.9, 0.1])#, [0.5 for _ in range(len(self.Game.start_indices))])
        # env_numLanes = (len(self.Game.horiz_lanes) + len(self.Game.vert_lanes))

        self.config = muzero_config.MuZeroConfig()
        #self.config.observation_shape = (1, 1, len(self.Game.observation()))
        self.config.observation_shape = (
            1, 1, len(self.Game.observation_space.sample()))
        self.config.action_space = list(
            range(0, 2**len(self.Game.action_space.sample())))
        #self.config.action_space = list(range(2**self.Game.action_space.shape[0]))

        # Fix random generator seed
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        # Manage GPUs
        # TODO could trim this out
        total_gpus = (self.config.max_num_gpus if self.config.max_num_gpus
                      is not None else torch.cuda.device_count())
        self.num_gpus = total_gpus / split_resources_in
        if 1 < self.num_gpus:
            self.num_gpus = math.floor(self.num_gpus)

        # Checkpoint and replay buffer used to initialize workers
        self.checkpoint = {
            "weights": None,
            "optimizer_state": None,
            "total_reward": 0,
            "muzero_reward": 0,
            "opponent_reward": 0,
            "episode_length": 0,
            "mean_value": 0,
            "training_step": 0,
            "lr": 0,
            "total_loss": 0,
            "value_loss": 0,
            "reward_loss": 0,
            "policy_loss": 0,
            "num_played_games": 0,
            "num_played_steps": 0,
            "num_reanalysed_games": 0,
            "terminate": False
        }
        self.replay_buffer = {}

        model = models.MuZeroNetwork(self.config)
        weights = model.get_weights()
        self.summary = str(model).replace("\n", " \n\n")
        self.checkpoint["weights"] = copy.deepcopy(weights)

        # Workers
        self.self_play_workers = []
        self.test_worker = None
        self.training_worker = None
        self.reanalyse_worker = None
        self.replay_buffer_worker = None
        self.shared_storage_worker = None
Exemplo n.º 26
0
def play_against_other(weights1,
                       config1,
                       weights2,
                       config2,
                       seed,
                       render=False):
    np.random.seed(seed)
    torch.manual_seed(seed)
    game_module = importlib.import_module("games." + config1)
    config1 = game_module.MuZeroConfig()
    model1 = models.MuZeroNetwork(config1)
    model1.set_weights(torch.load(weights1))
    model1.eval()

    game_module = importlib.import_module("games." + config2)
    config2 = game_module.MuZeroConfig()
    model2 = models.MuZeroNetwork(config2)
    model2.set_weights(torch.load(weights2))
    model2.eval()

    game = Game(seed)
    observation = game.reset()

    game_history1 = GameHistory()
    game_history1.action_history.append(0)
    game_history1.reward_history.append(0)
    game_history1.to_play_history.append(game.to_play())
    game_history1.legal_actions.append(game.legal_actions())
    observation1 = copy.deepcopy(observation)
    # observation1[0] = -observation1[1]
    # observation1[1] = -observation1[0]
    # observation1[2] = -observation1[2]
    game_history1.observation_history.append(observation1)

    game_history2 = GameHistory()
    game_history2.action_history.append(0)
    game_history2.reward_history.append(0)
    game_history2.to_play_history.append(not game.to_play())
    game_history2.legal_actions.append(game.legal_actions())
    observation2 = copy.deepcopy(observation)
    observation2[0] = -observation2[1]
    observation2[1] = -observation2[0]
    observation2[2] = -observation2[2]
    game_history2.observation_history.append(observation2)

    done = False
    reward = 0

    while not done:
        if game.to_play_real() == 1:
            config = config1
            model = model1
            game_history = game_history1
        else:
            config = config2
            model = model2
            game_history = game_history2

        stacked_observations = game_history.get_stacked_observations(
            -1,
            config.stacked_observations,
        )

        root, priority, tree_depth = MCTS(config).run(
            model,
            stacked_observations,
            game.legal_actions(),
            game.to_play(),
            False,
        )

        action = SelfPlay.select_action(
            root,
            0,
        )

        game_history1.store_search_statistics(root, config.action_space)
        game_history1.priorities.append(priority)
        game_history2.store_search_statistics(root, config.action_space)
        game_history2.priorities.append(priority)
        observation, reward, done = game.step(action)
        if render:
            game.render()

        game_history1.action_history.append(action)
        observation1 = copy.deepcopy(observation)
        # observation1[0] = -observation1[1]
        # observation1[1] = -observation1[0]
        # observation1[2] = -observation1[2]
        game_history1.observation_history.append(observation1)
        game_history1.reward_history.append(reward)
        game_history1.to_play_history.append(game.to_play())
        game_history1.legal_actions.append(game.legal_actions())

        game_history2.action_history.append(action)
        observation2 = copy.deepcopy(observation)
        observation2[0] = -observation2[1]
        observation2[1] = -observation2[0]
        observation2[2] = -observation2[2]
        game_history2.observation_history.append(observation2)
        game_history2.reward_history.append(reward)
        game_history2.to_play_history.append(not game.to_play())
        game_history2.legal_actions.append(game.legal_actions())

    return reward, TictactoeComp.wins(game.get_state(), 1)
Exemplo n.º 27
0
    def _joe_logging(self, shared_storage_worker, replay_buffer_worker):
        """
        Keep track of the training performance
        """

        if not hasattr(self, '_has_logged_one'):
            # Launch the test worker to get performance metrics
            self._test_worker = self_play.SelfPlay(
                copy.deepcopy(self.muzero_weights),
                self.Game(self.config.seed + self.config.num_actors),
                self.config,
            )

            # Write everything in TensorBoard
            writer = SummaryWriter(self.config.results_path)

            print(
                "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
            )

            # Save hyperparameters to TensorBoard
            hp_table = [
                "| {} | {} |".format(key, value)
                for key, value in self.config.__dict__.items()
            ]
            writer.add_text(
                "Hyperparameters",
                "| Parameter | Value |\n|-------|-------|\n" +
                "\n".join(hp_table),
            )
            # Save model representation
            writer.add_text(
                "Model summary",
                str(models.MuZeroNetwork(self.config)).replace("\n", " \n\n"),
            )
            self._has_logged_one = True
            self._writer = writer
            self._counter = 0
            self._last_game_played = 0
            return

        info = shared_storage_worker.get_info()
        writer = self._writer
        counter = info['training_step']
        if info['training_step'] % self.config.checkpoint_interval != 0:
            return

        games_played = replay_buffer_worker.get_self_play_count()
        if games_played % 3 == 0 and games_played != self._last_game_played:
            self._test_worker.joe_self_play(shared_storage_worker, None, True)
            # self._test_worker.joe_self_play(shared_storage_worker, replay_buffer_worker, True)
            self._last_game_played = games_played

        writer.add_scalar(
            "1.Total reward/1.Total reward",
            info["total_reward"],
            counter,
        )
        writer.add_scalar(
            "1.Total reward/2.Mean value",
            info["mean_value"],
            counter,
        )
        writer.add_scalar(
            "1.Total reward/3.Episode length",
            info["episode_length"],
            counter,
        )
        writer.add_scalar(
            "1.Total reward/4.MuZero reward",
            info["muzero_reward"],
            counter,
        )
        writer.add_scalar(
            "1.Total reward/5.Opponent reward",
            info["opponent_reward"],
            counter,
        )
        writer.add_scalar(
            "2.Workers/1.Self played games",
            # ray.get(replay_buffer_worker.get_self_play_count.remote()),
            replay_buffer_worker.get_self_play_count(),
            counter,
        )
        writer.add_scalar("2.Workers/2.Training steps", info["training_step"],
                          counter)
        writer.add_scalar(
            "2.Workers/3.Self played games per training step ratio",
            # ray.get(replay_buffer_worker.get_self_play_count.remote())
            replay_buffer_worker.get_self_play_count() /
            max(1, info["training_step"]),
            counter,
        )
        writer.add_scalar("2.Workers/4.Learning rate", info["lr"], counter)
        writer.add_scalar("3.Loss/1.Total weighted loss", info["total_loss"],
                          counter)
        writer.add_scalar("3.Loss/Value loss", info["value_loss"], counter)
        writer.add_scalar("3.Loss/Reward loss", info["reward_loss"], counter)
        writer.add_scalar("3.Loss/Policy loss", info["policy_loss"], counter)
        print(
            "Last test reward: {:.2f}. Training step: {}/{}. Played games: {}. Loss: {:.2f}"
            .format(
                info["total_reward"],
                info["training_step"],
                self.config.training_steps,
                # ray.get(replay_buffer_worker.get_self_play_count.remote()),
                replay_buffer_worker.get_self_play_count(),
                info["total_loss"],
            ),
            end="\r",
        )