示例#1
0
 def run(l_):
     best_net = Net(trained_model=best_net_filename)
     self_play_component = SelfPlay(best_net, num_games=1, num_simulations=num_simulations,
                                    num_exploration_steps=num_exploration_steps)
     _new_data = self_play_component.start()
     l_ += _new_data
     print('[Self Play] Collected data in a process, length:', len(_new_data))
示例#2
0
    def __init__(self, max_step):
        # initialize neural network weights
        self.model = NeuralNetwork('resnet')

        # initialize iteration num i
        self.iteration_num = 1000

        # initialize termination step T
        self.T = max_step
        self.enhance_op = SelfPlay(max_step=max_step)
    def compare_virtual_with_real_trajectories(self,
                                               first_obs,
                                               game,
                                               horizon,
                                               plot=True):
        """
        First, MuZero plays a game but uses its model instead of using the environment.
        Then, MuZero plays the optimal trajectory according precedent trajectory but performs it in the
        real environment until arriving at an action impossible in the real environment.
        It does an MCTS too, but doesn't take it into account.
        All information during the two trajectories are recorded and displayed.
        """
        virtual_trajectory_info = self.get_virtual_trajectory_from_obs(
            first_obs, horizon, False)
        real_trajectory_info = Trajectoryinfo("Real trajectory", self.config)
        trajectory_divergence_index = None
        real_trajectory_end_reason = "Reached horizon"

        # Illegal moves are masked at the root
        root, mcts_info = MCTS(self.config).run(
            self.model,
            first_obs,
            game.legal_actions(),
            game.to_play(),
            True,
        )
        self.plot_mcts(root, plot)
        real_trajectory_info.store_info(root, mcts_info, None, numpy.NaN)
        for i, action in enumerate(virtual_trajectory_info.action_history):
            # Follow virtual trajectory until it reaches an illegal move in the real env
            if action not in game.legal_actions():
                break  # Comment to keep playing after trajectory divergence
                action = SelfPlay.select_action(root, 0)
                if trajectory_divergence_index is None:
                    trajectory_divergence_index = i
                    real_trajectory_end_reason = f"Virtual trajectory reached an illegal move at timestep {trajectory_divergence_index}."

            observation, reward, done = game.step(action)
            root, mcts_info = MCTS(self.config).run(
                self.model,
                observation,
                game.legal_actions(),
                game.to_play(),
                True,
            )
            real_trajectory_info.store_info(root, mcts_info, action, reward)
            if done:
                real_trajectory_end_reason = "Real trajectory reached Done"
                break

        if plot:
            virtual_trajectory_info.plot_trajectory()
            real_trajectory_info.plot_trajectory()
            print(real_trajectory_end_reason)

        return (
            virtual_trajectory_info,
            real_trajectory_info,
            trajectory_divergence_index,
        )
    def get_virtual_trajectory_from_obs(self,
                                        observation,
                                        horizon,
                                        plot=True,
                                        to_play=0):
        """
        MuZero plays a game but uses its model instead of using the environment.
        We still do an MCTS at each step.
        """
        trajectory_info = Trajectoryinfo("Virtual trajectory", self.config)
        root, mcts_info = MCTS(self.config).run(self.model, observation,
                                                self.config.action_space,
                                                to_play, True)
        trajectory_info.store_info(root, mcts_info, None, numpy.NaN)

        virtual_to_play = to_play
        for i in range(horizon):
            action = SelfPlay.select_action(root, 0)

            # Players play turn by turn
            if virtual_to_play + 1 < len(self.config.players):
                virtual_to_play = self.config.players[virtual_to_play + 1]
            else:
                virtual_to_play = self.config.players[0]

            # Generate new root
            # TODO: Test keeping the old root
            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
                root.hidden_state,
                torch.tensor([[action]]).to(root.hidden_state.device),
            )
            value = models.support_to_scalar(value,
                                             self.config.support_size).item()
            reward = models.support_to_scalar(reward,
                                              self.config.support_size).item()
            root = Node(0)
            root.expand(
                self.config.action_space,
                virtual_to_play,
                reward,
                policy_logits,
                hidden_state,
            )

            root, mcts_info = MCTS(self.config).run(self.model, None,
                                                    self.config.action_space,
                                                    virtual_to_play, True,
                                                    root)
            trajectory_info.store_info(root,
                                       mcts_info,
                                       action,
                                       reward,
                                       new_prior_root_value=value)

        if plot:
            self.plot_trajectory(trajectory_info)

        return trajectory_info
示例#5
0
class PokerZero(object):
    """
    class implement poker zero
    """
    def __init__(self, max_step):
        # initialize neural network weights
        self.model = NeuralNetwork('resnet')

        # initialize iteration num i
        self.iteration_num = 1000

        # initialize termination step T
        self.T = max_step
        self.enhance_op = SelfPlay(max_step=max_step)

    def train(self):
        for i in range(self.iteration_num):
            # initialize state s0
            s0 = None  # RoomAi generate one !!!
            # conduct self-play
            self.enhance_op.run_selfplay(cur_state=s0)
            """
            get the final score game r_T here
            """
            """
            label data z_t = +-r_T
            store data (s_t, \pi_t, z_t)
            """
            samples = self.enhance_op.save_data()
            """
            sample from all time step of last iteration of self-play
            """
            gen_samples = self.enhance_op.gen_samples()
            """
            train neural network using these samples
            """
            self.model.build_model(gen_samples)
示例#6
0
def play_against_algorithm(weight_file_path,
                           config_name,
                           seed,
                           algo="expert",
                           render=False):
    np.random.seed(seed)
    torch.manual_seed(seed)

    game_module = importlib.import_module("games." + config_name)
    config = game_module.MuZeroConfig()
    model = models.MuZeroNetwork(config)
    model.set_weights(torch.load(weight_file_path))
    model.eval()

    algo = globals()[algo.capitalize()](-1, 1)

    game = Game(seed)
    observation = game.reset()

    game_history = GameHistory()
    game_history.action_history.append(0)
    game_history.reward_history.append(0)
    game_history.to_play_history.append(game.to_play())
    game_history.legal_actions.append(game.legal_actions())
    game_history.observation_history.append(observation)

    done = False
    depth = 9
    reward = 0

    while not done:
        if game.to_play_real() == -1:
            action = algo(game.get_state(), depth, game.to_play_real())
        else:
            stacked_observations = game_history.get_stacked_observations(
                -1,
                config.stacked_observations,
            )

            root, priority, tree_depth = MCTS(config).run(
                model,
                stacked_observations,
                game.legal_actions(),
                game.to_play(),
                False,
            )

            action = SelfPlay.select_action(
                root,
                0,
            )

            game_history.store_search_statistics(root, config.action_space)
            game_history.priorities.append(priority)
        observation, reward, done = game.step(action)
        if render:
            game.render()
        depth -= 1

        game_history.action_history.append(action)
        game_history.observation_history.append(observation)
        game_history.reward_history.append(reward)
        game_history.to_play_history.append(game.to_play())
        game_history.legal_actions.append(game.legal_actions())

    return reward, TictactoeComp.wins(game.get_state(), 1)
示例#7
0
def play_against_other(weights1,
                       config1,
                       weights2,
                       config2,
                       seed,
                       render=False):
    np.random.seed(seed)
    torch.manual_seed(seed)
    game_module = importlib.import_module("games." + config1)
    config1 = game_module.MuZeroConfig()
    model1 = models.MuZeroNetwork(config1)
    model1.set_weights(torch.load(weights1))
    model1.eval()

    game_module = importlib.import_module("games." + config2)
    config2 = game_module.MuZeroConfig()
    model2 = models.MuZeroNetwork(config2)
    model2.set_weights(torch.load(weights2))
    model2.eval()

    game = Game(seed)
    observation = game.reset()

    game_history1 = GameHistory()
    game_history1.action_history.append(0)
    game_history1.reward_history.append(0)
    game_history1.to_play_history.append(game.to_play())
    game_history1.legal_actions.append(game.legal_actions())
    observation1 = copy.deepcopy(observation)
    # observation1[0] = -observation1[1]
    # observation1[1] = -observation1[0]
    # observation1[2] = -observation1[2]
    game_history1.observation_history.append(observation1)

    game_history2 = GameHistory()
    game_history2.action_history.append(0)
    game_history2.reward_history.append(0)
    game_history2.to_play_history.append(not game.to_play())
    game_history2.legal_actions.append(game.legal_actions())
    observation2 = copy.deepcopy(observation)
    observation2[0] = -observation2[1]
    observation2[1] = -observation2[0]
    observation2[2] = -observation2[2]
    game_history2.observation_history.append(observation2)

    done = False
    reward = 0

    while not done:
        if game.to_play_real() == 1:
            config = config1
            model = model1
            game_history = game_history1
        else:
            config = config2
            model = model2
            game_history = game_history2

        stacked_observations = game_history.get_stacked_observations(
            -1,
            config.stacked_observations,
        )

        root, priority, tree_depth = MCTS(config).run(
            model,
            stacked_observations,
            game.legal_actions(),
            game.to_play(),
            False,
        )

        action = SelfPlay.select_action(
            root,
            0,
        )

        game_history1.store_search_statistics(root, config.action_space)
        game_history1.priorities.append(priority)
        game_history2.store_search_statistics(root, config.action_space)
        game_history2.priorities.append(priority)
        observation, reward, done = game.step(action)
        if render:
            game.render()

        game_history1.action_history.append(action)
        observation1 = copy.deepcopy(observation)
        # observation1[0] = -observation1[1]
        # observation1[1] = -observation1[0]
        # observation1[2] = -observation1[2]
        game_history1.observation_history.append(observation1)
        game_history1.reward_history.append(reward)
        game_history1.to_play_history.append(game.to_play())
        game_history1.legal_actions.append(game.legal_actions())

        game_history2.action_history.append(action)
        observation2 = copy.deepcopy(observation)
        observation2[0] = -observation2[1]
        observation2[1] = -observation2[0]
        observation2[2] = -observation2[2]
        game_history2.observation_history.append(observation2)
        game_history2.reward_history.append(reward)
        game_history2.to_play_history.append(not game.to_play())
        game_history2.legal_actions.append(game.legal_actions())

    return reward, TictactoeComp.wins(game.get_state(), 1)
示例#8
0
def train(train_specification, checkpoint=None):
    assert isinstance(train_specification, TrainingSpecification)
    logger.info(f"{train_specification}")

    previous_network = AlphaNetwork(train_specification.residual_depth,
                                    train_specification.single_channel_size,
                                    train_specification.num_input_channels,
                                    train_specification.total_possible_actions)
    previous_network = previous_network.double()

    all_examples = []
    if checkpoint:
        all_examples = previous_network.load_checkpoint(checkpoint)

    num_games_history = train_specification.num_games_per_episode * train_specification.num_history_episodes
    num_examples_history = num_games_history * train_specification.training_examples_per_game

    logger.info("Starting")
    for episode in range(train_specification.num_episodes):
        episode_start_time = time.time()

        logger.info(f"episode: {episode}/{train_specification.num_episodes}")

        current_network = copy.deepcopy(previous_network)

        self_play_start_time = time.time()
        for game in range(train_specification.num_games_per_episode):
            game_start_time = time.time()
            logger.debug(f"Episode {episode} - Self-Playing game number {game}/{train_specification.num_games_per_episode}")
            self_play_engine = SelfPlay(train_specification.prediction_network(current_network),
                                        train_specification.game_engine(),
                                        train_specification.num_simulations,
                                        train_specification.training_augmentor(),
                                        train_specification.temperature)

            game_score, training_examples = self_play_engine.play()
            logger.info(f"Self-play game took ({time.time() - game_start_time}) seconds")

            all_examples.extend(training_examples)
        logger.info(f"Self-play took ({time.time() - self_play_start_time}) seconds")

        all_examples = list(reversed(list(reversed(all_examples))[:num_examples_history]))
        logger.info(f"current size of all_examples is {len(all_examples)}")
        if CUDA:
            logger.info("will use cuda during training")
            current_network = current_network.cuda()

        train_start_time = time.time()
        losses = current_network.train(all_examples, epochs=train_specification.num_epochs, batch_size=64)
        current_network = current_network.cpu()
        logger.info(f"Training took ({time.time() - train_start_time}) seconds")

        if evaluate_vs_previous(train_specification, previous_network, current_network):
            logger.info("Saving checkpoint")
            previous_network = current_network
            previous_network.save_checkpoint(train_specification.game_name, all_examples)
            evaluate_competitive(train_specification, previous_network, current_network)
            evaluate_random(train_specification, current_network)
        else:
            # retain examples from previous episode, but store checkpoint regardless
            current_network.save_checkpoint(train_specification.game_name, all_examples)

        logger.info(f"Episode took ({time.time() - episode_start_time}) seconds")
    def add_action(self,
                   opponent: str,
                   temperature: float = 0,
                   temperature_threshold: float = 0,
                   human_action: int = None) -> (int, str, list):
        with torch.no_grad():
            if self.done:
                raise ValueError(
                    "Status is already 'done' but there still another step.")
            if len(self.game_history.action_history) > self.config.max_moves:
                raise ValueError(
                    "Number of steps are already over the max moves.")

            stacked_observations = self.game_history.get_stacked_observations(
                -1,
                self.config.stacked_observations,
            )

            # Choose the action
            action = None
            if opponent == "self":
                root, mcts_info = MCTS(self.config).run(
                    self.model,
                    stacked_observations,
                    self.game.legal_actions(),
                    self.game.to_play(),
                    True,
                )
                action = SelfPlay.select_action(
                    root,
                    temperature if not temperature_threshold
                    or len(self.game_history.action_history) <
                    temperature_threshold else 0,
                )
            elif opponent == "random":
                action, root = numpy.random.choice(
                    self.game.legal_actions()), None
            elif opponent == "expert":
                action, root = self.game.expert_agent(), None
            elif opponent == "human":
                action, root = human_action, None
            else:
                raise ValueError(
                    'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
                )

            # cast action variable
            action = int(action)

            if action is None or not action in self.game.legal_actions():
                if (opponent == "human"):
                    raise ValueError(
                        f"Requested action '{action}' is illegal in this game."
                    )
                else:
                    raise Exception(
                        f"Calculated action '{action}' by '{opponent}' is illegal in this game."
                    )
            observation, reward, self.done = self.game.step(action)

            self.game_history.store_search_statistics(root,
                                                      self.config.action_space)

            # Next batch
            self.game_history.action_history.append(action)
            self.game_history.observation_history.append(observation)
            self.game_history.reward_history.append(reward)
            self.game_history.to_play_history.append(self.game.to_play())

            if isinstance(observation, numpy.ndarray):
                observation = observation.tolist()
            return action, self.game.action_to_string(action), observation
示例#10
0
import config
import game
from model import vamperouge_net
from self_play import SelfPlay

if __name__ == "__main__":
    neural_net = vamperouge_net(config)

    if config.load_model:
        neural_net.load_checkpoint(config.load_folder_file[0],
                                   config.load_folder_file[1])

    self_play = SelfPlay(neural_net, config)
    if config.load_model:
        print("Load train samples from file")
        self_play.load_train_samples()
    self_play.learn()