Exemplo n.º 1
0
def mcts(simulation_time: float,
         env: Environment,
         root_node: typing.Optional[UCTNode] = None) -> UCTNode:
    start_time = time.time()
    root_node = UCTNode(
        state=env.get_state(),
        active_player=env.get_active_player(),
        action=None,
        parent=None,
        num_actions=env.get_num_actions(),
        valid_actions=env.get_valid_actions(),
    )
    while time.time() - start_time < simulation_time:
        leaf_node, winner = root_node.select(env)
        if winner is not None:
            leaf_node.backup(winner)
            continue
        else:
            leaf_node.expand()
            winner = leaf_node.simulate(env)
            leaf_node.backup(winner)
    return root_node
Exemplo n.º 2
0
def main():
    """
    Sets the parameters for the Environment, Critic, and Actor according to the imported config file.
    Creates an environment where a predefined number of episodes can be performed.
    Instantiates an actor to keep track of the policy, and a critic to keep track of the value at each state
    Runs a predefined number of episodes creating a new board for each episode.
    For each episode, the actor and the critic are updated according to the Actor-Critic model.
    Finally, epsilon is set to zero, and the environment plays a game with the updated policy.
    """

    env = Environment(env_cfg)
    granularity = env_cfg["granularity"]
    critic = Critic(critic_cfg, granularity)
    actor = Actor(actor_cfg)

    episodes = training_cfg["number_of_episodes"]
    visualize_episodes = training_cfg["visualize_episodes"]
    steps_per_episode = []

    for episode in tqdm(range(episodes),
                        desc=f"Playing {episodes} episodes",
                        colour='#39ff14'):
        env.new_simulation()
        path = []
        positions = []
        critic.reset_eli_dict()
        actor.reset_eli_dict()
        while not env.reached_top() and not env.reached_max_steps():
            env.update_steps()
            current_state = copy(env.get_state())
            legal_actions = env.get_actions()
            action = actor.get_action(state=current_state,
                                      legal_actions=legal_actions)
            path.append((str(current_state), str(action)))
            reward = env.perform_action(action=action)

            td_err = critic.compute_td_err(current_state=current_state,
                                           next_state=env.get_state(),
                                           reward=reward)

            # Previous states on the path are updated as well during the call to train() by eligibility traces
            critic.train(state=current_state, td_error=td_err)
            critic.update_eligs()

            # Update actor beliefs on SAPs for all pairs seen thus far in the episode
            for i, sap in enumerate(reversed(path)):
                actor.update_eli_dict(state=str(sap[0]),
                                      action=str(sap[1]),
                                      i=i)
                actor.update_policy_dict(state=str(sap[0]),
                                         action=str(sap[1]),
                                         td_err=td_err)

            positions.append(env.get_position())

        print("steps used in this episode", env.steps)
        if episode in visualize_episodes:
            env.visualize_landscape(positions)
        steps_per_episode.append(env.steps)

    plot_learning(steps_per_episode)

    # Enable history tracking to visualize final simulation
    env.new_simulation()

    print(f"Actor final epsilon: {actor.epsilon}")
    actor.epsilon = 0  # Set exploration to 0
    print("Attempting final simulation to show you how smart I am now")
    while not env.reached_top() and not env.reached_max_steps():
        current_state = env.get_state()
        legal_actions = env.get_actions()
        action = actor.get_action(current_state, legal_actions)
        env.perform_action(action)