예제 #1
0
def train_agents(n_episodes=10000, t_max=1000):
    env = UnityEnvironment(file_name="envs/Tennis.app")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]

    seeding(seed=42)
    state_size = env_info.vector_observations.shape[1]
    action_size = brain.vector_action_space_size
    num_agents = env_info.vector_observations.shape[0]
    maddpg = MADDPG(state_size=state_size,
                    action_size=action_size,
                    num_agents=num_agents)

    scores_deque = deque(maxlen=100)
    scores_list = []
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)
        for _ in range(t_max):
            actions = maddpg.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            scores += rewards
            maddpg.step(states, actions, rewards, next_states, dones)
            states = next_states
            if np.any(dones):
                break

        scores_deque.append(np.max(scores))
        scores_list.append(np.max(scores))

        print(f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_deque)}',
              end="")
        if i_episode % PRINT_EVERY == 0:
            print(
                f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_deque) : .3f}'
            )

        if np.mean(scores_deque) >= 2.0 and len(scores_deque) >= 100:
            for i, agent in enumerate(maddpg.agents):
                torch.save(agent.actor_local.state_dict(),
                           f'models/checkpoint_actor_local_{i}.pth')
                torch.save(agent.critic_local.state_dict(),
                           f'models/checkpoint_critic_local_{i}.pth')
            print(
                f'\nSaved Model: Episode {i_episode}\tAverage Score: {np.mean(scores_deque) : .3f}'
            )
            break

    return scores_list
예제 #2
0
def trainFunction(state_size, action_size, n_episodes=4000, num_agents=2):
    magent = MADDPG(action_size=action_size,
                    noise_start=1.0,
                    seed=2,
                    gamma=0.99,
                    t_stop_noise=30000)
    scores = []
    scores_deque = deque(maxlen=100)
    scores_avg = []

    for i_episode in range(1, n_episodes + 1):
        rewards = []
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current state (for each agent)
        if i_episode % 2:
            update = True
        # loop over steps
        while True:
            # select an action
            joint_actions = magent.act(states, update)
            update = False
            # take action in environment and set parameters to new values
            env_info = env.step(joint_actions)[brain_name]
            next_states = env_info.vector_observations
            rewards_v = env_info.rewards
            done_v = env_info.local_done
            # update and train agent with returned information
            magent.step(states, joint_actions, rewards_v, next_states, done_v)
            states = next_states
            rewards.append(rewards_v)
            if any(done_v):
                break

        # calculate episode reward as maximum of individually collected rewards of agents
        episode_reward = np.max(np.sum(np.array(rewards), axis=0))

        scores.append(
            episode_reward)  # save most recent score to overall score array
        scores_deque.append(
            episode_reward
        )  # save most recent score to running window of 100 last scores
        current_avg_score = np.mean(scores_deque)
        scores_avg.append(
            current_avg_score
        )  # save average of last 100 scores to average score array

        print('\rEpisode {}\tAverage Score: {:.3f}'.format(
            i_episode, current_avg_score),
              end="")

        # log average score every 200 episodes
        if i_episode % 200 == 0:
            print('\rEpisode {}\tAverage Score: {:.3f}'.format(
                i_episode, current_avg_score))

        # break and report success if environment is solved
        if np.mean(scores_deque) >= .5 and i_episode % 200 == 0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'
                .format(i_episode, np.mean(scores_deque)))
            magent.save()
예제 #3
0
def train(env, num_episodes=5000, max_t=1000, warmup_episodes=0):
    """ Monitor agent's performance.
    
    Params
    ======
    - env: instance of the environment
    - num_episodes: maximum number of episodes of agent-environment interaction
    - max_t: maximum number of timesteps per episode
    - warmup_episodes: how many episodes to explore and collect samples before learning begins
    
    Returns
    =======
    - scores: list containing received rewards
    """

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1.0
    noise_reduction = 0.9999

    # list containing max scores from each episode
    episode_scores = []
    # last 100 scores
    scores_window = deque(maxlen=100)
    mean_score = 0.0

    maddpg = MADDPG(state_size, action_size, num_agents * state_size,
                    num_agents * action_size)

    # for each episode
    for i_episode in range(1, num_episodes + 1):
        # reset the environment and begin the episode
        env_info = env.reset(train_mode=True)[brain_name]
        maddpg.reset()

        # get the current state (for each agent)
        states = env_info.vector_observations

        # initialize the score (for each agent)
        scores = np.zeros(num_agents)

        for t in range(max_t):
            # select an action (for each agent)
            if i_episode > warmup_episodes:
                actions = maddpg.act(states, noise)
                noise *= noise_reduction
            else:
                # Collect random samples to explore and fill the replay buffer
                actions = np.random.uniform(-1, 1, (num_agents, action_size))

            # send all actions to the environment
            env_info = env.step(actions)[brain_name]

            # get next state (for each agent)
            next_states = env_info.vector_observations

            # get reward (for each agent)
            rewards = env_info.rewards

            # see if episode finished
            dones = env_info.local_done

            # agents perform internal updates based on sampled experience
            maddpg.step(states, actions, rewards, next_states, dones)

            # roll over states to next time step
            states = next_states

            # learn when time is right
            if t % LEARN_EVERY == 0 and i_episode > warmup_episodes:
                for _ in range(LEARN_BATCH):
                    maddpg.learn()

            # update the score (for each agent)
            scores += rewards

            # exit loop if episode finished
            if np.any(dones):
                break

        episode_max_score = np.max(scores)
        episode_scores.append(episode_max_score)

        if i_episode > warmup_episodes:
            # save final score
            scores_window.append(episode_max_score)
            mean_score = np.mean(scores_window)
            # monitor progress
            if i_episode % 10 == 0:
                print("\rEpisode {:d}/{:d} || Average score {:.2f}".format(
                    i_episode, num_episodes, mean_score))
        else:
            print("\rWarmup episode {:d}/{:d}".format(i_episode,
                                                      warmup_episodes),
                  end="")

        if i_episode % SAVE_EVERY == 0 and i_episode > warmup_episodes:
            maddpg.save_weights(i_episode)

        # check if task is solved
        if i_episode >= 100 and mean_score >= 0.5:
            print(
                '\nEnvironment solved in {:d} episodes. Average score: {:.2f}'.
                format(i_episode, mean_score))
            maddpg.save_weights()
            break
    if i_episode == num_episodes:
        print("\nGame over. Too bad! Final score {:.2f}\n".format(mean_score))
    return episode_scores
예제 #4
0
def train_maddpg(env,
                 max_episode=1000,
                 max_t=1000,
                 print_every=5,
                 check_history=100,
                 sigma_start=0.2,
                 sigma_end=0.01,
                 sigma_decay=0.995):
    # reset
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]

    # action and state size
    action_size = brain.vector_action_space_size
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('State size:', state_size)
    print('Action size: ', action_size)

    # initialize agent
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    maddpg = MADDPG(state_size, action_size, random_seed=123)

    scores_deque = deque(maxlen=check_history)
    scores = []

    # learning multiple episodes
    sigma = sigma_start
    for episode in range(max_episode):
        # prepare for training in the current epoc
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        score = 0
        maddpg.reset(sigma=sigma)

        # play and learn in current episode
        for t in range(max_t):
            actions = maddpg.act(states)

            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished

            maddpg.step(t, states, actions, rewards, next_states, dones)
            states = next_states

            reward = np.max(
                rewards)  # get max score of two agents as current score
            score += reward
            if np.any(dones):
                break

        # update sigma for exlporation
        sigma = max(sigma_end, sigma * sigma_decay)

        # record score
        epoc_score = score
        scores_deque.append(epoc_score)
        scores.append(epoc_score)

        if episode % print_every == 0:
            print('Episode {}\tscore: {:.4f}\tAverage Score: {:.4f}'.format(
                episode, epoc_score, np.mean(scores_deque)))

        if np.mean(scores_deque) >= 0.5 and episode >= check_history:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.4f}'
                .format(episode - check_history, np.mean(scores_deque)))
            for agent in maddpg.ddpg_agents:
                torch.save(agent.actor_local.state_dict(),
                           'actor_agent_' + str(agent.id) + '.pth')
                torch.save(agent.critic_local.state_dict(),
                           'critic_agent_' + str(agent.id) + '.pth')
            break

    return scores