Пример #1
0
def main(arglist):
    ACTORS = 1
    env = EnvWrapper(arglist.scenario, ACTORS, arglist.saved_episode)
    if arglist.eval:
        current_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
        writer = SummaryWriter(log_dir='./logs/' + current_time + '-' +
                               arglist.scenario)
    maddpg_wrapper = MADDPG(ACTORS)

    maddpg_wrapper.create_agents(env, arglist)

    j = 0
    for episode in range(arglist.max_episode):
        obs = env.reset()
        terminal = False
        maddpg_wrapper.reset()
        total_reward = [0 for i in maddpg_wrapper.workers]
        step = 0

        while not terminal and step < 25:
            if not arglist.eval:
                env.render(0)
                time.sleep(0.03)

            actions = maddpg_wrapper.take_actions(obs)
            obs2, reward, done = env.step(actions)

            for actor in range(ACTORS):
                for i, rew in enumerate(reward[actor]):
                    total_reward[i] += rew

            j += ACTORS
            #terminal = all(done)
            if arglist.eval:
                maddpg_wrapper.update(j, ACTORS, actions, reward, obs, obs2,
                                      done)

            obs = obs2
            step += 1

        if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0:
            maddpg_wrapper.save(episode)

        if arglist.eval:
            for worker, ep_ave_max in zip(maddpg_wrapper.workers,
                                          maddpg_wrapper.ep_ave_max_q_value):
                print(worker.pos, ' => average_max_q: ',
                      ep_ave_max / float(step), ' Reward: ',
                      total_reward[worker.pos], ' Episode: ', episode)
                writer.add_scalar(
                    str(worker.pos) + '/Average_max_q',
                    ep_ave_max / float(step), episode)
                writer.add_scalar(
                    str(worker.pos) + '/Reward Agent',
                    total_reward[worker.pos], episode)

    env.close()
Пример #2
0
def train(env, num_episodes=5000, max_t=1000, warmup_episodes=0):
    """ Monitor agent's performance.
    
    Params
    ======
    - env: instance of the environment
    - num_episodes: maximum number of episodes of agent-environment interaction
    - max_t: maximum number of timesteps per episode
    - warmup_episodes: how many episodes to explore and collect samples before learning begins
    
    Returns
    =======
    - scores: list containing received rewards
    """

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1.0
    noise_reduction = 0.9999

    # list containing max scores from each episode
    episode_scores = []
    # last 100 scores
    scores_window = deque(maxlen=100)
    mean_score = 0.0

    maddpg = MADDPG(state_size, action_size, num_agents * state_size,
                    num_agents * action_size)

    # for each episode
    for i_episode in range(1, num_episodes + 1):
        # reset the environment and begin the episode
        env_info = env.reset(train_mode=True)[brain_name]
        maddpg.reset()

        # get the current state (for each agent)
        states = env_info.vector_observations

        # initialize the score (for each agent)
        scores = np.zeros(num_agents)

        for t in range(max_t):
            # select an action (for each agent)
            if i_episode > warmup_episodes:
                actions = maddpg.act(states, noise)
                noise *= noise_reduction
            else:
                # Collect random samples to explore and fill the replay buffer
                actions = np.random.uniform(-1, 1, (num_agents, action_size))

            # send all actions to the environment
            env_info = env.step(actions)[brain_name]

            # get next state (for each agent)
            next_states = env_info.vector_observations

            # get reward (for each agent)
            rewards = env_info.rewards

            # see if episode finished
            dones = env_info.local_done

            # agents perform internal updates based on sampled experience
            maddpg.step(states, actions, rewards, next_states, dones)

            # roll over states to next time step
            states = next_states

            # learn when time is right
            if t % LEARN_EVERY == 0 and i_episode > warmup_episodes:
                for _ in range(LEARN_BATCH):
                    maddpg.learn()

            # update the score (for each agent)
            scores += rewards

            # exit loop if episode finished
            if np.any(dones):
                break

        episode_max_score = np.max(scores)
        episode_scores.append(episode_max_score)

        if i_episode > warmup_episodes:
            # save final score
            scores_window.append(episode_max_score)
            mean_score = np.mean(scores_window)
            # monitor progress
            if i_episode % 10 == 0:
                print("\rEpisode {:d}/{:d} || Average score {:.2f}".format(
                    i_episode, num_episodes, mean_score))
        else:
            print("\rWarmup episode {:d}/{:d}".format(i_episode,
                                                      warmup_episodes),
                  end="")

        if i_episode % SAVE_EVERY == 0 and i_episode > warmup_episodes:
            maddpg.save_weights(i_episode)

        # check if task is solved
        if i_episode >= 100 and mean_score >= 0.5:
            print(
                '\nEnvironment solved in {:d} episodes. Average score: {:.2f}'.
                format(i_episode, mean_score))
            maddpg.save_weights()
            break
    if i_episode == num_episodes:
        print("\nGame over. Too bad! Final score {:.2f}\n".format(mean_score))
    return episode_scores
    print('Size of each action:', action_size)
    
    # examine the state space 
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0]) 

    agents = MADDPG(state_size, action_size)
    agents.load_from_file()
    
    for i in range(1, 20):
        env_info = env.reset(train_mode=False)[brain_name]
        states = env_info.vector_observations
        score = np.zeros((2,))
        agents.reset()

        for t in range(100):
            actions = agents.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            states = next_states
            score += np.array(rewards)
            if any(dones):
                break 

    env.close()
    
def main():

    ##########
    # CONFIG #
    ##########
    # Target Reward
    tgt_score = 0.5
    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Seed
    seed = 7
    seeding(seed)
    # Model Architecture
    # Actor
    hidden_in_actor = 256
    hidden_out_actor = 128
    lr_actor = 1e-4
    # Critic
    hidden_in_critic = 256
    hidden_out_critic = 128
    lr_critic = 3e-4
    weight_decay_critic = 0
    # Episodes
    number_of_episodes = 10000
    episode_length = 2000
    # Buffer
    buffer_size = int(1e6)
    batchsize = 512
    # Agent Update Frequency
    episode_per_update = 1
    # Rewards Discounts Factor
    discount_factor = 0.95
    # Soft Update Weight
    tau = 1e-2
    # Noise Process
    noise_factor = 2
    noise_reduction = 0.9999
    noise_floor = 0.0
    # Window
    win_len = 100
    # Save Frequency
    save_interval = 200
    # Logger
    log_path = os.getcwd() + "/log"
    logger = SummaryWriter(log_dir=log_path)
    # Model Directory
    model_dir = os.getcwd() + "/model_dir"
    os.makedirs(model_dir, exist_ok=True)
    # Load Saved Model
    load_model = False

    ####################
    # Load Environment #
    ####################
    env = UnityEnvironment(file_name="./Tennis_Linux_NoVis/Tennis.x86_64")
    # Get brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    print('Brain Name:', brain_name)
    # Reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # Number of Agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)
    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    ####################
    # Show Progressbar #
    ####################
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]
    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()
    start = time.time()

    ###############
    # Multi Agent #
    ###############
    maddpg = MADDPG(state_size, action_size, num_agents, hidden_in_actor,
                    hidden_out_actor, lr_actor, hidden_in_critic,
                    hidden_out_critic, lr_critic, weight_decay_critic,
                    discount_factor, tau, seed, device)

    if load_model:
        load_dict_list = torch.load(os.path.join(model_dir,
                                                 'episode-saved.pt'))
        for i in range(num_agents):
            maddpg.maddpg_agent[i].actor.load_state_dict(
                load_dict_list[i]['actor_params'])
            maddpg.maddpg_agent[i].actor_optimizer.load_state_dict(
                load_dict_list[i]['actor_optim_params'])
            maddpg.maddpg_agent[i].critic.load_state_dict(
                load_dict_list[i]['critic_params'])
            maddpg.maddpg_agent[i].critic_optimizer.load_state_dict(
                load_dict_list[i]['critic_optim_params'])

    #################
    # Replay Buffer #
    #################
    rebuffer = ReplayBuffer(buffer_size, seed, device)

    #################
    # TRAINING LOOP #
    #################
    # initialize scores
    scores_history = []
    scores_window = deque(maxlen=save_interval)

    # i_episode = 0
    for i_episode in range(number_of_episodes):
        timer.update(i_episode)

        # Reset Environmet
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)

        # Reset Agent
        maddpg.reset()

        # episode_t = 0
        for episode_t in range(episode_length):

            # Explore with decaying noise factor
            actions = maddpg.act(states, noise_factor=noise_factor)
            env_info = env.step(actions)[brain_name]  # Environment reacts
            next_states = env_info.vector_observations  # get the next states
            rewards = env_info.rewards  # get the rewards
            dones = env_info.local_done  # see if episode has finished

            ###################
            # Save Experience #
            ###################
            rebuffer.add(states, actions, rewards, next_states, dones)

            scores += rewards
            states = next_states

            if any(dones):
                break

        scores_history.append(np.max(scores))  # save most recent score
        scores_window.append(np.max(scores))  # save most recent score
        avg_rewards = np.mean(scores_window)
        noise_factor = max(noise_floor, noise_factor *
                           noise_reduction)  # Reduce Noise Factor

        #########
        # LEARN #
        #########
        if len(rebuffer) > batchsize and i_episode % episode_per_update == 0:
            for a_i in range(num_agents):
                samples = rebuffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            # Soft Update
            maddpg.update_targets()

        ##################
        # Track Progress #
        ##################
        if i_episode % save_interval == 0 or i_episode == number_of_episodes - 1:
            logger.add_scalars('rewards', {
                'Avg Reward': avg_rewards,
                'Noise Factor': noise_factor
            }, i_episode)
            print(
                '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}'
                .format((time.time() - start) / 60, maddpg.update_count,
                        episode_t),
                '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'.
                format(i_episode, avg_rewards, noise_factor),
                end="\n")

        ##############
        # Save Model #
        ##############
        save_info = ((i_episode) % save_interval == 0
                     or i_episode == number_of_episodes)
        if save_info:
            save_dict_list = []
            for i in range(num_agents):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)
            torch.save(save_dict_list,
                       os.path.join(model_dir, 'episode-Latest.pt'))

            pd.Series(scores_history).to_csv(
                os.path.join(model_dir, "scores.csv"))

            # plot the scores
            rolling_mean = pd.Series(scores_history).rolling(win_len).mean()
            fig = plt.figure()
            ax = fig.add_subplot(111)
            plt.plot(np.arange(len(scores_history)), scores_history)
            plt.axhline(y=tgt_score, color='r', linestyle='dashed')
            plt.plot(rolling_mean, lw=3)
            plt.ylabel('Score')
            plt.xlabel('Episode #')
            # plt.show()
            fig.savefig(os.path.join(model_dir, 'Average_Score.pdf'))
            fig.savefig(os.path.join(model_dir, 'Average_Score.jpg'))
            plt.close()

        if avg_rewards > tgt_score:
            logger.add_scalars('rewards', {
                'Avg Reward': avg_rewards,
                'Noise Factor': noise_factor
            }, i_episode)
            print(
                '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}'
                .format((time.time() - start) / 60, maddpg.update_count,
                        episode_t),
                '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'.
                format(i_episode, avg_rewards, noise_factor),
                end="\n")
            break

    env.close()
    logger.close()
    timer.finish()
Пример #5
0
def train_maddpg(env,
                 max_episode=1000,
                 max_t=1000,
                 print_every=5,
                 check_history=100,
                 sigma_start=0.2,
                 sigma_end=0.01,
                 sigma_decay=0.995):
    # reset
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]

    # action and state size
    action_size = brain.vector_action_space_size
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('State size:', state_size)
    print('Action size: ', action_size)

    # initialize agent
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    maddpg = MADDPG(state_size, action_size, random_seed=123)

    scores_deque = deque(maxlen=check_history)
    scores = []

    # learning multiple episodes
    sigma = sigma_start
    for episode in range(max_episode):
        # prepare for training in the current epoc
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        score = 0
        maddpg.reset(sigma=sigma)

        # play and learn in current episode
        for t in range(max_t):
            actions = maddpg.act(states)

            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished

            maddpg.step(t, states, actions, rewards, next_states, dones)
            states = next_states

            reward = np.max(
                rewards)  # get max score of two agents as current score
            score += reward
            if np.any(dones):
                break

        # update sigma for exlporation
        sigma = max(sigma_end, sigma * sigma_decay)

        # record score
        epoc_score = score
        scores_deque.append(epoc_score)
        scores.append(epoc_score)

        if episode % print_every == 0:
            print('Episode {}\tscore: {:.4f}\tAverage Score: {:.4f}'.format(
                episode, epoc_score, np.mean(scores_deque)))

        if np.mean(scores_deque) >= 0.5 and episode >= check_history:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.4f}'
                .format(episode - check_history, np.mean(scores_deque)))
            for agent in maddpg.ddpg_agents:
                torch.save(agent.actor_local.state_dict(),
                           'actor_agent_' + str(agent.id) + '.pth')
                torch.save(agent.critic_local.state_dict(),
                           'critic_agent_' + str(agent.id) + '.pth')
            break

    return scores