Exemplo n.º 1
0
 def config(self, config):
     print("Prepare for new configuration. Make new MADDPG agent.")
     self.n_episodes = config.get("n_episodes", N_EPISODES)
     self.solved_score = config.get("solved_score", SOLVED_SCORE)
     self.conseq_episodes = config.get("conseq_episodes", CONSEC_EPISODES)
     self.seed = config.get("seed", 1)
     self.MADDPG_obj = MADDPG(state_size=self.state_size,
                              action_size=self.action_size,
                              num_agents=self.num_agents,
                              config=config)
Exemplo n.º 2
0
def training(max_episodes=3000, episode_length=1000, random_seed=4):

    env, brain_name, num_agents, action_size, state_size = create_env()
    maddpg = MADDPG(num_agents,
                    state_size,
                    action_size,
                    num_agents * state_size,
                    num_agents * action_size,
                    discount_factor=0.99,
                    tau=0.001,
                    random_seed=random_seed)
    agent_reward = [[] for _ in range(num_agents)]
    agent_reward_deque = [deque(maxlen=100) for _ in range(num_agents)]
    score_full = []
    score_deque = deque(maxlen=100)

    for episode in range(1, max_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        obs = env_info.vector_observations
        obs_full = obs

        episode_scores = np.zeros(num_agents)

        for episode_t in range(episode_length):
            actions = maddpg.act(obs)
            env_info = env.step(actions)[brain_name]

            next_obs = env_info.vector_observations
            next_obs_full = next_obs
            rewards = env_info.rewards
            dones = env_info.local_done
            episode_scores += rewards

            maddpg.step(obs, obs_full, actions, rewards, next_obs,
                        next_obs_full, dones, episode_t)

            obs = next_obs
            obs_full = next_obs_full

            if np.any(dones):
                break

        for i in range(num_agents):
            agent_reward[i].append(episode_scores[i])
            agent_reward_deque[i].append(episode_scores[i])

        score_full.append(max(episode_scores))
        score_deque.append(max(episode_scores))

        if episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                episode, np.mean(score_deque)))

        if np.mean(score_deque) >= 0.5:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(episode, np.mean(score_deque)))
            #for i in range(num_agents):
            #    torch.save(agents[i].actor_local.state_dict(), 'checkpoint_actor'+str(i) +'.pth')
            #    torch.save(agents[i].critic_local.state_dict(), 'checkpoint_critic'+str(i)+'.pth')
            torch.save(maddpg.critic.state_dict(),
                       'checkpoint_centralized_critic.pth')
            for i in range(num_agents):
                torch.save(maddpg.actors[i].actor_local.state_dict(),
                           'checkpoint_actor' + str(i) + '.pth')
            break
    env.close()
    return maddpg, agent_reward, score_full, random_seed
Exemplo n.º 3
0
class MADDPG_Runner():
    def __init__(self, env, config):
        """

        :rtype: object
        """
        super(MADDPG_Runner, self).__init__()

        self.agents = []

        self.env = env

        # get the default brain
        self.brain_name = self.env.brain_names[0]
        self.brain = self.env.brains[self.brain_name]

        # reset the environment
        env_info = self.env.reset(train_mode=True)[self.brain_name]

        # number of agents
        self.num_agents = len(env_info.agents)
        print('Number of agents:', self.num_agents)

        # size of each action
        self.action_size = self.brain.vector_action_space_size
        print('Size of each action:', self.action_size)
        # examine the state space
        states = env_info.vector_observations
        self.state_size = states.shape[1]
        print('There are {} agents. Each observes a state with length: {}'.
              format(states.shape[0], self.state_size))
        print('The state for the first agent looks like: \n{}\n'.format(
            states[0]))

        self.config(config)

    def config(self, config):
        print("Prepare for new configuration. Make new MADDPG agent.")
        self.n_episodes = config.get("n_episodes", N_EPISODES)
        self.solved_score = config.get("solved_score", SOLVED_SCORE)
        self.conseq_episodes = config.get("conseq_episodes", CONSEC_EPISODES)
        self.seed = config.get("seed", 1)
        self.MADDPG_obj = MADDPG(state_size=self.state_size,
                                 action_size=self.action_size,
                                 num_agents=self.num_agents,
                                 config=config)

    def seeding(self):
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)

    def reset_agents(self):
        for agent in self.agents:
            agent.reset()

    def learning_step(self, states, actions, rewards, next_states, done):
        print("learning step", states, next_states, rewards, done, actions)
        for i, agent in enumerate(self.agents):
            agent.step(states, actions, rewards, next_states, done, i)

    ## Training loop

    def training_loop(self, t_max=1000, stop_when_done=True):

        # initialize scoring
        scores_window = deque(maxlen=CONSEC_EPISODES)
        moving_average = []
        scores_all = []
        best_score = -np.inf
        best_episode = 0
        already_solved = False
        self.seeding()

        scores_deque = deque(maxlen=100)
        scores_list = []
        scores_list_100_avg = []

        for i_episode in range(1, self.n_episodes + 1):
            env_info = self.env.reset(
                train_mode=True)[self.brain_name]  # reset the environment
            states = env_info.vector_observations  # get the current states (for all agents)
            scores = np.zeros(
                self.num_agents
            )  # initialize the score (for each agent in MADDPG)
            num_steps = 0
            actions = []
            for _ in range(t_max):
                actions = self.MADDPG_obj.act(states,
                                              i_episode,
                                              add_noise=ADD_NOISE)
                env_info = self.env.step(actions)[
                    self.brain_name]  # send all actions to the environment
                next_states = env_info.vector_observations  # get next state (for each agent in MADDPG)
                rewards = env_info.rewards  # get rewards (for each agent in MADDPG)
                dones = env_info.local_done  # see if episode finished
                scores += rewards  # update the score (for each agent in MADDPG)
                self.MADDPG_obj.step(i_episode, states, actions, rewards,
                                     next_states,
                                     dones)  # train the MADDPG_obj
                states = next_states  # roll over states to next time step
                num_steps += 1
                if np.any(dones):  # exit loop if episode finished
                    break
                # print('Total score (averaged over agents) this episode: {}'.format(np.mean(score)))

            scores_deque.append(np.max(scores))
            scores_list.append(np.max(scores))
            scores_list_100_avg.append(np.mean(scores_deque))

            # print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {}'.format(i_episode, np.mean(scores_deque), score), end="")
            if i_episode % PRINT_EVERY == 0:
                print('Episode {}\tAverage Score: {:.2f}\tCurrent Score: {}'.
                      format(i_episode, np.mean(scores_deque), np.max(scores)))
                print('Noise Scaling: {}, Memory size: {} and Num Steps: {}'.
                      format(self.MADDPG_obj.maddpg_agents[0].noise_scale,
                             len(self.MADDPG_obj.memory), num_steps))
                #print("last 10", scores_list[-10:])
                #print("last actions", actions)

            if i_episode % 500 == 0:
                self.MADDPG_obj.save_maddpg()
                print('Saved Model: Episode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_deque)))

            if np.mean(scores_deque) > self.solved_score and len(
                    scores_deque) >= 100:
                self.MADDPG_obj.save_maddpg()
                print(
                    'Goal reached. Saved Model: Episode {}\tAverage Score: {:.2f}'
                    .format(i_episode, np.mean(scores_deque)))
                if stop_when_done:
                    break

        return scores_list, scores_list_100_avg, i_episode
Exemplo n.º 4
0
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)
# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

# Train agents function ########################################################
from maddpg_agent import MADDPG
agents = MADDPG(num_agents=num_agents,
                state_size=state_size,
                action_size=action_size,
                random_seed=2)


def train(n_episodes=100, max_t=1000):
    """Multi-Agent Deep Deterministic Policy Gradiant.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
    """
    scores_window = deque(maxlen=100)  # last 100 scores
    scores_output = []

    for i_episode in range(1, n_episodes + 1):
Exemplo n.º 5
0
    Weights might look like "weightname-1800.data", delete just the "-1800" part. Then, turn testing
    to true.
    """

    save_dir = "saves"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)








    agent1_ddpg = MADDPG('agent1')

    agent1_ddpg_target = MADDPG('agent1_target')

    agent2_ddpg = MADDPG('agent2')

    agent2_ddpg_target = MADDPG('agent2_target')

    #3
    # agent3_ddpg = MADDPG('agent3')

    # agent3_ddpg_target = MADDPG('agent3_target')

    #saver = tf.train.Saver()

    agent1_actor_target_init, agent1_actor_target_update = create_init_update('agent1_actor', 'agent1_target_actor')
Exemplo n.º 6
0
            for i, agent in enumerate(maddpg.agents):
                torch.save(
                    agent.actor_local.state_dict(),
                    'Z:/{:.2f}_actor_{}_checkpoint.pth'.format(mean_score, i))
            break  # or not and just keep on keepin on

    return scores


brain_name, env, env_info, state, state_size, action_size = new_unity_environment(
    train_mode=True)
print(brain_name)
print(env)
print(env_info)
print(state)
print(state_size)
print(action_size)

maddpg = MADDPG(state_size, action_size, 1337)

scores = maddpg_train(maddpg, env, brain_name, state_size, train_mode=True)

env.close()

# plot the scores after training to a 100 episode average score of 30
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()
Exemplo n.º 7
0
def train():
    
    # config parameters
    model_dir = '/home/shijiliu/self-learning/reinforcement-learning/deep-reinforcement-learning/p3_collab-compet/draft_2/'
    
    number_of_episodes = 10000
    episode_length = 80
    batchsize = 128
    
    t = 0
    
    action_noise_coef = 10.0
    param_noise_coef = 0.0
    action_noise_reduction = 0.9999
    param_noise_reduction = 0.9999
    
    episode_per_update = 2
    
    # create env, get essential env info
    env, brain_name, num_agents, action_size, state_size = create_env()
    
    buffer = PrioritizedReplayMemory(1000*episode_length,alpha = 0.5, beta_start = 0.4)
    
    # initialize policy and critic
    maddpg = MADDPG(num_agents,state_size,action_size,num_agents * state_size, num_agents * action_size, discount_factor = 0.99, tau = 0.001)
    agent_reward = [[] for _ in range(num_agents)]
    score_full = []
    score_deque = deque(maxlen = 100)
    
    # training loop
    for episode in range(number_of_episodes):
        env_info = env.reset(train_mode=True)[brain_name]
        obs = env_info.vector_observations
        obs_full = obs
        
        episode_scores = np.zeros(num_agents)
        
        for episode_t in range(episode_length):
            actions = maddpg.act(obs, action_noise_coef, param_noise_coef)
            action_noise_coef *= action_noise_reduction
            param_noise_coef *= param_noise_reduction
            
            # process the output action to interact with the environment
            action_np = [a.detach().cpu().numpy() for a in actions]
            
            # step the environment for 1 step
            env_info = env.step(action_np)[brain_name]
            
            next_obs = env_info.vector_observations
            next_obs_full = next_obs
            rewards = env_info.rewards
            dones = env_info.local_done
            episode_scores += rewards 
            
            # add data to buffer
            transition = (obs, obs_full, actions, rewards, next_obs, next_obs_full, dones)
            
            buffer.push(transition)
            
            obs = next_obs
            obs_full = next_obs_full
            
            if np.any(dones):
                break
        
        # update the networks once after every episode_per_update
        if buffer.storage_size() > batchsize and episode % episode_per_update == 0:
            for a_i in range(num_agents):
                samples,_,_ = buffer.sample(batchsize)
                #print(len(samples))
                ordered_samples = zip(*samples)
                maddpg.update(ordered_samples, a_i)
            maddpg.update_targets() #soft update the target network towards the actual networks
            
        for i in range(num_agents):
            agent_reward[i].append(episode_scores[i])
            
        score_full.append(max(episode_scores))
        score_deque.append(max(episode_scores))
        
        if episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score_deque)))
            
        if np.mean(score_deque) >= 0.5:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode, np.mean(score_deque)))
            # save models
            save_dict_list = []
            for i in range(num_agents):
                save_dict = {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(),
                             'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                             'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(),
                             'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()}
                save_dict_list.append(save_dict)
            torch.save(save_dict_list, 
                           os.path.join(model_dir, 'episode-{}.pt'.format(episode)))
            
            break
        
        
    env.close()
    return maddpg, agent_reward, score_full
Exemplo n.º 8
0
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# get number of agenets.  Confirm there is one
num_agents = len(env_info.agents)

# Get action size
action_size = brain.vector_action_space_size

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]

agent = MADDPG(state_size=state_size,
               action_size=action_size,
               num_agents=num_agents,
               seed=42)
# agent.load_weights()

episodes = 10000  # Number of episodes
max_time = 1000  # Max number of time steps per episode
max_score = 0.6  # Average score to beat over length 100 window

# Score lists
scores_deque = deque(maxlen=100)
all_scores = []
all_scores_mean = []
all_scores_std = []

# Main training loop
for ep in range(0, episodes):