def run_agents(n_episodes=5): env = UnityEnvironment(file_name="envs/Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] state_size = env_info.vector_observations.shape[1] action_size = brain.vector_action_space_size num_agents = env_info.vector_observations.shape[0] maddpg = MADDPG(state_size=state_size, action_size=action_size, num_agents=num_agents) for i, agent in enumerate(maddpg.agents): agent.actor_local.load_state_dict(torch.load(f'models/checkpoint_actor_local_{i}.pth')) agent.critic_local.load_state_dict(torch.load(f'models/checkpoint_critic_local_{i}.pth')) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) while True: actions = maddpg.act(states, add_noise=True) env_info = env.step(actions)[brain_name] states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done scores += rewards if any(dones): break print(f"Epsiode {i_episode}. Rewards of two agents: {scores}")
def test(env, model_file='best.pt', num_ep=100): rewards_total = [] dict_list = torch.load(model_file) maddpg = MADDPG() maddpg.maddpg_agent[0].actor.load_state_dict(torch.load('actor0.pt')) maddpg.maddpg_agent[1].actor.load_state_dict(torch.load('actor1.pt')) maddpg.maddpg_agent[0].critic.load_state_dict(torch.load('critic1.pt')) maddpg.maddpg_agent[1].critic.load_state_dict(torch.load('critic1.pt')) for i in range(1, num_ep + 1): # play game for 100 episodes env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(2) # initialize the score (for each agent) while True: actions = maddpg.act(states) # select actions env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break rewards_total.append(np.max(scores)) print('Scores from episode {}: {}'.format(i, scores)) print('Average Score over {} episodes: {}'.format(num_ep, np.mean(rewards_total)))
def main(args): set_seed(args.seed) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # initialize environment n_players = 3 env = football_env.create_environment( env_name="academy_3_vs_1_with_keeper", representation="simple115", number_of_left_players_agent_controls=n_players, stacked=False, logdir="/tmp/football", write_goal_dumps=False, write_full_episode_dumps=False, render=False) # state and action space state_space_size = env.observation_space.shape[ 1] # we are using simple115 representation action_space_size = env.action_space.nvec.tolist()[0] # 三个 players 动作空间相同 # state[98:100] 表示控制的三个球员 # model print("loading models") actors = [ Actor(state_space_size=state_space_size, action_space_size=action_space_size) for _ in range(n_players) ] critics = [ Critic(state_space_size=state_space_size, action_space_size=action_space_size, n_players=n_players) for _ in range(n_players) ] old_actors = [ Actor(state_space_size=state_space_size, action_space_size=action_space_size) for _ in range(n_players) ] old_critics = [ Critic(state_space_size=state_space_size, action_space_size=action_space_size, n_players=n_players) for _ in range(n_players) ] for old_actor, actor in zip(old_actors, actors): old_actor.load_state_dict(actor.state_dict()) for old_critic, critic in zip(old_critics, critics): old_critic.load_state_dict(critic.state_dict()) # maddpg maddpg = MADDPG(env=env, action_list=list(range(action_space_size)), actors=actors, critics=critics, old_actors=old_actors, old_critics=old_critics, args=args, device=device) print("learn") maddpg.learn()
def play(): env = UnityEnvironment(file_name='./Tennis.app') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) # create agent maddpg_agent = MADDPG(state_size=state_size, action_size=action_size, seed=0) # load weights for i, agent in enumerate(maddpg_agent.maddpg_agent): agent.policy_local.load_state_dict( torch.load('models/checkpoint_actor_{}.pth'.format(i))) # reverse weights so agent 1 is on the left instead # for i, agent in enumerate(reversed(maddpg_agent.maddpg_agent)): # agent.policy_local.load_state_dict(torch.load('models/checkpoint_actor_{}.pth'.format(i))) env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = maddpg_agent.act( states, add_noise=False) # select an action (for each agent) env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break print('Agent 0 score this episode: {}'.format(scores[0])) print('Agent 0 score this episode: {}'.format(scores[1])) env.close()
def main(arglist): ACTORS = 1 env = EnvWrapper(arglist.scenario, ACTORS, arglist.saved_episode) if arglist.eval: current_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime()) writer = SummaryWriter(log_dir='./logs/' + current_time + '-' + arglist.scenario) maddpg_wrapper = MADDPG(ACTORS) maddpg_wrapper.create_agents(env, arglist) j = 0 for episode in range(arglist.max_episode): obs = env.reset() terminal = False maddpg_wrapper.reset() total_reward = [0 for i in maddpg_wrapper.workers] step = 0 while not terminal and step < 25: if not arglist.eval: env.render(0) time.sleep(0.03) actions = maddpg_wrapper.take_actions(obs) obs2, reward, done = env.step(actions) for actor in range(ACTORS): for i, rew in enumerate(reward[actor]): total_reward[i] += rew j += ACTORS #terminal = all(done) if arglist.eval: maddpg_wrapper.update(j, ACTORS, actions, reward, obs, obs2, done) obs = obs2 step += 1 if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0: maddpg_wrapper.save(episode) if arglist.eval: for worker, ep_ave_max in zip(maddpg_wrapper.workers, maddpg_wrapper.ep_ave_max_q_value): print(worker.pos, ' => average_max_q: ', ep_ave_max / float(step), ' Reward: ', total_reward[worker.pos], ' Episode: ', episode) writer.add_scalar( str(worker.pos) + '/Average_max_q', ep_ave_max / float(step), episode) writer.add_scalar( str(worker.pos) + '/Reward Agent', total_reward[worker.pos], episode) env.close()
def maddpg(n_episodes = 5000): #PARAMETERS: noise = 2 batch_size = 256 update_every = 1 agent = MADDPG(discount_factor = 0.99, tau = 0.02, batch_size = batch_size) buff = ReplayBuffer(10000) for i_episode in range(n_episodes): env_info = env.reset(train_mode = True)[brain_name] state = env_info.vector_observations state = torch.from_numpy(np.array(state)).float().unsqueeze(0) score = np.zeros(num_agents) t = 0 while True: actions = agent.act(state, noise) noise *= 0.9999 actions_array = torch.stack(actions).detach().numpy() env_info = env.step(actions_array)[brain_name] next_state = env_info.vector_observations next_state = torch.from_numpy(np.array(next_state)).float().unsqueeze(0) reward = np.array(env_info.rewards).reshape(1, -1) dones = np.array(env_info.local_done).reshape(1, -1) actions_array = actions_array.reshape(1, -1) buff.push((state, actions_array, reward, next_state, dones)) if len(buff) > batch_size and t % update_every == 0: for i in range(2): samples = buff.sample(batch_size) agent.update(samples, i, noise) agent.update_targets() t += 1 score += reward[0] state = next_state if np.any(dones): break scores_window.append(np.max(score)) scores.append(np.max(score)) print('\rEpisode {}\tAverage Score: {:.3f}'.format(i_episode, np.mean(scores_window)), end = "") if i_episode % 100: for i in range(2): torch.save(agent.maddpg_agent[i].actor.state_dict(), 'bin/checkpoint_actor{}.pth'.format(i)) torch.save(agent.maddpg_agent[i].critic.state_dict(), 'bin/checkpoint_critic{}.pth'.format(i)) if np.mean(scores_window) >= 0.5: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode - 100, np.mean(scores_window))) for i in range(2): torch.save(agent.maddpg_agent[i].actor.state_dict(), 'bin/actor{}_finished.pth'.format(i)) torch.save(agent.maddpg_agent[i].critic.state_dict(), 'bin/critic{}_finished.pth'.format(i)) break
def train_agents(n_episodes=10000, t_max=1000): env = UnityEnvironment(file_name="envs/Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] seeding(seed=42) state_size = env_info.vector_observations.shape[1] action_size = brain.vector_action_space_size num_agents = env_info.vector_observations.shape[0] maddpg = MADDPG(state_size=state_size, action_size=action_size, num_agents=num_agents) scores_deque = deque(maxlen=100) scores_list = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) for _ in range(t_max): actions = maddpg.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done scores += rewards maddpg.step(states, actions, rewards, next_states, dones) states = next_states if np.any(dones): break scores_deque.append(np.max(scores)) scores_list.append(np.max(scores)) print(f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_deque)}', end="") if i_episode % PRINT_EVERY == 0: print( f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_deque) : .3f}' ) if np.mean(scores_deque) >= 2.0 and len(scores_deque) >= 100: for i, agent in enumerate(maddpg.agents): torch.save(agent.actor_local.state_dict(), f'models/checkpoint_actor_local_{i}.pth') torch.save(agent.critic_local.state_dict(), f'models/checkpoint_critic_local_{i}.pth') print( f'\nSaved Model: Episode {i_episode}\tAverage Score: {np.mean(scores_deque) : .3f}' ) break return scores_list
def test_ddpg(env, episodes=10): # reset brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] # action and state size action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] print('State size:', state_size) print('Action size: ', action_size) num_agents = len(env_info.agents) print('Number of agents:', num_agents) # load MADDPG agent maddpg = MADDPG(state_size, action_size, random_seed=0) for agent in maddpg.ddpg_agents: agent.actor_local.load_state_dict( torch.load('actor_agent_' + str(agent.id) + '.pth')) agent.critic_local.load_state_dict( torch.load('critic_agent_' + str(agent.id) + '.pth')) scores = [] for n in range(episodes): # prepare for training in the current epoc env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations score = 0 dones = [False] * num_agents states = env_info.vector_observations score = 0 while not np.any(dones): actions = maddpg.act(states, add_noise=False) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished states = next_states score += np.max(rewards) scores.append(score) print('Average score over {} episodes: {:.4f}'.format( episodes, np.mean(scores))) return scores
# size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) from maddpg import MADDPG from collections import deque import torch agent = MADDPG(24, 2, 0) env_info = env.reset(train_mode=True)[brain_name] env_info.vector_observations.shape def maddpg(max_episodes=2000, print_every=10): scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, max_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations agent.reset() score = 0 while True: actions = agent.act(states)
print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) full_action_size = num_agents * action_size full_state_size = num_agents * state_size maddpg = MADDPG(num_agents, state_size, action_size, buffer_size=0) maddpg.load(agent_id=1) for i_episode in range(10): env_info = env.reset(train_mode=False)[brain_name] i_step = 0 scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = maddpg.act(states) env_info = env.step(actions)[brain_name] # send all actions to tne environment rewards = env_info.rewards # get reward (for each agent) next_states = env_info.vector_observations # get next state (for each agent) dones = env_info.local_done # see if episode finished scores += rewards # update the score (for each agent)
env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] import torch import pickle from maddpg import MADDPG from collections import deque import matplotlib.pyplot as plt import time, os maddpg = MADDPG(24, 2, 2, 1976) scores_max_hist = [] scores_mean_hist = [] logger = logging.getLogger(__name__) f_handle = logging.FileHandler("Log_File.txt") f_format = logging.Formatter('%(levelname)s: %(asctime)s %(message)s') f_handle.setFormatter(f_format) f_handle.setLevel(logging.INFO) logger.addHandler(f_handle) def maddpg_train(n_episodes=2500):
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 1000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + '/log' model_dir = os.getcwd() + '/model_dir' os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir = log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = ['episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() obs, obs_full = transpose_list(all_obs) # for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < parallel_envs or episode==number_of_episodes-parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array'))
from unityagents import UnityEnvironment from maddpg import MADDPG from ddpg import ReplayBuffer import numpy as np import torch import matplotlib.pyplot as plt from collections import deque env = UnityEnvironment(file_name="Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] agent = MADDPG(discount_factor=0.99, tau=0.02, batch_size=256) agent.maddpg_agent[0].actor.load_state_dict( torch.load('bin/actor0_finished.pth', map_location=lambda storage, loc: storage)) agent.maddpg_agent[1].actor.load_state_dict( torch.load('bin/actor1_finished.pth', map_location=lambda storage, loc: storage)) env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations state = torch.from_numpy(np.array(state)).float().unsqueeze(0) score = np.zeros(2) while True: actions = agent.act(state, 0) actions_array = torch.stack(actions).detach().numpy() env_info = env.step(actions_array)[brain_name]
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 1000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes + parallel_envs, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() obs, obs_full = transpose_list(all_obs) # for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def update(): if ALGORITHM == 'maddpg': ddpg = MADDPG(avs.n_actions, avs.n_features, 1, 'maddpg model', RETRAIN) elif ALGORITHM == 'ddpg': ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN) else: ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN) t1 = time.time() rewards1 = 0 rewards2 = 0 var = VAR collision = 0 avgreward1 = [] avgreward2 = [] collision_percentage = [] for i in range(MAX_EPISODES): s1, s2 = avs.reset() ep_reward1 = 0 ep_reward2 = 0 if i % 100000 == 0 and i > IMITATION_EPISODE: plot(avgreward1, avgreward2, collision_percentage, i) for j in range(MAX_EP_STEPS): if RENDER: avs.render() # Add exploration noise if i < IMITATION_EPISODE or i % 4 == 0: a1 = imitation(avs.agent1, avs.agent2, avs.target1) a2 = imitation(avs.agent2, avs.agent1, avs.target2) else: # add randomness to action selection for exploration a1 = ddpg.choose_action(s1) a1 = [ np.clip(np.random.normal(a1[0], var), -1, 1), np.clip(np.random.normal(a1[1], var), -1, 1) ] a2 = ddpg.choose_action(s2) a2 = [ np.clip(np.random.normal(a2[0], var), -1, 1), np.clip(np.random.normal(a2[1], var), -1, 1) ] # a2 = imitation(avs.agent2, avs.agent1, avs.target2) if DEBUG: time.sleep(0.1) s_1, r1, s_2, r2, done, info = avs.step(a1, a2) if ALGORITHM == 'ddpg': ddpg.store_transition(s1, a1, r1, s_1) ddpg.store_transition(s2, a2, r2, s_2) else: ddpg.store_transition(s1, s2, a1, a2, r1, s_1, s_2) ddpg.store_transition(s2, s1, a2, a1, r2, s_2, s_1) s1 = s_1 s2 = s_2 ep_reward1 += r1 ep_reward2 += r2 if j == MAX_EP_STEPS - 1 or done: print("pt:", ddpg.pointer) print('Episode:', i, 'Step:', j, ' Reward: %i' % int(ep_reward1), int(ep_reward2), 'Explore: %.2f' % var) if i >= IMITATION_EPISODE: rewards1 += ep_reward1 rewards2 += ep_reward2 if r1 < -100: collision += 1 if (i + 1) % 100 == 0: avgreward1.append(rewards1 / 100) avgreward2.append(rewards2 / 100) collision_percentage.append(collision) rewards1 = 0 rewards2 = 0 collision = 0 break if ddpg.pointer > MEMORY_CAPACITY: ddpg.learn() ddpg.learn() if var > MIN_VAR and i > IMITATION_EPISODE: var *= DECAY # decay the action randomness if i % 4 != 0 and ep_reward1 > 100 and ep_reward2 > 100 and i > IMITATION_EPISODE: ddpg.save(i) print('Running time: ', time.time() - t1)
print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) full_action_size = num_agents * action_size full_state_size = num_agents * state_size writer = SummaryWriter(log_dir="logs/train", flush_secs=30) maddpg = MADDPG(num_agents, state_size, action_size, buffer_size=BUFFER_SIZE, writer=writer) scores_list, avg_scores_list = maddpg_training(maddpg, writer) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(scores_list) + 1), scores_list) plt.plot(np.arange(1, len(avg_scores_list) + 1), avg_scores_list) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() plt.savefig("Scores.png") writer.close()
if __name__ == "__main__": # Configuration n_episodes = 1 checkpoint = "./checkpoints/checkpoint{}.pth" # Unitiy environment env = UnityEnvironment("./Tennis_Linux/Tennis.x86_64") # Agent agent = TennisMultiAgent(state_size=24, action_size=2, n_agents=2) agent.load(checkpoint) # DDPG maddpg = MADDPG(env=env, agent=agent) scores = maddpg.test(n_episodes=n_episodes) # Close the environment env.close() if n_episodes > 1: # Show results print(scores) print("Average score of {} episodes: {:.2f}".format( n_episodes, np.mean(scores))) # Plot scores fig, ax = plt.subplots(figsize=(10, 6)) ax.plot(np.linspace(1, n_episodes + 1, n_episodes), scores) ax.set_xlabel("Episodes")
print('The state for the first agent looks like:', states[0]) # config settings config = Config() config.update_every = 1 config.batch_size = 512 config.buffer_size = int(1e6) config.discount = 0.99 config.tau = 0.2 config.seed = 2 config.lr_actor = 1e-4 config.lr_critic = 1e-4 config.action_size = action_size config.state_size = state_size config.num_agents = num_agents ma = MADDPG(config) def train(n_episode=30000): """ Function to train the agent """ scores = [] scores_window = deque(maxlen=100) for i_episode in range(n_episode): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations ma.reset() score = np.zeros(num_agents) while True: actions = ma.act(states)
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 episode_length = 100 batchsize = 1000 # how many episodes to save policy and gif save_interval = 5000 # what is this ? t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) # this may be a list of all environments env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic # this creates a list of models, each element in the list refers to an agent in the simulation # [agent_one_ddpg, agent_two_ddpg, ...] # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): # notice we jump forward by number of parallel environments for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) # i believe there are as many as number of agents times parallel env reward reward_this_episode = np.zeros((parallel_envs, 3)) # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # all_observation = array(number of environments 4, 2 elements) # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14 # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements all_obs = env.reset() # obs : is a list that has 1 element per environment. each element contains a list of 3 array. # each array is the state of one agent in that environment. # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment # each element contains an array of 14 values which is the global state of that environment obs, obs_full = transpose_list(all_obs) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # we finish the episode before sampling the buffer for trainint # t jumps forward in a multiple of environment t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed # the transpose_to_tensor(obs) changes the data to each agent point of view # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3 # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments. # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and # to generate an action from each agent actor. actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct # actions_array is a tensor of shape (3 agent, 4 env, 2 action) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents # the shape of actions_for_env is (4 env, 3 agent, 2 action) actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # To gain more understanding, please see the code in the multiagent folder. next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which # reward and actions belong to which agent # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done # each element of sample, say samples[0] is a list of 3 elements, one for each agent # each agent element contains their corresponding value, for example in case of obs it would be a # vector with 14 values # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) #soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] seeding() # number of parallel agents #parallel_envs = num_agents # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 update_actor_after = 100 update_actor_every = 2 episode_length = 100 batchsize = 100 # how many episodes to save policy and gif save_interval = 1000 t = 0 LR_ACTOR = 1e-5 LR_CRITIC = 3e-3 # amplitude of OU noise # this slowly decreases to 0 noise = 1.0 noise_reduction = 0.999999 # how many episodes before update episode_per_update = 1 no_of_updates_perTime = 1 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) #torch.set_num_threads(parallel_envs) #env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(10 * episode_length)) # initialize policy and critic maddpg = MADDPG(lr_actor=LR_ACTOR, lr_critic=LR_CRITIC) #logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] #agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes): timer.update(episode) env_info = env.reset( train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) reward_this_episode = np.zeros((1, num_agents)) #all_obs = env.reset() # obs = states obs_full = np.concatenate((states[0], states[1])) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < 1 or episode == number_of_episodes - 1) tmax = 0 #resetting noise for i in range(num_agents): maddpg.maddpg_agent[i].noise.reset() for episode_t in range(episode_length): t += 1 update_act = True if (episode > update_actor_after or episode % update_actor_every == 0) else False # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensorAsitis(obs), noise=noise, batch=False) noise *= noise_reduction actions_array = torch.stack(actions).cpu().detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame env_info = env.step(actions_for_env)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards rewards_for_env = np.hstack(rewards) obs = states obs_full = np.concatenate((states[0], states[1])) next_obs = next_states next_obs_full = np.concatenate((next_states[0], next_states[1])) # add data to buffer transition = (np.array([obs]), np.array([obs_full]), np.array([actions_for_env]), np.array([rewards_for_env]), np.array([next_obs]), np.array([next_obs_full]), np.array([dones], dtype='float')) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # update once after every episode_per_update if len(buffer) > batchsize and episode % episode_per_update == 0: for _ in range(no_of_updates_perTime): for a_i in range(num_agents): samples = buffer.sample(batchsize) #updating the weights of the n/w maddpg.update(samples, a_i, update_actor=update_act) maddpg.update_targets( ) #soft update the target network towards the actual networks if np.any(dones): # if the episode is done the loop is break to the next episode break for i in range(num_agents): agent0_reward.append(reward_this_episode[0, 0]) agent1_reward.append(reward_this_episode[0, 1]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward)] agent0_reward = [] agent1_reward = [] for a_i, avg_rew in enumerate(avg_rewards): #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) print('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files #imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), #frames, duration=.04) timer.finish()
def main(): seeding() # number of parallel agents number_of_agents = 2 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 3000 batchsize = 128 # amplitude of OU noise # this slowly decreases to 0 noise = 1 noise_reduction = 0.9999 tau = 1e-3 # soft update factor gamma = 0.99 # reward discount factor print_every = 100 # how many episodes before update episode_per_update = 2 #model_dir= os.getcwd()+"/model_dir" #os.makedirs(model_dir, exist_ok=True) result_dir= os.getcwd()+"/result_dir" os.makedirs(result_dir, exist_ok=True) # do we need to set multi-thread for this env? torch.set_num_threads(number_of_agents*2) env = TennisEnv() # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(1e5)) num_agents, num_states, num_actions = env.get_shapes() # initialize policy and critic maddpg = MADDPG(num_agents, num_states, num_actions, discount_factor=gamma, tau=tau) # training loop scores_window = deque(maxlen=100) ep_scores = [] agent0_reward = [] agent1_reward = [] for episode in range(0, number_of_episodes): reward_this_episode = np.zeros((1, number_of_agents)) states, states_full, env_info = env.reset() for agent in maddpg.maddpg_agent: agent.noise.reset() while True: actions = maddpg.act(torch.tensor(states, dtype=torch.float), noise=noise) noise *= noise_reduction actions_for_env = torch.stack(actions).detach().numpy() # step forward one frame next_states, next_states_full, rewards, dones, info = env.step(actions_for_env) # add data to buffer buffer.push(states, states_full, actions_for_env, rewards, next_states, next_states_full, dones) reward_this_episode += rewards states = np.copy(next_states) states_full = np.copy(next_states_full) # update once after every episode_per_update if len(buffer) > batchsize: for a_i in range(number_of_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) if np.any(dones): break agent0_reward.append(reward_this_episode[0, 0]) agent1_reward.append(reward_this_episode[0, 1]) avg_rewards = max(reward_this_episode[0, 0], reward_this_episode[0, 1]) scores_window.append(avg_rewards) cur_score = np.mean(scores_window) ep_scores.append(cur_score) save_dict_list =[] if episode % print_every == 0.0 or avg_rewards > 2.5: print('\rEpisode: {}, Average score: {:.5f}, noise: {:.5f}'.format(episode, cur_score, noise)) if avg_rewards > 2.5: for i in range(number_of_agents): save_dict = {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-{}-{}.pt'.format(episode, cur_score))) print('model saved') break env.close() #print('main-ep_scores: ', ep_scores) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(ep_scores)+1), ep_scores) plt.ylabel('Score') plt.xlabel('Episode #') fig.savefig(result_dir + '/score_plot.png')
# ############################################################################## # ENVIRONMENT # ############################################################################## env = UnityEnvironment(file_name=ENV_FILE, seed=SEED) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # ############################################################################## # AGENT # ############################################################################## # INITIALIZE AGENT, AND LOAD WEIGHTS FROM BEST SNAPSHOT maddpg = MADDPG( actor_layer_sizes=ACTOR_LAYER_SIZES, critic_layer_sizes=CRITIC_LAYER_SIZES, clamp_actions=CLAMP_ACTIONS, logger=None, ) maddpg.load_model(os.path.join(snapshots_dir, "best_model.snapshot")) # ############################################################################## # INTERACT WITH ENVIRONMENT # ############################################################################## for episode_i in range(1, N_EPISODES + 1): print("{dec}\nEpisode {i}\n{dec}\n".format(dec="=" * 60, i=episode_i)) # INITIALIZE FOR NEW EPISODE rewards_this_episode = np.zeros((N_AGENTS, )) env_info = env.reset(train_mode=False)[brain_name] states = process_agent_states(env_info.vector_observations) global_state = process_gobal_state(env_info.vector_observations)
print( 'Episode {}\tAverage Score: {:.3f} MaxReward: {:.3f} Buffer : {}/{} Noise: {:.3f} Timestep: {}.' .format(episode, avg_score, max_reward, len(agent.memory), BUFFER_SIZE, agent.epsilon, agent.timestep_counter)) if avg_score >= GOAL: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode, avg_score)) agent.checkpoint() break return global_scores, averaged_scores # Init the Tennis environment and get agents, state and action info env, brain_name, n_agents, state_size, action_size = init_environment( UNITY_EXE_PATH) agent = MADDPG(state_size=state_size, action_size=action_size, n_agents=n_agents, random_seed=89) # Train the agent and get the results scores, averages = train() # Plot Statistics (Global scores and averaged scores) plt.subplot(2, 1, 2) plt.plot(np.arange(1, len(scores) + 1), averages) plt.ylabel('Tennis Environment Average Score') plt.xlabel('Episode #') plt.show()
print('\nExample state for a single agent:\n', states[0]) agent_state_size = process_agent_states(states).shape[1] global_state_size = process_gobal_state(states).shape[0] # ############################################################################## # AGENT # ############################################################################## # Create Multi agent Actor-Critic Model maddpg = MADDPG( actor_layer_sizes=ACTOR_LAYER_SIZES, critic_layer_sizes=CRITIC_LAYER_SIZES, discount_factor=DISCOUNT_FACTOR, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, gradient_clipping=GRADIENT_CLIPPING, clamp_actions=CLAMP_ACTIONS, logger=logger, log_losses=True, log_layers=False, log_weights=False, ) # ############################################################################## # TRAIN # ############################################################################## buffer = ReplayBuffer(int(BUFFER_SIZE), seed=SEED) n_episodes = 10000 best_rolling_mean_score = -np.inf hard_noise_reigime = True
#scenario = 'simple' scenario = 'simple_adversary' env = make_env(scenario) n_agents = env.n actor_dims = [] for i in range(n_agents): actor_dims.append(env.observation_space[i].shape[0]) critic_dims = sum(actor_dims) # action space is a list of arrays, assume each agent has same action space n_actions = env.action_space[0].n maddpg_agents = MADDPG(actor_dims, critic_dims, n_agents, n_actions, fc1=64, fc2=64, alpha=0.01, beta=0.01, scenario=scenario, chkpt_dir='tmp/maddpg/') memory = MultiAgentReplayBuffer(1000000, critic_dims, actor_dims, n_actions, n_agents, batch_size=1024) PRINT_INTERVAL = 500 N_GAMES = 50000 MAX_STEPS = 25
num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) agents = MADDPG(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=2) agents.actor_local.load_state_dict(torch.load('checkpoint_actor.pth')) agents.critic_local.load_state_dict(torch.load('checkpoint_critic.pth')) def play(n_episodes=5): for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations agents.reset() scores = np.zeros(num_agents) while True: actions = agents.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations
max_steps = 100 # before training, we will store the experience of all agents' state information for the next training process. episode_before_train = 100 obs = env.reset() n_states = len(obs[0]) initial_train = True test_or_train = True #vis = visdom.Visdom(port=8097) win = None param = None np.random.seed(1234) th.manual_seed(1234) # the initial of the original maddpg, it is the basic part of our architecture. maddpg = MADDPG(n_agents, n_states, n_actions, batch_size, capacity, episode_before_train, initial_train, test_or_train) FloatTensor = th.cuda.FloatTensor if maddpg.use_cuda else th.FloatTensor for i in range(maddpg.n_agents): maddpg.critics[i] = th.load('new/model_initial/critic[' + str(i) + '].pkl_episode' + str(3000)) maddpg.actors[i] = th.load('new/model_initial/actors[' + str(i) + '].pkl_episode' + str(3000)) for i_episode in range(n_episode): startTime = datetime.datetime.now() obs = env.reset() obs = np.stack(obs) if isinstance(obs, np.ndarray): obs = th.from_numpy(obs).float()
from multiagent.environment import MultiAgentEnv import multiagent.scenarios as scenarios import torch import numpy as np from agent import DDPGAgent from maddpg import MADDPG from utils import MultiAgentReplayBuffer def make_env(scenario_name, benchmark=False): # load scenario from script scenario = scenarios.load(scenario_name + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment if benchmark: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) else: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) return env env = make_env(scenario_name="simple_spread") ma_controller = MADDPG(env, 1000000) ma_controller.run(500, 300, 32)
'N_EPS_MIN': .01, # Normal noise min decay value 'OU_THETA': 1e-2, # OU noise theta parameter 'OU_SIGMA': 1e-2, # OU noise sigma parameters 'SEED': 42, # Random seed 'DEVICE': torch.device("cuda" if torch.cuda.is_available() else "cpu"), # Training Hyperparameters 'N_EPISODES': 2000, 'MAX_T': 2000, 'SUCCESS_SCORE': .5, 'PRINT_EVERY': 100, # Save on W&B 'WANDB': True, } if PARAMETERS['WANDB']: # Save on wandb wandb.init(project="maddpg", config=PARAMETERS) # Agent agent = MADDPG(PARAMETERS) # Training job scores = train_MADDPG(env, agent, n_episodes=PARAMETERS['N_EPISODES'], max_t=PARAMETERS['MAX_T'], success_score=PARAMETERS['SUCCESS_SCORE'], print_every=PARAMETERS['PRINT_EVERY'], brain_name=brain_name, use_wandb=PARAMETERS['WANDB'])
def main(): seeding() number_of_episodes = 20000 episode_length = 1000 batchsize = 256 save_interval = 1000 rewards_deque = deque(maxlen=100) rewards_all = [] noise = 1.0 noise_reduction = 1.0 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) """ Info about the UnityEnvironment brain_name: 'TennisBrain' brain: ['brain_name', 'camera_resolutions', 'num_stacked_vector_observations', 'number_visual_observations', 'vector_action_descriptions', 'vector_action_space_size', 'vector_action_space_type', 'vector_observation_space_size', 'vector_observation_space_type']] """ env = UnityEnvironment(file_name="Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] buffer = ReplayBuffer(int(1e5)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) # ------------------------------ training ------------------------------ # # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() for episode in range(1, number_of_episodes + 1): timer.update(episode) rewards_this_episode = np.zeros((2, )) """ Info about the UnityEnvironment env_info: ['agents', 'local_done', 'max_reached', 'memories', 'previous_text_actions', 'previous_vector_actions', 'rewards', 'text_observations', 'vector_observations', 'visual_observations'] actions: List(num_agents=2, action_size=2) states: List((24,), (24,)) rewards: List(2,) dones: List(2,) """ env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations for episode_t in range(episode_length): # reset the OUNoise for each agent. for i in range(2): maddpg.maddpg_agent[i].noise.reset() actions = maddpg.act(states, noise=noise) env_info = env.step(actions)[brain_name] noise *= noise_reduction next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # add data to buffer transition = (states, actions, rewards, next_states, dones) buffer.push(transition) rewards_this_episode += rewards states = next_states if any(dones): break # update the local and target network if len(buffer) > batchsize: # update the local network for _ in range(5): for a_i in range(2): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) # soft update the target network maddpg.update_targets() rewards_all.append(rewards_this_episode) rewards_deque.append(np.max(rewards_this_episode)) average_score = np.mean(rewards_deque) # --------------------- Logging for TensorBoard --------------------- # logger.add_scalars('rewards', { 'agent0': rewards_this_episode[0], 'agent1': rewards_this_episode[1] }, episode) logger.add_scalars('global', { 'score': np.max(rewards_this_episode), 'average_score': average_score }, episode) # -------------------------- Save the model -------------------------- # save_dict_list = [] if episode % save_interval == 0 or average_score >= 0.5: for i in range(2): save_dict = \ {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) if average_score >= 3.0: print('\nEnvironment solved in {} episodes!'.format(episode - 100)) print('\nAverage Score: {:.2f}'.format(average_score)) break env.close() logger.close() timer.finish()