def init_from_config(env, config, logger): """ Inits the algorithm from a config dict, handles DDPG-like and SAC-like models but not a mixture of them (some agents are DDPG and some others are SAC). :param env: :param config: :param logger: :return: algorithm """ sup_algos = config.agent_alg in SUPPORTED_ALGOS # Initializes agents if sup_algos: algorithm = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, gamma=config.gamma, lr=config.lr, lr_fe_coef=config.lr_fe_coef, lr_critic_coef=config.lr_critic_coef, grad_clip_value=config.grad_clip_value, hidden_dim=config.hidden_dim, weight_decay=config.weight_decay, discrete_exploration_scheme=config.discrete_exploration_scheme, boltzmann_temperature=config.boltzmann_temperature, feature_extractor=config.feature_extractor, logger=logger) else: raise ValueError('Algo is not supported') return algorithm
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, noisy_sharing=True, noisy_SNR=config.noisy_SNR, game_id=config.env_id, est_ac=config.est_action) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 print( '#########################################################################' ) print('Adversary using: ', config.adversary_alg, 'Good agent using: ', config.agent_alg, '\n') print('Noisy SNR is: ', config.noisy_SNR) print( '#########################################################################' ) for ep_i in range(0, config.n_episodes, config.n_rollout_threads): obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') if ep_i % 5000 == 0: maddpg.lr *= 0.5 explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: print("Episodes %i-%i of %i, rewards are: \n" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) for a_i, a_ep_rew in enumerate(ep_rews): print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') # *** perform validation every 1000 episodes. i.e. run N=10 times without exploration *** if ep_i % config.validate_every_n_eps == config.validate_every_n_eps - 1: # 假设只有一个env在跑 episodes_stats = [] info_for_one_env_among_timesteps = [] print('*' * 10, 'Validation BEGINS', '*' * 10) for valid_et_i in range(config.run_n_eps_in_validation): obs = env.reset() maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise( config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() curr_episode_stats = [] for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ ac.data.numpy() for ac in torch_agent_actions ] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) info_for_one_env_among_timesteps.append(infos[0]['n']) curr_episode_stats.append(infos[0]['n']) obs = next_obs episodes_stats.append(curr_episode_stats) print('Summary statistics:') if config.env_id == 'simple_tag': # avg_collisions = sum(map(sum,info_for_one_env_among_timesteps))/config.run_n_eps_in_validation episodes_stats = np.array(episodes_stats) # print(episodes_stats.shape) # validation logging with open(f'{config.model_name}.log', 'a') as valid_logfile: valid_logwriter = csv.writer(valid_logfile, delimiter=' ') valid_logwriter.writerow( np.sum(episodes_stats, axis=(1, 2)).tolist()) avg_collisions = np.sum( episodes_stats) / episodes_stats.shape[0] print(f'Avg of collisions: {avg_collisions}') elif config.env_id == 'simple_speaker_listener': for i, stat in enumerate(info_for_one_env_among_timesteps): print(f'ep {i}: {stat}') else: raise NotImplementedError print('*' * 10, 'Validation ENDS', '*' * 10) # *** END of VALIDATION *** maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() valid_logfile.close()
config = setup_experiment(args) logger = ExperimentLogger(config.save_dir, log_std_out=True, use_tensorboard=config.use_tensorboard) # make sampling runner if not config.cuda: torch.set_num_threads(config.n_training_threads) env_func = ENV_MAP[config.env] p_env_func = partial(env_func, config.scenario, benchmark=False, show_visual_range=config.show_visual_range) env = make_parallel_env(p_env_func, config.env_config, config.n_rollout_threads, config.seed) if not config.no_eval: eval_env = env_func(config.scenario, benchmark=False, show_visual_range=config.show_visual_range, **config.env_config) # make learner agent maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer(config.max_buffer_size, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) # train loop t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): logger.info("Episodes (%i-%i)/%i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
def run(config): model_dir = Path('./models') / config.env_name / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) os.system("cp shape.txt {}".format(run_dir)) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) #training时的线程数 if not config.use_cuda: torch.set_num_threads(config.n_training_threads) #env并行采样的进程 env = make_parallel_env(config.num_agents, config.n_rollout_threads, run_num, config.shape_file) #''' maddpg = MADDPG.init_from_env(env=env, agent_alg=config.agent_alg, cripple_alg=config.cripple_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, discrete_action=config.discrete_action) #''' #maddpg = MADDPG.init_from_save(model_dir/'run1'/'model.pt') replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 a_loss = [] c_loss = [] rewss = [] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') # show for the first time explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() #if config.display: # for env_show in env.envs: # env_show.render('human', close=False) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] #actions = [np.array([i.tolist().index(1.0) for i in action]) for action in actions_one_hot] for i in actions: # print(i) for j in i: j[1] *= np.pi #print(actions[0]) next_obs, rewards, dones, infos = env.step(actions) #print(len(agent_actions),len(next_obs)) #if config.display: # for env_show in env.envs: # env_show.render('human', close=False) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): #print(t) if config.use_cuda: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_cuda, norm_rews=True) maddpg.update(sample, a_i, logger=logger, actor_loss_list=a_loss, critic_loss_list=c_loss) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) rewss.append(ep_rews) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) # print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(str(run_dir / 'incremental'), exist_ok=True) maddpg.save( str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))) maddpg.save(str(run_dir / 'model.pt')) maddpg.save(str(run_dir / 'model.pt')) env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() '''
curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(1024) np.random.seed(1024) env = make_parallel_env(env_id, n_rollout_threads, 1024, True) maddpg = MADDPG.init_from_env(env, agent_alg='MADDPG', adversary_alg='MADDPG', tau=0.01, lr=0.01, hidden_dim=64, est_ac=True, game_id='simple_speaker_listener') replay_buffer = ReplayBuffer( buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 #for ep_i in range(0, n_episodes, n_rollout_threads): for ep_i in range(0, 10, 1): #print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + n_rollout_threads, n_episodes))
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) ##################### INITIALIZE FROM SAVED? ########################### if init_from_saved: if model_path is not None: maddpg = MADDPG.init_from_save(model_path) print("Initialized from saved model") # -------------------------------------------------------------------- # else: maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) # used for learning (updates) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) # This is just to store the global rewards and not for updating the policies g_storage_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions, maddpg) ''' Reward Shaping using D++, D. The rewards now contain global as well as shaped rewards Keep the global for logging, and use the shaped rewards for updates ''' # Choose which reward to use use_dpp = True # DIFFERENCE REWARDS d_rewards = [] for n in range(maddpg.nagents): d_rewards.append([rewards[0][n][1]]) d_rewards = [d_rewards] d_rewards = np.array(d_rewards) # GLOBAL REWARDS g_rewards = [] for n in range(maddpg.nagents): g_rewards.append([rewards[0][n][0]]) g_rewards = [g_rewards] g_rewards = np.array(g_rewards) if use_dpp: rewards = d_rewards else: rewards = g_rewards # ----------------------------------------------------------- # # Buffer used for updates replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) # push global rewards into g_replay_buffer for plotting g_storage_buffer.push(obs, agent_actions, g_rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') # Take out global reward from g_storage_buffer ep_rews = g_storage_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) #logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) if(env=='simple_reference'): for i in range(2): agent_init_params.append({'num_in_pol': num_in_pol, 'num_out_pol': num_out_pol, 'num_in_critic': num_in_critic}) init_dict = {'gamma': gamma, 'tau': tau, 'lr': lr, 'hidden_dim': hidden_dim, 'alg_types': alg_types, 'agent_init_params': agent_init_params, 'discrete_action': discrete_action} maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 episode_average_rewards=[] hundred_episode_average_rewards=[] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): if (ep_i%100==0 and ep_i>0): hundred_episode_average_rewards.append(np.mean(episode_average_rewards)) print('Rewards till',ep_i,'=',hundred_episode_average_rewards[-1]) print('Agent Actions=',torch_agent_actions) episode_average_rewards=[] ''' print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) ''' obs = env.reset() rewards_for_this_episode=[] # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) rewards_for_this_episode.append(np.mean(rewards)) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i)#, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') if ep_i>10000: print('Goal Color=',torch_obs[0]) print('Communication=',agent_actions[0]) env.render() time.sleep(0.01) if ep_i>100000: import ipdb ipdb.set_trace() ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) episode_average_rewards.append(np.sum(rewards_for_this_episode)) #for a_i, a_ep_rew in enumerate(ep_rews): #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') plt.plot(100*np.array(range(1,config.n_episodes//100)),hundred_episode_average_rewards) plt.xlabel('Episode Number') plt.ylabel('Average Reward for 100 episodes') plt.title('Speaker Discrete and Mover Continuous') plt.show('plot.png') maddpg.save(run_dir / 'model.pt') env.close()
def run(config): # model_dir = Path('./models') / config.env_id / config.model_name # if not model_dir.exists(): run_num = 10 # else: # exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in # model_dir.iterdir() if # str(folder.name).startswith('run')] # if len(exst_run_nums) == 0: # run_num = 1 # else: # run_num = max(exst_run_nums) + 1 # curr_run = 'run%i' % run_num # run_dir = model_dir / curr_run # log_dir = run_dir / 'logs' log_dir = 'checkpoints0605_4/' run_dir = log_dir + 'logs/' # os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) # change env env = SCraftAdapter(map_name='3m', seed=123,step_mul=8, difficulty='7', game_version='latest', replay_dir="replay/") maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp for obsp in env.observation_space], [acsp for acsp in env.action_space]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) stop = False obs = env.reset() # if stop: # stop = False # obs = env.reset() # else: # obs=env._get_obs() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise( config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() episode_length = 0 while not stop: # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) actions=[] agent_actions=np.zeros([len(env.action_space),env.action_space[0]]) # add avail_agent avail_actions=np.array(env.get_avail_actions()) for agent_i in range(len(torch_agent_actions)): agent_action = env.get_avail_agent_actions(agent_i) agent_action=[0 if agent_action[i]==0 else torch_agent_actions[agent_i].data.numpy()[0][i] for i in range(len(agent_action))] # add argmax actions.append(np.argmax(agent_action)) # new actions agent_actions[agent_i][actions[agent_i]]=1 # torch_agent_actions=[(if agent_avail_actions for action in ac.data.numpy()) for ac in torch_agent_actions] # convert actions to numpy arrays # rearrange actions to be per environment # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) stop = dones[0][0] replay_buffer.push(obs, agent_actions, rewards, next_obs, dones,avail_actions) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') episode_length += 1 ep_rews = replay_buffer.get_average_rewards( episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) # if ep_i % config.save_interval < config.n_rollout_threads: # os.makedirs(run_dir / 'incremental', exist_ok=True) # maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) # maddpg.save(run_dir / 'model.pt') # # maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir, '/summary.json')) logger.close()
def run(config): device = torch.device('cuda' if USE_CUDA else 'cpu') print('Using device:', device) if device.type == 'cuda': print(torch.cuda.get_device_name(0)) print('Memory Usage:') print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB') print('Cached: ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB') model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) print(str(log_dir)) logger = SummaryWriter(str(log_dir)) #logger = None f = open(run_dir / "hyperparametrs.txt","w+") f.write(str(config)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action, config.benchmark) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, stochastic = config.stochastic, commonCritic = config.commonCritic, gasil = config.gasil, dlr = config.dlr, lambda_disc = config.lambda_disc, batch_size_disc = config.batch_size_disc, dynamic=config.dynamic) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) expert_replay_buffer = PriorityReplayBuffer(config.expert_buffer_length, config.episode_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 agent_info = [[[] for i in range(config.n_rollout_threads)]] reward_info = [] total_returns = [] eval_trajectories = [] expert_average_returns = [] trajectories = [] durations = [] start_time = time.time() expert_trajectories = [] evaluation_rewards = [] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) if ep_i%100 == 0: mins = (time.time() - start_time)/60 durations.append(mins) print(mins, "minutes") start_time = time.time() obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() current_episode = [[] for i in range(config.n_rollout_threads)] current_trajectory = [[] for i in range(config.n_rollout_threads)] current_entities = [] total_dense = None if config.store_traj: cur_state_ent = env.getStateEntities() for i in range(config.n_rollout_threads): current_entities.append(cur_state_ent[i]) cur_state = env.getState() for i in range(config.n_rollout_threads): current_trajectory[i].append(cur_state[i]) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) if config.store_traj: cur_state = env.getState() for i in range(config.n_rollout_threads): current_trajectory[i].append(cur_state[i]) for i in range(config.n_rollout_threads): current_episode[i].append([obs[i], actions[i]]) if config.benchmark: #Fix this for i, info in enumerate(infos): agent_info[-1][i].append(info['n']) if et_i == 0: total_dense = rewards else: total_dense = total_dense + rewards replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads and ((expert_replay_buffer.num_traj*config.episode_length >= config.batch_size_disc) == (maddpg.gasil))): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') if maddpg.gasil: for update_i in range(config.num_disc_updates): sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False) sample_expert = expert_replay_buffer.sample(config.batch_size_disc, to_gpu=USE_CUDA) maddpg.gasil_disc_update(sample_normal, sample_expert, 0, logger=logger, num_disc_permutations = config.num_disc_permutations) for update_i in range(config.num_AC_updates): sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False) maddpg.gasil_AC_update(sample_normal, 0, episode_num = ep_i, logger=logger, num_AC_permutations = config.num_AC_permutations) else: for update_i in range(config.num_AC_updates): sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False) maddpg.update(sample_normal, 0, logger=logger, num_AC_permutations = config.num_AC_permutations) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') total_returns.append(total_dense) if maddpg.gasil: expert_replay_buffer.push(current_episode, total_dense, config.n_rollout_threads, current_entities, current_trajectory, config.store_traj) expert_average_returns.append(expert_replay_buffer.get_average_return()) if config.store_traj: for i in range(config.n_rollout_threads): trajectories.append([current_entities[i], current_trajectory[i]]) ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalars('agent%i/rew' % a_i, {'mean_episode_rewards': a_ep_rew}, ep_i) logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) #save mean episode rewards #save benchmarking data agent_info.append([[] for i in range(config.n_rollout_threads)]) reward_info.append(ep_rews) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') #save the trajectories in the expert replay buffer trajec = expert_replay_buffer.get_trajectories() if config.store_traj: expert_trajectories.append(trajec) if ep_i % config.eval_interval < config.n_rollout_threads: current_eval = [] current_trajectories = [] for ep_i_eval in range(0, config.n_eval_episodes, config.n_rollout_threads): obs = env.reset() total_eval = None maddpg.prep_rollouts(device='cpu') if config.store_traj: current_trajectory = [[] for i in range(config.n_rollout_threads)] current_entities = [] cur_state_ent = env.getStateEntities() for i in range(config.n_rollout_threads): current_entities.append(cur_state_ent[i]) cur_state = env.getState() for i in range(config.n_rollout_threads): current_trajectory[i].append(cur_state[i]) for et_i in range(config.episode_length): torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] torch_agent_actions = maddpg.step(torch_obs, explore=False) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) if config.store_traj: cur_state = env.getState() for i in range(config.n_rollout_threads): current_trajectory[i].append(cur_state[i]) if et_i == 0: total_eval = rewards else: total_eval = total_eval + rewards obs = next_obs current_eval.append(total_eval) if config.store_traj: for i in range(config.n_rollout_threads): current_trajectories.append([current_entities[i], current_trajectory[i]]) if config.store_traj: eval_trajectories.append(current_trajectories) evaluation_rewards.append(current_eval)
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) if config.load_adv == True: model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) model_path = model_path / 'model.pt' maddpg = MADDPG.init_from_env_with_runner_delay_unaware( env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, file_name=model_path) else: maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 delay_step = config.delay_step for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() maddpg.prep_rollouts(device='gpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() if config.env_id == 'simple_speaker_listener': zero_agent_actions = [ np.array([[0, 0, 0]]), np.array([[0, 0, 0, 0, 0]]) ] elif config.env_id == 'simple_spread': zero_agent_actions = [ np.array([[0.0, 0.0, 0.0, 0.0, 0.0]]) for _ in range(maddpg.nagents) ] elif config.env_id == 'simple_tag': zero_agent_actions = [ np.array([0.0, 0.0]) for _ in range(maddpg.nagents) ] last_agent_actions = [zero_agent_actions for _ in range(delay_step)] for et_i in range(config.episode_length): torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] if config.load_adv: if delay_step == 0: actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] else: agent_actions_tmp = [[ ac[i] for ac in agent_actions ] for i in range(config.n_rollout_threads)][0][:] actions = last_agent_actions[0] actions.append(agent_actions_tmp[-1]) last_agent_actions = last_agent_actions[1:] last_agent_actions.append(agent_actions_tmp[:2]) actions = [actions] next_obs, rewards, dones, infos = env.step( copy.deepcopy(actions)) else: if delay_step == 0: actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] else: actions = [[ac[i] for ac in last_agent_actions[0]] for i in range(config.n_rollout_threads)] last_agent_actions.pop(0) last_agent_actions.append(agent_actions) next_obs, rewards, dones, infos = env.step( copy.deepcopy(actions)) print('1', obs, agent_actions, rewards, next_obs, dones) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): if config.load_adv: for a_i in range(maddpg.nagents - 1): #do not update the runner sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) # maddpg.update_all_targets() maddpg.update_adversaries() else: for a_i in range( maddpg.nagents): #do not update the runner sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='gpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalars('agent%i/mean_episode_rewards' % a_i, {'reward': a_ep_rew}, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) env = gym.make("intersection-multiagent-v0") maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 delay_step = config.delay_step for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='gpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() agent_obs = [] for i in range(4): agent_obs.append( np.array([ obs[i % 4], obs[(i + 1) % 4], obs[(i + 2) % 4], obs[(i + 3) % 4] ]).flatten()) obs = np.array([agent_obs]) zero_agent_actions = [1, 1, 1, 1] last_agent_actions = [zero_agent_actions for _ in range(delay_step)] for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ torch.FloatTensor(np.vstack(obs[:, i])) for i in range(maddpg.nagents) ] # get actions as torch Variables # print(obs) torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # print(agent_actions) # rearrange actions to be per environment if delay_step == 0: actions = [np.argmax(agent_actions[i][0]) for i in range(4)] else: future_actions = [ np.argmax(agent_actions[i][0]) for i in range(4) ] actions = last_agent_actions[0] last_agent_actions = last_agent_actions[1:] last_agent_actions.append(future_actions) next_obs, rewards, dones, infos = env.step(actions) # print(rewards) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) if dones[0][0]: break obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range( maddpg.nagents): #do not update the runner sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='gpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) logger.add_scalars('agent%i/mean_episode_rewards' % a_i, {'reward': a_ep_rew}, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): # Make directory to store the results model_dir = Path('./models')/config.env_id/config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) # initialize tensorboard summary writer logger = SummaryWriter(str(log_dir)) # use provided seed torch.manual_seed(config.seed) np.random.seed(config.seed) # IDK how helpful this is if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, ) if not rnn: # TODO: this might break. code might not be modular (yet). Code works with RNN replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) else: # replay buffer obs space size is increased rnn_replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0]*history_steps for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) # This is just to store the global rewards and not for updating the policies g_storage_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0]*history_steps for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 ##################################################################################################### # START EPISODES # ##################################################################################################### for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) # List of Observations for each of the agents # E.g., For simple_spread, shape is {1,3,18} obs = env.reset() # For RNN history buffer. I know this is not modular. obs_tminus_0 = copy(obs) obs_tminus_1 = copy(obs) obs_tminus_2 = copy(obs) obs_tminus_3 = copy(obs) obs_tminus_4 = copy(obs) obs_tminus_5 = copy(obs) # # for 3 time-steps # obs_history = np.empty([1,3,54]) # next_obs_history = np.empty([1,3,54]) # For 6 time-steps (18*3 = 54) obs_history = np.empty([1,3,108]) next_obs_history = np.empty([1,3,108]) maddpg.prep_rollouts(device='cpu') # Exploration percentage remaining. IDK if this is a standard way of doing it however. explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() ################################################################################################## # START TIME-STEPS # ################################################################################################## for et_i in range(config.episode_length): # Populate current history for a in range(3): # env.nagents obs_history[0][a][:] = np.concatenate((obs_tminus_0[0][a][:], obs_tminus_1[0][a][:], obs_tminus_2[0][a][:], obs_tminus_3[0][a][:], obs_tminus_4[0][a][:], obs_tminus_5[0][a][:])) # Now, temp has history of 6 timesteps for each agent if not rnn: # TODO: This might break. Code works with RNN. !RNN not tested. # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # get actions (from learning algorithm) as torch Variables. For simple_spread this is discrete[5] torch_agent_actions = maddpg.step(torch_obs, explore=True) else: # rearrange histories to be per agent, and convert to torch Variable rnn_torch_obs = [Variable(torch.Tensor(np.vstack(obs_history[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # TODO: for RNN, actions should condition on history (DONE) torch_agent_actions = maddpg.step(rnn_torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # print(torch_agent_actions[0].data) # rearrange actions to be per environment. For single thread, it wont really matter. actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) ############### WHICH REWARD TO USE ############## # the rewards now contain global as well as difference rewards # Keep the global for logging, and difference for updates use_diff_reward = False #TODO: THIS IS THE TYPE OF REWARD YOU USE # DIFFERENCE REWARDS d_rewards = [] for n in range(maddpg.nagents): d_rewards.append([rewards[0][n][1]]) d_rewards = [d_rewards] d_rewards = np.array(d_rewards) # GLOBAL REWARDS g_rewards = [] for n in range(maddpg.nagents): g_rewards.append([rewards[0][n][0]]) g_rewards = [g_rewards] g_rewards = np.array(g_rewards) # replace "reward" with the reward that you want to use if use_diff_reward: rewards = d_rewards else: rewards = g_rewards # Create history for next state ''' history is [t, t-1, t-2] history[0] is because [0] is for one thread ''' for a in range(3): # env.nagents next_obs_history[0][a][:] = np.concatenate((next_obs[0][a][:], obs_tminus_0[0][a][:], obs_tminus_1[0][a][:], obs_tminus_2[0][a][:], obs_tminus_3[0][a][:], obs_tminus_4[0][a][:])) # Now, next_obs_history has history of 6 timesteps for each agent the next state # for RNN, replay buffer needs to store for e.g., states=[obs_t-2, obs_t-1, obs_t] if not rnn: replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs else: # Buffer used for updates rnn_replay_buffer.push(obs_history, agent_actions, rewards, next_obs_history, dones) # push global rewards into g_replay_buffer g_storage_buffer.push(obs_history, agent_actions, g_rewards, next_obs_history, dones) # Update histories obs_tminus_5 = copy(obs_tminus_4) obs_tminus_4 = copy(obs_tminus_3) obs_tminus_3 = copy(obs_tminus_2) obs_tminus_2 = copy(obs_tminus_1) obs_tminus_1 = copy(obs_tminus_0) obs_tminus_0 = copy(next_obs) t += config.n_rollout_threads if (len(rnn_replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = rnn_replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') # For plotting, use global reward achieved using difference rewards ep_rews = g_storage_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() print()
def run_main(run_num): config = Arglist() run_manager = running_env_manager(MODE) run_manager.prep_running_env(config, run_num) if not config.USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config) eval_env = make_parallel_env(config) maddpg = MADDPG.init_from_env(env, config) if config.use_IL: IL_controller = IL_Controller(config) # imitation learning controller replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp['comm'].n + acsp['act'].n if config.discrete_action else acsp['comm'].n + acsp['act'].shape[0] for acsp in env.action_space]) t = 0 # reset test results arrays all_ep_rewards = [] mean_ep_rewards = [] start_time = time.time() step = 0 win_counter = 0 curr_ep = -1 eval_win_rates = [0] # eps_without_IL = 0 # eps_without_IL_hist = [] print("\nPrey Max Speed: {}, useIL is {}\n".format(config.prey_max_speed, config.use_IL)) while step < config.n_time_steps: # total steps to be performed during a single run # start a episode due to episode termination\done curr_ep += 1 ep_rewards = np.zeros((1, len(env.agent_types))) # init reward vec for single episode. # prepare episodic stuff obs = env.reset() # maddpg.prep_rollouts(device=config.device) maddpg.prep_rollouts(device=config.device) explr_pct_remaining = max(0, config.n_exploration_steps - step) / config.n_exploration_steps maddpg.scale_noise( config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for ep_step in range(config.episode_length): # 1 episode loop. ends due to term\done # env.env._render("human", False) # time.sleep(0.05) if step == config.n_time_steps: break torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, ind])), requires_grad=False) for ind in range(maddpg.nagents)] # get actions as torch Variables with torch.no_grad(): torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays # agent_actions = [ac.detach().cpu().data.numpy() for ac in torch_agent_actions] agent_actions = [ac.cpu().data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[idx] for ac in agent_actions] for idx in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) if (len(replay_buffer) >= config.batch_size and (step % config.steps_per_eval) < config.n_rollout_threads): # perform evaluation eval_win_rates.append(eval_model(maddpg, eval_env, config.episode_length, config.num_steps_in_eval, config.n_rollout_threads, display=False)) if (len(replay_buffer) >= config.batch_size and (step % config.steps_per_update) < config.n_rollout_threads): # perform training train_model(maddpg, config, replay_buffer) step += config.n_rollout_threads # advance the step-counter if (len(replay_buffer) >= config.batch_size and config.use_IL and (step % config.IL_inject_every) < config.n_rollout_threads): # perform IL injection step, eval_win_rates = \ IL_controller.IL_inject(maddpg, replay_buffer, eval_env, step, config, eval_win_rates) IL_controller.decay() ep_rewards += rewards replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) if dones.any(): # terminate episode if won! # win_counter += 1 # eps_without_IL += 1 break obs = next_obs # perform IL injection if failed # if config.use_IL and ep_step == config.episode_length-1 and not dones.any(): # step, eval_win_rates = \ # IL_controller.IL_inject(maddpg, replay_buffer, eval_env, step, config, eval_win_rates) # eps_without_IL_hist.append(eps_without_IL) # eps_without_IL = 0 mean_ep_rewards.append(ep_rewards / config.episode_length) all_ep_rewards.append(ep_rewards) if step % 100 == 0 or (step == config.n_time_steps): # print progress. run_manager.printProgressBar(step, start_time, config.n_time_steps, "run" + str(run_num) + ": Steps Done: ", " Last eval win rate: {0:.2%}".format(eval_win_rates[-1]), 20, "%") # for a_i, a_ep_rew in enumerate(ep_rews): # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) # if ep_i % config.save_interval < config.n_rollout_threads: # os.makedirs(run_dir / 'incremental', exist_ok=True) # maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) # maddpg.save(run_dir / 'model.pt') # eps_without_IL_hist.append(eps_without_IL) if MODE == "RUN": run_dir = run_manager.run_dir np.save(run_dir / 'episodes_rewards', {"tot_ep_rewards": all_ep_rewards.copy(), "mean_ep_rewards": mean_ep_rewards.copy()}, True) # np.save(run_dir / 'IL_hist', eps_without_IL_hist, True) np.save(run_dir / 'win_rates', eval_win_rates, True) maddpg.save(run_dir / 'model.pt') # env.close() # logger.export_scalars_to_json(str(log_dir / 'summary.json')) # logger.close() return run_num
def run(config): scores_window = deque(maxlen=100) model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) # transport configuration name = 'Materials Transport' conf = { 'n_player': 2, #玩家数量 'board_width': 11, #地图宽 'board_height': 11, #地图高 'n_cell_type': 5, #格子的种类 'materials': 4, #集散点数量 'cars': 2, #汽车数 'planes': 0, #飞机数量 'barriers': 12, #固定障碍物数量 'max_step': 500, #最大步数 'game_name': name, #游戏名字 'K': 5, #每个K局更新集散点物资数目 'map_path': 'env/map.txt', #存放初始地图 'cell_range': 6, # 单格中各维度取值范围(tuple类型,只有一个int自动转为tuple)##? 'ob_board_width': None, # 不同智能体观察到的网格宽度(tuple类型),None表示与实际网格相同##? 'ob_board_height': None, # 不同智能体观察到的网格高度(tuple类型),None表示与实际网格相同##? 'ob_cell_range': None, # 不同智能体观察到的单格中各维度取值范围(二维tuple类型),None表示与实际网格相同##? } env = make_parallel_env_transport(config.env_id, conf, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): score = 0 # print("Episodes %i-%i of %i" % (ep_i + 1, # ep_i + 1 + config.n_rollout_threads, # config.n_episodes)) obs = env.reset() # TODO: TO CHECK # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # print('step', et_i) # env.render() # rearrange observations to be per agent, and convert to torch Variable # print('step', et_i) # print(maddpg.nagents) torch_obs = [ Variable( torch.Tensor(np.vstack(obs[:, i])), # 沿着竖直方向将矩阵堆叠起来。 requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] ############################################ # add # actions = actions.astype(int) ############################################ # add: 前两个action joint_action = [] for i in range(2): player = [] for j in range(1): each = [0] * 11 # idx = np.random.randint(11) each[3] = 1 player.append(each) joint_action.append(player) for m in range(2): joint_action.append([actions[0][m].astype(int).tolist()]) next_obs, rewards, dones, infos = env.step(joint_action) ################################# agents_action = actions[0] ################################# replay_buffer.push(obs, agents_action, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') score += rewards[0][0] ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') scores_window.append(score) reward_epi = np.mean(scores_window) reward_epi_var = np.var(scores_window) logger.add_scalar('results/completion_window' % reward_epi, ep_i) logger.add_scalar('results/completion_window' % reward_epi_var, ep_i) print( '\r Episode {}\t Average Reward: {:.3f}\t Var Reward: {:.3f} \t '. format(ep_i, reward_epi, reward_epi_var)) maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(args, **args_dict): reward_flag, pos_flag = None, None save_data = {'reward': -1000., 'pos': 0.} # model_dir = Path('./models') / config.env_id / config.model_name # if not model_dir.exists(): # curr_run = 'run1' # else: # exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in # model_dir.iterdir() if # str(folder.name).startswith('run')] # if len(exst_run_nums) == 0: # curr_run = 'run1' # else: # curr_run = 'run%i' % (max(exst_run_nums) + 1) # run_dir = model_dir / curr_run # log_dir = run_dir / 'logs' # os.makedirs(log_dir) th.manual_seed(args.seed) np.random.seed(args.seed) if not args.use_cuda or not th.cuda.is_available(): # th.set_num_threads(args.n_training_threads) FloatTensor = th.FloatTensor else: FloatTensor = th.cuda.FloatTensor env = make_parallel_env(**args_dict) maddpg = MADDPG.init_from_env(env, args) replay_buffer = ReplayBuffer( args.capacity, args.n_agents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, args.n_episodes, args.n_rollout_threads): ttt = time.time() obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor if args.use_cuda and th.cuda.is_available(): maddpg.prep_rollouts(device='gpu') else: maddpg.prep_rollouts(device='cpu') # maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max( 0, args.n_exploration_eps - ep_i) / args.n_exploration_eps scale_noise_i = args.final_noise_scale + ( args.init_noise_scale - args.final_noise_scale) * explr_pct_remaining maddpg.scale_noise(scale_noise_i) maddpg.reset_noise() print("Episodes %i-%i of %i, replay: %.2f, explore: %.2f" % (ep_i + 1, ep_i + 1 + args.n_rollout_threads, args.n_episodes, float(len(replay_buffer)) / replay_buffer.max_steps, scale_noise_i)) for et_i in range(args.max_steps): ttt = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ th.from_numpy(np.vstack(obs[:, i])).type(FloatTensor) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ ac.detach().cpu().numpy() for ac in torch_agent_actions ] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(args.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += args.n_rollout_threads # # ttt2 = time.time() # print('1', ttt2 - ttt) # if (len(replay_buffer) >= args.batch_size and (t % args.steps_per_update) < args.n_rollout_threads): ttt = time.time() if args.use_cuda and th.cuda.is_available(): maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') # for u_i in range(args.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(args.batch_size, to_gpu=args.use_cuda and th.cuda.is_available(), norm_rews=args.norm_rews) _, _, _ = maddpg.update(sample, a_i) maddpg.update_all_targets() if args.use_cuda and th.cuda.is_available(): maddpg.prep_rollouts(device='gpu') else: maddpg.prep_rollouts(device='cpu') # maddpg.prep_rollouts(device='cpu') # # ttt2 = time.time() # print('2', ttt2 - ttt) # # ep_rews = replay_buffer.get_average_rewards( # config.episode_length * config.n_rollout_threads) # for a_i, a_ep_rew in enumerate(ep_rews): # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % args.test_interval < args.n_rollout_threads: ttt = time.time() obs = env.reset() if args.use_cuda and th.cuda.is_available(): maddpg.prep_rollouts(device='gpu') else: maddpg.prep_rollouts(device='cpu') # maddpg.prep_rollouts(device='cpu') with th.no_grad(): pos_total = 0. finish_ep = np.zeros(args.n_rollout_threads) r_total = np.zeros((args.n_rollout_threads, args.n_agents)) record_r = np.zeros(args.n_agents) for eval_i in range(args.max_steps): torch_obs = [ FloatTensor(np.vstack(obs[:, i])) for i in range(maddpg.nagents) ] torch_agent_actions = maddpg.step(torch_obs, explore=False) agent_actions = [ ac.detach().cpu().numpy() for ac in torch_agent_actions ] actions = [[ac[i] for ac in agent_actions] for i in range(args.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) r_total += rewards obs = next_obs for d_i in range(dones.shape[0]): if dones[d_i] or (eval_i == args.max_steps - 1 and finish_ep[d_i] == 0.): # if eval_i == args.max_steps - 1 and finish_ep[d_i] == 0.: # print(d_i) pos_total += infos[d_i]['pos'] record_r += r_total[d_i] r_total[d_i] = [0., 0.] finish_ep[d_i] += 1 record_r /= finish_ep.sum() pos_total /= finish_ep.sum() # ttt2 = time.time() # print('3', ttt2 - ttt) # new_path = model_path + '/' + str(ep_i) + '.pt' has_saved = False if record_r.sum() > save_data['reward']: save_data['reward'] = record_r.sum() if save_data['reward'] > 0 and pos_total > 10.: # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True) maddpg.save(new_path) if pos_total > save_data['pos']: save_data['pos'] = pos_total if record_r.sum( ) > 0 and pos_total > 10. and not has_saved: # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True) maddpg.save(new_path) if pos_total > 17.0: maddpg.save(new_path) if reward_flag is None: reward_flag = vis.line( X=np.arange(ep_i, ep_i + 1), Y=np.array([np.append(record_r, record_r.sum())]), opts=dict(ylabel='Test Reward', xlabel='Episode', title='Reward', legend=[ 'Agent-%d' % i for i in range(args.n_agents) ] + ['Total'])) else: vis.line(X=np.array( [np.array(ep_i).repeat(args.n_agents + 1)]), Y=np.array([np.append(record_r, record_r.sum())]), win=reward_flag, update='append') if pos_flag is None: pos_flag = vis.line(X=np.arange(ep_i, ep_i + 1), Y=np.array([pos_total]), opts=dict(ylabel='Length', xlabel='Episode', title='How far ?', legend=['position'])) else: vis.line(X=np.array([ep_i]), Y=np.array([pos_total]), win=pos_flag, update='append') # if ep_i % config.save_interval < config.n_rollout_threads: # os.makedirs(run_dir / 'incremental', exist_ok=True) # maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) # maddpg.save(run_dir / 'model.pt') # maddpg.save(run_dir / 'model.pt') env.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): env.render() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): # Make directory to store the results model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) # initialize tensorboard summary writer logger = SummaryWriter(str(log_dir)) # use provided seed torch.manual_seed(config.seed) np.random.seed(config.seed) # IDK how helpful this is if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 global_reward_list = [] # START EPISODES for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) # List of Observations for each of the agents # E.g., For simple_spread, shape is {1,3,18} obs = env.reset() maddpg.prep_rollouts(device='cpu') # Exploration percentage remaining. IDK if this is a standard way of doing it however. explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() # START TIME-STEPS episode_reward = 0 for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions (from learning algorithm) as torch Variables. For simple_spread this is discrete[5] torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions ] # print(torch_agent_actions[0].data) # rearrange actions to be per environment. For single thread, it wont really matter. actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) #print(rewards[0][0]) episode_reward += rewards[0][0] replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') global_reward_list.append(episode_reward / (config.episode_length)) #print(global_reward_list) ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') with open("DIFF_rewards.txt", "wb") as fp_: # Pickling pickle.dump(global_reward_list, fp_) maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()