def test_model(self): torch.autograd.set_detect_anomaly(True) self.algo = AttentionSAC([(5, 3), (5, 2)], tau=0.01, pi_lr=0.01, q_lr=0.01, gamma=0.95, pol_hidden_dim=128, critic_hidden_dim=128, attend_heads=4, reward_scale=10.) self.algo.prep_rollouts(device='cpu') sample: Dict[AgentKey, AgentObservation] = \ {AgentKey(0, '0-1'): AgentObservation([1, 2, 3, 2, 3]), AgentKey(0, '0-2'): AgentObservation([2, 4, 3, 2, 4]), AgentKey(0, '0-3'): AgentObservation([2, 4, 3, 2, 4]), AgentKey(1, '0-1'): AgentObservation([1, 1, 3, 1, 4]), AgentKey(1, '0-2'): AgentObservation([1, 1, 3, 1, 4])} results = self.algo.step(sample, explore=True) self.assertEqual(len(results[AgentKey(0, '0-1')].action), 3) self.assertEqual(len(results[AgentKey(1, '0-1')].action), 2) for key in sample: self.assertTrue(key in results) for i in range(20): self.algo.step(sample) self.algo.prep_training(device='cpu') # Generate random training sample train_sample: List[Dict[AgentKey, AgentReplayFrame]] = \ [{AgentKey(0, '0-1'): AgentReplayFrame([rval() for i in range(5)], [0, 1, 0], 5, False, [rval() for i in range(5)]), AgentKey(0, '0-2'): AgentReplayFrame([rval() for i in range(5)], [1, 0, 0], 5, False, [rval() for i in range(5)]), AgentKey(0, '0-3'): AgentReplayFrame([rval() for i in range(5)], [0, 1, 0], 5, False, [rval() for i in range(5)]), AgentKey(1, '0-1'): AgentReplayFrame([rval() for i in range(5)], [0, 1], 5, False, [rval() for i in range(5)]), AgentKey(1, '0-2'): AgentReplayFrame([rval() for i in range(5)], [0, 1], 5, False, [rval() for i in range(5)])} for _ in range(3)] train_sample: Dict[AgentKey, BatchedAgentReplayFrame] = preprocess_to_batch( train_sample) self.algo.update_critic(train_sample, logger=None) self.algo.update_policies(train_sample, logger=None) self.algo.update_all_targets()
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) model = AttentionSAC.init_from_save(model_path) env = make_env(config.env_id, discrete_action=True) model.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) env.render('human') for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_actions = model.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) env.close()
def run(config): env = football_env.create_environment( env_name=config["academy_scenario"], rewards=config["scoring"], render=config["render_mode"], number_of_left_players_agent_controls=config["num_to_control"], representation='raw') model = AttentionSAC.init_from_save( "./models/football/MAAC3/run2/model.pt", True) # (** EDITED **) Set Replay Buffer # env.action_space, env.observation_space 의 shape를 iteration을 통해 버퍼 설정 for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]): obs = env.reset() obs = make_state(obs) model.prep_rollouts(device='cpu') for et_i in range(config["episode_length"]): print("episode : {} | step : {}".format(ep_i, et_i), end='\r') # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config["n_rollout_threads"])] # Reform Actions list to fit on Football Env # Google Football 환경은 액션 리스트 (one hot encoded)가 아닌 정수값을 받음 actions_list = [[np.argmax(b) for b in a] for a in actions] # Step next_obs, rewards, dones, infos = env.step(actions_list) next_obs = make_state(next_obs) # Prevention of divergence # 안해주면 발산해서 학습 불가 (NaN) rewards = rewards - 0.000001 # Reform Done Flag list # replay buffer에 알맞도록 done 리스트 재구성 obs = next_obs env.close()
def run(model_name: str): model_path, run_num, run_dir, log_dir = run_setup(model_name, get_latest_model=True) if model_path is None: print("Couldn't find model!") return model = AttentionSAC.init_from_save(model_path) model.prep_rollouts(device='cpu') run_env: HaliteRunHelper = HaliteRunHelper() run_env.simulate(lambda o: model.step(o, explore=True), agent_count=2)
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' maac = AttentionSAC.init_from_save(model_path) env = MultiAgentEnv(config.env_id, config.n_controlled_lagents, config.n_controlled_ragents, config.reward_type, config.render) maac.prep_rollouts(device='cpu') goal_diff = 0 for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() for t_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(maac.nagents) ] # get actions as torch Variables torch_actions = maac.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) if all(dones): goal_diff += np.sum(rewards) / (config.n_controlled_lagents + config.n_controlled_ragents) if all(dones): break goal_diff /= config.n_episodes print(goal_diff) env.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) envActionSpace = env.action_space envObservationSpace = env.observation_space model = AttentionSAC.init_from_env( envActionSpace, envObservationSpace, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, #128 critic_hidden_dim=config.critic_hidden_dim, #128 attend_heads=config.attend_heads, #4 reward_scale=config.reward_scale) replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): #12 print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): #25 # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads ): # 100 steps across rollouts -> 4 updates model.prep_training(device='cpu') for u_i in range(config.num_updates): #4 sample = replay_buffer.sample(config.batch_size) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) # model = AttentionSAC.init_from_env(env, # tau=config.tau, # pi_lr=config.pi_lr, # q_lr=config.q_lr, # gamma=config.gamma, # pol_hidden_dim=config.pol_hidden_dim, # critic_hidden_dim=config.critic_hidden_dim, # attend_heads=config.attend_heads, # reward_scale=config.reward_scale) # Model used to test with adversarial agent # model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run140\\model.pt") # print("Model instantiated") # Model used to test without adversarial agent model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run148\\model.pt") print("Model instantiated") replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 row_list = [] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents)] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) # print (rewards) # print (dones[0]) # env.render('human') replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) #print(sample) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') if (dones[0][0]): print("Breakin the epsiodeeeee at timestep", et_i) break et_i += 1 row_list.append((ep_i+1,et_i)) ep_rews = replay_buffer.get_average_rewards( et_i * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * et_i, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') with open('Timesteps_vs_Episodes.csv', 'w', newline='') as file: writer = csv.writer(file) writer.writerow(["Ep No", "Number of Timesteps"]) for row in row_list: writer.writerow(row) model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name # if not model_dir.exists(): # run_num = 1 # else: # exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in # model_dir.iterdir() if # str(folder.name).startswith('run')] # if len(exst_run_nums) == 0: # run_num = 1 # else: # run_num = max(exst_run_nums) + 1 run_num = 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir,exist_ok=True) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) model = AttentionSAC.init_from_env(env, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents)] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * config.episode_length, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): cover_ratio = [] model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' # os.makedirs(log_dir) # logger = SummaryWriter(str(log_dir)) # torch.manual_seed(run_num) # np.random.seed(run_num) #env = make_parallel_env(, config.n_rollout_threads, run_num) env = make_env(config.env_id, benchmark=BENCHMARK, discrete_action=True, use_handcraft_policy=config.use_handcraft_policy) model = AttentionSAC.init_from_env( env, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) model.init_from_save_self('./models/swift_scenario/model/run8/model.pt') replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 update_count = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ ac.data.numpy().squeeze() for ac in torch_agent_actions ] # rearrange actions to be per environment # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] # agent_actions[0][5]=1 # agent_actions[1][5]=1 # agent_actions[2][5]=1 next_obs, rewards, dones, infos = env.step( agent_actions, use_handcraft_policy=config.use_handcraft_policy) env.render() time.sleep(0.1) # # # get actions as torch Variables # torch_agent_actions = model.step(torch_obs, explore=True) # # convert actions to numpy arrays # agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # # rearrange actions to be per environment # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] # next_obs, rewards, dones, infos = env.step(actions) # env.render() #if et_i == config.episode_length - 1: #print(infos) #print(type(infos['cover_ratio'])) #cover_ratio.append(float(infos[0]['n'][0]['cover_ratio'])) #print(infos) # replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs ''' t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): update_count += 1 print("episode:", ep_i, ", total steps:", t, " update_count:", update_count) sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') logger.export_scalars_to_json(str(log_dir / 'summary.json')) model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() print(cover_ratio) ''' env.close()
def run(config): USE_CUDA = False if config.gpu: if torch.cuda.is_available(): USE_CUDA = True model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) # model_run = 'run%i' % max(exst_run_nums) # model_path = model_dir / model_run / 'model.pt' torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num, config.n_controlled_lagents, config.n_controlled_ragents, config.reward_type, config.render) model = AttentionSAC.init_from_env(env, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) # model = AttentionSAC.init_from_save_(model_path, load_critic=False, gpu=USE_CUDA) replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) best_rewards = 0 t = 0 num_episodes = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): if ep_i % (config.epoch_size * config.n_rollout_threads) == 0: stat = dict() stat['epoch'] = int(ep_i / (config.epoch_size * config.n_rollout_threads) + 1) obs = env.reset() model.prep_rollouts(device='cpu') s = dict() s['dones'] = [0 for i in range(config.n_rollout_threads)] s['num_episodes'] = [0 for i in range(config.n_rollout_threads)] s['reward'] = [0 for i in range(config.n_rollout_threads)] s['success'] = [0 for i in range(config.n_rollout_threads)] s['steps_taken'] = [0 for i in range(config.n_rollout_threads)] s['reward_buffer'] = [0 for i in range(config.n_rollout_threads)] s['steps_buffer'] = [0 for i in range(config.n_rollout_threads)] for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents)] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') for i in range(config.n_rollout_threads): s['reward'][i] += np.mean(rewards[i]) s['steps_taken'][i] += 1 if dones[i][0] == True: s['dones'][i] += 1 s['num_episodes'][i] += 1 s['reward_buffer'][i] = s['reward'][i] s['steps_buffer'][i] = s['steps_taken'][i] if infos[i]['score_reward'] == 1: s['success'][i] += 1 if et_i == config.episode_length-1: if dones[i][0] == False: if s['dones'][i] > 0: s['reward'][i] = s['reward_buffer'][i] s['steps_taken'][i] = s['steps_buffer'][i] else: s['num_episodes'][i] += 1 ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) global_ep_rews = 0 for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalars('agent%i/rewards' % a_i, {'mean_episode_rewards': a_ep_rew}, ep_i) global_ep_rews += a_ep_rew / (config.n_controlled_lagents + config.n_controlled_ragents) logger.add_scalars('global', {'global_rewards': global_ep_rews}, ep_i) if global_ep_rews > 0.007: model.save(run_dir / ('model_ep%i.pt' % ep_i)) # print('model saved at ep%i' % ep_i) # print('saved model reward: ', global_ep_rews) if global_ep_rews > best_rewards: best_rewards = global_ep_rews if best_rewards > 0.005: model.save(run_dir / ('best_model_ep%i.pt' % ep_i)) # print('best model saved at ep%i' % ep_i) # print('best global reward: ', best_rewards) # if ep_i%500 == 0: # print('episode: ', ep_i) # print('global reward: ', global_ep_rews) # print('best global reward: ', best_rewards) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') # An exact episode means a real episode in the game, rather than the episode in a training loop # Mean (exact) episode data are only generated from complete exact episodes # We calculate the mean (exact) episode data in each epoch # (config.epoch_size * config.n_rollout_threads) means the number of training episodes an epoch includes # The mean (exact) episode data are used for visualization and comparison # Reward, Steps-Taken, Success stat['num_episodes'] = stat.get('num_episodes', 0) + np.sum(s['num_episodes']) stat['reward'] = stat.get('reward', 0) + np.sum(s['reward']) stat['success'] = stat.get('success', 0) + np.sum(s['success']) stat['steps_taken'] = stat.get('steps_taken', 0) + np.sum(s['steps_taken']) if (ep_i+config.n_rollout_threads) % (config.epoch_size * config.n_rollout_threads) == 0: num_episodes += stat['num_episodes'] print('Epoch {}'.format(stat['epoch'])) print('Episode: {}'.format(num_episodes)) print('Reward: {}'.format(stat['reward']/stat['num_episodes'])) print('Success: {:.2f}'.format(stat['success']/stat['num_episodes'])) print('Steps-Taken: {:.2f}'.format(stat['steps_taken']/stat['num_episodes'])) model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config["env_id"] / config["model_name"] if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config["env_id"], config["n_rollout_threads"], run_num) model = AttentionSAC.init_from_env( env, tau=config["tau"], pi_lr=config["pi_lr"], q_lr=config["q_lr"], gamma=config["gamma"], pol_hidden_dim=config["pol_hidden_dim"], critic_hidden_dim=config["critic_hidden_dim"], attend_heads=config["attend_heads"], reward_scale=config["reward_scale"]) replay_buffer = ReplayBuffer(config["buffer_length"], model.nagents, [115 for _ in range(11)], [19 for _ in range(11)]) t = 0 for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config["n_rollout_threads"], config["n_episodes"])) obs = env.reset() model.prep_rollouts(device='cpu') done = [False] et_i = 0 while not any(done): et_i += 1 # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config["n_rollout_threads"])] actions_list = [] for a in actions: temp = [] for b in a: temp.append(np.argmax(b)) actions_list.append(temp) next_obs, rewards, done, infos = env.step(actions_list) dones = [done for _ in range(11)] replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config["n_rollout_threads"] if (len(replay_buffer) >= config["batch_size"] and (t % config["steps_per_update"]) < config["n_rollout_threads"]): if config["use_gpu"]: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config["num_updates"]): sample = replay_buffer.sample(config["batch_size"], to_gpu=config["use_gpu"]) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') print("ep_i : {} | et_i : {}".format(ep_i, et_i), end='\r') ep_rews = replay_buffer.get_average_rewards( config["episode_length"] * config["n_rollout_threads"]) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * config["episode_length"], ep_i) if ep_i % config["save_interval"] < config["n_rollout_threads"]: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def train(env): n_agents = env["n_agents"] x_dim = env["x_dim"] y_dim = env["y_dim"] n_cities = env["n_cities"] max_rails_between_cities = env["max_rails_between_cities"] max_rails_in_city = env["max_rails_in_city"] seed = 0 use_fast_tree_obs = False # Observation parameters observation_tree_depth = 4 observation_radius = 10 observation_max_path_depth = 30 # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = None if use_fast_tree_obs: tree_observation = FastTreeObs(max_depth=observation_tree_depth) print("Using FastTreeObs") else: tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) print("Using StandardTreeObs") speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) rewards = [] obs, info = env.reset() if use_fast_tree_obs: state_size = tree_observation.observation_dim else: # Calculate the state size given the depth of the tree observation and the # number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes action_size = 5 DEVICE = 'cpu' # if torch.cuda.is_available(): # DEVICE = 'gpu' buffer_length = 10000 steps_to_save_model = 10 step_size = 100 num_steps = 100 # update every 100 steps avg_steps = 20 # num steps to average and plot rewards reward_q = [] batch_size = 100 agent_obs = np.array([None] * env.get_num_agents()) max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) num_episodes = 100000 agent_init_params = [] sa_size = [] for i in range(n_agents): agent_init_params.append({ 'num_in_pol': state_size, 'num_out_pol': action_size, 'init_weights': 'model.pt' }) sa_size.append((state_size, action_size)) hyperparams = { "tau": 0.01, "pi_lr": 0.00001, "q_lr": 0.00005, "pol_hidden_dim": 256, "critic_hidden_dim": 256, "attend_heads": 8 } model = AttentionSAC(agent_init_params=agent_init_params, sa_size=sa_size, tau=hyperparams["tau"], pi_lr=hyperparams["pi_lr"], q_lr=hyperparams["q_lr"], pol_hidden_dim=hyperparams["pol_hidden_dim"], critic_hidden_dim=hyperparams["critic_hidden_dim"], attend_heads=hyperparams["attend_heads"]) model.init_dict = {} replay_buffer = ReplayBuffer(buffer_length, n_agents, [state_size for i in range(n_agents)], [action_size for i in range(n_agents)]) print("MAX STEPS: " + str(max_steps)) print("NUM EPISODES: ", num_episodes) print("HYPERPARAMS: ") print(hyperparams) start_time = time.time() for ep in range(num_episodes): print("Episode " + str(ep) + ":", flush=True) obs, info = env.reset(True, True) model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode = 0 for steps in range(max_steps): if steps % step_size == 0: print("=", end="", flush=True) for agent in env.get_agent_handles(): if obs[agent] is not None: if use_fast_tree_obs: agent_obs[agent] = obs[agent] else: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: agent_obs[agent] = np.array([0.] * state_size) action_dict = {} agent_actions = [] torch_obs = [ Variable(torch.Tensor([agent_obs[i]]), requires_grad=False) for i in range(n_agents) ] torch_agent_actions = model.step(torch_obs, explore=True) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] for i in range(n_agents): dist = torch_agent_actions[i][0] idx = -1 for j in range(action_size): if dist[j] != 0: idx = j break action_dict[i] = idx next_obs, all_rewards, done, info = env.step(action_dict) rewards = [] dones = [] next_agent_obs = np.array([None] * env.get_num_agents()) for agent in env.get_agent_handles(): if next_obs[agent] is not None: if use_fast_tree_obs: next_agent_obs[agent] = next_obs[agent] else: next_agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: next_agent_obs[agent] = np.array([0.] * state_size) for i in range(n_agents): reward_sum_for_this_episode += all_rewards[i] rewards.append(all_rewards[i]) all_rewards[i] += augment_reward(agent_obs[agent]) dones.append(done[i]) replay_buffer.push(np.array([agent_obs]), np.array(agent_actions), np.array([rewards]), np.array([next_agent_obs]), np.array([dones])) if steps % num_steps == 0: model.prep_training(device=DEVICE) sample = replay_buffer.sample(batch_size, norm_rews=False) #print(sample) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode /= n_agents reward_q.append(reward_sum_for_this_episode) if len(reward_q) == avg_steps: wandb.log({'reward': np.mean(reward_q)}) reward_q = [] print() if ep % steps_to_save_model == 0: print("\nSaving model") model.save(os.getcwd() + "/model.pt") cur_time = time.time() time_elapsed = (cur_time - start_time) // 60 print("Time Elapsed: " + str(time_elapsed) + "\n")
def run(halite_env: BaseEnv, load_latest: bool=False): config = halite_env.config model_path, run_num, run_dir, log_dir = run_setup(config.model_name, get_latest_model=load_latest) os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) # Build MAAC model if model_path is None: model = AttentionSAC(halite_env.agent_type_topologies, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) else: model = AttentionSAC.init_from_save(model_path, load_critic=True) # Build replay buffer replay_buffer = ReplayBuffer(config.buffer_length) prev_time = time.perf_counter() t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): curr_time = time.perf_counter() print("Episodes %i-%i of %i (%is)" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes, (curr_time - prev_time))) model.prep_rollouts(device='cpu') game_reward = halite_env.simulate(lambda o: model.step(o, explore=True), replay_buffer) t += config.n_rollout_threads if (replay_buffer.length() >= config.batch_size and (t % config.games_per_update) < config.n_rollout_threads): print("Training") if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample: List[Dict[AgentKey, AgentReplayFrame]] = replay_buffer.sample(config.batch_size) # print("Original sample size", len(sample)) # print("Preprocessing to batch structure") sample: Dict[AgentKey, BatchedAgentReplayFrame] = preprocess_to_batch(sample, to_gpu=config.use_gpu) # print("Filtered sample size", len(sample)) # if len(sample) < 5: # print("Sample size keys:", sample.keys()) # print("Updating model critic") model.update_critic(sample, logger=logger) # print("Updating model policies") model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for k, v in ep_rews.items(): logger.add_scalar('agent%s/mean_episode_rewards' % str(k), v, ep_i) logger.add_scalar("global_env_rewards", game_reward, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: print("Saving") model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') print("run_dir", run_dir) prev_time = curr_time model.save(run_dir / 'model.pt') logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): numWolves = 4 numSheep = 1 numBlocks = 2 numAgents = numWolves + numSheep numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numAgents)) blocksID = list(range(numAgents, numEntities)) wolfSize = 0.075 sheepSize = 0.05 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [ blockSize ] * numBlocks sheepMaxSpeed = 1.3 wolfMaxSpeed = 1.0 blockMaxSpeed = None entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [ sheepMaxSpeed ] * numSheep + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities collisionReward = 10 isCollision = IsCollision(getPosFromAgentState) punishForOutOfBound = PunishForOutOfBound() rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisionPunishment=collisionReward) individualRewardWolf = 0 rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision, collisionReward, individualRewardWolf) reshapeAction = ReshapeAction() costActionRatio = 0 getActionCost = GetActionCost(costActionRatio, reshapeAction, individualCost=True) getWolvesAction = lambda action: [action[wolfID] for wolfID in wolvesID] rewardWolfWithActionCost = lambda state, action, nextState: np.array( rewardWolf(state, action, nextState)) - np.array( getActionCost(getWolvesAction(action))) rewardFunc = lambda state, action, nextState: \ list(rewardWolfWithActionCost(state, action, nextState)) + list(rewardSheep(state, action, nextState)) reset = ResetMultiAgentChasing(numAgents, numBlocks) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [ observeOneAgent(agentID)(state) for agentID in range(numAgents) ] reshapeAction = ReshapeAction() getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) transit = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState) isTerminal = lambda state: [False] * numAgents initObsForParams = observe(reset()) envObservationSpace = [ initObsForParams[obsID].shape for obsID in range(len(initObsForParams)) ] worldDim = 2 envActionSpace = [ spaces.Discrete(worldDim * 2 + 1) for agentID in range(numAgents) ] model_dir = os.path.join(dirName, 'models', config.env_id, config.model_name) model = AttentionSAC.init_from_env( envActionSpace, envObservationSpace, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, #128 critic_hidden_dim=config.critic_hidden_dim, #128 attend_heads=config.attend_heads, #4 reward_scale=config.reward_scale) replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [ obsp[0] if isinstance(obsp, tuple) else obsp.shape[0] for obsp in envObservationSpace ], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in envActionSpace ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): #12 print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) state = reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): obs = observe(state) obs = np.array([obs]) # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] action = actions[0] nextState = transit(state, action) next_obs = np.array([observe(nextState)]) rewards = np.array([rewardFunc(state, action, nextState)]) dones = np.array([isTerminal(nextState)]) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) state = nextState t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads ): # 100 steps across rollouts -> 4 updates model.prep_training(device='cpu') for u_i in range(config.num_updates): #4 sample = replay_buffer.sample(config.batch_size) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device='cpu') if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') pathIncremental = os.path.join(model_dir, 'incremental') if not os.path.exists(pathIncremental): os.makedirs(pathIncremental) model.save( os.path.join(pathIncremental, ('model_ep%i.pt' % (ep_i + 1)))) model.save(os.path.join(model_dir, 'model.pt'))
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run #log_dir = run_dir / 'logs' os.makedirs(run_dir) #logger = SummaryWriter(str(log_dir)) # Initialization of evaluation metrics collisions = [0] success_nums = [0] ccr_activates = [0] final_ep_rewards = [] # sum of rewards for training curve final_ep_collisions = [] final_ep_activates = [] final_ep_success_nums = [] torch.manual_seed(run_num) np.random.seed(run_num) env = make_env(config.env_id, discrete_action=True) num_agents = env.n env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) # if config.emergency: # env.switch_emergency() model = AttentionSAC.init_from_env( env, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 #### remove all tensorboard methods, replace with print and pickle for ep_i in range(0, config.n_episodes, config.n_rollout_threads): #print("Episodes %i-%i of %i" % (ep_i + 1, # ep_i + 1 + config.n_rollout_threads, # config.n_episodes)) if config.emergency: env.switch_emergency() obs = env.reset() model.prep_rollouts(device='cpu') t_start = time.time() prev_obs = None act_n_t_minus_1 = None for et_i in range(config.episode_length): if config.CCR: if act_n_t_minus_1: target_obs_n, _, _, _ = env.oracle_step(act_n_t_minus_1) diff_state = obs[:, :, :4] - target_obs_n[:, :, : 4] # 12x4x4 if config.env_id == 'wall' or config.env_id == 'strong_wind' or config.env_id == 'wall_expos': diff_obs = obs[:, :, -(model.nagents + 8 + 1)] elif config.env_id == 'turbulence': diff_obs = obs[:, :, -(model.nagents + 2 + 1)] else: assert (False) emerg_n = np.sum(diff_state**2, axis=-1) + diff_obs # 12x4 env.oracle_update() # obs: 12x4x20 # emerg_n: 12x4 for agent_i in range(model.nagents): for agent_j in range(model.nagents): #print(obs[:, agent_i, -agent_j]) #print(emerg_n[:, agent_j]) obs[:, agent_i, -agent_j] = emerg_n[:, agent_j] #print(obs[:, agent_i, -agent_j]) #print(emerg_n[:, agent_j]) # collect experience if prev_obs is not None: replay_buffer.push(prev_obs, agent_actions, rewards, obs, dones) # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) if config.CCR: if act_n_t_minus_1: for i in range(model.nagents): for j in range(model.nagents): # ccr_activates[-1] += 1 intrinsic_reward = np.linalg.norm( next_obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1) - np.linalg.norm( obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1) intrinsic_reward /= (1 + np.linalg.norm( obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1)) intrinsic_reward *= (emerg_n[:, j] - emerg_n[:, i]) rewards[:, i] += 10 * intrinsic_reward / np.sqrt( num_agents) """ if (len(episode_rewards) == 2 or len(episode_rewards) == 2000 or len(episode_rewards) == 5000) and episode_step % 5 == 0: Ls[i].append(' intrinsic reward = ' + str(intrinsic_reward) + '\n') """ # if i == j: continue # emerg_invalid = ~((emerg_n[:,j] > emerg_n[:,i]) & (emerg_n[:,j] > 0)) # ccr_activates[-1] += (~emerg_invalid).sum() # intrinsic_reward = np.linalg.norm(next_obs[:,i,2:4] - obs[:,j,2:4], axis=-1) - np.linalg.norm(obs[:,i,2:4] - obs[:,j,2:4], axis=-1) # intrinsic_reward[emerg_invalid] = 0 # rewards[:,i] += 10 * intrinsic_reward act_n_t_minus_1 = actions prev_obs = obs obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) model.update_critic(sample, logger=None) model.update_policies(sample, logger=None) model.update_all_targets() model.prep_rollouts(device='cpu') ls_num_collision = env.get_collision_and_zero_out() collisions.append(np.array( ls_num_collision).mean()) # might need to convert to np.int ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) ep_rews = np.array(ep_rews).mean() # save model, display training output print( "episodes: {}, mean episode reward: {}, mean number of collisions with wall: {}, ccr activates: {}, success numbers: {}, time: {}" .format(ep_i, ep_rews, np.mean(collisions[-config.save_rate:]), np.mean(ccr_activates[-config.save_rate:]), np.mean(success_nums[-config.save_rate:]), round(time.time() - t_start, 3))) # Keep track of final episode reward final_ep_rewards.append(ep_rews) # final_ep_activates.append(np.mean(ccr_activates[-config.save_rate:])) final_ep_collisions.append(np.mean(collisions[-config.save_rate:])) final_ep_success_nums.append(np.mean(success_nums[-config.save_rate:])) if ep_i % config.save_rate == 0: x_axis = np.arange(0, ep_i + 1, step=12) # plot reward data rew_file_name = run_dir / 'rewards.png' plt.plot(x_axis, final_ep_rewards) plt.xlabel('training episode') plt.ylabel('reward') #plt.legend() plt.savefig(rew_file_name) plt.clf() collision_file_name = run_dir / 'collisions.png' plt.plot(x_axis, final_ep_collisions) plt.xlabel('training episode') plt.ylabel('number of collisions') #plt.legend() plt.savefig(collision_file_name) plt.clf() # activates_file_name = run_dir / 'activates.png' # plt.plot(x_axis, final_ep_activates) # plt.xlabel('training episode') # plt.ylabel('CCR activates') # #plt.legend() # plt.savefig(activates_file_name) # plt.clf() success_file_name = run_dir / 'successes.png' plt.plot(x_axis, final_ep_success_nums) plt.xlabel('training episode') plt.ylabel('success numbers') #plt.legend() plt.savefig(success_file_name) plt.clf() rew_file_name = run_dir collision_file_name = run_dir success_nums_file_name = run_dir activates_file_name = run_dir rew_file_name /= 'rewards.pkl' collision_file_name /= 'collisions.pkl' success_nums_file_name /= 'success_nums.pkl' # activates_file_name /= 'activates.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) with open(collision_file_name, 'wb') as fp: pickle.dump(final_ep_collisions, fp) # with open(activates_file_name, 'wb') as fp: # pickle.dump(final_ep_activates, fp) with open(success_nums_file_name, 'wb') as fp: pickle.dump(final_ep_success_nums, fp) plt.clf() if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close()
def test(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] # runs the newest run_num = max(exst_run_nums) curr_run = 'run%i' % run_num run_dir = model_dir / curr_run # Initialization of evaluation metrics collisions = [0] success_nums = [0] ccr_activates = [0] final_ep_rewards = [] # sum of rewards for training curve final_ep_collisions = [] final_ep_activates = [] final_ep_success_nums = [] torch.manual_seed(run_num) np.random.seed(run_num) env = make_env(config.env_id, discrete_action=True) env.seed(run_num) np.random.seed(run_num) model = AttentionSAC.init_from_save(run_dir / 'model.pt', True) replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 #### remove all tensorboard methods, replace with print and pickle for ep_i in range(0, config.n_episodes): obs = np.expand_dims(np.array(env.reset()), 0) model.prep_rollouts(device='cpu') t_start = time.time() prev_obs = None act_n_t_minus_1 = None for et_i in range(config.episode_length): if config.CCR: if act_n_t_minus_1: target_obs_n, _, _, _ = env.oracle_step(act_n_t_minus_1[0]) target_obs_n = np.expand_dims(np.array(target_obs_n), 0) diff_state = obs[:, :, :4] - target_obs_n[:, :, : 4] # 1x4x4 if config.env_id == 'wall': diff_obs = obs[:, :, -(model.nagents + 8 + 1)] elif config.env_id == 'turbulence': diff_obs = obs[:, :, -(model.nagents + 2 + 1)] else: assert (False) emerg_n = np.sum(diff_state**2, axis=-1) + diff_obs # 1x4 env.oracle_update() # obs: 1x4x20 # emerg_n: 1x4 for agent_i in range(model.nagents): for agent_j in range(model.nagents): obs[:, agent_i, -agent_j] = emerg_n[:, agent_j] # collect experience if prev_obs is not None: replay_buffer.push(prev_obs, agent_actions, rewards, obs, dones) #print(obs) # convert observation to torch Variable torch_obs = [] for i in range(model.nagents): torch_obs.append( Variable(torch.Tensor(obs[:, i]), requires_grad=False)) # print(torch_obs) # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[0] for ac in agent_actions]] # rearrange actions to be per environment #actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions[0]) next_obs = np.expand_dims(np.array(next_obs), 0) rewards = np.expand_dims(np.array(rewards), 0) dones = np.expand_dims(np.array(dones), 0) infos = np.expand_dims(np.array(infos), 0) if config.CCR: act_n_t_minus_1 = actions prev_obs = obs obs = next_obs t += 1 # for displaying learned policies if config.display: time.sleep(0.1) env.render() continue env.close()
def main(): debug = 1 if debug: numWolves = 3 numSheep = 1 numBlocks = 2 sheepSpeedMultiplier = 1 individualRewardWolf = 0 costActionRatio = 0.0 else: print(sys.argv) condition = json.loads(sys.argv[1]) numWolves = int(condition['numWolves']) numSheep = int(condition['numSheeps']) numBlocks = int(condition['numBlocks']) sheepSpeedMultiplier = float(condition['sheepSpeedMultiplier']) individualRewardWolf = float(condition['individualRewardWolf']) costActionRatio = float(condition['costActionRatio']) modelName = "maac{}wolves{}sheep{}blocksSheepSpeed{}WolfActCost{}individ{}".format( numWolves, numSheep, numBlocks, sheepSpeedMultiplier, costActionRatio, individualRewardWolf) n_rollout_threads = 1 buffer_length = int(1e6) n_episodes = 60000 episode_length = 75 steps_per_update = 100 num_updates = 4 batch_size = 1024 save_interval = 1000 pol_hidden_dim = 128 critic_hidden_dim = 128 attend_heads = 4 pi_lr = 0.001 q_lr = 0.001 tau = 0.001 gamma = 0.99 reward_scale = 100. numAgents = numWolves + numSheep numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numAgents)) blocksID = list(range(numAgents, numEntities)) wolfSize = 0.075 sheepSize = 0.05 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [ blockSize ] * numBlocks wolfMaxSpeed = 1.0 blockMaxSpeed = None sheepMaxSpeedOriginal = 1.3 sheepMaxSpeed = sheepMaxSpeedOriginal * sheepSpeedMultiplier entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [ sheepMaxSpeed ] * numSheep + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities collisionReward = 10 isCollision = IsCollision(getPosFromAgentState) punishForOutOfBound = PunishForOutOfBound() rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisionPunishment=collisionReward) rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision, collisionReward, individualRewardWolf) reshapeAction = ReshapeAction() getActionCost = GetActionCost(costActionRatio, reshapeAction, individualCost=True) getWolvesAction = lambda action: [action[wolfID] for wolfID in wolvesID] rewardWolfWithActionCost = lambda state, action, nextState: np.array( rewardWolf(state, action, nextState)) - np.array( getActionCost(getWolvesAction(action))) rewardFunc = lambda state, action, nextState: \ list(rewardWolfWithActionCost(state, action, nextState)) + list(rewardSheep(state, action, nextState)) reset = ResetMultiAgentChasing(numAgents, numBlocks) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [ observeOneAgent(agentID)(state) for agentID in range(numAgents) ] reshapeAction = ReshapeAction() getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) transit = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState) isTerminal = lambda state: [False] * numAgents initObsForParams = observe(reset()) envObservationSpace = [ initObsForParams[obsID].shape for obsID in range(len(initObsForParams)) ] worldDim = 2 envActionSpace = [ spaces.Discrete(worldDim * 2 + 1) for agentID in range(numAgents) ] model_dir = os.path.join(dirName, 'models', 'chasing') model = AttentionSAC.init_from_env( envActionSpace, envObservationSpace, tau=tau, pi_lr=pi_lr, q_lr=q_lr, gamma=gamma, pol_hidden_dim=pol_hidden_dim, #128 critic_hidden_dim=critic_hidden_dim, #128 attend_heads=attend_heads, #4 reward_scale=reward_scale) replay_buffer = ReplayBuffer(buffer_length, model.nagents, [ obsp[0] if isinstance(obsp, tuple) else obsp.shape[0] for obsp in envObservationSpace ], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in envActionSpace ]) t = 0 for ep_i in range(0, n_episodes, n_rollout_threads): #12 print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + n_rollout_threads, n_episodes)) state = reset() model.prep_rollouts(device='cpu') for et_i in range(episode_length): obs = observe(state) obs = np.array([obs]) # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(n_rollout_threads)] action = actions[0] nextState = transit(state, action) next_obs = np.array([observe(nextState)]) rewards = np.array([rewardFunc(state, action, nextState)]) dones = np.array([isTerminal(nextState)]) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) state = nextState t += n_rollout_threads if (len(replay_buffer) >= batch_size and (t % steps_per_update) < n_rollout_threads ): # 100 steps across rollouts -> 4 updates model.prep_training(device='cpu') for u_i in range(num_updates): #4 sample = replay_buffer.sample(batch_size) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device='cpu') if ep_i % save_interval < n_rollout_threads: model.prep_rollouts(device='cpu') model.save(os.path.join(model_dir, modelName + 'eps' + str(ep_i))) model.save(os.path.join(model_dir, modelName))
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(1804) np.random.seed(1804) # initialize E parallel environments with N agents env = make_parallel_env(config.env_id, config.n_rollout_threads, 1804) model = AttentionSAC.init_from_save('model.pt') # model = AttentionSAC.init_from_env(env, # tau=config.tau, # pi_lr=config.pi_lr, # q_lr=config.q_lr, # gamma=config.gamma, # pol_hidden_dim=config.pol_hidden_dim, # critic_hidden_dim=config.critic_hidden_dim, # attend_heads=config.attend_heads, # reward_scale=config.reward_scale) # initialize replay buffer D replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) # T_update t = 0 max_step = 0 max_time = 0 total_step = np.zeros(model.nagents) total_time = np.zeros(model.nagents) for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') success = np.zeros((config.n_rollout_threads, model.nagents), dtype=bool) steps = np.zeros((config.n_rollout_threads, model.nagents)) time_cost = np.zeros((config.n_rollout_threads, model.nagents)) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] start = time.clock() # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=False) end = time.clock() per_time_cost = end - start # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) # calculate steps success = np.logical_or(success, dones) # steps += dones steps += np.logical_not(dones) time_cost += np.logical_not(dones) * per_time_cost # store transitions for all env in replay buffer # replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs # T_update = T_update + E t += config.n_rollout_threads # if (len(replay_buffer) >= max(config.pi_batch_size, config.q_batch_size) and # (t % config.steps_per_update) < config.n_rollout_threads): # if config.use_gpu: # model.prep_training(device='gpu') # else: # model.prep_training(device='cpu') # for u_i in range(config.num_critic_updates): # sample = replay_buffer.sample(config.q_batch_size, # to_gpu=config.use_gpu) # model.update_critic(sample, logger=logger) # for u_i in range(config.num_pol_updates): # sample = replay_buffer.sample(config.pi_batch_size, # to_gpu=config.use_gpu) # model.update_policies(sample, logger=logger) # model.update_all_targets() # # for u_i in range(config.num_updates): # # sample = replay_buffer.sample(config.batch_size, # # to_gpu=config.use_gpu) # # model.update_critic(sample, logger=logger) # # model.update_policies(sample, logger=logger) # # model.update_all_targets() model.prep_rollouts(device='cpu') # ep_dones = np.mean(success, axis=0) # ep_steps = 1 - np.mean(steps / config.episode_length, axis=0) # ep_mean_step # ep_rews = replay_buffer.get_average_rewards( # config.episode_length * config.n_rollout_threads) # for a_i, a_ep_rew in enumerate(ep_rews): # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) # for a_i, a_ep_done in enumerate(ep_dones): # logger.add_scalar('agent%i/mean_episode_dones' % a_i, a_ep_done, ep_i) # for a_i, a_ep_step in enumerate(ep_steps): # logger.add_scalar('agent%i/mean_episode_steps' % a_i, a_ep_step, ep_i) total_step += np.mean(steps, axis=0) total_time += np.mean(time_cost, axis=0) max_step += np.max(steps) max_time += np.max(time_cost) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') # os.makedirs(run_dir / 'incremental', exist_ok=True) # model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) # model.save(run_dir / 'model.pt') mean_step = total_step / (100 / config.n_rollout_threads) mean_time = total_time / (100 / config.n_rollout_threads) max_time /= 100 / config.n_rollout_threads max_step /= 100 / config.n_rollout_threads print('; '.join([ f'{chr(65 + i)} Mean Step:{mean_step[i]}, Mean Time:{mean_time[i]}' for i in range(model.nagents) ])) print('Mean Max Step:{}, Mean Max Time Cost:{}'.format(max_step, max_time)) # model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config["env_id"] / config["model_name"] if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config["n_rollout_threads"], run_num) model = AttentionSAC.init_from_env( env, tau=config["tau"], pi_lr=config["pi_lr"], q_lr=config["q_lr"], gamma=config["gamma"], pol_hidden_dim=config["pol_hidden_dim"], critic_hidden_dim=config["critic_hidden_dim"], attend_heads=config["attend_heads"], reward_scale=config["reward_scale"]) # (** EDITED **) Set Replay Buffer # env.action_space, env.observation_space 의 shape를 iteration을 통해 버퍼 설정 replay_buffer = ReplayBuffer(config["buffer_length"], model.nagents, [115 for _ in range(model.nagents)], [19 for _ in range(model.nagents)]) t = 0 for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config["n_rollout_threads"], config["n_episodes"])) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config["episode_length"]): print("episode : {} | step : {}".format(ep_i, et_i), end='\r') # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config["n_rollout_threads"])] # Reform Actions list to fit on Football Env # Google Football 환경은 액션 리스트 (one hot encoded)가 아닌 정수값을 받음 actions_list = [[np.argmax(b) for b in a] for a in actions] # Step next_obs, rewards, dones, infos = env.step(actions_list) # Prevention of divergence # 안해주면 발산해서 학습 불가 (NaN) rewards = rewards - 0.000001 # Reform Done Flag list # replay buffer에 알맞도록 done 리스트 재구성 dones = (np.array([dones for _ in range(model.nagents)])).T replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config["n_rollout_threads"] if (len(replay_buffer) >= config["batch_size"] and (t % config["steps_per_update"]) < config["n_rollout_threads"]): if config["use_gpu"]: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config["num_updates"]): sample = replay_buffer.sample(config["batch_size"], to_gpu=config["use_gpu"]) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config["episode_length"] * config["n_rollout_threads"]) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * config["episode_length"], ep_i) if ep_i % config["save_interval"] < config["n_rollout_threads"]: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name run_num = 1 numWolves = 3 numSheep = 1 numBlocks = 2 numAgents = numWolves + numSheep numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numAgents)) blocksID = list(range(numAgents, numEntities)) wolfSize = 0.075 sheepSize = 0.05 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [ blockSize ] * numBlocks sheepMaxSpeed = 1.3 wolfMaxSpeed = 1.0 blockMaxSpeed = None entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [ sheepMaxSpeed ] * numSheep + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities collisionReward = 10 isCollision = IsCollision(getPosFromAgentState) punishForOutOfBound = PunishForOutOfBound() rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisionPunishment=collisionReward) individualRewardWolf = 0 rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision, collisionReward, individualRewardWolf) reshapeAction = ReshapeAction() costActionRatio = 0 getActionCost = GetActionCost(costActionRatio, reshapeAction, individualCost=True) getWolvesAction = lambda action: [action[wolfID] for wolfID in wolvesID] rewardWolfWithActionCost = lambda state, action, nextState: np.array( rewardWolf(state, action, nextState)) - np.array( getActionCost(getWolvesAction(action))) rewardFunc = lambda state, action, nextState: \ list(rewardWolfWithActionCost(state, action, nextState)) + list(rewardSheep(state, action, nextState)) reset = ResetMultiAgentChasing(numAgents, numBlocks) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [ observeOneAgent(agentID)(state) for agentID in range(numAgents) ] reshapeAction = ReshapeAction() getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) transit = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState) isTerminal = lambda state: [False] * numAgents initObsForParams = observe(reset()) obsShape = [ initObsForParams[obsID].shape for obsID in range(len(initObsForParams)) ] worldDim = 2 actionSpace = [ spaces.Discrete(worldDim * 2 + 1) for agentID in range(numAgents) ] curr_run = 'run%i' % run_num run_dir = model_dir / curr_run torch.manual_seed(run_num) np.random.seed(run_num) model = AttentionSAC.init_from_save(filename=run_dir / 'model.pt') biteList = [] trajListToRender = [] for ep_i in range(0, config.n_episodes): state = reset() model.prep_rollouts(device='cpu') trajectory = [] for et_i in range(config.episode_length): obs = observe(state) obs = np.array([obs]) torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] torch_agent_actions = model.step(torch_obs, explore=False) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] action = actions[0] nextState = transit(state, action) next_obs = observe(nextState) rewards = rewardFunc(state, action, nextState) done_n = isTerminal(nextState) done = all(done_n) trajectory.append((state, action, rewards, nextState)) state = nextState biteNum = calcWolfTrajBiteAmount(trajectory, wolvesID, singleReward=10) biteList.append(biteNum) trajListToRender.append(list(trajectory)) print(biteNum) meanTrajBite = np.mean(biteList) seTrajBite = np.std(biteList) / np.sqrt(len(biteList) - 1) print('meanTrajBite', meanTrajBite, 'seTrajBite ', seTrajBite) wolfColor = np.array([0.85, 0.35, 0.35]) sheepColor = np.array([0.35, 0.85, 0.35]) blockColor = np.array([0.25, 0.25, 0.25]) entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheep + [ blockColor ] * numBlocks render = Render(entitiesSizeList, entitiesColorList, numAgents, getPosFromAgentState) trajToRender = np.concatenate(trajListToRender) render(trajToRender)
def run(config): model_dir = Path('./models') / config.env_id / config.model_name run_num = 1 numWolves = 3 numSheep = 1 numBlocks = 2 numAgents = numWolves + numSheep numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numAgents)) blocksID = list(range(numAgents, numEntities)) wolfSize = 0.075 sheepSize = 0.05 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [ blockSize ] * numBlocks curr_run = 'run%i' % run_num run_dir = model_dir / curr_run torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) model = AttentionSAC.init_from_save(filename=run_dir / 'model.pt') biteList = [] trajListToRender = [] for ep_i in range(0, config.n_episodes): obs = env.reset() model.prep_rollouts(device='cpu') trajectory = [] for et_i in range(config.episode_length): #25 torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] torch_agent_actions = model.step(torch_obs, explore=False) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) state = [ np.append(agent.state.p_pos, agent.state.p_vel) for agent in env.agents ] + [ np.append(landmark.state.p_pos, landmark.state.p_vel) for landmark in env.world.landmarks ] state = obs[0] action = actions[0] reward = rewards[0] nextState = next_obs[0] trajectory.append((state, action, reward, nextState)) obs = next_obs biteNum = calcWolfTrajBiteAmount(trajectory, wolvesID, singleReward=10) biteList.append(biteNum) trajListToRender = trajListToRender + trajectory print(biteNum) meanTrajBite = np.mean(biteList) seTrajBite = np.std(biteList) / np.sqrt(len(biteList) - 1) print('meanTrajBite', meanTrajBite, 'seTrajBite ', seTrajBite) wolfColor = np.array([0.85, 0.35, 0.35]) sheepColor = np.array([0.35, 0.85, 0.35]) blockColor = np.array([0.25, 0.25, 0.25]) entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheep + [ blockColor ] * numBlocks render = Render(entitiesSizeList, entitiesColorList, numAgents, getPosFromAgentState) trajToRender = np.concatenate(trajListToRender) render(trajToRender) env.close()
def run(config): device = torch.device( 'cuda:' + str(config.gpu) if torch.cuda.is_available() else 'cpu') model_dir = Path('./runs') / config.store_result_dir train_loader, train_drugs, train_Y = preprocess(config.dataset, config) print("number of data") print(len(train_loader)) for it, original_pair in enumerate(train_loader): if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) print('Run pair number ', str(it)) Hyperparams = Args() BasePath = './runs/' + config.store_result_dir writer = SummaryWriter(BasePath + '/plots') original_drug_smile = train_drugs[it] original_target_aff = train_Y[it] original_drug = original_pair original_target = original_pair.target[0] print('Original target:') print(original_target) print('Original molecule:') print(original_drug_smile) model_to_explain = mol_utils.get_graphdta_dgn().to(device) pred_aff, drug_original_encoding, prot_original_encoding = model_to_explain( original_drug.to(device), seq_cat(original_target).to(device)) atoms_ = np.unique([ x.GetSymbol() for x in Chem.MolFromSmiles(original_drug_smile).GetAtoms() ]) cof = [1.0, 0.05, 0.01, 0.05] env = make_parallel_env(original_drug_smile, original_target, Hyperparams, atoms_, model_to_explain, original_drug, original_target_aff, pred_aff, device, cof) model = AttentionSAC.init_from_env( env, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp[0] for obsp in env.observation_space], [acsp for acsp in env.action_space]) if not os.path.isdir(BasePath + "/counterfacts"): os.makedirs(BasePath + "/counterfacts") mol_utils.TopKCounterfactualsDTA.init(original_drug_smile, it, BasePath + "/counterfacts") t = 0 episode_length = 1 trg = trange(0, config.n_episodes, config.n_rollout_threads) for ep_i in trg: obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, results, dones, action_drug, action_prot = env.step( actions) drug_reward, loss_, gain, drug_sim, prot_sim, qed = results[0][ 0] prot_reward, loss_, gain, drug_sim, prot_sim, qed = results[0][ 1] writer.add_scalar('DTA/Reward', drug_reward, ep_i) writer.add_scalar('DTA/Distance', loss_, ep_i) writer.add_scalar('DTA/Drug Similarity', drug_sim, ep_i) writer.add_scalar('DTA/Drug QED', qed, ep_i) writer.add_scalar('DTA/Protein Similarity', prot_sim, ep_i) pair_reward = [] pair_reward.append(drug_reward) pair_reward.append(prot_reward) rewards = np.array([pair_reward]) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += 1 if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < 1): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') if np.all(dones == True): mutate_position = [ i for i in range(len(original_target)) if original_target[i] != action_prot[i] ] trg.set_postfix(Reward=drug_reward, DrugSim=drug_sim, TargetSim=prot_sim, SMILES=action_drug, TargetMutatePosition=mutate_position, refresh=True) mol_utils.TopKCounterfactualsDTA.insert({ 'smiles': action_drug, 'protein': action_prot, 'drug_reward': drug_reward, 'protein_reward': prot_reward, 'loss': loss_, 'gain': gain, 'drug sim': drug_sim, 'drug qed': qed, 'prot sim': prot_sim, 'mutate position': mutate_position }) ep_rews = replay_buffer.get_average_rewards(episode_length * 1) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * episode_length, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
}) sa_size.append((state_size, action_size)) hyperparams = { "tau": 0.01, # ddpg soft update "pi_lr": 0.00001, "q_lr": 0.00005, "pol_hidden_dim": 256, "critic_hidden_dim": 256, "attend_heads": 8 } model = AttentionSAC(agent_init_params=agent_init_params, sa_size=sa_size, tau=hyperparams["tau"], pi_lr=hyperparams["pi_lr"], q_lr=hyperparams["q_lr"], pol_hidden_dim=hyperparams["pol_hidden_dim"], critic_hidden_dim=hyperparams["critic_hidden_dim"], attend_heads=hyperparams["attend_heads"]) model.init_dict = {} replay_buffer = ReplayBuffer(buffer_length, n_agents, [state_size for i in range(n_agents)], [action_size for i in range(n_agents)]) print("MAX STEPS: " + str(max_steps)) print("NUM EPISODES: ", num_episodes) print("HYPERPARAMS: ") print(hyperparams) start_time = time.time()