def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) model = AttentionSAC.init_from_save(model_path) env = make_env(config.env_id, discrete_action=True) model.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) env.render('human') for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_actions = model.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) env.close()
def run(model_name: str): model_path, run_num, run_dir, log_dir = run_setup(model_name, get_latest_model=True) if model_path is None: print("Couldn't find model!") return model = AttentionSAC.init_from_save(model_path) model.prep_rollouts(device='cpu') run_env: HaliteRunHelper = HaliteRunHelper() run_env.simulate(lambda o: model.step(o, explore=True), agent_count=2)
def run(config): env = football_env.create_environment( env_name=config["academy_scenario"], rewards=config["scoring"], render=config["render_mode"], number_of_left_players_agent_controls=config["num_to_control"], representation='raw') model = AttentionSAC.init_from_save( "./models/football/MAAC3/run2/model.pt", True) # (** EDITED **) Set Replay Buffer # env.action_space, env.observation_space 의 shape를 iteration을 통해 버퍼 설정 for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]): obs = env.reset() obs = make_state(obs) model.prep_rollouts(device='cpu') for et_i in range(config["episode_length"]): print("episode : {} | step : {}".format(ep_i, et_i), end='\r') # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config["n_rollout_threads"])] # Reform Actions list to fit on Football Env # Google Football 환경은 액션 리스트 (one hot encoded)가 아닌 정수값을 받음 actions_list = [[np.argmax(b) for b in a] for a in actions] # Step next_obs, rewards, dones, infos = env.step(actions_list) next_obs = make_state(next_obs) # Prevention of divergence # 안해주면 발산해서 학습 불가 (NaN) rewards = rewards - 0.000001 # Reform Done Flag list # replay buffer에 알맞도록 done 리스트 재구성 obs = next_obs env.close()
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' maac = AttentionSAC.init_from_save(model_path) env = MultiAgentEnv(config.env_id, config.n_controlled_lagents, config.n_controlled_ragents, config.reward_type, config.render) maac.prep_rollouts(device='cpu') goal_diff = 0 for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() for t_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(maac.nagents) ] # get actions as torch Variables torch_actions = maac.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) if all(dones): goal_diff += np.sum(rewards) / (config.n_controlled_lagents + config.n_controlled_ragents) if all(dones): break goal_diff /= config.n_episodes print(goal_diff) env.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(1804) np.random.seed(1804) # initialize E parallel environments with N agents env = make_parallel_env(config.env_id, config.n_rollout_threads, 1804) model = AttentionSAC.init_from_save('model.pt') # model = AttentionSAC.init_from_env(env, # tau=config.tau, # pi_lr=config.pi_lr, # q_lr=config.q_lr, # gamma=config.gamma, # pol_hidden_dim=config.pol_hidden_dim, # critic_hidden_dim=config.critic_hidden_dim, # attend_heads=config.attend_heads, # reward_scale=config.reward_scale) # initialize replay buffer D replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) # T_update t = 0 max_step = 0 max_time = 0 total_step = np.zeros(model.nagents) total_time = np.zeros(model.nagents) for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') success = np.zeros((config.n_rollout_threads, model.nagents), dtype=bool) steps = np.zeros((config.n_rollout_threads, model.nagents)) time_cost = np.zeros((config.n_rollout_threads, model.nagents)) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] start = time.clock() # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=False) end = time.clock() per_time_cost = end - start # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) # calculate steps success = np.logical_or(success, dones) # steps += dones steps += np.logical_not(dones) time_cost += np.logical_not(dones) * per_time_cost # store transitions for all env in replay buffer # replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs # T_update = T_update + E t += config.n_rollout_threads # if (len(replay_buffer) >= max(config.pi_batch_size, config.q_batch_size) and # (t % config.steps_per_update) < config.n_rollout_threads): # if config.use_gpu: # model.prep_training(device='gpu') # else: # model.prep_training(device='cpu') # for u_i in range(config.num_critic_updates): # sample = replay_buffer.sample(config.q_batch_size, # to_gpu=config.use_gpu) # model.update_critic(sample, logger=logger) # for u_i in range(config.num_pol_updates): # sample = replay_buffer.sample(config.pi_batch_size, # to_gpu=config.use_gpu) # model.update_policies(sample, logger=logger) # model.update_all_targets() # # for u_i in range(config.num_updates): # # sample = replay_buffer.sample(config.batch_size, # # to_gpu=config.use_gpu) # # model.update_critic(sample, logger=logger) # # model.update_policies(sample, logger=logger) # # model.update_all_targets() model.prep_rollouts(device='cpu') # ep_dones = np.mean(success, axis=0) # ep_steps = 1 - np.mean(steps / config.episode_length, axis=0) # ep_mean_step # ep_rews = replay_buffer.get_average_rewards( # config.episode_length * config.n_rollout_threads) # for a_i, a_ep_rew in enumerate(ep_rews): # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) # for a_i, a_ep_done in enumerate(ep_dones): # logger.add_scalar('agent%i/mean_episode_dones' % a_i, a_ep_done, ep_i) # for a_i, a_ep_step in enumerate(ep_steps): # logger.add_scalar('agent%i/mean_episode_steps' % a_i, a_ep_step, ep_i) total_step += np.mean(steps, axis=0) total_time += np.mean(time_cost, axis=0) max_step += np.max(steps) max_time += np.max(time_cost) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') # os.makedirs(run_dir / 'incremental', exist_ok=True) # model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) # model.save(run_dir / 'model.pt') mean_step = total_step / (100 / config.n_rollout_threads) mean_time = total_time / (100 / config.n_rollout_threads) max_time /= 100 / config.n_rollout_threads max_step /= 100 / config.n_rollout_threads print('; '.join([ f'{chr(65 + i)} Mean Step:{mean_step[i]}, Mean Time:{mean_time[i]}' for i in range(model.nagents) ])) print('Mean Max Step:{}, Mean Max Time Cost:{}'.format(max_step, max_time)) # model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) # model = AttentionSAC.init_from_env(env, # tau=config.tau, # pi_lr=config.pi_lr, # q_lr=config.q_lr, # gamma=config.gamma, # pol_hidden_dim=config.pol_hidden_dim, # critic_hidden_dim=config.critic_hidden_dim, # attend_heads=config.attend_heads, # reward_scale=config.reward_scale) # Model used to test with adversarial agent # model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run140\\model.pt") # print("Model instantiated") # Model used to test without adversarial agent model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run148\\model.pt") print("Model instantiated") replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 row_list = [] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents)] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) # print (rewards) # print (dones[0]) # env.render('human') replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) #print(sample) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') if (dones[0][0]): print("Breakin the epsiodeeeee at timestep", et_i) break et_i += 1 row_list.append((ep_i+1,et_i)) ep_rews = replay_buffer.get_average_rewards( et_i * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * et_i, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') with open('Timesteps_vs_Episodes.csv', 'w', newline='') as file: writer = csv.writer(file) writer.writerow(["Ep No", "Number of Timesteps"]) for row in row_list: writer.writerow(row) model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name run_num = 1 numWolves = 3 numSheep = 1 numBlocks = 2 numAgents = numWolves + numSheep numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numAgents)) blocksID = list(range(numAgents, numEntities)) wolfSize = 0.075 sheepSize = 0.05 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [ blockSize ] * numBlocks curr_run = 'run%i' % run_num run_dir = model_dir / curr_run torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) model = AttentionSAC.init_from_save(filename=run_dir / 'model.pt') biteList = [] trajListToRender = [] for ep_i in range(0, config.n_episodes): obs = env.reset() model.prep_rollouts(device='cpu') trajectory = [] for et_i in range(config.episode_length): #25 torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] torch_agent_actions = model.step(torch_obs, explore=False) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) state = [ np.append(agent.state.p_pos, agent.state.p_vel) for agent in env.agents ] + [ np.append(landmark.state.p_pos, landmark.state.p_vel) for landmark in env.world.landmarks ] state = obs[0] action = actions[0] reward = rewards[0] nextState = next_obs[0] trajectory.append((state, action, reward, nextState)) obs = next_obs biteNum = calcWolfTrajBiteAmount(trajectory, wolvesID, singleReward=10) biteList.append(biteNum) trajListToRender = trajListToRender + trajectory print(biteNum) meanTrajBite = np.mean(biteList) seTrajBite = np.std(biteList) / np.sqrt(len(biteList) - 1) print('meanTrajBite', meanTrajBite, 'seTrajBite ', seTrajBite) wolfColor = np.array([0.85, 0.35, 0.35]) sheepColor = np.array([0.35, 0.85, 0.35]) blockColor = np.array([0.25, 0.25, 0.25]) entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheep + [ blockColor ] * numBlocks render = Render(entitiesSizeList, entitiesColorList, numAgents, getPosFromAgentState) trajToRender = np.concatenate(trajListToRender) render(trajToRender) env.close()
def run(halite_env: BaseEnv, load_latest: bool=False): config = halite_env.config model_path, run_num, run_dir, log_dir = run_setup(config.model_name, get_latest_model=load_latest) os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) # Build MAAC model if model_path is None: model = AttentionSAC(halite_env.agent_type_topologies, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) else: model = AttentionSAC.init_from_save(model_path, load_critic=True) # Build replay buffer replay_buffer = ReplayBuffer(config.buffer_length) prev_time = time.perf_counter() t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): curr_time = time.perf_counter() print("Episodes %i-%i of %i (%is)" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes, (curr_time - prev_time))) model.prep_rollouts(device='cpu') game_reward = halite_env.simulate(lambda o: model.step(o, explore=True), replay_buffer) t += config.n_rollout_threads if (replay_buffer.length() >= config.batch_size and (t % config.games_per_update) < config.n_rollout_threads): print("Training") if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample: List[Dict[AgentKey, AgentReplayFrame]] = replay_buffer.sample(config.batch_size) # print("Original sample size", len(sample)) # print("Preprocessing to batch structure") sample: Dict[AgentKey, BatchedAgentReplayFrame] = preprocess_to_batch(sample, to_gpu=config.use_gpu) # print("Filtered sample size", len(sample)) # if len(sample) < 5: # print("Sample size keys:", sample.keys()) # print("Updating model critic") model.update_critic(sample, logger=logger) # print("Updating model policies") model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for k, v in ep_rews.items(): logger.add_scalar('agent%s/mean_episode_rewards' % str(k), v, ep_i) logger.add_scalar("global_env_rewards", game_reward, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: print("Saving") model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') print("run_dir", run_dir) prev_time = curr_time model.save(run_dir / 'model.pt') logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def test(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] # runs the newest run_num = max(exst_run_nums) curr_run = 'run%i' % run_num run_dir = model_dir / curr_run # Initialization of evaluation metrics collisions = [0] success_nums = [0] ccr_activates = [0] final_ep_rewards = [] # sum of rewards for training curve final_ep_collisions = [] final_ep_activates = [] final_ep_success_nums = [] torch.manual_seed(run_num) np.random.seed(run_num) env = make_env(config.env_id, discrete_action=True) env.seed(run_num) np.random.seed(run_num) model = AttentionSAC.init_from_save(run_dir / 'model.pt', True) replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 #### remove all tensorboard methods, replace with print and pickle for ep_i in range(0, config.n_episodes): obs = np.expand_dims(np.array(env.reset()), 0) model.prep_rollouts(device='cpu') t_start = time.time() prev_obs = None act_n_t_minus_1 = None for et_i in range(config.episode_length): if config.CCR: if act_n_t_minus_1: target_obs_n, _, _, _ = env.oracle_step(act_n_t_minus_1[0]) target_obs_n = np.expand_dims(np.array(target_obs_n), 0) diff_state = obs[:, :, :4] - target_obs_n[:, :, : 4] # 1x4x4 if config.env_id == 'wall': diff_obs = obs[:, :, -(model.nagents + 8 + 1)] elif config.env_id == 'turbulence': diff_obs = obs[:, :, -(model.nagents + 2 + 1)] else: assert (False) emerg_n = np.sum(diff_state**2, axis=-1) + diff_obs # 1x4 env.oracle_update() # obs: 1x4x20 # emerg_n: 1x4 for agent_i in range(model.nagents): for agent_j in range(model.nagents): obs[:, agent_i, -agent_j] = emerg_n[:, agent_j] # collect experience if prev_obs is not None: replay_buffer.push(prev_obs, agent_actions, rewards, obs, dones) #print(obs) # convert observation to torch Variable torch_obs = [] for i in range(model.nagents): torch_obs.append( Variable(torch.Tensor(obs[:, i]), requires_grad=False)) # print(torch_obs) # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[0] for ac in agent_actions]] # rearrange actions to be per environment #actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions[0]) next_obs = np.expand_dims(np.array(next_obs), 0) rewards = np.expand_dims(np.array(rewards), 0) dones = np.expand_dims(np.array(dones), 0) infos = np.expand_dims(np.array(infos), 0) if config.CCR: act_n_t_minus_1 = actions prev_obs = obs obs = next_obs t += 1 # for displaying learned policies if config.display: time.sleep(0.1) env.render() continue env.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name run_num = 1 numWolves = 3 numSheep = 1 numBlocks = 2 numAgents = numWolves + numSheep numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numAgents)) blocksID = list(range(numAgents, numEntities)) wolfSize = 0.075 sheepSize = 0.05 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [ blockSize ] * numBlocks sheepMaxSpeed = 1.3 wolfMaxSpeed = 1.0 blockMaxSpeed = None entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [ sheepMaxSpeed ] * numSheep + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities collisionReward = 10 isCollision = IsCollision(getPosFromAgentState) punishForOutOfBound = PunishForOutOfBound() rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisionPunishment=collisionReward) individualRewardWolf = 0 rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision, collisionReward, individualRewardWolf) reshapeAction = ReshapeAction() costActionRatio = 0 getActionCost = GetActionCost(costActionRatio, reshapeAction, individualCost=True) getWolvesAction = lambda action: [action[wolfID] for wolfID in wolvesID] rewardWolfWithActionCost = lambda state, action, nextState: np.array( rewardWolf(state, action, nextState)) - np.array( getActionCost(getWolvesAction(action))) rewardFunc = lambda state, action, nextState: \ list(rewardWolfWithActionCost(state, action, nextState)) + list(rewardSheep(state, action, nextState)) reset = ResetMultiAgentChasing(numAgents, numBlocks) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [ observeOneAgent(agentID)(state) for agentID in range(numAgents) ] reshapeAction = ReshapeAction() getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) transit = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState) isTerminal = lambda state: [False] * numAgents initObsForParams = observe(reset()) obsShape = [ initObsForParams[obsID].shape for obsID in range(len(initObsForParams)) ] worldDim = 2 actionSpace = [ spaces.Discrete(worldDim * 2 + 1) for agentID in range(numAgents) ] curr_run = 'run%i' % run_num run_dir = model_dir / curr_run torch.manual_seed(run_num) np.random.seed(run_num) model = AttentionSAC.init_from_save(filename=run_dir / 'model.pt') biteList = [] trajListToRender = [] for ep_i in range(0, config.n_episodes): state = reset() model.prep_rollouts(device='cpu') trajectory = [] for et_i in range(config.episode_length): obs = observe(state) obs = np.array([obs]) torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] torch_agent_actions = model.step(torch_obs, explore=False) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] action = actions[0] nextState = transit(state, action) next_obs = observe(nextState) rewards = rewardFunc(state, action, nextState) done_n = isTerminal(nextState) done = all(done_n) trajectory.append((state, action, rewards, nextState)) state = nextState biteNum = calcWolfTrajBiteAmount(trajectory, wolvesID, singleReward=10) biteList.append(biteNum) trajListToRender.append(list(trajectory)) print(biteNum) meanTrajBite = np.mean(biteList) seTrajBite = np.std(biteList) / np.sqrt(len(biteList) - 1) print('meanTrajBite', meanTrajBite, 'seTrajBite ', seTrajBite) wolfColor = np.array([0.85, 0.35, 0.35]) sheepColor = np.array([0.35, 0.85, 0.35]) blockColor = np.array([0.25, 0.25, 0.25]) entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheep + [ blockColor ] * numBlocks render = Render(entitiesSizeList, entitiesColorList, numAgents, getPosFromAgentState) trajToRender = np.concatenate(trajListToRender) render(trajToRender)