def create_env(self, env_kwargs): def thunk(): import experiments.test_lstm_a2c return RewardCollector(gym.make(**env_kwargs)) env = AsyncVectorEnv([thunk] * self.num_processes) self.validation_env = SyncVectorEnv([thunk]) return env
def test_config(n_envs, base_env, use_torch, use_logger, return_info): config = 'n_envs' + str(n_envs) + '-base_env' + str(base_env) \ + '-torch' + str(use_torch) + '-logger' + str(use_logger) \ + '-info' + str(return_info) if isinstance(base_env, str): env = vec_env = gym.vector.make(base_env, num_envs=n_envs) else: def make_env(): env = base_env() return env env_fns = [make_env for _ in range(n_envs)] env = vec_env = AsyncVectorEnv(env_fns) if use_logger: env = envs.Logger(env, interval=5, logger=self.logger) if use_torch: env = envs.Torch(env) policy = lambda x: ch.totensor(vec_env.action_space.sample()) else: policy = lambda x: vec_env.action_space.sample() if return_info: agent = lambda x: (policy(x), {'policy': policy(x)[0]}) else: agent = policy # Gather experience env = envs.Runner(env) replay = env.run(agent, steps=NUM_STEPS) # Pre-compute some shapes shape = (NUM_STEPS, n_envs) state_shape = vec_env.observation_space.sample()[0] if isinstance(state_shape, (int, float)): state_shape = tuple() else: state_shape = state_shape.shape action_shape = vec_env.action_space.sample()[0] if isinstance(action_shape, (int, float)): action_shape = (1, ) else: action_shape = action_shape.shape done_shape = tuple() # Check shapes states = replay.state() self.assertEqual(states.shape, shape + state_shape, config) actions = replay.action() self.assertEqual(actions.shape, shape + action_shape, config) dones = replay.done() self.assertEqual(dones.shape, shape + done_shape, config) if return_info: policies = replay.policy() self.assertEqual(policies.shape, (NUM_STEPS, ) + action_shape, config)
def create_unreal_env(num_processes, kwargs): def thunk(env): env = gym.make(**env) env = RewardCollector(env) env = TransposeImage(env) env = ScaledFloatFrame(env) env = UnrealEnvBaseWrapper(env) return env return AsyncVectorEnv([lambda: thunk(kwargs) for _ in range(num_processes)]), SyncVectorEnv([lambda: thunk(kwargs)])
def create_env(self, env): class W(gym.ObservationWrapper): def observation(self, o): return o.astype(np.float32) env_kwargs = env def _thunk(): env = gym.make(**env_kwargs) env = RewardCollector(env) env = gym.wrappers.TransformReward(env, lambda r: 0.01 * r) env = W(env) return env self.validation_environment = SyncVectorEnv([_thunk]) return AsyncVectorEnv([_thunk for _ in range(self.num_processes)])
def _init(): env = gym.make( env_id, seed=seed + rank, effective_max_num_players=effective_max_num_players, init_num_players=effective_max_num_players, with_shuffle=with_shuffle, gnn_input=gnn_input ) return env return _init env = AsyncVectorEnv( [make_env('Adhoc-Foraging-8x8-3f-v0', i, args['seed'], num_players_train, False, True) for i in range(args['num_envs'])] ) # Save init agent model parameters. save_dirs = os.path.join(directory, 'params_0') agent.save_parameters(save_dirs) # Evaluate initial model performance in training environment avgs = [] num_dones, per_worker_rew = [0] * args['num_envs'], [0] * args['num_envs'] agent.reset() env_eval = AsyncVectorEnv( [make_env('Adhoc-Foraging-8x8-3f-v0', i, args['eval_init_seed'], num_players_train, False, True) for i in range(args['num_envs'])]
implicit_max_player_num=3, with_shuffling=False): def _init(): env = gym.make(env_id, seed=seed + rank, num_players=num_players, close_penalty=close_penalty, implicit_max_player_num=implicit_max_player_num, with_shuffling=with_shuffling) return env return _init num_players = args['num_players'] env = AsyncVectorEnv([ make_env('Adhoc-wolfpack-v5', i, num_players, args['seed'], args['close_penalty']) for i in range(args['num_envs']) ]) # Save initial model parameters. save_dirs = os.path.join(directory, 'params_0') agent.save_parameters(save_dirs) # Evaluate initial model performance in training environment avgs = [] for ep_val_num in range(args['eval_eps']): num_players = args['num_players'] agent.reset() steps = 0 avg_total_rewards = 0.0 env_eval = AsyncVectorEnv([ make_env('Adhoc-wolfpack-v5', i, num_players, 2000,
def main(): n_envs = len(os.sched_getaffinity(0)) factory = FallingEnvFactory() # factory = HalfCheetahEnvFactory() # factory = HumanoidFallingEnvFactory() env: Env = factory.make_env() envs: VectorEnv = AsyncVectorEnv([factory.make_env for _ in range(n_envs)]) env_container = EnvContainer(env, envs) state_dim, = env.observation_space.shape action_dim, = env.action_space.shape relu = nn.ReLU() tanh = nn.Tanh() identity = nn.Identity() actor = ProbMLPConstantLogStd(state_dim, action_dim, [256, 256], relu, tanh, -1.0) critic = MultiLayerPerceptron(state_dim, 1, [256, 256], relu, identity) scaler_ = StandardScaler() print("Fit scaler") env.reset() state_seq = [] for _ in tqdm(range(512)): action = env.action_space.sample() state, _, done, _ = env.step(action) state_seq.append(state) if done: env.reset() state_seq = np.stack(state_seq) scaler_.fit(state_seq) scaler = ScalerNet(scaler_) module_dict = ModuleDict() module_dict.set(ModuleKey.actor, actor) module_dict.set(ModuleKey.scaler, scaler) module_dict.set(ModuleKey.critic, critic) action_getter: ActionGetter = ActionGetterModule(actor, scaler) sample_collector: SampleCollector = SampleCollectorV0(env_container, action_getter, 2048, 1) mse_loss = nn.MSELoss() critic_tensor_inserter: TensorInserter = \ TensorInserterTensorize(ArrayKey.states, TensorKey.states_tensor) + \ TensorInserterTensorize(ArrayKey.log_probs, TensorKey.log_probs_tensor) + \ TensorInserterTensorize(ArrayKey.cumulative_rewards, TensorKey.cumulative_rewards_tensor) + \ TensorInserterForward(TensorKey.states_tensor, ModuleKey.scaler, TensorKey.states_tensor) + \ TensorInserterForward(TensorKey.states_tensor, ModuleKey.critic, TensorKey.cumulative_reward_predictions_tensor) critic_loss_calculator: LossCalculator = \ LossCalculatorInputTarget(TensorKey.cumulative_reward_predictions_tensor, TensorKey.cumulative_rewards_tensor, mse_loss) actor_tensor_inserter: TensorInserter = \ TensorInserterTensorize(ArrayKey.states, TensorKey.states_tensor) + \ TensorInserterTensorize(ArrayKey.actions, TensorKey.actions_tensor) + \ TensorInserterTensorize(ArrayKey.log_probs, TensorKey.log_probs_tensor) + \ TensorInserterTensorize(ArrayKey.cumulative_rewards, TensorKey.cumulative_rewards_tensor) + \ TensorInserterForward(TensorKey.states_tensor, ModuleKey.scaler, TensorKey.states_tensor) + \ TensorInserterForward(TensorKey.states_tensor, ModuleKey.critic, TensorKey.cumulative_reward_predictions_tensor) + \ TensorInserterLambda([TensorKey.cumulative_rewards_tensor, TensorKey.cumulative_reward_predictions_tensor], lambda x, y: x - y, TensorKey.advantages_tensor) + \ TensorInserterModuleLambda(ModuleKey.actor, [TensorKey.states_tensor, TensorKey.actions_tensor], lambda actor, state, action: actor.get_log_prob(state, action), TensorKey.new_log_probs_tensor) + \ TensorInserterLambda([TensorKey.new_log_probs_tensor, TensorKey.log_probs_tensor, TensorKey.advantages_tensor], get_ppo_surrogate_tensor, TensorKey.ppo_surrogates_tensor) actor_loss_calculator: LossCalculator = \ LossCalculatorLambda([TensorKey.ppo_surrogates_tensor], lambda x: -torch.mean(x)) actor_optimizer = RAdam(params=actor.parameters(), lr=3e-4) actor_updater: ModuleUpdater = ModuleUpdaterOptimizer(actor_optimizer) critic_optimizer = RAdam(params=critic.parameters(), lr=3e-4) critic_updater: ModuleUpdater = ModuleUpdaterOptimizer(critic_optimizer) actor_trainee = Trainee([actor], actor_updater, actor_tensor_inserter, actor_loss_calculator, 10) critic_trainee = Trainee([critic], critic_updater, critic_tensor_inserter, critic_loss_calculator, 10) trainer = RLTrainer(sample_collector, [critic_trainee, actor_trainee], 100000, 128) trainer.train(module_dict)
def make_env(args, rank, num_agents=5, active_agents=3, freeze_multiplier=80, team_mode="guard", reward_scheme="sparse", seed=100): def _init(): return make_open_env( args, args['num_env_steps'], num_agents, active_agents, freeze_multiplier, team_mode=team_mode, reward_scheme=reward_scheme, seed=int(seed + 1000 * rank) ) return _init num_players_train = args['num_players_train'] num_players_test = args['num_players_test'] env = AsyncVectorEnv([ make_env(args, i, active_agents=num_players_train, seed=args['seed'], reward_scheme=args["reward_type"]) for i in range(args['num_envs']) ]) env_eval = AsyncVectorEnv([ make_env(args, i, active_agents=num_players_train, seed=args['eval_init_seed'], reward_scheme=args["reward_type"]) for i in range(args['num_envs']) ]) env_eval2 = AsyncVectorEnv([ make_env(args, i, active_agents=num_players_test, seed=args['eval_init_seed'], reward_scheme=args["reward_type"]) for i in range(args['num_envs']) ]) today = date.today() d1 = today.strftime("%d_%m_%Y")
from gym.vector import AsyncVectorEnv from agents.dqn.dqn import DQN from agents.ppo.ppo import PPO from utils.runner import train, make_env if __name__ == "__main__": env_fns = [make_env() for _ in range(8)] env = AsyncVectorEnv(env_fns) agent = PPO(env.single_observation_space, env.single_action_space) returns = train(agent, env, 3000000, 500)
def test_config(n_envs, n_episodes, base_env, use_torch, use_logger, return_info, retry): config = 'n_envs' + str(n_envs) + '-n_eps' + str(n_episodes) \ + '-base_env' + str(base_env) \ + '-torch' + str(use_torch) + '-logger' + str(use_logger) \ + '-info' + str(return_info) if isinstance(base_env, str): env = vec_env = gym.vector.make(base_env, num_envs=n_envs) else: def make_env(): env = base_env() return env env_fns = [make_env for _ in range(n_envs)] env = vec_env = AsyncVectorEnv(env_fns) if use_logger: env = envs.Logger(env, interval=5, logger=self.logger) if use_torch: env = envs.Torch(env) policy = lambda x: ch.totensor(vec_env.action_space.sample()) else: policy = lambda x: vec_env.action_space.sample() if return_info: agent = lambda x: (policy(x), { 'policy': policy(x)[0], 'act': policy(x) }) else: agent = policy # Gather experience env = envs.Runner(env) replay = env.run(agent, episodes=n_episodes) if retry: replay = env.run(agent, episodes=n_episodes) # Pre-compute some shapes shape = (len(replay), ) state_shape = vec_env.observation_space.sample().shape[1:] action_shape = np.array(vec_env.action_space.sample())[0].shape if len(action_shape) == 0: action_shape = (1, ) done_shape = (1, ) # Check shapes states = replay.state() self.assertEqual(states.shape, shape + state_shape, config) actions = replay.action() self.assertEqual(actions.shape, shape + action_shape, config) dones = replay.done() self.assertEqual(dones.shape, shape + done_shape, config) if return_info: policies = replay.policy() self.assertEqual(policies.shape, shape + action_shape, config) acts = replay.act() self.assertEqual(acts.shape, (len(replay), n_envs) + action_shape, config)
num_agents, active_agents, freeze_multiplier, team_mode=team_mode, reward_scheme=reward_scheme, seed=int(seed + 1000 * rank)) return _init num_players_train = args['num_players_train'] num_players_test = args['num_players_test'] env = AsyncVectorEnv([ make_env(args, i, active_agents=num_players_train, seed=args['seed'], reward_scheme="sparse") for i in range(8) ]) args["device"] = "cpu" writer = None for idx in range(101): agent = MRFAgent(args=args, writer=writer, added_u_dim=0) load_dir = args['loading_dir'] + str(idx) agent.load_parameters(load_dir) obs_list = [] agent.reset()