def test_make_gym_env(self): env = make_gym_env(env_id='CartPole-v1', seed=0, monitor=False) assert isinstance(env, Env) assert not isinstance(env, gym.Env) assert isinstance(env, Wrapper) assert isinstance(env.observation_space, Box) assert isinstance(env.action_space, Discrete) env_spec = EnvSpec(env) assert env_spec.control_type == 'Discrete' assert env_spec.T == 500 assert env_spec.max_episode_reward == 475.0 assert env_spec.reward_range == (-float('inf'), float('inf')) assert not env_spec.is_vec_env with pytest.raises(TypeError): env_spec.num_env assert env.reset().shape == (4, ) assert len(env.step(env.action_space.sample())) == 4 del env del env_spec # Pendulum, continuous # do not test redundant part env = make_gym_env('Pendulum-v0', seed=0) assert isinstance(env, Env) env_spec = EnvSpec(env) assert isinstance(env_spec.action_space, Box) assert env_spec.T == 200 assert env_spec.control_type == 'Continuous' assert env.reset().shape == (3, ) assert len(env.step(env.action_space.sample())) == 4
def test_episode_runner(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) if env_id == 'CartPole-v1': sticky_action = 1 elif env_id == 'Pendulum-v0': sticky_action = [0.1] T = 30 agent = StickyAgent(None, env_spec, sticky_action) runner = EpisodeRunner(None, agent, env) D = runner(T) assert D.N == 3 assert D.maxT == max(D.Ts) seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.observations[n][0]) assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(T): obs, reward, done, info = ev.step(sticky_action) assert np.allclose(reward, D.rewards[n][t]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert np.allclose(done, D.dones[n][t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.masks[n][t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(obs, D.infos[n][t]['terminal_observation']) assert D.completes[n] assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:, ...]) assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...]) assert np.allclose(0.0, D.numpy_rewards[n, t + 1:]) assert np.allclose(True, D.numpy_dones[n, t + 1:]) assert np.allclose(0.0, D.numpy_masks[n, t + 1:]) break else: assert np.allclose(obs, D.observations[n][t + 1])
def test_frame_stack(): env = make_gym_env(env_id='CartPole-v1', seed=1) env = FrameStack(env, num_stack=4) assert isinstance(env, FrameStack) assert isinstance(env, Env) assert env.num_stack == 4 assert env.observation_space.shape == (4, 4) assert isinstance(env.stack_buffer, np.ndarray) assert env.stack_buffer.shape == (4, 4) assert np.all(env.stack_buffer == 0.0) assert env.stack_buffer.dtype == np.float32 assert env.reset().shape == (4, 4) obs = env.step(0)[0] assert obs[:, 0].sum() != 0.0 assert obs[:, 1].sum() != 0.0 assert np.all(obs[:, 2:] == 0.0) assert np.any(obs[:, 0] != obs[:, 1]) obs = env.step(1)[0] obs = env.step(1)[0] assert np.allclose(obs[:, -1], [0.03073904, 0.00145001, -0.03088818, -0.03131252]) assert np.allclose(obs[:, 2], [0.03076804, -0.19321568, -0.03151444, 0.25146705]) obs = env.step(1)[0] assert np.allclose(obs[:, -1], [0.03076804, -0.19321568, -0.03151444, 0.25146705])
def test_categorical_head(): with pytest.raises(AssertionError): env = make_gym_env('Pendulum-v0', 0) env_spec = EnvSpec(env) CategoricalHead(None, None, 30, env_spec) env = make_gym_env('CartPole-v1', 0) env_spec = EnvSpec(env) head = CategoricalHead(None, None, 30, env_spec) assert head.feature_dim == 30 assert isinstance(head.action_head, nn.Linear) assert head.action_head.in_features == 30 and head.action_head.out_features == 2 dist = head(torch.randn(3, 30)) assert isinstance(dist, Categorical) assert list(dist.batch_shape) == [3] assert list(dist.probs.shape) == [3, 2] action = dist.sample() assert action.shape == (3, )
def test_rolling_segment_runner(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) if env_id == 'CartPole-v1': sticky_action = 1 elif env_id == 'Pendulum-v0': sticky_action = [0.1] T = 30 agent = StickyAgent(None, env_spec, sticky_action) runner = RollingSegmentRunner(None, agent, env) D = runner(T) assert D.N == 3 assert D.T == T seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(T): obs, reward, done, info = ev.step(sticky_action) if done: info['terminal_observation'] = obs obs = ev.reset() assert np.allclose(obs, D.numpy_observations[n, t + 1, ...]) assert np.allclose(sticky_action, D.numpy_actions[n, t, ...]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(info['terminal_observation'], D.infos[n][t]['terminal_observation'])
def test_diag_gaussian_head(): with pytest.raises(AssertionError): env = make_gym_env('CartPole-v1', 0) env_spec = EnvSpec(env) DiagGaussianHead(None, None, 30, env_spec) env = make_gym_env('Pendulum-v0', 0) env_spec = EnvSpec(env) head = DiagGaussianHead(None, None, 30, env_spec) assert head.feature_dim == 30 assert isinstance(head.mean_head, nn.Linear) assert isinstance(head.logstd_head, nn.Parameter) assert head.mean_head.in_features == 30 and head.mean_head.out_features == 1 assert list(head.logstd_head.shape) == [1] assert torch.eq(head.logstd_head, torch.tensor(-0.510825624)) dist = head(torch.randn(3, 30)) assert isinstance(dist, Independent) and isinstance(dist.base_dist, Normal) assert list(dist.batch_shape) == [3] action = dist.sample() assert list(action.shape) == [3, 1] head = DiagGaussianHead(None, None, 30, env_spec, std_style='softplus') dist = head(torch.randn(3, 30)) action = dist.sample() assert list(action.shape) == [3, 1] assert torch.eq(head.logstd_head, torch.tensor(-0.19587036834631966)) head = DiagGaussianHead(None, None, 30, env_spec, std_style='sigmoidal') assert torch.eq(head.logstd_head, torch.tensor(-0.871222446472449)) head = DiagGaussianHead(None, None, 30, env_spec, std_state_dependent=True) dist = head(torch.randn(3, 30)) action = dist.sample() assert list(action.shape) == [3, 1] head = DiagGaussianHead(None, None, 30, env_spec, constant_std=0.3) dist = head(torch.randn(3, 30)) action = dist.sample() assert list(action.shape) == [3, 1] assert not head.logstd_head.requires_grad assert torch.eq(head.logstd_head, torch.tensor([-1.2039728]))
def test_constraint_action(): env = make_gym_env('Pendulum-v0', 0) env_spec = EnvSpec(env) action = torch.tensor([1.5]) assert torch.eq(constraint_action(env_spec, action), torch.tensor([1.5])) action = torch.tensor([3.0]) assert torch.eq(constraint_action(env_spec, action), torch.tensor([2.0])) action = torch.tensor([-10.0]) assert torch.eq(constraint_action(env_spec, action), torch.tensor([-2.0]))
def test_resize_observation(self, env_id): env = make_gym_env(env_id, 0) new_env = ResizeObservation(env, 16) assert env.observation_space.shape[ -1] == new_env.observation_space.shape[-1] assert new_env.observation_space.shape[:2] == (16, 16) obs = env.reset() assert obs.shape == env.observation_space.shape obs = new_env.reset() assert obs.shape[:2] == (16, 16) assert obs.shape == new_env.observation_space.shape
def test_random_agent(): env = make_gym_env('Pendulum-v0', 0) env_spec = EnvSpec(env) agent = RandomAgent(None, env_spec) out = agent.choose_action(env.reset()) assert isinstance(out, dict) assert 'action' in out and out['action'].shape == (1,) venv = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(venv) agent = RandomAgent(None, env_spec) out = agent.choose_action(env.reset()) assert isinstance(out, dict) assert 'action' in out and len(out['action']) == 3 and isinstance(out['action'][0], int)
def test_gray_scale_observation(self, env_id): env = make_gym_env(env_id, 0) new_env = GrayScaleObservation(env, keep_dim=True) assert env.observation_space.shape[: 2] == new_env.observation_space.shape[: 2] assert env.observation_space.shape[-1] == 3 assert new_env.observation_space.shape[-1] == 1 obs = env.reset() assert obs.shape == env.observation_space.shape obs = new_env.reset() assert obs.shape == new_env.observation_space.shape
def test_random_policy(): env = make_gym_env('Pendulum-v0', 0) env_spec = EnvSpec(env) policy = RandomPolicy(None, env_spec) out = policy(env.reset()) assert isinstance(out, dict) assert 'action' in out and out['action'].shape == (1, ) venv = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v0', 3, 0, False) env_spec = EnvSpec(venv) policy = RandomPolicy(None, env_spec) out = policy(env.reset()) assert isinstance(out, dict) assert 'action' in out and len(out['action']) == 3 and isinstance( out['action'][0], int)
def test_sticky_agent(): sticky_action = 0 env = make_gym_env('CartPole-v1', 0) env_spec = EnvSpec(env) agent = StickyAgent(None, env_spec, sticky_action) out = agent.choose_action(env.reset()) assert isinstance(out, dict) assert 'action' in out and isinstance(out['action'], int) assert out['action'] == sticky_action venv = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(venv) agent = StickyAgent(None, env_spec, sticky_action) out = agent.choose_action(env.reset()) assert isinstance(out, dict) assert 'action' in out and len(out['action']) == 3 and isinstance(out['action'][0], int) assert np.allclose(out['action'], [0, 0, 0])
def eval(self, n): # Set network as evaluation mode self.agent.policy.network.eval() # Create a new instance of the envrionment env = make_gym_env(env_id=self.config['env:id'], seed=self.config['seed'], monitor=False, monitor_dir=None) # Create a TrajectoryRunner runner = TrajectoryRunner(agent=self.agent, env=env, gamma=self.config['algo:gamma']) # Evaluate the agent for a set of trajectories D = runner(N=self.config['eval:N'], T=self.config['eval:T']) # Return evaluation output eval_output = {} eval_output['D'] = D eval_output['n'] = n return eval_output
def test_batch_episode(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) D = BatchEpisode(env_spec) if env_id == 'CartPole-v1': sticky_action = 1 action_shape = () action_dtype = np.int32 elif env_id == 'Pendulum-v0': sticky_action = [0.1] action_shape = env_spec.action_space.shape action_dtype = np.float32 obs = env.reset() D.add_observation(obs) for t in range(30): action = [sticky_action] * env.num_env obs, reward, done, info = env.step(action) D.add_observation(obs) D.add_action(action) D.add_reward(reward) D.add_done(done) D.add_info(info) D.add_batch_info({'V': [0.1 * (t + 1), (t + 1), 10 * (t + 1)]}) [D.set_completed(n) for n, d in enumerate(done) if d] assert D.N == 3 assert len(D.Ts) == 3 assert D.maxT == max(D.Ts) assert all([ isinstance(x, np.ndarray) for x in [ D.numpy_observations, D.numpy_actions, D.numpy_rewards, D.numpy_dones, D.numpy_masks ] ]) assert all([ x.dtype == np.float32 for x in [D.numpy_observations, D.numpy_rewards, D.numpy_masks] ]) assert all([ x.shape == (3, D.maxT) for x in [D.numpy_rewards, D.numpy_dones, D.numpy_masks] ]) assert D.numpy_actions.dtype == action_dtype assert D.numpy_dones.dtype == np.bool assert D.numpy_observations.shape == (3, D.maxT + 1) + env_spec.observation_space.shape assert D.numpy_actions.shape == (3, D.maxT) + action_shape assert isinstance(D.batch_infos, list) and len(D.batch_infos) == 30 assert np.allclose([0.1 * (x + 1) for x in range(30)], [info['V'][0] for info in D.batch_infos]) assert np.allclose([1 * (x + 1) for x in range(30)], [info['V'][1] for info in D.batch_infos]) assert np.allclose([10 * (x + 1) for x in range(30)], [info['V'][2] for info in D.batch_infos]) seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.observations[n][0]) assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(30): obs, reward, done, info = ev.step(sticky_action) assert np.allclose(reward, D.rewards[n][t]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert np.allclose(done, D.dones[n][t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.masks[n][t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(obs, D.infos[n][t]['terminal_observation']) assert D.completes[n] assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:, ...]) assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...]) assert np.allclose(0.0, D.numpy_rewards[n, t + 1:]) assert np.allclose(True, D.numpy_dones[n, t + 1:]) assert np.allclose(0.0, D.numpy_masks[n, t + 1:]) break else: assert np.allclose(obs, D.observations[n][t + 1])
def test_batch_segment(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) T = 30 D = BatchSegment(env_spec, T) if env_id == 'CartPole-v1': sticky_action = 1 action_shape = () action_dtype = np.int32 elif env_id == 'Pendulum-v0': sticky_action = [0.1] action_shape = env_spec.action_space.shape action_dtype = np.float32 obs = env.reset() D.add_observation(0, obs) for t in range(T): action = [sticky_action] * env.num_env obs, reward, done, info = env.step(action) D.add_observation(t + 1, obs) D.add_action(t, action) D.add_reward(t, reward) D.add_done(t, done) D.add_info(info) D.add_batch_info({'V': [0.1 * (t + 1), (t + 1), 10 * (t + 1)]}) assert D.N == 3 assert D.T == T assert all([ isinstance(x, np.ndarray) for x in [ D.numpy_observations, D.numpy_actions, D.numpy_rewards, D.numpy_dones, D.numpy_masks ] ]) assert all([ x.dtype == np.float32 for x in [D.numpy_observations, D.numpy_rewards, D.numpy_masks] ]) assert D.numpy_actions.dtype == action_dtype assert D.numpy_dones.dtype == np.bool assert D.numpy_observations.shape[:2] == (3, T + 1) assert D.numpy_actions.shape == (3, T) + action_shape assert all([ x.shape == (3, T) for x in [D.numpy_rewards, D.numpy_dones, D.numpy_masks] ]) assert isinstance(D.batch_infos, list) and len(D.batch_infos) == T assert np.allclose([0.1 * (x + 1) for x in range(T)], [info['V'][0] for info in D.batch_infos]) assert np.allclose([1 * (x + 1) for x in range(T)], [info['V'][1] for info in D.batch_infos]) assert np.allclose([10 * (x + 1) for x in range(T)], [info['V'][2] for info in D.batch_infos]) seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(T): obs, reward, done, info = ev.step(sticky_action) if done: info['terminal_observation'] = obs obs = ev.reset() assert np.allclose(obs, D.numpy_observations[n, t + 1, ...]) assert np.allclose(sticky_action, D.numpy_actions[n, t, ...]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(info['terminal_observation'], D.infos[n][t]['terminal_observation'])
def __call__(self, config): # Set random seeds: PyTorch, numpy.random, random set_global_seeds(seed=config['seed']) # Create an environment env = make_gym_env(env_id=config['env:id'], seed=config['seed'], monitor=False, monitor_dir=None) # Create environment specification env_spec = EnvSpec(env) # Create device torch.cuda.set_device(config['cuda_id']) device = torch.device( f'cuda:{config["cuda_id"]}' if config['cuda'] else 'cpu') # Create policy network = MLP(config=config).to(device) policy = CategoricalPolicy(network=network, env_spec=env_spec) # Create optimizer optimizer = optim.Adam(policy.network.parameters(), lr=config['algo:lr']) # Create learning rate scheduler if config['algo:use_lr_scheduler']: max_epoch = config[ 'train:iter'] # Max number of lr decay, Note where lr_scheduler put lambda_f = lambda epoch: 1 - epoch / max_epoch # decay learning rate for each training epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Create agent kwargs = {'device': device} if config['algo:use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = ActorCriticAgent(policy=policy, optimizer=optimizer, config=config, **kwargs) # Create runner runner = TrajectoryRunner(agent=agent, env=env, gamma=config['algo:gamma']) # Create engine engine = Engine(agent=agent, runner=runner, config=config, logger=None) # Training and evaluation train_logs = [] eval_logs = [] for i in range(config['train:iter']): train_output = engine.train(i) # Logging and evaluation if i == 0 or (i + 1) % config['log:interval'] == 0: # Log training and record the loggings train_logger = engine.log_train(train_output) train_logs.append(train_logger.logs) # Log evaluation and record the loggings eval_output = engine.eval(i) eval_logger = engine.log_eval(eval_output) eval_logs.append(eval_logger.logs) # Save the loggings np.save( Path(config['log:dir']) / str(config['ID']) / 'train', train_logs) np.save( Path(config['log:dir']) / str(config['ID']) / 'eval', eval_logs) return None
def test_reward_scale(): env = make_gym_env(env_id='CartPole-v1', seed=0) env = RewardScale(env, scale=0.02) env.reset() observation, reward, done, info = env.step(env.action_space.sample()) assert reward == 0.02
def test_make_gym_env(self): env = make_gym_env(env_id='Pendulum-v0', seed=1) assert isinstance(env, Env)