def test_is_unwrappable_to(): assert is_unwrappable_to(make_env('FrozenLake-v0'), TimeLimit) assert is_unwrappable_to(make_env('FrozenLake-v0'), DiscreteEnv) assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'), FrozenLakeFeatureWrapper) assert is_unwrappable_to(feature_wrapper.make('FrozenLake8x8-v0'), FrozenLakeFeatureWrapper) assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'), feature_wrapper.FeatureWrapper) env = feature_wrapper.make('FrozenLake-v0') reward_function = FeatureBasedRewardFunction(env, 'random') env = RewardWrapper(env, reward_function) assert is_unwrappable_to(env, RewardWrapper) assert is_unwrappable_to(env, feature_wrapper.FeatureWrapper) assert is_unwrappable_to(env, DiscreteEnv) assert is_unwrappable_to(env, gym.Env)
def test_feature_count(): env = feature_wrapper.make('FrozenLake-v0') # create dummy data: path = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4] features = [] for i in path: features.append(to_one_hot(i, 16)) trajs = [{'features': features}] result = feature_count(env, trajs, gamma=1.0) desired = np.array( [0., 1., 2., 3., 4., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) assert isinstance(result, np.ndarray) assert np.allclose(result, desired) # two times the same traj should get the same feature count: trajs = [{'features': features}, {'features': features}] result = feature_count(env, trajs, gamma=1.0) desired = np.array( [0., 1., 2., 3., 4., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) assert isinstance(result, np.ndarray) assert np.allclose(result, desired) # repeating a traj twice should double feature count (with gamma 1) trajs = [{'features': features + features}] result = feature_count(env, trajs, gamma=1.0) desired = np.array( [0., 2., 4., 6., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) assert isinstance(result, np.ndarray) assert np.allclose(result, desired) # test gamma 0.9: trajs = [{'features': features}] result = feature_count(env, trajs, gamma=.9) x = .9**6 + .9**7 + .9**8 + .9**9 desired = np.array([ 0., 1., .9 + .81, .729 + .6561 + .59049, x, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]) assert isinstance(result, np.ndarray) assert np.allclose(result, desired) # test gamma 0: result = feature_count(env, trajs, gamma=0) desired = np.array( [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) assert isinstance(result, np.ndarray) assert np.allclose(result, desired)
def make_wrapped_env(env_id: str, with_feature_wrapper: bool = False, reward_function_factory: Callable = None, with_model_wrapper: bool = False): """Make an environment, potentially wrapped in FeatureWrapper, RewardWrapper, and BaseWorldModelWrapper. Parameters ---------- env_id: str The environment's id, e.g. 'FrozenLake-v0'. with_feature_wrapper: bool Whether to use a feature wrapper. reward_function_factory: Callable A function which returns a new reward function when called. If this is provided, the environment will be wrapped in a RewardWrapper using the returned reward function. with_model_wrapper: bool Whether to use a BaseWorldModelWrapper. Returns ------- gym.Env A gym environment, potentially wrapped. """ assert env_id in ENV_IDS if with_feature_wrapper: assert env_id in feature_wrapper.feature_wrappable_envs() env = feature_wrapper.make(env_id) else: env = make_env(env_id) if reward_function_factory is not None: reward_function = reward_function_factory(env) assert isinstance(reward_function, BaseRewardFunction) env = RewardWrapper(env, reward_function) if with_model_wrapper: if utils.wrapper.is_unwrappable_to(env, DiscreteEnv): env = DiscreteEnvModelWrapper(env) elif utils.wrapper.is_unwrappable_to(env, MazeWorld): env = MazeModelWrapper(env) else: raise NotImplementedError() return env
store_to = 'data/frozen/expert/' no_episodes = 1000 max_steps_per_episode = 100 def rl_alg_factory(env): '''Return an RL algorithm which is used both for the expert and in the IRL loop.''' return TabularQ(env) # Apprenticeship IRL assumes that rewards are linear in features. # However, FrozenLake doesn't provide features. It is sufficiently small # to work with tabular methods. Therefore, we just use a wrapper that uses # a one-hot encoding of the state space as features. env = feature_wrapper.make('FrozenLake-v0') # Generate expert trajectories. expert_agent = rl_alg_factory(env) print('Training expert agent...') expert_agent.train(15) print('Done training expert') expert_trajs = collect_trajs(env, expert_agent, no_episodes, max_steps_per_episode, store_to) # you can comment out the previous block if expert data has already # been generated and load the trajectories from file by uncommenting # next 2 lines: # with open(store_to + 'trajs.pkl', 'rb') as f: # expert_trajs = pickle.load(f)
def quick_run_alg(alg_class, config={}): env = feature_wrapper.make('FrozenLake-v0') reward_function = FeatureBasedRewardFunction(env, 'random') env = RewardWrapper(env, reward_function) def rl_alg_factory(env): return ValueIteration(env, {}) expert_trajs = [{ 'states': [ 0, 0, 4, 0, 4, 8, 4, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 4, 8, 9, 8, 8, 9, 10, 14, 15 ], 'actions': [ 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 1, 0, 1 ], 'rewards': [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0 ], 'true_rewards': [], 'features': [ np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1. ]) ] }, { 'states': [0, 4, 8, 8, 9, 10, 6, 2, 6, 10, 14, 15], 'actions': [0, 0, 3, 3, 1, 0, 2, 0, 2, 0, 1], 'rewards': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], 'true_rewards': [], 'features': [ np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1. ]) ] }] metrics = [] alg = alg_class(env, expert_trajs, rl_alg_factory, metrics, config) alg.train(2, 2, 2)
def test_value_iteration(): # gamma = 1.0 env = gym.make('FrozenLake-v0') agent = ValueIteration(env, {'gamma': 1.0}) agent.train(10) state_values = agent.state_values assert isinstance(state_values, np.ndarray) assert state_values.shape == (17, ) # argmax should be state just before frisbee # (15 is final state, 16 is absorbing state) assert np.argmax(state_values) == 14 assert state_values[14] > 0.93 and state_values[14] < 0.95 assert state_values[15] == 0 # gamma = 0.9 env = gym.make('FrozenLake-v0') agent = ValueIteration(env, {'gamma': 0.9}) agent.train(10) state_values = agent.state_values assert isinstance(state_values, np.ndarray) assert state_values.shape == (17, ) # argmax should be state just before frisbee # (15 is final state, 16 is absorbing state) assert np.argmax(state_values) == 14 assert state_values[14] > 0.63 and state_values[14] < 0.65 # holes and frisbee should have zero value: for i in [5, 7, 11, 12, 15]: assert state_values[i] == 0 # check some q values: # go right in second to last state assert np.argmax(agent.q_values[14, :]) == 1 assert np.min(agent.q_values) == 0 assert np.max(agent.q_values) <= 1 # check policy: for i in range(16): assert np.isclose(np.sum(agent.policy(i)), 1.) assert np.min(agent.policy(i)) >= 0. assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i)) # check softmax policy old_state_values = agent.state_values old_q_values = agent.q_values agent = ValueIteration(env, {'gamma': 0.9, 'temperature': 0.1}) agent.train(10) assert np.all(agent.state_values <= old_state_values) # at least initial state should now have lower value: assert agent.state_values[0] < old_state_values[0] assert np.all(agent.q_values <= old_q_values) # check policy: for i in range(16): assert np.isclose(np.sum(agent.policy(i)), 1.) assert np.min(agent.policy(i)) >= 0. assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i)) # ordering of probabilities should stay the same with softmax assert np.all( np.argsort(old_q_values[i, :]) == np.argsort(agent.policy(i))) # test policy array: policy_array = agent.policy_array() assert policy_array.shape == (17, 4) for i in range(16): assert np.all(agent.policy(i) == policy_array[i, :]) # check if true reward isn't leaked: env = feature_wrapper.make('FrozenLake-v0') reward_function = FeatureBasedRewardFunction(env, np.zeros(16)) env = RewardWrapper(env, reward_function) agent = ValueIteration(env, {}) agent.train(10) assert np.sum(agent.state_values == 0)
# Define important script constants here: store_to = 'data/pendulum/expert/' no_episodes = 1000 max_steps_per_episode = 200 def rl_alg_factory(env): '''Return an RL algorithm which is used both for the expert and in the IRL loop.''' return PPO(env) # Pendulum has features that can easily be extracted from the previous state, # see PendulumFeatureWrapper. env = feature_wrapper.make('Pendulum-v0') # Generate expert trajectories. # expert_agent = rl_alg_factory(env) # print('Training expert agent...') # expert_agent.train(15) # print('Done training expert') # expert_trajs = collect_trajs(env, expert_agent, no_episodes, # max_steps_per_episode, store_to) # you can comment out the previous block if expert data has already # been generated and load the trajectories from file by uncommenting # next 2 lines: with open(store_to + 'trajs.pkl', 'rb') as f: expert_trajs = pickle.load(f)