def test_is_unwrappable_to(): assert is_unwrappable_to(make_env('FrozenLake-v0'), TimeLimit) assert is_unwrappable_to(make_env('FrozenLake-v0'), DiscreteEnv) assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'), FrozenLakeFeatureWrapper) assert is_unwrappable_to(feature_wrapper.make('FrozenLake8x8-v0'), FrozenLakeFeatureWrapper) assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'), feature_wrapper.FeatureWrapper) env = feature_wrapper.make('FrozenLake-v0') reward_function = FeatureBasedRewardFunction(env, 'random') env = RewardWrapper(env, reward_function) assert is_unwrappable_to(env, RewardWrapper) assert is_unwrappable_to(env, feature_wrapper.FeatureWrapper) assert is_unwrappable_to(env, DiscreteEnv) assert is_unwrappable_to(env, gym.Env)
def make_wrapped_env(env_id: str, with_feature_wrapper: bool = False, reward_function_factory: Callable = None, with_model_wrapper: bool = False): """Make an environment, potentially wrapped in FeatureWrapper, RewardWrapper, and BaseWorldModelWrapper. Parameters ---------- env_id: str The environment's id, e.g. 'FrozenLake-v0'. with_feature_wrapper: bool Whether to use a feature wrapper. reward_function_factory: Callable A function which returns a new reward function when called. If this is provided, the environment will be wrapped in a RewardWrapper using the returned reward function. with_model_wrapper: bool Whether to use a BaseWorldModelWrapper. Returns ------- gym.Env A gym environment, potentially wrapped. """ assert env_id in ENV_IDS if with_feature_wrapper: assert env_id in feature_wrapper.feature_wrappable_envs() env = feature_wrapper.make(env_id) else: env = make_env(env_id) if reward_function_factory is not None: reward_function = reward_function_factory(env) assert isinstance(reward_function, BaseRewardFunction) env = RewardWrapper(env, reward_function) if with_model_wrapper: if utils.wrapper.is_unwrappable_to(env, DiscreteEnv): env = DiscreteEnvModelWrapper(env) elif utils.wrapper.is_unwrappable_to(env, MazeWorld): env = MazeModelWrapper(env) else: raise NotImplementedError() return env
# a one-hot encoding of the state space as features. env = feature_wrapper.make('FrozenLake-v0') # Generate expert trajectories. expert_agent = rl_alg_factory(env) print('Training expert agent...') expert_agent.train(15) print('Done training expert') expert_trajs = collect_trajs(env, expert_agent, no_episodes, max_steps_per_episode, store_to) # you can comment out the previous block if expert data has already # been generated and load the trajectories from file by uncommenting # next 2 lines: # with open(store_to + 'trajs.pkl', 'rb') as f: # expert_trajs = pickle.load(f) # Provide random reward function as initial reward estimate. # This probably isn't really required. reward_function = FeatureBasedRewardFunction(env, np.random.normal(size=16)) env = RewardWrapper(env, reward_function) # Run projection algorithm for up to 5 minutes. appr_irl = ApprIRL(env, expert_trajs, rl_alg_factory, proj=True) appr_irl.train(time_limit=600, rl_time_per_iteration=45, eps=0, no_trajs=100, max_steps_per_episode=100, verbose=True)
def quick_run_alg(alg_class, config={}): env = feature_wrapper.make('FrozenLake-v0') reward_function = FeatureBasedRewardFunction(env, 'random') env = RewardWrapper(env, reward_function) def rl_alg_factory(env): return ValueIteration(env, {}) expert_trajs = [{ 'states': [ 0, 0, 4, 0, 4, 8, 4, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 4, 8, 9, 8, 8, 9, 10, 14, 15 ], 'actions': [ 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 1, 0, 1 ], 'rewards': [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0 ], 'true_rewards': [], 'features': [ np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1. ]) ] }, { 'states': [0, 4, 8, 8, 9, 10, 6, 2, 6, 10, 14, 15], 'actions': [0, 0, 3, 3, 1, 0, 2, 0, 2, 0, 1], 'rewards': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], 'true_rewards': [], 'features': [ np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1. ]) ] }] metrics = [] alg = alg_class(env, expert_trajs, rl_alg_factory, metrics, config) alg.train(2, 2, 2)
def __init__(self, env_id: str, expert_trajs_path: str, irl_alg_factory: Callable[[gym.Env, List[Dict[str, list]]], BaseIRLAlgorithm], metrics: List[BaseMetric], rl_config: dict, irl_config: dict, run_config: dict): """ Parameters ---------- env_id: str The environment id of a gym environment. This is the id passed to gym.make(). expert_trajs_path: str A path to the folder where expert trajectories are stored. The file with expert trajectories must be expert_trajs_path/trajs.data. irl_alg_factory: Callable[[gym.Env, List[Dict[str, list]]], BaseIRLAlgorithm] A factory function which takes a gym environment and expert trajetories and returns a subclass of BaseIRLAlgorithm. metrics: List[BaseMetric] The metrics to be evaluated after running the IRL algorithm. run_config: dict A dictionary containing the configuration of the run. Required fields are: 'reward_function': subclass of BaseRewardFunction, e.g. FeatureBasedRewardFunction. 'no_expert_trajs': int, number of expert trajectories to be used. 'no_irl_iterations': int, number of iterations the IRL algorithm is run for. 'no_rl_episodes_per_irl_iteration': int, how many episodes the RL agent is allowed to run each iteration. 'no_irl_episodes_per_irl_iteration': int, how many episodes can be sampled for the IRL algorithm each iteration. """ # create and wrap environment according to specified reward function if run_config['reward_function'] is FeatureBasedRewardFunction: # feature based reward functions need to be wrapped in a FeatureWrapper: self.env = feature_make(env_id) elif run_config['reward_function'] is TabularRewardFunction: self.env = gym.make(env_id) # wrap environment in a reward wrapper to prevent leaking of true reward reward_function = run_config['reward_function'](self.env, parameters='random') self.env = RewardWrapper(self.env, reward_function) # load expert trajs: self.expert_trajs = load_stored_trajs(expert_trajs_path) # use only specified number of expert trajs assert len(self.expert_trajs) >= run_config['no_expert_trajs'] self.expert_trajs = self.expert_trajs[:run_config['no_expert_trajs']] self.irl_alg_factory = irl_alg_factory # Metrics are only passed as classes and need to be instantiated instantiated_metrics = [] # collect all information relevant for certain metric __init__s: metric_input = { 'env': self.env, 'expert_trajs': self.expert_trajs, 'true_reward': truth.make(env_id), } # instantiate metrics: for metric in metrics: instantiated_metrics.append(metric(metric_input)) self.metrics = instantiated_metrics self.rl_config = rl_config self.irl_config = irl_config self.run_config = run_config
def test_value_iteration(): # gamma = 1.0 env = gym.make('FrozenLake-v0') agent = ValueIteration(env, {'gamma': 1.0}) agent.train(10) state_values = agent.state_values assert isinstance(state_values, np.ndarray) assert state_values.shape == (17, ) # argmax should be state just before frisbee # (15 is final state, 16 is absorbing state) assert np.argmax(state_values) == 14 assert state_values[14] > 0.93 and state_values[14] < 0.95 assert state_values[15] == 0 # gamma = 0.9 env = gym.make('FrozenLake-v0') agent = ValueIteration(env, {'gamma': 0.9}) agent.train(10) state_values = agent.state_values assert isinstance(state_values, np.ndarray) assert state_values.shape == (17, ) # argmax should be state just before frisbee # (15 is final state, 16 is absorbing state) assert np.argmax(state_values) == 14 assert state_values[14] > 0.63 and state_values[14] < 0.65 # holes and frisbee should have zero value: for i in [5, 7, 11, 12, 15]: assert state_values[i] == 0 # check some q values: # go right in second to last state assert np.argmax(agent.q_values[14, :]) == 1 assert np.min(agent.q_values) == 0 assert np.max(agent.q_values) <= 1 # check policy: for i in range(16): assert np.isclose(np.sum(agent.policy(i)), 1.) assert np.min(agent.policy(i)) >= 0. assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i)) # check softmax policy old_state_values = agent.state_values old_q_values = agent.q_values agent = ValueIteration(env, {'gamma': 0.9, 'temperature': 0.1}) agent.train(10) assert np.all(agent.state_values <= old_state_values) # at least initial state should now have lower value: assert agent.state_values[0] < old_state_values[0] assert np.all(agent.q_values <= old_q_values) # check policy: for i in range(16): assert np.isclose(np.sum(agent.policy(i)), 1.) assert np.min(agent.policy(i)) >= 0. assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i)) # ordering of probabilities should stay the same with softmax assert np.all( np.argsort(old_q_values[i, :]) == np.argsort(agent.policy(i))) # test policy array: policy_array = agent.policy_array() assert policy_array.shape == (17, 4) for i in range(16): assert np.all(agent.policy(i) == policy_array[i, :]) # check if true reward isn't leaked: env = feature_wrapper.make('FrozenLake-v0') reward_function = FeatureBasedRewardFunction(env, np.zeros(16)) env = RewardWrapper(env, reward_function) agent = ValueIteration(env, {}) agent.train(10) assert np.sum(agent.state_values == 0)
from irl_benchmark.irl.collect import collect_trajs from irl_benchmark.irl.feature.feature_wrapper import FrozenLakeFeatureWrapper from irl_benchmark.irl.reward.reward_function import FeatureBasedRewardFunction from irl_benchmark.irl.reward.reward_wrapper import RewardWrapper from irl_benchmark.rl.algorithms.value_iteration import ValueIteration from irl_benchmark.utils.utils import get_transition_matrix store_to = 'data/frozen/expert/' no_episodes = 1000 max_steps_per_episode = 1000 env = gym.make('FrozenLake8x8-v0') env = FrozenLakeFeatureWrapper(env) initial_reward_function_estimate = FeatureBasedRewardFunction( env=env, parameters=np.zeros(64)) env = RewardWrapper(env=env, reward_function=initial_reward_function_estimate) # Generate expert trajectories. expert_agent = ValueIteration(env) print('Training expert agent...') expert_agent.train(30) expert_trajs = collect_trajs(env, expert_agent, no_episodes, max_steps_per_episode, store_to) feat_map = np.eye(64) transition_dynamics = get_transition_matrix(env) def rl_alg_factory(env): return ValueIteration(env)