Python RewardWrapper示例，irl_benchmark.irl.reward.reward_wrapper.RewardWrapper Python示例

示例#1

0

显示文件

文件： utils_wrapper_test.py 项目： liamondrop/irl-benchmark

def test_is_unwrappable_to():
    assert is_unwrappable_to(make_env('FrozenLake-v0'), TimeLimit)
    assert is_unwrappable_to(make_env('FrozenLake-v0'), DiscreteEnv)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'),
                             FrozenLakeFeatureWrapper)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake8x8-v0'),
                             FrozenLakeFeatureWrapper)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'),
                             feature_wrapper.FeatureWrapper)
    env = feature_wrapper.make('FrozenLake-v0')
    reward_function = FeatureBasedRewardFunction(env, 'random')
    env = RewardWrapper(env, reward_function)
    assert is_unwrappable_to(env, RewardWrapper)
    assert is_unwrappable_to(env, feature_wrapper.FeatureWrapper)
    assert is_unwrappable_to(env, DiscreteEnv)
    assert is_unwrappable_to(env, gym.Env)

示例#2

0

显示文件

def make_wrapped_env(env_id: str,
                     with_feature_wrapper: bool = False,
                     reward_function_factory: Callable = None,
                     with_model_wrapper: bool = False):
    """Make an environment, potentially wrapped in FeatureWrapper, RewardWrapper,
    and BaseWorldModelWrapper.

    Parameters
    ----------
    env_id: str
        The environment's id, e.g. 'FrozenLake-v0'.
    with_feature_wrapper: bool
        Whether to use a feature wrapper.
    reward_function_factory: Callable
        A function which returns a new reward function when called. If this is
        provided, the environment will be wrapped in a RewardWrapper using
        the returned reward function.
    with_model_wrapper: bool
        Whether to use a BaseWorldModelWrapper.

    Returns
    -------
    gym.Env
        A gym environment, potentially wrapped.
    """
    assert env_id in ENV_IDS
    if with_feature_wrapper:
        assert env_id in feature_wrapper.feature_wrappable_envs()
        env = feature_wrapper.make(env_id)
    else:
        env = make_env(env_id)

    if reward_function_factory is not None:
        reward_function = reward_function_factory(env)
        assert isinstance(reward_function, BaseRewardFunction)
        env = RewardWrapper(env, reward_function)

    if with_model_wrapper:
        if utils.wrapper.is_unwrappable_to(env, DiscreteEnv):
            env = DiscreteEnvModelWrapper(env)
        elif utils.wrapper.is_unwrappable_to(env, MazeWorld):
            env = MazeModelWrapper(env)
        else:
            raise NotImplementedError()
    return env

示例#3

0

显示文件

# a one-hot encoding of the state space as features.
env = feature_wrapper.make('FrozenLake-v0')

# Generate expert trajectories.
expert_agent = rl_alg_factory(env)
print('Training expert agent...')
expert_agent.train(15)
print('Done training expert')
expert_trajs = collect_trajs(env, expert_agent, no_episodes,
                             max_steps_per_episode, store_to)

# you can comment out the previous block if expert data has already
# been generated and load the trajectories from file by uncommenting
# next 2 lines:
# with open(store_to + 'trajs.pkl', 'rb') as f:
#     expert_trajs = pickle.load(f)

# Provide random reward function as initial reward estimate.
# This probably isn't really required.
reward_function = FeatureBasedRewardFunction(env, np.random.normal(size=16))
env = RewardWrapper(env, reward_function)

# Run projection algorithm for up to 5 minutes.
appr_irl = ApprIRL(env, expert_trajs, rl_alg_factory, proj=True)
appr_irl.train(time_limit=600,
               rl_time_per_iteration=45,
               eps=0,
               no_trajs=100,
               max_steps_per_episode=100,
               verbose=True)

示例#4

0

显示文件

def quick_run_alg(alg_class, config={}):
    env = feature_wrapper.make('FrozenLake-v0')
    reward_function = FeatureBasedRewardFunction(env, 'random')
    env = RewardWrapper(env, reward_function)

    def rl_alg_factory(env):
        return ValueIteration(env, {})

    expert_trajs = [{
        'states': [
            0, 0, 4, 0, 4, 8, 4, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 4, 8, 9,
            8, 8, 9, 10, 14, 15
        ],
        'actions': [
            0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1,
            3, 3, 1, 0, 1
        ],
        'rewards': [
            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            1.0
        ],
        'true_rewards': [],
        'features': [
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.
            ])
        ]
    }, {
        'states': [0, 4, 8, 8, 9, 10, 6, 2, 6, 10, 14, 15],
        'actions': [0, 0, 3, 3, 1, 0, 2, 0, 2, 0, 1],
        'rewards': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
        'true_rewards': [],
        'features': [
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.
            ])
        ]
    }]
    metrics = []
    alg = alg_class(env, expert_trajs, rl_alg_factory, metrics, config)
    alg.train(2, 2, 2)

示例#5

0

显示文件

    def __init__(self, env_id: str, expert_trajs_path: str,
                 irl_alg_factory: Callable[[gym.Env, List[Dict[str, list]]],
                                           BaseIRLAlgorithm],
                 metrics: List[BaseMetric], rl_config: dict, irl_config: dict,
                 run_config: dict):
        """

        Parameters
        ----------
        env_id: str
            The environment id of a gym environment. This is the id passed to gym.make().
        expert_trajs_path: str
            A path to the folder where expert trajectories are stored. The file with
            expert trajectories must be expert_trajs_path/trajs.data.
        irl_alg_factory: Callable[[gym.Env, List[Dict[str, list]]], BaseIRLAlgorithm]
            A factory function which takes a gym environment and expert trajetories and
            returns a subclass of BaseIRLAlgorithm.
        metrics: List[BaseMetric]
            The metrics to be evaluated after running the IRL algorithm.
        run_config: dict
            A dictionary containing the configuration of the run. Required fields are:
            'reward_function': subclass of BaseRewardFunction, e.g. FeatureBasedRewardFunction.
            'no_expert_trajs': int, number of expert trajectories to be used.
            'no_irl_iterations': int, number of iterations the IRL algorithm is run for.
            'no_rl_episodes_per_irl_iteration': int, how many episodes the RL agent
            is allowed to run each iteration.
            'no_irl_episodes_per_irl_iteration': int, how many episodes can be sampled
            for the IRL algorithm each iteration.
        """
        # create and wrap environment according to specified reward function
        if run_config['reward_function'] is FeatureBasedRewardFunction:
            # feature based reward functions need to be wrapped in a FeatureWrapper:
            self.env = feature_make(env_id)
        elif run_config['reward_function'] is TabularRewardFunction:
            self.env = gym.make(env_id)

        # wrap environment in a reward wrapper to prevent leaking of true reward
        reward_function = run_config['reward_function'](self.env,
                                                        parameters='random')
        self.env = RewardWrapper(self.env, reward_function)

        # load expert trajs:
        self.expert_trajs = load_stored_trajs(expert_trajs_path)
        # use only specified number of expert trajs
        assert len(self.expert_trajs) >= run_config['no_expert_trajs']
        self.expert_trajs = self.expert_trajs[:run_config['no_expert_trajs']]

        self.irl_alg_factory = irl_alg_factory

        # Metrics are only passed as classes and need to be instantiated
        instantiated_metrics = []
        # collect all information relevant for certain metric __init__s:
        metric_input = {
            'env': self.env,
            'expert_trajs': self.expert_trajs,
            'true_reward': truth.make(env_id),
        }
        # instantiate metrics:
        for metric in metrics:
            instantiated_metrics.append(metric(metric_input))
        self.metrics = instantiated_metrics

        self.rl_config = rl_config
        self.irl_config = irl_config

        self.run_config = run_config

示例#6

0

显示文件

文件： rl_value_iteration_test.py 项目： dit7ya/irl-benchmark-1

def test_value_iteration():
    # gamma = 1.0
    env = gym.make('FrozenLake-v0')
    agent = ValueIteration(env, {'gamma': 1.0})
    agent.train(10)
    state_values = agent.state_values
    assert isinstance(state_values, np.ndarray)
    assert state_values.shape == (17, )
    # argmax should be state just before frisbee
    # (15 is final state, 16 is absorbing state)
    assert np.argmax(state_values) == 14
    assert state_values[14] > 0.93 and state_values[14] < 0.95
    assert state_values[15] == 0

    # gamma = 0.9
    env = gym.make('FrozenLake-v0')
    agent = ValueIteration(env, {'gamma': 0.9})
    agent.train(10)
    state_values = agent.state_values
    assert isinstance(state_values, np.ndarray)
    assert state_values.shape == (17, )
    # argmax should be state just before frisbee
    # (15 is final state, 16 is absorbing state)
    assert np.argmax(state_values) == 14
    assert state_values[14] > 0.63 and state_values[14] < 0.65
    # holes and frisbee should have zero value:
    for i in [5, 7, 11, 12, 15]:
        assert state_values[i] == 0

    # check some q values:
    # go right in second to last state
    assert np.argmax(agent.q_values[14, :]) == 1
    assert np.min(agent.q_values) == 0
    assert np.max(agent.q_values) <= 1

    # check policy:
    for i in range(16):
        assert np.isclose(np.sum(agent.policy(i)), 1.)
        assert np.min(agent.policy(i)) >= 0.
        assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i))

    # check softmax policy
    old_state_values = agent.state_values
    old_q_values = agent.q_values
    agent = ValueIteration(env, {'gamma': 0.9, 'temperature': 0.1})
    agent.train(10)
    assert np.all(agent.state_values <= old_state_values)
    # at least initial state should now have lower value:
    assert agent.state_values[0] < old_state_values[0]
    assert np.all(agent.q_values <= old_q_values)
    # check policy:
    for i in range(16):
        assert np.isclose(np.sum(agent.policy(i)), 1.)
        assert np.min(agent.policy(i)) >= 0.
        assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i))
        # ordering of probabilities should stay the same with softmax
        assert np.all(
            np.argsort(old_q_values[i, :]) == np.argsort(agent.policy(i)))

    # test policy array:
    policy_array = agent.policy_array()
    assert policy_array.shape == (17, 4)
    for i in range(16):
        assert np.all(agent.policy(i) == policy_array[i, :])

    # check if true reward isn't leaked:
    env = feature_wrapper.make('FrozenLake-v0')
    reward_function = FeatureBasedRewardFunction(env, np.zeros(16))
    env = RewardWrapper(env, reward_function)
    agent = ValueIteration(env, {})
    agent.train(10)
    assert np.sum(agent.state_values == 0)

示例#7

0

显示文件

文件： meirl_experiments.py 项目： dit7ya/irl-benchmark-1

from irl_benchmark.irl.collect import collect_trajs
from irl_benchmark.irl.feature.feature_wrapper import FrozenLakeFeatureWrapper
from irl_benchmark.irl.reward.reward_function import FeatureBasedRewardFunction
from irl_benchmark.irl.reward.reward_wrapper import RewardWrapper
from irl_benchmark.rl.algorithms.value_iteration import ValueIteration
from irl_benchmark.utils.utils import get_transition_matrix

store_to = 'data/frozen/expert/'
no_episodes = 1000
max_steps_per_episode = 1000

env = gym.make('FrozenLake8x8-v0')
env = FrozenLakeFeatureWrapper(env)
initial_reward_function_estimate = FeatureBasedRewardFunction(
    env=env, parameters=np.zeros(64))
env = RewardWrapper(env=env, reward_function=initial_reward_function_estimate)

# Generate expert trajectories.
expert_agent = ValueIteration(env)
print('Training expert agent...')
expert_agent.train(30)
expert_trajs = collect_trajs(env, expert_agent, no_episodes,
                             max_steps_per_episode, store_to)

feat_map = np.eye(64)

transition_dynamics = get_transition_matrix(env)


def rl_alg_factory(env):
    return ValueIteration(env)