def __init__(self, env: gym.Env, config: Union[None, dict] = None):
        """

        Parameters
        ----------
        env: gym.Env
            A DiscreteEnv environment
        config: dict
            Configuration of hyperparameters.
        """
        assert is_unwrappable_to(env, BaseWorldModelWrapper)
        super(ValueIteration, self).__init__(env, config)

        self.model_wrapper = unwrap_env(env, BaseWorldModelWrapper)

        # +1 for absorbing state
        self.no_states = self.model_wrapper.n_states() + 1
        self.no_actions = env.action_space.n
        self.transitions = self.model_wrapper.get_transition_array()
        # will be filled in beginning of training:
        self.rewards = None
        # will be filled during training:
        self.state_values = None
        self.q_values = None
        # whenever self._policy is None, it will be re-calculated
        # based on current self.q_values when calling policy().
        self._policy = None
def test_make_maze1():
    env = make_env('MazeWorld1-v0')
    assert is_unwrappable_to(env, MazeWorld)
    walls, rews = get_maps(MAP1)
    maze_env = unwrap_env(env, MazeWorld)
    assert np.all(maze_env.map_walls == walls)
    assert np.all(maze_env.map_rewards == rews)
示例#3
0
    def __init__(self,
                 env: gym.Env,
                 expert_trajs: List[Dict[str, list]],
                 rl_alg_factory: Callable[[gym.Env], BaseRLAlgorithm],
                 metrics: List[BaseMetric] = [],
                 config: Union[dict, None] = None):
        """

        Parameters
        ----------
        env: gym.Env
            The gym environment to be trained on.
            Needs to be wrapped in a RewardWrapper to not leak the true reward function.
        expert_trajs: List[dict]
            A list of trajectories.
            Each trajectory is a dictionary with keys
            ['states', 'actions', 'rewards', 'true_rewards', 'features'].
            The values of each dictionary are lists.
            See :func:`irl_benchmark.irl.collect.collect_trajs`.
        rl_alg_factory: Callable[[gym.Env], BaseRLAlgorithm]
            A function which returns a new RL algorithm when called.
        config: dict
            A dictionary containing algorithm-specific parameters.
        """

        assert is_unwrappable_to(env, RewardWrapper)
        self.env = env
        self.expert_trajs = expert_trajs
        self.rl_alg_factory = rl_alg_factory
        self.metrics = metrics
        self.metric_results = [[]] * len(metrics)
        self.config = preprocess_config(self, IRL_CONFIG_DOMAINS, config)
示例#4
0
    def __init__(self, env: gym.Env, expert_trajs: List[Dict[str, list]],
                 rl_alg_factory: Callable[[gym.Env], BaseRLAlgorithm],
                 metrics: List[BaseMetric], config: dict):
        """See :class:`irl_benchmark.irl.algorithms.base_algorithm.BaseIRLAlgorithm`."""

        assert is_unwrappable_to(env, DiscreteEnv)
        assert is_unwrappable_to(env, FeatureWrapper)

        super(MaxEntIRL, self).__init__(env, expert_trajs, rl_alg_factory,
                                        metrics, config)
        # get transition matrix (with absorbing state)
        self.transition_matrix = get_transition_matrix(self.env)
        self.n_states, self.n_actions, _ = self.transition_matrix.shape

        # get map of features for all states:
        feature_wrapper = unwrap_env(env, FeatureWrapper)
        self.feat_map = feature_wrapper.feature_array()
    def train(self, no_episodes: int):
        """ Train the agent

        Parameters
        ----------
        no_episodes: int
            Not used in this algorithm (since it assumes known transition dynamics)
        """
        assert is_unwrappable_to(
            self.env,
            gym.envs.toy_text.discrete.DiscreteEnv) or is_unwrappable_to(
                self.env, MazeWorld)
        # extract reward function from env (using wrapped reward function if available):
        self.rewards = self.model_wrapper.get_reward_array()

        # initialize state values:
        state_values = np.zeros([self.no_states])

        while True:  # stops when state values converge
            # remember old values for error computation
            old_state_values = state_values.copy()
            # calculate Q-values:
            q_values = self.rewards + \
                       self.config['gamma'] * self.transitions.dot(state_values)
            # calculate state values either with maximum or mellow maximum:
            if self.config['temperature'] is None:
                # using default maximum operator:
                state_values = self._argmax_state_values(q_values)
            else:
                # using softmax:
                state_values = self._softmax_state_values(q_values)

            # stopping condition:
            # check if state values converged (almost no change since last iteration:
            if np.allclose(state_values,
                           old_state_values,
                           atol=self.config['epsilon']):
                break

        # persist learned state values and Q-values:
        self.state_values = state_values
        self.q_values = q_values
        # flag to tell other methods that policy needs to be updated based on new values:
        self._policy = None
def test_is_unwrappable_to():
    assert is_unwrappable_to(make_env('FrozenLake-v0'), TimeLimit)
    assert is_unwrappable_to(make_env('FrozenLake-v0'), DiscreteEnv)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'),
                             FrozenLakeFeatureWrapper)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake8x8-v0'),
                             FrozenLakeFeatureWrapper)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'),
                             feature_wrapper.FeatureWrapper)
    env = feature_wrapper.make('FrozenLake-v0')
    reward_function = FeatureBasedRewardFunction(env, 'random')
    env = RewardWrapper(env, reward_function)
    assert is_unwrappable_to(env, RewardWrapper)
    assert is_unwrappable_to(env, feature_wrapper.FeatureWrapper)
    assert is_unwrappable_to(env, DiscreteEnv)
    assert is_unwrappable_to(env, gym.Env)
示例#7
0
def feature_count(env, trajs: List[Dict[str, list]],
                  gamma: float) -> np.ndarray:
    """Return empirical discounted feature counts of input trajectories.

    Parameters
    ----------
    env: gym.Env
        A gym environment, wrapped in a feature wrapper
    trajs: List[Dict[str, list]]
         A list of trajectories.
        Each trajectory is a dictionary with keys
        ['states', 'actions', 'rewards', 'true_rewards', 'features'].
        The values of each dictionary are lists.
        See :func:`irl_benchmark.irl.collect.collect_trajs`.
    gamma: float
        The discount factor. Must be in range [0., 1.].

    Returns
    -------
    np.ndarray
        A numpy array containing discounted feature counts. The shape
        is the same as the trajectories' feature shapes. One scalar
        feature count per feature.
    """
    assert is_unwrappable_to(env, FeatureWrapper)

    # Initialize feature count sum to zeros of correct shape:
    feature_dim = unwrap_env(env, FeatureWrapper).feature_dimensionality()
    # feature_dim is a 1-tuple,
    # extract the feature dimensionality as integer:
    assert len(feature_dim) == 1
    feature_dim = feature_dim[0]
    feature_count_sum = np.zeros(feature_dim)

    for traj in trajs:
        assert traj['features']  # empty lists are False in python

        # gammas is a vector containing [gamma^0, gamma^1, gamma^2, ... gamma^l]
        # where l is length of the trajectory:
        gammas = gamma**np.arange(len(traj['features']))
        traj_feature_count = np.sum(gammas.reshape(-1, 1) *
                                    np.array(traj['features']).reshape(
                                        (-1, feature_dim)),
                                    axis=0)
        # add trajectory's feature count:
        feature_count_sum += traj_feature_count
    # divide feature_count_sum by number of trajectories to normalize:
    result = feature_count_sum / len(trajs)
    return result
示例#8
0
    def get_reward_array(self):
        env = unwrap_env(self.env, DiscreteEnv)

        # adding +1 to account for absorbing state
        # (reached whenever game ended)
        n_states = env.observation_space.n + 1
        n_actions = env.action_space.n

        if is_unwrappable_to(self.env, RewardWrapper):
            # get the reward function:
            reward_wrapper = unwrap_env(self.env, RewardWrapper)
            reward_function = reward_wrapper.reward_function
        else:
            reward_function = None

        rewards = np.zeros([n_states, n_actions])

        # iterate over all "from" states:
        for state, transitions_given_state in env.P.items():
            # iterate over all actions:
            for action, outcomes in transitions_given_state.items():
                # iterate over all possible outcomes:
                for probability, next_state, reward, done in outcomes:
                    if reward_function is not None:
                        if done and state == next_state:
                            # don't output reward for reaching state if game is over
                            # and already in that state.
                            reward = 0
                        else:
                            rew_input = reward_wrapper.get_reward_input_for(
                                state, action, next_state)
                            reward = reward_function.reward(rew_input)
                    rewards[state, action] += reward * probability

        # reward of absorbing state is zero:
        rewards[-1, :] = 0.0

        return rewards
示例#9
0
    def __init__(self, env: gym.Env, config: dict):
        """

        Parameters
        ----------
        env: gym.Env
            A DiscreteEnv environment
        config: dict
            Configuration of hyperparameters.
        """
        assert is_unwrappable_to(env, gym.envs.toy_text.discrete.DiscreteEnv)
        super(ValueIteration, self).__init__(env, config)
        self.no_states = env.observation_space.n + 1  # + 1 for absorbing state
        self.no_actions = env.action_space.n
        self.transitions = get_transition_matrix(env)
        # will be filled in beginning of training:
        self.rewards = None
        # will be filled during training:
        self.state_values = None
        self.q_values = None
        # whenever self._policy is None, it will be re-calculated
        # based on current self.q_values when calling policy().
        self._policy = None
示例#10
0
def case_make_wrapped(env_id):
    env = make_wrapped_env(env_id)
    assert not is_unwrappable_to(env, FeatureWrapper)
    assert not is_unwrappable_to(env, RewardWrapper)
    assert not is_unwrappable_to(env, BaseWorldModelWrapper)

    env = make_wrapped_env(env_id, with_feature_wrapper=True)
    assert is_unwrappable_to(env, FeatureWrapper)
    assert not is_unwrappable_to(env, RewardWrapper)
    assert not is_unwrappable_to(env, BaseWorldModelWrapper)

    env = make_wrapped_env(env_id,
                           with_feature_wrapper=True,
                           with_model_wrapper=True)
    assert is_unwrappable_to(env, FeatureWrapper)
    assert not is_unwrappable_to(env, RewardWrapper)
    assert is_unwrappable_to(env, BaseWorldModelWrapper)

    def rew_fun_fact(env):
        return FeatureBasedRewardFunction(env, 'random')

    env = make_wrapped_env(env_id,
                           with_feature_wrapper=True,
                           reward_function_factory=rew_fun_fact,
                           with_model_wrapper=False)
    assert is_unwrappable_to(env, FeatureWrapper)
    assert is_unwrappable_to(env, RewardWrapper)
    assert not is_unwrappable_to(env, BaseWorldModelWrapper)

    env = make_wrapped_env(env_id,
                           with_feature_wrapper=True,
                           reward_function_factory=rew_fun_fact,
                           with_model_wrapper=True)
    assert is_unwrappable_to(env, FeatureWrapper)
    assert is_unwrappable_to(env, RewardWrapper)
    assert is_unwrappable_to(env, BaseWorldModelWrapper)
示例#11
0
def test_make_frozen8():
    env = make_env('FrozenLake8x8-v0')
    assert is_unwrappable_to(env, FrozenLakeEnv)
示例#12
0
 def __init__(self, env):
     assert is_unwrappable_to(env, DiscreteEnv)
     super(DiscreteEnvModelWrapper, self).__init__(env)
示例#13
0
    def _get_model_arrays(self, return_transitions=True, return_rewards=True):

        if return_rewards:
            if is_unwrappable_to(self.env, RewardWrapper):
                reward_wrapper = unwrap_env(self.env, RewardWrapper)
            else:
                reward_wrapper = None

        assert return_transitions or return_rewards

        # +1 for absorbing state:
        n_states = self.n_states() + 1
        absorbing_s = n_states - 1
        num_rewards = n_actions = self.maze_env.action_space.n
        paths = self.maze_env.paths

        if return_transitions:
            coords_trans_state = []
            coords_trans_action = []
            coords_trans_next_state = []
            trans_data = []

            def add_transition(s, a, sn, p):
                coords_trans_state.append(s)
                coords_trans_action.append(a)
                coords_trans_next_state.append(sn)
                trans_data.append(p)

        if return_rewards:
            rewards = np.zeros((n_states, n_actions))

        for s in tqdm(range(n_states - 1)):
            for a in range(n_actions):
                state = self.index_to_state(s)

                if return_rewards and reward_wrapper is not None:
                    rew_input = reward_wrapper.get_reward_input_for(
                        state, a, None)
                    wrapped_reward = reward_wrapper.reward_function.reward(
                        rew_input).item()

                if np.sum(state[num_rewards:]) == 0:
                    if return_transitions:
                        add_transition(s, a, absorbing_s, 1.)
                    if return_rewards:
                        if reward_wrapper is None:
                            rewards[s, a] = 0
                        else:
                            rewards[s, a] = wrapped_reward
                    continue

                pos_index = int(np.where(state[:num_rewards] > 0)[0][0])
                path = paths[pos_index][a]

                if len(path) == 1 or pos_index == a:
                    assert pos_index == a
                    if return_transitions:
                        add_transition(s, a, s, 1. - RANDOM_QUIT_CHANCE)
                        add_transition(s, a, absorbing_s, RANDOM_QUIT_CHANCE)
                    if return_rewards:
                        if reward_wrapper is None:
                            rewards[s, a] = REWARD_MOVE
                            if state[num_rewards + a] != 0:
                                rews_where = self.maze_env.rews_where
                                rewards[s, a] += float(
                                    self.maze_env.map_rewards[rews_where[0][a], \
                                    rews_where[1][a]]) * (1 - RANDOM_QUIT_CHANCE)
                        else:
                            rewards[s, a] = wrapped_reward
                    continue

                success_prob = (1 - RANDOM_QUIT_CHANCE)**(len(path) - 1)
                if return_transitions:
                    new_state = get_next_state(state, a, num_rewards)
                    new_s = self.state_to_index(new_state)
                    add_transition(s, a, new_s, success_prob)
                    add_transition(s, a, absorbing_s, 1. - success_prob)

                if return_rewards:
                    if reward_wrapper is None:
                        if state[num_rewards + a] == 0:
                            # if reward is already collected at this field:
                            rew_value = 0
                        else:
                            rews_where = self.maze_env.rews_where
                            rew_value = float(
                                self.maze_env.map_rewards[rews_where[0][a],
                                                          rews_where[1][a]])

                        possible_distances = np.arange(1, len(path))
                        prob_getting_to_distance = (
                            1 - RANDOM_QUIT_CHANCE)**possible_distances
                        prob_stopping_at_distance = np.ones_like(
                            possible_distances, dtype=np.float32)
                        prob_stopping_at_distance[:-1] = RANDOM_QUIT_CHANCE
                        expected_walking_distance = np.sum(
                            possible_distances * prob_getting_to_distance *
                            prob_stopping_at_distance)
                        weighted_reward = expected_walking_distance * REWARD_MOVE + success_prob * rew_value

                        rewards[s, a] = weighted_reward
                    else:
                        rewards[s, a] = wrapped_reward

        for a in range(n_actions):
            if return_transitions:
                add_transition(absorbing_s, a, absorbing_s, 1.)
            if return_rewards:
                rewards[absorbing_s, a] = 0

        if return_transitions:
            coords = np.array([
                coords_trans_state, coords_trans_action,
                coords_trans_next_state
            ])
            transitions = sparse.COO(coords, trans_data)

        if return_transitions:
            if return_rewards:
                return transitions, rewards
            return transitions
        return rewards
示例#14
0
 def __init__(self, env):
     assert is_unwrappable_to(env, MazeWorld)
     super(MazeModelWrapper, self).__init__(env)
     self.maze_env = unwrap_env(self.env, MazeWorld)