예제 #1
0
def test_maze1_features():
    env = make_wrapped_env('MazeWorld1-v0', with_feature_wrapper=True)
    maze_env = unwrap_env(env, MazeWorld)
    feature_wrapper = unwrap_env(env, FeatureWrapper)
    d = feature_wrapper.feature_dimensionality()
    ranges = feature_wrapper.feature_range()
    print(ranges)
    for i in range(10240, 13):
        for a in range(10):
            feature = feature_wrapper.features(maze_env.index_to_state(i), a,
                                               None)
            assert feature.shape == d
            assert np.all(feature >= ranges[0])
            assert np.all(feature <= ranges[1])
예제 #2
0
    def __init__(self, env: gym.Env, expert_trajs: List[Dict[str, list]],
                 rl_alg_factory: Callable[[gym.Env], BaseRLAlgorithm],
                 metrics: List[BaseMetric], config: dict):
        """See :class:`irl_benchmark.irl.algorithms.base_algorithm.BaseIRLAlgorithm`."""

        super(MaxEntIRL, self).__init__(env, expert_trajs, rl_alg_factory,
                                        metrics, config)
        # get transition matrix (with absorbing state)
        self.transition_matrix = unwrap_env(
            env, BaseWorldModelWrapper).get_transition_array()
        self.n_states, self.n_actions, _ = self.transition_matrix.shape

        # get map of features for all states:
        feature_wrapper = unwrap_env(env, FeatureWrapper)
        self.feat_map = feature_wrapper.feature_array()
예제 #3
0
    def get_transition_array(self):
        env = unwrap_env(self.env, DiscreteEnv)

        # adding +1 to account for absorbing state
        # (reached whenever game ended)
        n_states = env.observation_space.n + 1
        n_actions = env.action_space.n

        transitions = np.zeros([n_states, n_actions, n_states])

        # iterate over all "from" states:
        for state, transitions_given_state in env.P.items():
            # iterate over all actions:
            for action, outcomes in transitions_given_state.items():
                # iterate over all possible outcomes:
                for probability, next_state, _, done in outcomes:
                    # add transition probability T(s, a, s')
                    transitions[state, action, next_state] += probability
                    if done:
                        # outcome was marked as ending the game.
                        # if game is done and state == next_state, map to absorbing state instead
                        if state == next_state:
                            transitions[state, action, next_state] = 0
                        # map next state to absorbing state
                        # make sure that next state wasn't mapped to any other state yet
                        assert np.sum(transitions[next_state, :, :-1]) == 0
                        transitions[next_state, :, -1] = 1.0

        # specify transition probabilities for absorbing state:
        # returning to itself for all actions.
        transitions[-1, :, -1] = 1.0

        return transitions
예제 #4
0
def test_make_maze1():
    env = make_env('MazeWorld1-v0')
    assert is_unwrappable_to(env, MazeWorld)
    walls, rews = get_maps(MAP1)
    maze_env = unwrap_env(env, MazeWorld)
    assert np.all(maze_env.map_walls == walls)
    assert np.all(maze_env.map_rewards == rews)
예제 #5
0
    def __init__(self, env: gym.Env, config: Union[None, dict] = None):
        """

        Parameters
        ----------
        env: gym.Env
            A DiscreteEnv environment
        config: dict
            Configuration of hyperparameters.
        """
        assert is_unwrappable_to(env, BaseWorldModelWrapper)
        super(ValueIteration, self).__init__(env, config)

        self.model_wrapper = unwrap_env(env, BaseWorldModelWrapper)

        # +1 for absorbing state
        self.no_states = self.model_wrapper.n_states() + 1
        self.no_actions = env.action_space.n
        self.transitions = self.model_wrapper.get_transition_array()
        # will be filled in beginning of training:
        self.rewards = None
        # will be filled during training:
        self.state_values = None
        self.q_values = None
        # whenever self._policy is None, it will be re-calculated
        # based on current self.q_values when calling policy().
        self._policy = None
예제 #6
0
    def features(self, current_state: np.ndarray, action: int,
                 next_state: None) -> np.ndarray:
        """Return features to be saved in step method's info dictionary.

        There are four feature variables: expected walking distance,
        probability of reaching a small reward field, probability of reaching
        a medium reward field, probability of reaching a large reward field.
        Only one of the last three values will be non-zero."""

        maze_env = unwrap_env(self.env, MazeWorld)

        # can only calculate features for a single state-action pair.
        assert len(current_state.shape) == 1

        # special case: not at any position:
        if np.sum(current_state[:maze_env.num_rewards]) == 0:
            return np.array([1, 0, 0, 0])
        path_len = maze_env.get_path_len(current_state, action)

        # special case: all rewards collected:
        if np.sum(current_state[maze_env.num_rewards:]) == 0:
            return np.zeros(4)

        assert path_len > 0
        # special case: walking to current position
        if path_len == 1:
            # assert that agent is walking to its current position:
            assert current_state[action] == 1.0
            expected_walking_distance = 1.0
        else:
            # calculate expected walking distance feature:
            possible_distances = np.arange(1, path_len)
            prob_getting_to_distance = (1 -
                                        RANDOM_QUIT_CHANCE)**possible_distances
            prob_stopping_at_distance = np.ones_like(possible_distances,
                                                     dtype=np.float32)
            prob_stopping_at_distance[:-1] = RANDOM_QUIT_CHANCE
            expected_walking_distance = np.sum(possible_distances *
                                               prob_getting_to_distance *
                                               prob_stopping_at_distance)

        # coin collection probabilities:
        ccps = np.zeros(3)
        rew_value = maze_env.get_rew_value(current_state, action)
        if rew_value != 0.:
            assert rew_value in [REWARD_SMALL, REWARD_MEDIUM, REWARD_LARGE]
            rew_value_index = [REWARD_SMALL, REWARD_MEDIUM,
                               REWARD_LARGE].index(rew_value)
            if path_len == 1:
                ccps[rew_value_index] = (1 - RANDOM_QUIT_CHANCE)
            else:
                ccps[rew_value_index] = (1 - RANDOM_QUIT_CHANCE)**(path_len -
                                                                   1)

        return np.concatenate((np.array([expected_walking_distance]), ccps))
예제 #7
0
def test_frozen_features():
    env = make_wrapped_env('FrozenLake-v0', with_feature_wrapper=True)
    feature_wrapper = unwrap_env(env, FeatureWrapper)
    d = feature_wrapper.feature_dimensionality()
    ranges = feature_wrapper.feature_range()
    print(ranges)
    for i in range(16):
        feature = feature_wrapper.features(None, None, i)
        assert feature.shape == d
        assert np.all(feature >= ranges[0])
        assert np.all(feature <= ranges[1])
예제 #8
0
    def train(self, no_irl_iterations: int,
              no_rl_episodes_per_irl_iteration: int,
              no_irl_episodes_per_irl_iteration: int):
        """

        """

        sa_visit_count, P0 = self.sa_visitations()

        # calculate feature expectations
        expert_feature_count = self.feature_count(self.expert_trajs, gamma=1.0)

        # initialize the parameters
        reward_function = FeatureBasedRewardFunction(self.env, 'random')
        theta = reward_function.parameters

        agent = self.rl_alg_factory(self.env)

        irl_iteration_counter = 0

        while irl_iteration_counter < no_irl_iterations:
            irl_iteration_counter += 1

            if self.config['verbose']:
                print('IRL ITERATION ' + str(irl_iteration_counter))

            reward_wrapper = unwrap_env(self.env, RewardWrapper)
            reward_wrapper.update_reward_parameters(theta)

            # compute policy
            agent.train(no_rl_episodes_per_irl_iteration)

            policy = agent.policy_array()
            state_values = agent.state_values
            q_values = agent.q_values

            # occupancy measure
            d = self.occupancy_measure(policy=policy,
                                       initial_state_dist=P0)[:-1]

            # log-likeilihood gradient
            grad = -(expert_feature_count - np.dot(self.feat_map.T, d))

            # graduate descent
            theta -= self.config['lr'] * grad

            evaluation_input = {
                'irl_agent': agent,
                'irl_reward': reward_function
            }
            self.evaluate_metrics(evaluation_input)

        return theta
예제 #9
0
    def get_reward_array(self):
        env = unwrap_env(self.env, DiscreteEnv)

        # adding +1 to account for absorbing state
        # (reached whenever game ended)
        n_states = env.observation_space.n + 1
        n_actions = env.action_space.n

        if is_unwrappable_to(self.env, RewardWrapper):
            # get the reward function:
            reward_wrapper = unwrap_env(self.env, RewardWrapper)
            reward_function = reward_wrapper.reward_function
        else:
            reward_function = None

        rewards = np.zeros([n_states, n_actions])

        # iterate over all "from" states:
        for state, transitions_given_state in env.P.items():
            # iterate over all actions:
            for action, outcomes in transitions_given_state.items():
                # iterate over all possible outcomes:
                for probability, next_state, reward, done in outcomes:
                    if reward_function is not None:
                        if done and state == next_state:
                            # don't output reward for reaching state if game is over
                            # and already in that state.
                            reward = 0
                        else:
                            rew_input = reward_wrapper.get_reward_input_for(
                                state, action, next_state)
                            reward = reward_function.reward(rew_input)
                    rewards[state, action] += reward * probability

        # reward of absorbing state is zero:
        rewards[-1, :] = 0.0

        return rewards
예제 #10
0
def feature_count(env, trajs: List[Dict[str, list]],
                  gamma: float) -> np.ndarray:
    """Return empirical discounted feature counts of input trajectories.

    Parameters
    ----------
    env: gym.Env
        A gym environment, wrapped in a feature wrapper
    trajs: List[Dict[str, list]]
         A list of trajectories.
        Each trajectory is a dictionary with keys
        ['states', 'actions', 'rewards', 'true_rewards', 'features'].
        The values of each dictionary are lists.
        See :func:`irl_benchmark.irl.collect.collect_trajs`.
    gamma: float
        The discount factor. Must be in range [0., 1.].

    Returns
    -------
    np.ndarray
        A numpy array containing discounted feature counts. The shape
        is the same as the trajectories' feature shapes. One scalar
        feature count per feature.
    """
    assert is_unwrappable_to(env, FeatureWrapper)

    # Initialize feature count sum to zeros of correct shape:
    feature_dim = unwrap_env(env, FeatureWrapper).feature_dimensionality()
    # feature_dim is a 1-tuple,
    # extract the feature dimensionality as integer:
    assert len(feature_dim) == 1
    feature_dim = feature_dim[0]
    feature_count_sum = np.zeros(feature_dim)

    for traj in trajs:
        assert traj['features']  # empty lists are False in python

        # gammas is a vector containing [gamma^0, gamma^1, gamma^2, ... gamma^l]
        # where l is length of the trajectory:
        gammas = gamma**np.arange(len(traj['features']))
        traj_feature_count = np.sum(gammas.reshape(-1, 1) *
                                    np.array(traj['features']).reshape(
                                        (-1, feature_dim)),
                                    axis=0)
        # add trajectory's feature count:
        feature_count_sum += traj_feature_count
    # divide feature_count_sum by number of trajectories to normalize:
    result = feature_count_sum / len(trajs)
    return result
예제 #11
0
    def __init__(self, env: gym.Env, expert_trajs: List[Dict[str, list]],
                 rl_alg_factory: Callable[[gym.Env], BaseRLAlgorithm],
                 metrics: List[BaseMetric], config: dict):

        super(MaxCausalEntIRL, self).__init__(env, expert_trajs,
                                              rl_alg_factory, metrics, config)

        assert is_unwrappable_to(env, DiscreteEnv)
        assert is_unwrappable_to(env, FeatureWrapper)

        # get transition matrix (with absorbing state)
        self.transition_matrix = get_transition_matrix(self.env)
        self.n_states, self.n_actions, _ = self.transition_matrix.shape

        # get map of features for all states:
        feature_wrapper = unwrap_env(env, FeatureWrapper)
        self.feat_map = feature_wrapper.feature_array()
예제 #12
0
    def train(self, no_irl_iterations: int,
              no_rl_episodes_per_irl_iteration: int,
              no_irl_episodes_per_irl_iteration: int):
        """Train algorithm. See abstract base class for parameter types."""

        # calculate feature expectations
        expert_feature_count = self.feature_count(self.expert_trajs, gamma=1.0)

        # start with an agent
        agent = self.rl_alg_factory(self.env)

        reward_wrapper = unwrap_env(self.env, RewardWrapper)
        theta = reward_wrapper.reward_function.parameters

        irl_iteration_counter = 0
        while irl_iteration_counter < no_irl_iterations:
            irl_iteration_counter += 1

            if self.config['verbose']:
                print('IRL ITERATION ' + str(irl_iteration_counter))
            # compute policy
            agent.train(no_rl_episodes_per_irl_iteration)

            policy = agent.policy_array()

            # compute state visitation frequencies, discard absorbing state
            svf = self.expected_svf(policy)[:-1]

            # compute gradients
            grad = (expert_feature_count - self.feat_map.T.dot(svf))

            # update params
            theta += self.config['lr'] * grad

            reward_wrapper.update_reward_parameters(theta)

            evaluation_input = {
                'irl_agent': agent,
                'irl_reward': reward_wrapper.reward_function
            }
            self.evaluate_metrics(evaluation_input)

        return theta
def test_update_parameters_frozen_feature():
    def rew_fun_factory(env):
        return FeatureBasedRewardFunction(env, 'random')

    env = make_wrapped_env(
        'FrozenLake-v0',
        with_feature_wrapper=True,
        reward_function_factory=rew_fun_factory)

    reward_wrapper = unwrap_env(env, RewardWrapper)
    params = np.copy(reward_wrapper.reward_function.parameters)
    domain = reward_wrapper.reward_function.domain()
    rews = reward_wrapper.reward_function.reward(domain)
    reward_wrapper.update_reward_parameters(2 * params)
    rews2 = reward_wrapper.reward_function.reward(domain)
    assert np.all(np.isclose(2 * rews, rews2))
    reward_wrapper.update_reward_parameters(np.zeros_like(params))
    rews3 = reward_wrapper.reward_function.reward(domain)
    assert np.all(np.isclose(rews3, np.zeros_like(rews3)))
예제 #14
0
    def feature_array(self) -> np.ndarray:
        """ Get features for the entire domain as an array.
        Has to be overwritten in each feature wrapper.
        Wrappers for large environments will not implement this method.

        Returns
        -------
        np.ndarray
            The features for the entire domain as an array.
            Shape: (domain_size, d).
        """
        maze_world = unwrap_env(self.env, MazeWorld)
        num_rewards = maze_world.num_rewards
        n_states = num_rewards * 2**num_rewards
        feature_array = np.zeros((n_states, num_rewards, 4))
        for s in range(n_states):
            for a in range(num_rewards):
                state = maze_world.index_to_state(s)
                feature = self.features(state, a, None)
                feature_array[s, a, :] = feature
        return feature_array
예제 #15
0
    def train(self, no_irl_iterations: int,
              no_rl_episodes_per_irl_iteration: int,
              no_irl_episodes_per_irl_iteration: int
              ) -> Tuple[BaseRewardFunction, BaseRLAlgorithm]:
        """Train the apprenticeship learning IRL algorithm.

        Parameters
        ----------
        no_irl_iterations: int
            The number of iteration the algorithm should be run.
        no_rl_episodes_per_irl_iteration: int
            The number of episodes the RL algorithm is allowed to run in
            each iteration of the IRL algorithm.
        no_irl_episodes_per_irl_iteration: int
            The number of episodes permitted to be run in each iteration
            to update the current reward estimate (e.g. to estimate state frequencies
            of the currently optimal policy).

        Returns
        -------
        Tuple[BaseRewardFunction, BaseRLAlgorithm]
            The estimated reward function and a RL agent trained for this estimate.
        """

        # Initialize training with a random agent.
        agent = RandomAgent(self.env)

        irl_iteration_counter = 0
        while irl_iteration_counter < no_irl_iterations:
            irl_iteration_counter += 1

            if self.config['verbose']:
                print('IRL ITERATION ' + str(irl_iteration_counter))

            # Estimate feature count of current agent.
            trajs = collect_trajs(
                self.env,
                agent,
                no_trajectories=no_irl_episodes_per_irl_iteration)
            current_feature_count = self.feature_count(
                trajs, gamma=self.config['gamma'])

            print('CURRENT FEATURE COUNT:')
            print(current_feature_count)

            # add new feature count to list of feature counts
            self.feature_counts.append(current_feature_count)
            # for SVM mode:
            self.labels.append(-1.)

            # convert to numpy array:
            feature_counts = np.array(self.feature_counts)
            labels = np.array(self.labels)

            # update reward coefficients based on mode specified in config:
            if self.config['mode'] == 'projection':
                # projection mode:
                if irl_iteration_counter == 1:
                    # initialize feature_count_bar in first iteration
                    # set to first non-expert feature count:
                    feature_count_bar = feature_counts[1]
                else:
                    # not first iteration.
                    # calculate line through last feature_count_bar and
                    # last non-expert feature count:
                    line = feature_counts[-1] - feature_count_bar
                    # new feature_count_bar is orthogonal projection of
                    # expert's feature count onto the line:
                    feature_count_bar += np.dot(
                        line, feature_counts[0] - feature_count_bar) / np.dot(
                            line, line) * line
                reward_coefficients = feature_counts[0] - feature_count_bar
                # compute distance as L2 norm of reward coefficients (t^(i) in paper):
                distance = np.linalg.norm(reward_coefficients, ord=2)

            elif self.config['mode'] == 'svm':
                # svm mode:
                # create quadratic programming problem definition:
                weights = cvx.Variable(feature_counts.shape[1])
                bias = cvx.Variable()
                objective = cvx.Minimize(cvx.norm(weights, 2))
                constraints = [
                    cvx.multiply(labels,
                                 (feature_counts * weights + bias)) >= 1
                ]
                problem = cvx.Problem(objective, constraints)
                # solve quadratic program:
                problem.solve()

                if weights.value is None:
                    # TODO: we need to handle empty solution better.
                    raise RuntimeError(
                        'Empty solution set for linearly separable SVM.')

                if self.config['verbose']:
                    # print support vectors
                    # (which last iterations where relevant for current result?)
                    svm_classifications = feature_counts.dot(
                        weights.value) + bias.value
                    support_vectors = np.where(
                        np.isclose(np.abs(svm_classifications), 1))[0]
                    print('The support vectors are from iterations number ' +
                          str(support_vectors))

                reward_coefficients = weights.value
                distance = 2 / problem.value

            else:
                raise NotImplementedError()

            if self.config['verbose']:
                print('Distance: ' + str(distance))

            self.distances.append(distance)

            print(reward_coefficients)
            # update reward function
            reward_wrapper = unwrap_env(self.env, RewardWrapper)
            reward_wrapper.update_reward_parameters(reward_coefficients)

            # check stopping criterion:
            if distance <= self.config['epsilon']:
                if self.config['verbose']:
                    print("Feature counts matched within " +
                          str(self.config['epsilon']) + ".")
                break

            # create new RL-agent
            agent = self.rl_alg_factory(self.env)
            # train agent (with new reward function)
            agent.train(no_rl_episodes_per_irl_iteration)

            evaluation_input = {
                'irl_agent': agent,
                'irl_reward': reward_wrapper.reward_function
            }
            self.evaluate_metrics(evaluation_input)

        return reward_wrapper.reward_function, agent
예제 #16
0
def test_unwrap():
    env = make_env('FrozenLake-v0')
    assert env.env is unwrap_env(env, DiscreteEnv)

    # No unwrapping needed:
    assert env is unwrap_env(env, gym.Env)

    # Unwrap all the way:
    assert env.env is unwrap_env(env)

    env = FrozenLakeFeatureWrapper(env)
    assert env.env.env is unwrap_env(env, DiscreteEnv)

    # No unwrapping needed:
    assert env is unwrap_env(env, FrozenLakeFeatureWrapper)

    # Unwrap all the way:
    assert env.env.env is unwrap_env(env)

    # check types:
    assert isinstance(unwrap_env(env, DiscreteEnv), DiscreteEnv)
    assert isinstance(unwrap_env(env, feature_wrapper.FeatureWrapper),
                      feature_wrapper.FeatureWrapper)
    assert isinstance(unwrap_env(env, FrozenLakeFeatureWrapper),
                      FrozenLakeFeatureWrapper)
    assert isinstance(unwrap_env(env, FrozenLakeFeatureWrapper),
                      feature_wrapper.FeatureWrapper)
    assert isinstance(unwrap_env(env), gym.Env)
예제 #17
0
 def __init__(self, env):
     assert is_unwrappable_to(env, MazeWorld)
     super(MazeModelWrapper, self).__init__(env)
     self.maze_env = unwrap_env(self.env, MazeWorld)
예제 #18
0
    def _get_model_arrays(self, return_transitions=True, return_rewards=True):

        if return_rewards:
            if is_unwrappable_to(self.env, RewardWrapper):
                reward_wrapper = unwrap_env(self.env, RewardWrapper)
            else:
                reward_wrapper = None

        assert return_transitions or return_rewards

        # +1 for absorbing state:
        n_states = self.n_states() + 1
        absorbing_s = n_states - 1
        num_rewards = n_actions = self.maze_env.action_space.n
        paths = self.maze_env.paths

        if return_transitions:
            coords_trans_state = []
            coords_trans_action = []
            coords_trans_next_state = []
            trans_data = []

            def add_transition(s, a, sn, p):
                coords_trans_state.append(s)
                coords_trans_action.append(a)
                coords_trans_next_state.append(sn)
                trans_data.append(p)

        if return_rewards:
            rewards = np.zeros((n_states, n_actions))

        for s in tqdm(range(n_states - 1)):
            for a in range(n_actions):
                state = self.index_to_state(s)

                if return_rewards and reward_wrapper is not None:
                    rew_input = reward_wrapper.get_reward_input_for(
                        state, a, None)
                    wrapped_reward = reward_wrapper.reward_function.reward(
                        rew_input).item()

                if np.sum(state[num_rewards:]) == 0:
                    if return_transitions:
                        add_transition(s, a, absorbing_s, 1.)
                    if return_rewards:
                        if reward_wrapper is None:
                            rewards[s, a] = 0
                        else:
                            rewards[s, a] = wrapped_reward
                    continue

                pos_index = int(np.where(state[:num_rewards] > 0)[0][0])
                path = paths[pos_index][a]

                if len(path) == 1 or pos_index == a:
                    assert pos_index == a
                    if return_transitions:
                        add_transition(s, a, s, 1. - RANDOM_QUIT_CHANCE)
                        add_transition(s, a, absorbing_s, RANDOM_QUIT_CHANCE)
                    if return_rewards:
                        if reward_wrapper is None:
                            rewards[s, a] = REWARD_MOVE
                            if state[num_rewards + a] != 0:
                                rews_where = self.maze_env.rews_where
                                rewards[s, a] += float(
                                    self.maze_env.map_rewards[rews_where[0][a], \
                                    rews_where[1][a]]) * (1 - RANDOM_QUIT_CHANCE)
                        else:
                            rewards[s, a] = wrapped_reward
                    continue

                success_prob = (1 - RANDOM_QUIT_CHANCE)**(len(path) - 1)
                if return_transitions:
                    new_state = get_next_state(state, a, num_rewards)
                    new_s = self.state_to_index(new_state)
                    add_transition(s, a, new_s, success_prob)
                    add_transition(s, a, absorbing_s, 1. - success_prob)

                if return_rewards:
                    if reward_wrapper is None:
                        if state[num_rewards + a] == 0:
                            # if reward is already collected at this field:
                            rew_value = 0
                        else:
                            rews_where = self.maze_env.rews_where
                            rew_value = float(
                                self.maze_env.map_rewards[rews_where[0][a],
                                                          rews_where[1][a]])

                        possible_distances = np.arange(1, len(path))
                        prob_getting_to_distance = (
                            1 - RANDOM_QUIT_CHANCE)**possible_distances
                        prob_stopping_at_distance = np.ones_like(
                            possible_distances, dtype=np.float32)
                        prob_stopping_at_distance[:-1] = RANDOM_QUIT_CHANCE
                        expected_walking_distance = np.sum(
                            possible_distances * prob_getting_to_distance *
                            prob_stopping_at_distance)
                        weighted_reward = expected_walking_distance * REWARD_MOVE + success_prob * rew_value

                        rewards[s, a] = weighted_reward
                    else:
                        rewards[s, a] = wrapped_reward

        for a in range(n_actions):
            if return_transitions:
                add_transition(absorbing_s, a, absorbing_s, 1.)
            if return_rewards:
                rewards[absorbing_s, a] = 0

        if return_transitions:
            coords = np.array([
                coords_trans_state, coords_trans_action,
                coords_trans_next_state
            ])
            transitions = sparse.COO(coords, trans_data)

        if return_transitions:
            if return_rewards:
                return transitions, rewards
            return transitions
        return rewards