예제 #1
0
    def advance(self, state: MdpState, t: int, a: Action,
                agent: Agent) -> Tuple[MdpState, Reward]:
        """
        Advance from the current state given an action, based on the current state's model probability distribution.

        :param state: State to advance.
        :param t: Current time step.
        :param a: Action.
        :param agent: Agent.
        :return: 2-tuple of next state and next reward.
        """

        # get next-state / reward tuples
        s_prime_rewards = [
            (s_prime, reward)
            for s_prime in self.p_S_prime_R_given_S_A[state][a]
            for reward in self.p_S_prime_R_given_S_A[state][a][s_prime]
            if self.p_S_prime_R_given_S_A[state][a][s_prime][reward] > 0.0
        ]

        # get probability of each tuple
        probs = np.array([
            self.p_S_prime_R_given_S_A[state][a][s_prime][reward]
            for s_prime, reward in s_prime_rewards
        ])

        # sample next state and reward
        self.state, next_reward = sample_list_item(
            x=s_prime_rewards, probs=probs, random_state=self.random_state)

        return self.state, next_reward
예제 #2
0
    def sample_next_state_and_reward(
            self, state: MdpState, action: Action,
            random_state: RandomState) -> Tuple[MdpState, float]:
        """
        Sample the environment model.

        :param state: State.
        :param action: Action.
        :param random_state: Random state.
        :return: 2-tuple of next state and reward.
        """

        # sample next state
        next_state_count = self.state_action_next_state_count[state][action]
        next_states = list(next_state_count.keys())
        total_count = sum(next_state_count.values())

        probs = np.array([
            next_state_count[next_state] / total_count
            for next_state in next_states
        ])

        next_state = sample_list_item(next_states, probs, random_state)

        # get average reward in next state
        reward = self.state_reward_averager[next_state].get_value()

        return next_state, reward
예제 #3
0
    def __act__(self, t: int) -> Action:
        """
        Sample a random action based on current preferences.

        :param t: Time step.
        :return: Action.
        """

        return sample_list_item(self.most_recent_state.AA, self.Pr_A,
                                self.random_state)
예제 #4
0
def test_sample_list_item():

    x = [1, 2, 3]
    p = np.array([0.1, 0.3, 0.6])

    rng = RandomState(12345)
    x_samples = [sample_list_item(x, p, rng) for _ in range(10000)]

    xs, cnts = np.unique(x_samples, return_counts=True)

    x_cnt = {x: cnt for x, cnt in zip(xs, cnts)}

    total = sum(x_cnt.values())
    x_p = [x_cnt[x] / total for x in x]

    assert_allclose(p, x_p, atol=0.01)

    with pytest.raises(ValueError,
                       match='Expected cumulative probabilities to sum to 1'):
        sample_list_item([1, 2, 3], np.array([0.2, 0.3, 0.4]), rng)
예제 #5
0
    def sample_state(self, random_state: RandomState) -> MdpState:
        """
        Sample a previously encountered state uniformly.

        :param random_state: Random state.
        :return: State.
        """

        return sample_list_item(
            list(self.state_action_next_state_count.keys()), None,
            random_state)
예제 #6
0
    def sample_action(self, state: MdpState,
                      random_state: RandomState) -> Action:
        """
        Sample a previously encountered action in a given state uniformly.

        :param state: State.
        :param random_state: Random state.
        :return: Action
        """

        return sample_list_item(
            list(self.state_action_next_state_count[state].keys()), None,
            random_state)
예제 #7
0
def test_stochastic_environment_model():

    random_state = RandomState(12345)

    model = StochasticEnvironmentModel()

    actions = [
        Action(i)
        for i in range(5)
    ]

    states = [
        State(i, actions)
        for i in range(5)
    ]

    for t in range(1000):
        state = sample_list_item(states, None, random_state)
        action = sample_list_item(state.AA, None, random_state)
        next_state = sample_list_item(states, None, random_state)
        reward = Reward(None, random_state.randint(10))
        model.update(state, action, next_state, reward)

    environment_sequence = []
    for i in range(1000):
        state = model.sample_state(random_state)
        action = model.sample_action(state, random_state)
        next_state, reward = model.sample_next_state_and_reward(state, action, random_state)
        environment_sequence.append((next_state, reward))

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_stochastic_environment_model.pickle', 'wb') as file:
    #     pickle.dump(environment_sequence, file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_stochastic_environment_model.pickle', 'rb') as file:
        environment_sequence_fixture = pickle.load(file)

    assert environment_sequence == environment_sequence_fixture
예제 #8
0
    def __act__(self, t: int) -> Action:
        """
        Act stochastically according to the policy.

        :param t: Time tick.
        :return: Action.
        """

        self.most_recent_state: MdpState

        # sample action according to policy for most recent state
        action_prob = self.pi[self.most_recent_state]
        actions = list(action_prob.keys())
        probs = np.array(list(action_prob.values()))

        return sample_list_item(x=actions,
                                probs=probs,
                                random_state=self.random_state)
예제 #9
0
def evaluate_q_pi(
        agent: ActionValueMdpAgent,
        environment: MdpEnvironment,
        num_episodes: int,
        exploring_starts: bool,
        update_upon_every_visit: bool,
        off_policy_agent: ActionValueMdpAgent = None,
) -> Tuple[Set[MdpState], float]:
    """
    Perform Monte Carlo evaluation of an agent's policy within an environment, returning state-action values. This
    evaluation function operates over rewards obtained at the end of episodes, so it is only appropriate for episodic
    tasks.

    :param agent: Agent containing target policy to be optimized.
    :param environment: Environment.
    :param num_episodes: Number of episodes to execute.
    :param exploring_starts: Whether or not to use exploring starts, forcing a random action in the first time step.
    This maintains exploration in the first state; however, unless each state has some nonzero probability of being
    selected as the first state, there is no assurance that all state-action pairs will be sampled. If the initial state
    is deterministic, consider passing False here and shifting the burden of exploration to the improvement step with
    a nonzero epsilon (see `rlai.gpi.improvement.improve_policy_with_q_pi`).
    :param update_upon_every_visit: True to update each state-action pair upon each visit within an episode, or False to
    update each state-action pair upon the first visit within an episode.
    :param off_policy_agent: Agent containing behavioral policy used to generate learning episodes. To ensure that the
    state-action value estimates converge to those of the target policy, the policy of the `off_policy_agent` must be
    soft (i.e., have positive probability for all state-action pairs that have positive probabilities in the agent's
    target policy).
    :return: 2-tuple of (1) set of only those states that were evaluated, and (2) the average reward obtained per
    episode.
    """

    logging.info(f'Running Monte Carlo evaluation of q_pi for {num_episodes} episode(s).')

    evaluated_states = set()

    episode_generation_agent = agent if off_policy_agent is None else off_policy_agent
    episode_reward_averager = IncrementalSampleAverager()
    episodes_per_print = max(1, int(num_episodes * 0.05))
    for episode_i in range(num_episodes):

        # reset the environment for the new run (always use the agent we're learning about, as state identifiers come
        # from it), and reset the episode generate agent accordingly.
        state = environment.reset_for_new_run(agent)
        episode_generation_agent.reset_for_new_run(state)

        # simulate until episode termination, keeping a trace of state-action pairs and their immediate rewards, as well
        # as the times of their first visits (only if we're doing first-visit evaluation).
        t = 0
        state_action_first_t = None if update_upon_every_visit else {}
        t_state_action_reward = []
        total_reward = 0.0
        while not state.terminal and (environment.T is None or t < environment.T):

            evaluated_states.add(state)

            if exploring_starts and t == 0:
                a = sample_list_item(state.AA, None, environment.random_state)
            else:
                a = episode_generation_agent.act(t)

            state_a = (state, a)

            # mark time step of first visit, if we're doing first-visit evaluation.
            if state_action_first_t is not None and state_a not in state_action_first_t:
                state_action_first_t[state_a] = t

            next_state, next_reward = environment.advance(state, t, a, agent)
            t_state_action_reward.append((t, state_a, next_reward))
            total_reward += next_reward.r
            state = next_state
            t += 1

            episode_generation_agent.sense(state, t)

        # work backwards through the trace to calculate discounted returns. need to work backward in order for the value
        # of g at each time step t to be properly discounted. here, w is the importance-sampling weight of the agent's
        # (target) policy compared to the episode generation policy (behavior).
        g = 0.0
        w = 1.0
        for t, state_a, reward in reversed(t_state_action_reward):

            g = agent.gamma * g + reward.r

            # if we're doing every-visit, or if the current time step was the first visit to the state-action, then g
            # is the discounted sample value. add it to our average.
            if state_action_first_t is None or state_action_first_t[state_a] == t:

                state, a = state_a

                agent.q_S_A.initialize(state=state, a=a, alpha=None, weighted=True)

                # the following two lines work correctly for on- and off-policy learning. in the former case, the agent
                # and episode policies are the same, which makes w always equal to 1 (i.e., q_S_A is unweighted...the
                # on-policy case). in off-policy learning, w will be the importance-sampling weight.
                agent.q_S_A[state][a].update(value=g, weight=w)
                w *= agent.pi[state][a] / episode_generation_agent.pi[state][a]

                # if the importance sampling weight becomes zero (allowing floating-point tolerance), then we're done,
                # as all subsequent weighted updates (at earlier time steps) will be zero. this is the sense in which
                # off-policy learning only learns from the "tails" of episodes in which all state-action pairs of the
                # episode are also greedy with respect to the agent's policy.
                if w < 0.00000001:
                    break

        episode_reward_averager.update(total_reward)

        episodes_finished = episode_i + 1
        if episodes_finished % episodes_per_print == 0:
            logging.info(f'Finished {episodes_finished} of {num_episodes} episode(s).')

    logging.info(f'Completed evaluation. Average reward per episode:  {episode_reward_averager.get_value()}')

    return evaluated_states, episode_reward_averager.get_value()
예제 #10
0
def evaluate_v_pi(
        agent: ActionValueMdpAgent,
        environment: MdpEnvironment,
        num_episodes: int
) -> Dict[MdpState, float]:
    """
    Perform Monte Carlo evaluation of an agent's policy within an environment, returning state values. Uses a random
    action on the first time step to maintain exploration (exploring starts). This evaluation approach is only
    marginally useful in practice, as the state-value estimates require a model of the environmental dynamics (i.e.,
    the transition-reward probability distribution) in order to be applied. See `evaluate_q_pi` in this module for a
    more feature-rich and useful evaluation approach (i.e., state-action value estimation). This evaluation function
    operates over rewards obtained at the end of episodes, so it is only appropriate for episodic tasks.

    :param agent: Agent.
    :param environment: Environment.
    :param num_episodes: Number of episodes to execute.
    :return: Dictionary of MDP states and their estimated values under the agent's policy.
    """

    logging.info(f'Running Monte Carlo evaluation of v_pi for {num_episodes} episode(s).')

    v_pi: Dict[MdpState, IncrementalSampleAverager] = {
        terminal_state: IncrementalSampleAverager()
        for terminal_state in environment.terminal_states
    }

    episodes_per_print = int(num_episodes * 0.05)
    for episode_i in range(num_episodes):

        # start the environment in a random state
        state = environment.reset_for_new_run(agent)
        agent.reset_for_new_run(state)

        # simulate until episode termination, keeping a trace of states and their immediate rewards, as well as the
        # times of their first visits.
        t = 0
        state_first_t = {}
        t_state_reward = []
        while not state.terminal and (environment.T is None or t < environment.T):

            if state not in state_first_t:
                state_first_t[state] = t

            if t == 0:
                a = sample_list_item(state.AA, None, environment.random_state)
            else:
                a = agent.act(t)

            next_state, reward = environment.advance(state, t, a, agent)
            t_state_reward.append((t, state, reward))
            state = next_state
            t += 1

            agent.sense(state, t)

        # work backwards through the trace to calculate discounted returns. need to work backward in order for the value
        # of g at each time step t to be properly discounted.
        g = 0
        for t, state, reward in reversed(t_state_reward):

            g = agent.gamma * g + reward.r

            # if the current time step was the first visit to the state, then g is the discounted sample value. add it
            # to our average.
            if state_first_t[state] == t:

                if state not in v_pi:
                    v_pi[state] = IncrementalSampleAverager()

                v_pi[state].update(g)

        episodes_finished = episode_i + 1
        if episodes_finished % episodes_per_print == 0:
            logging.info(f'Finished {episodes_finished} of {num_episodes} episode(s).')

    return {
        s: v_pi[s].get_value()
        for s in v_pi
    }
예제 #11
0
def get_bootstrapped_state_action_value(
        state: MdpState, t: int, mode: Mode, agent: MdpAgent,
        q_S_A: StateActionValueEstimator,
        environment: MdpEnvironment) -> Tuple[float, Action]:
    """
    Get the bootstrapped state-action value for a state, also returning the next action.

    :param state: State.
    :param t: Time step.
    :param mode: Bootstrap mode.
    :param agent: Agent.
    :param q_S_A: Current state-action value estimates.
    :param environment: Environment.
    :return: 2-tuple of the state's bootstrapped state-action value and the next action.
    """

    next_a = None

    # if the state is terminal, then all q-values are zero.
    if state.terminal:
        bootstrapped_s_a_value = 0.0
    else:

        # EXPECTED_SARSA:  get expected q-value based on current policy and q-value estimates
        if mode == Mode.EXPECTED_SARSA:
            bootstrapped_s_a_value = sum(
                (agent.pi[state][a] if state in agent.pi else 1 /
                 len(state.AA)) * (q_S_A[state][a].get_value(
                 ) if state in q_S_A and a in q_S_A[state] else 0.0)
                for a in state.AA)
        else:

            # SARSA:  agent determines the t-d target action as well as the episode's next action, which are the same
            # (we're on-policy).
            if mode == Mode.SARSA:
                td_target_a = next_a = agent.act(t)

            # Q-LEARNING:  select the action with max q-value from the state. if no q-values are estimated, then select
            # the action uniformly randomly.
            elif mode == Mode.Q_LEARNING:
                if state in q_S_A and len(q_S_A[state]) > 0:
                    td_target_a = max(
                        q_S_A[state],
                        key=lambda action: q_S_A[state][action].get_value())
                else:
                    td_target_a = sample_list_item(
                        state.AA,
                        probs=None,
                        random_state=environment.random_state)
            else:  # pragma no cover
                raise ValueError(f'Unknown TD mode:  {mode}')

            # get the state-action value if we have an estimate for it; otherwise, it's zero.
            if state in q_S_A and td_target_a in q_S_A[state]:
                bootstrapped_s_a_value = q_S_A[state][td_target_a].get_value()
            else:
                bootstrapped_s_a_value = 0.0

        # if we're off-policy, then we won't yet have a next action. ask the agent for an action now.
        if next_a is None:
            next_a = agent.act(t)

    return bootstrapped_s_a_value, next_a
예제 #12
0
 def mock_input(prompt: str) -> str:
     s = human.most_recent_state
     selected_a = sample_list_item(s.AA,
                                   probs=None,
                                   random_state=random_state)
     return selected_a.name