def advance(self, state: MdpState, t: int, a: Action, agent: Agent) -> Tuple[MdpState, Reward]: """ Advance from the current state given an action, based on the current state's model probability distribution. :param state: State to advance. :param t: Current time step. :param a: Action. :param agent: Agent. :return: 2-tuple of next state and next reward. """ # get next-state / reward tuples s_prime_rewards = [ (s_prime, reward) for s_prime in self.p_S_prime_R_given_S_A[state][a] for reward in self.p_S_prime_R_given_S_A[state][a][s_prime] if self.p_S_prime_R_given_S_A[state][a][s_prime][reward] > 0.0 ] # get probability of each tuple probs = np.array([ self.p_S_prime_R_given_S_A[state][a][s_prime][reward] for s_prime, reward in s_prime_rewards ]) # sample next state and reward self.state, next_reward = sample_list_item( x=s_prime_rewards, probs=probs, random_state=self.random_state) return self.state, next_reward
def sample_next_state_and_reward( self, state: MdpState, action: Action, random_state: RandomState) -> Tuple[MdpState, float]: """ Sample the environment model. :param state: State. :param action: Action. :param random_state: Random state. :return: 2-tuple of next state and reward. """ # sample next state next_state_count = self.state_action_next_state_count[state][action] next_states = list(next_state_count.keys()) total_count = sum(next_state_count.values()) probs = np.array([ next_state_count[next_state] / total_count for next_state in next_states ]) next_state = sample_list_item(next_states, probs, random_state) # get average reward in next state reward = self.state_reward_averager[next_state].get_value() return next_state, reward
def __act__(self, t: int) -> Action: """ Sample a random action based on current preferences. :param t: Time step. :return: Action. """ return sample_list_item(self.most_recent_state.AA, self.Pr_A, self.random_state)
def test_sample_list_item(): x = [1, 2, 3] p = np.array([0.1, 0.3, 0.6]) rng = RandomState(12345) x_samples = [sample_list_item(x, p, rng) for _ in range(10000)] xs, cnts = np.unique(x_samples, return_counts=True) x_cnt = {x: cnt for x, cnt in zip(xs, cnts)} total = sum(x_cnt.values()) x_p = [x_cnt[x] / total for x in x] assert_allclose(p, x_p, atol=0.01) with pytest.raises(ValueError, match='Expected cumulative probabilities to sum to 1'): sample_list_item([1, 2, 3], np.array([0.2, 0.3, 0.4]), rng)
def sample_state(self, random_state: RandomState) -> MdpState: """ Sample a previously encountered state uniformly. :param random_state: Random state. :return: State. """ return sample_list_item( list(self.state_action_next_state_count.keys()), None, random_state)
def sample_action(self, state: MdpState, random_state: RandomState) -> Action: """ Sample a previously encountered action in a given state uniformly. :param state: State. :param random_state: Random state. :return: Action """ return sample_list_item( list(self.state_action_next_state_count[state].keys()), None, random_state)
def test_stochastic_environment_model(): random_state = RandomState(12345) model = StochasticEnvironmentModel() actions = [ Action(i) for i in range(5) ] states = [ State(i, actions) for i in range(5) ] for t in range(1000): state = sample_list_item(states, None, random_state) action = sample_list_item(state.AA, None, random_state) next_state = sample_list_item(states, None, random_state) reward = Reward(None, random_state.randint(10)) model.update(state, action, next_state, reward) environment_sequence = [] for i in range(1000): state = model.sample_state(random_state) action = model.sample_action(state, random_state) next_state, reward = model.sample_next_state_and_reward(state, action, random_state) environment_sequence.append((next_state, reward)) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_stochastic_environment_model.pickle', 'wb') as file: # pickle.dump(environment_sequence, file) with open(f'{os.path.dirname(__file__)}/fixtures/test_stochastic_environment_model.pickle', 'rb') as file: environment_sequence_fixture = pickle.load(file) assert environment_sequence == environment_sequence_fixture
def __act__(self, t: int) -> Action: """ Act stochastically according to the policy. :param t: Time tick. :return: Action. """ self.most_recent_state: MdpState # sample action according to policy for most recent state action_prob = self.pi[self.most_recent_state] actions = list(action_prob.keys()) probs = np.array(list(action_prob.values())) return sample_list_item(x=actions, probs=probs, random_state=self.random_state)
def evaluate_q_pi( agent: ActionValueMdpAgent, environment: MdpEnvironment, num_episodes: int, exploring_starts: bool, update_upon_every_visit: bool, off_policy_agent: ActionValueMdpAgent = None, ) -> Tuple[Set[MdpState], float]: """ Perform Monte Carlo evaluation of an agent's policy within an environment, returning state-action values. This evaluation function operates over rewards obtained at the end of episodes, so it is only appropriate for episodic tasks. :param agent: Agent containing target policy to be optimized. :param environment: Environment. :param num_episodes: Number of episodes to execute. :param exploring_starts: Whether or not to use exploring starts, forcing a random action in the first time step. This maintains exploration in the first state; however, unless each state has some nonzero probability of being selected as the first state, there is no assurance that all state-action pairs will be sampled. If the initial state is deterministic, consider passing False here and shifting the burden of exploration to the improvement step with a nonzero epsilon (see `rlai.gpi.improvement.improve_policy_with_q_pi`). :param update_upon_every_visit: True to update each state-action pair upon each visit within an episode, or False to update each state-action pair upon the first visit within an episode. :param off_policy_agent: Agent containing behavioral policy used to generate learning episodes. To ensure that the state-action value estimates converge to those of the target policy, the policy of the `off_policy_agent` must be soft (i.e., have positive probability for all state-action pairs that have positive probabilities in the agent's target policy). :return: 2-tuple of (1) set of only those states that were evaluated, and (2) the average reward obtained per episode. """ logging.info(f'Running Monte Carlo evaluation of q_pi for {num_episodes} episode(s).') evaluated_states = set() episode_generation_agent = agent if off_policy_agent is None else off_policy_agent episode_reward_averager = IncrementalSampleAverager() episodes_per_print = max(1, int(num_episodes * 0.05)) for episode_i in range(num_episodes): # reset the environment for the new run (always use the agent we're learning about, as state identifiers come # from it), and reset the episode generate agent accordingly. state = environment.reset_for_new_run(agent) episode_generation_agent.reset_for_new_run(state) # simulate until episode termination, keeping a trace of state-action pairs and their immediate rewards, as well # as the times of their first visits (only if we're doing first-visit evaluation). t = 0 state_action_first_t = None if update_upon_every_visit else {} t_state_action_reward = [] total_reward = 0.0 while not state.terminal and (environment.T is None or t < environment.T): evaluated_states.add(state) if exploring_starts and t == 0: a = sample_list_item(state.AA, None, environment.random_state) else: a = episode_generation_agent.act(t) state_a = (state, a) # mark time step of first visit, if we're doing first-visit evaluation. if state_action_first_t is not None and state_a not in state_action_first_t: state_action_first_t[state_a] = t next_state, next_reward = environment.advance(state, t, a, agent) t_state_action_reward.append((t, state_a, next_reward)) total_reward += next_reward.r state = next_state t += 1 episode_generation_agent.sense(state, t) # work backwards through the trace to calculate discounted returns. need to work backward in order for the value # of g at each time step t to be properly discounted. here, w is the importance-sampling weight of the agent's # (target) policy compared to the episode generation policy (behavior). g = 0.0 w = 1.0 for t, state_a, reward in reversed(t_state_action_reward): g = agent.gamma * g + reward.r # if we're doing every-visit, or if the current time step was the first visit to the state-action, then g # is the discounted sample value. add it to our average. if state_action_first_t is None or state_action_first_t[state_a] == t: state, a = state_a agent.q_S_A.initialize(state=state, a=a, alpha=None, weighted=True) # the following two lines work correctly for on- and off-policy learning. in the former case, the agent # and episode policies are the same, which makes w always equal to 1 (i.e., q_S_A is unweighted...the # on-policy case). in off-policy learning, w will be the importance-sampling weight. agent.q_S_A[state][a].update(value=g, weight=w) w *= agent.pi[state][a] / episode_generation_agent.pi[state][a] # if the importance sampling weight becomes zero (allowing floating-point tolerance), then we're done, # as all subsequent weighted updates (at earlier time steps) will be zero. this is the sense in which # off-policy learning only learns from the "tails" of episodes in which all state-action pairs of the # episode are also greedy with respect to the agent's policy. if w < 0.00000001: break episode_reward_averager.update(total_reward) episodes_finished = episode_i + 1 if episodes_finished % episodes_per_print == 0: logging.info(f'Finished {episodes_finished} of {num_episodes} episode(s).') logging.info(f'Completed evaluation. Average reward per episode: {episode_reward_averager.get_value()}') return evaluated_states, episode_reward_averager.get_value()
def evaluate_v_pi( agent: ActionValueMdpAgent, environment: MdpEnvironment, num_episodes: int ) -> Dict[MdpState, float]: """ Perform Monte Carlo evaluation of an agent's policy within an environment, returning state values. Uses a random action on the first time step to maintain exploration (exploring starts). This evaluation approach is only marginally useful in practice, as the state-value estimates require a model of the environmental dynamics (i.e., the transition-reward probability distribution) in order to be applied. See `evaluate_q_pi` in this module for a more feature-rich and useful evaluation approach (i.e., state-action value estimation). This evaluation function operates over rewards obtained at the end of episodes, so it is only appropriate for episodic tasks. :param agent: Agent. :param environment: Environment. :param num_episodes: Number of episodes to execute. :return: Dictionary of MDP states and their estimated values under the agent's policy. """ logging.info(f'Running Monte Carlo evaluation of v_pi for {num_episodes} episode(s).') v_pi: Dict[MdpState, IncrementalSampleAverager] = { terminal_state: IncrementalSampleAverager() for terminal_state in environment.terminal_states } episodes_per_print = int(num_episodes * 0.05) for episode_i in range(num_episodes): # start the environment in a random state state = environment.reset_for_new_run(agent) agent.reset_for_new_run(state) # simulate until episode termination, keeping a trace of states and their immediate rewards, as well as the # times of their first visits. t = 0 state_first_t = {} t_state_reward = [] while not state.terminal and (environment.T is None or t < environment.T): if state not in state_first_t: state_first_t[state] = t if t == 0: a = sample_list_item(state.AA, None, environment.random_state) else: a = agent.act(t) next_state, reward = environment.advance(state, t, a, agent) t_state_reward.append((t, state, reward)) state = next_state t += 1 agent.sense(state, t) # work backwards through the trace to calculate discounted returns. need to work backward in order for the value # of g at each time step t to be properly discounted. g = 0 for t, state, reward in reversed(t_state_reward): g = agent.gamma * g + reward.r # if the current time step was the first visit to the state, then g is the discounted sample value. add it # to our average. if state_first_t[state] == t: if state not in v_pi: v_pi[state] = IncrementalSampleAverager() v_pi[state].update(g) episodes_finished = episode_i + 1 if episodes_finished % episodes_per_print == 0: logging.info(f'Finished {episodes_finished} of {num_episodes} episode(s).') return { s: v_pi[s].get_value() for s in v_pi }
def get_bootstrapped_state_action_value( state: MdpState, t: int, mode: Mode, agent: MdpAgent, q_S_A: StateActionValueEstimator, environment: MdpEnvironment) -> Tuple[float, Action]: """ Get the bootstrapped state-action value for a state, also returning the next action. :param state: State. :param t: Time step. :param mode: Bootstrap mode. :param agent: Agent. :param q_S_A: Current state-action value estimates. :param environment: Environment. :return: 2-tuple of the state's bootstrapped state-action value and the next action. """ next_a = None # if the state is terminal, then all q-values are zero. if state.terminal: bootstrapped_s_a_value = 0.0 else: # EXPECTED_SARSA: get expected q-value based on current policy and q-value estimates if mode == Mode.EXPECTED_SARSA: bootstrapped_s_a_value = sum( (agent.pi[state][a] if state in agent.pi else 1 / len(state.AA)) * (q_S_A[state][a].get_value( ) if state in q_S_A and a in q_S_A[state] else 0.0) for a in state.AA) else: # SARSA: agent determines the t-d target action as well as the episode's next action, which are the same # (we're on-policy). if mode == Mode.SARSA: td_target_a = next_a = agent.act(t) # Q-LEARNING: select the action with max q-value from the state. if no q-values are estimated, then select # the action uniformly randomly. elif mode == Mode.Q_LEARNING: if state in q_S_A and len(q_S_A[state]) > 0: td_target_a = max( q_S_A[state], key=lambda action: q_S_A[state][action].get_value()) else: td_target_a = sample_list_item( state.AA, probs=None, random_state=environment.random_state) else: # pragma no cover raise ValueError(f'Unknown TD mode: {mode}') # get the state-action value if we have an estimate for it; otherwise, it's zero. if state in q_S_A and td_target_a in q_S_A[state]: bootstrapped_s_a_value = q_S_A[state][td_target_a].get_value() else: bootstrapped_s_a_value = 0.0 # if we're off-policy, then we won't yet have a next action. ask the agent for an action now. if next_a is None: next_a = agent.act(t) return bootstrapped_s_a_value, next_a
def mock_input(prompt: str) -> str: s = human.most_recent_state selected_a = sample_list_item(s.AA, probs=None, random_state=random_state) return selected_a.name