def reward_tabular_normalized(policy, task, tol=1e-4): ''' compute the expected reward / reward by value iteration averaged over states. ''' gtV = compute_tabular_value(task, tol) # ground truth values by value iteration. V = reward_tabular(policy, task, tol) return V / gtV
def expected_reward_tabular_normalized(policy, task, tol=1e-4): ''' compute the expected reward / reward by value iteration averaged over states. ''' gtV = compute_tabular_value(task, tol) # ground truth values by value iteration. V = reward_tabular(policy, task, tol) rewards = [V[state] / gtV[state] for state in task.get_valid_states()] return np.mean(rewards)
def reward_tabular_normalized_fix_start(policy, task, tol=1e-4): ''' compute the expected reward / reward by value iteration averaged over states. ''' states = [task.start_state] gtV = compute_tabular_value(task, tol) # ground truth values by value iteration. V = reward_tabular(policy, task, tol) rewards = {state: V[state] / gtV[state] for state in task.get_valid_states()} return np.mean([rewards[state] for state in states])