Exemplo n.º 1
0
    def compute_single_policy_backup(self, policy: Policy, gamma: float) -> Tuple[ValueFunction, float]:
        '''
        Performs a policy backup on the current value function 
        and using the specified policy.  
        This method does not modify the current value function; 
        instead it returns a new value function, 
        together with the error associated with the backup operation.
        '''
        # DONE
        new_value_function = ValueFunction(self._domain)
        error = 0
        for state in self._domain.get_observation_space().get_elements():
            if self._domain.is_terminal(state):
                new_value_function._values[state] = 0
            else:
                action = policy.__getitem__(state)
                # distribution = self._domain.get_next_state_distribution(state,action).get_values()
                new_value_function._values[state] = self.q_value(state,action,gamma)
                if error < abs(self.q_value(state,action,gamma) - self.__getitem__(state)):
                    error = abs(self.q_value(state,action,gamma) - self.__getitem__(state))


        return new_value_function, error