Exemplo n.º 1
0
    def agent_init(self, taskspec):
        """ This function is called once at the begining of an episode.
        Performs sanity checks with the environment.

        :param taskspec: The task specifications
        :type taskspec: str

        """
        spec = TaskSpecVRLGLUE3.TaskSpecParser(taskspec)
        if len(spec.getIntActions()) != 1:
            raise Exception("Expecting 1-dimensional discrete actions")
        if len(spec.getDoubleActions()) != 0:
            raise Exception("Expecting no continuous actions")
        if spec.isSpecial(spec.getIntActions()[0][0]):
            raise Exception(
                "Expecting min action to be a number not a special value")
        if spec.isSpecial(spec.getIntActions()[0][1]):
            raise Exception(
                "Expecting max action to be a number not a special value")

        observation_ranges = spec.getDoubleObservations()
        self.basis = FourierBasis(len(observation_ranges), self.fa_order,
                                  observation_ranges)
        self.weights = np.zeros((self.basis.numTerms, len(self.options)))

        self.last_action = 0
        self.last_features = []
        self.last_observation = []
Exemplo n.º 2
0
    def agent_init(self, taskspec):
        """ This function is called once at the begining of an episode.
        Performs sanity checks with the environment.

        :param taskspec: The task specifications
        :type taskspec: str

        """
        spec = TaskSpecVRLGLUE3.TaskSpecParser(taskspec)
        if len(spec.getIntActions()) != 1:
            raise Exception("Expecting 1-dimensional discrete actions")
        if len(spec.getDoubleActions()) != 0:
            raise Exception("Expecting no continuous actions")
        if spec.isSpecial(spec.getIntActions()[0][0]):
            raise Exception("Expecting min action to be a number not a special value")
        if spec.isSpecial(spec.getIntActions()[0][1]):
            raise Exception("Expecting max action to be a number not a special value")

        observation_ranges = spec.getDoubleObservations()
        self.basis = FourierBasis(len(observation_ranges), self.fa_order, observation_ranges)
        self.weights = np.zeros((self.basis.numTerms, len(self.options)))

        self.last_action = 0
        self.last_features = []
        self.last_observation = []
Exemplo n.º 3
0
class IntraOptionLearning(Agent):
    """ This class implements Intra-Option learning with
    linear function approximation.

    R. S. Sutton, D. Precup, and S. Singh, "Intra-option learning about
    temporally abstract actions," In Proceedings of the Fifteenth
    International Conference on Machine Learning (ICML 1998), 1998, pp. 556-564.

    """

    def __init__(self, options, alpha, gamma, epsilon, fa_order):
        """
        :param options: A set of options with learnt policies
        :type options: list of Option

        """
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.fa_order = fa_order
        self.options = options
        self.current_option = None
        self.finished_learning = False

    def intraoption_update(self, reward, features, observation):
        """ Perform a step of intra-option learning

        :param reward: The reward just obtained
        :param features: The features representation of the current state

        """

        for i in self.consistent_options(self.last_observation, self.last_action):
            current_value = 0.0
            if self.options[i].terminate(observation):
                initializable_options = self.initializable_options(observation)
                current_value = np.dot(self.weights[:,initializable_options].T, features).max()
            else:
                current_value = np.dot(self.weights[:,i].T, features)

            delta = reward + self.gamma*current_value - np.dot(self.weights[:,i].T, self.last_features)

            self.weights[:,i] += self.alpha*delta

    def consistent_options(self, observation, action):
        """
        :returns: a subset of the options for which pi_o(s) = a
        :rtype: list of int
        """
        return filter(lambda idx: self.options[idx].pi(observation) == action,
                        xrange(len(self.options)))


    def initializable_options(self, observation):
        """ Find the options available under the current state

        :retuns: The indices of the options that can be initiated under the current state
        :rtype: list of int

        """
        return filter(lambda idx: self.options[idx].initiate(observation), xrange(len(self.options)))

    def egreedy(self, observation, features):
        """ Use epsilon-greedy exploration for the behavior policy

        :param observation: The raw observations
        :param features: The features representation of the observation
        :returns: A random option with probability epsilon, or the option with
        the highest value with probability 1 - epsilon.
        :rtype: int

        """
        initializable_options = self.initializable_options(observation)

        if not self.finished_learning and (random.random() < self.epsilon):
            return random.choice(initializable_options)

        return initializable_options[np.dot(self.weights[:,initializable_options].T, features).argmax()]

    def mu(self, observation, features=None):
        """ The semi-markov deterministic policy that follows
        an option to completion before starting another one.

        :param observation: The raw observations
        :param features: The features representation of the observation
        :returns: the best option according to the current policy
        :rtype: Option

        """
        if self.current_option == None or self.current_option.terminate(observation):
            self.current_option = self.options[self.egreedy(observation, features)]

        return self.current_option

    def agent_init(self, taskspec):
        """ This function is called once at the begining of an episode.
        Performs sanity checks with the environment.

        :param taskspec: The task specifications
        :type taskspec: str

        """
        spec = TaskSpecVRLGLUE3.TaskSpecParser(taskspec)
        if len(spec.getIntActions()) != 1:
            raise Exception("Expecting 1-dimensional discrete actions")
        if len(spec.getDoubleActions()) != 0:
            raise Exception("Expecting no continuous actions")
        if spec.isSpecial(spec.getIntActions()[0][0]):
            raise Exception("Expecting min action to be a number not a special value")
        if spec.isSpecial(spec.getIntActions()[0][1]):
            raise Exception("Expecting max action to be a number not a special value")

        observation_ranges = spec.getDoubleObservations()
        self.basis = FourierBasis(len(observation_ranges), self.fa_order, observation_ranges)
        self.weights = np.zeros((self.basis.numTerms, len(self.options)))

        self.last_action = 0
        self.last_features = []
        self.last_observation = []

    def agent_start(self, obs):
        """ This function is called by the environment in the initial state.

        :param obs: An observation from the environment
        :rtype obs: :class:`rlglue.types.Observation`
        :returns: The primitive action to execute in the environment according to the
        behavior policy.
        :rtype: a primitive action under the form of a :class:`rlglue.types.Action`

        """
        observation = np.array(obs.doubleArray)
        current_features = self.basis.computeFeatures(observation)

        self.last_observation = observation
        self.last_features = current_features
        self.last_action = self.mu(observation, current_features).pi(observation)

        action = Action()
        action.intArray = [self.last_action]
        return action

    def agent_step(self, reward, obs):
        """ This function is called by the environment while the episode lasts.

        If learning is not frozen, the option-value function Q(s, o) is updated
        using intra-option learning.

        :param reward: The reward obtained as a result of the last transition.
        :param obs: An observation from the environment
        :rtype obs: :class:`rlglue.types.Observation`
        :returns: The primitive action to execute in the environment according to the
        behavior policy.
        :rtype: a primitive action under the form of a :class:`rlglue.types.Action`

        """
        observation = np.array(obs.doubleArray)
        current_features = self.basis.computeFeatures(observation)

        if not self.finished_learning:
            self.intraoption_update(reward, current_features, observation)

        self.last_observation = observation
        self.last_features = current_features
        self.last_action = self.mu(observation, current_features).pi(observation)

        action = Action()
        action.intArray = [self.last_action]
        return action

    def agent_end(self, reward):
        """ This function is called by the environment when the episode finishes.

        If learning is not frozen, the option-value function Q(s, o) is updated
        using intra-option learning.

        :param reward: The reward obtained as a result of the last transition.

        """
        if not self.finished_learning:
            for i in self.consistent_options(self.last_observation, self.last_action):
                delta = reward - np.dot(self.weights[:,i].T, self.last_features)
                self.weights[:,i] = self.weights[:,i] + self.alpha*delta

    def agent_cleanup(self):
        pass

    def agent_message(self, msg):
        return "Intra-Option Learning does not understand your message."
Exemplo n.º 4
0
class IntraOptionLearning(Agent):
    """ This class implements Intra-Option learning with
    linear function approximation.

    R. S. Sutton, D. Precup, and S. Singh, "Intra-option learning about
    temporally abstract actions," In Proceedings of the Fifteenth
    International Conference on Machine Learning (ICML 1998), 1998, pp. 556-564.

    """
    def __init__(self, options, alpha, gamma, epsilon, fa_order):
        """
        :param options: A set of options with learnt policies
        :type options: list of Option

        """
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.fa_order = fa_order
        self.options = options
        self.current_option = None
        self.finished_learning = False

    def intraoption_update(self, reward, features, observation):
        """ Perform a step of intra-option learning

        :param reward: The reward just obtained
        :param features: The features representation of the current state

        """

        for i in self.consistent_options(self.last_observation,
                                         self.last_action):
            current_value = 0.0
            if self.options[i].terminate(observation):
                initializable_options = self.initializable_options(observation)
                current_value = np.dot(
                    self.weights[:, initializable_options].T, features).max()
            else:
                current_value = np.dot(self.weights[:, i].T, features)

            delta = reward + self.gamma * current_value - np.dot(
                self.weights[:, i].T, self.last_features)

            self.weights[:, i] += self.alpha * delta

    def consistent_options(self, observation, action):
        """
        :returns: a subset of the options for which pi_o(s) = a
        :rtype: list of int
        """
        return filter(lambda idx: self.options[idx].pi(observation) == action,
                      xrange(len(self.options)))

    def initializable_options(self, observation):
        """ Find the options available under the current state

        :retuns: The indices of the options that can be initiated under the current state
        :rtype: list of int

        """
        return filter(lambda idx: self.options[idx].initiate(observation),
                      xrange(len(self.options)))

    def egreedy(self, observation, features):
        """ Use epsilon-greedy exploration for the behavior policy

        :param observation: The raw observations
        :param features: The features representation of the observation
        :returns: A random option with probability epsilon, or the option with
        the highest value with probability 1 - epsilon.
        :rtype: int

        """
        initializable_options = self.initializable_options(observation)

        if not self.finished_learning and (random.random() < self.epsilon):
            return random.choice(initializable_options)

        return initializable_options[np.dot(
            self.weights[:, initializable_options].T, features).argmax()]

    def mu(self, observation, features=None):
        """ The semi-markov deterministic policy that follows
        an option to completion before starting another one.

        :param observation: The raw observations
        :param features: The features representation of the observation
        :returns: the best option according to the current policy
        :rtype: Option

        """
        if self.current_option == None or self.current_option.terminate(
                observation):
            self.current_option = self.options[self.egreedy(
                observation, features)]

        return self.current_option

    def agent_init(self, taskspec):
        """ This function is called once at the begining of an episode.
        Performs sanity checks with the environment.

        :param taskspec: The task specifications
        :type taskspec: str

        """
        spec = TaskSpecVRLGLUE3.TaskSpecParser(taskspec)
        if len(spec.getIntActions()) != 1:
            raise Exception("Expecting 1-dimensional discrete actions")
        if len(spec.getDoubleActions()) != 0:
            raise Exception("Expecting no continuous actions")
        if spec.isSpecial(spec.getIntActions()[0][0]):
            raise Exception(
                "Expecting min action to be a number not a special value")
        if spec.isSpecial(spec.getIntActions()[0][1]):
            raise Exception(
                "Expecting max action to be a number not a special value")

        observation_ranges = spec.getDoubleObservations()
        self.basis = FourierBasis(len(observation_ranges), self.fa_order,
                                  observation_ranges)
        self.weights = np.zeros((self.basis.numTerms, len(self.options)))

        self.last_action = 0
        self.last_features = []
        self.last_observation = []

    def agent_start(self, obs):
        """ This function is called by the environment in the initial state.

        :param obs: An observation from the environment
        :rtype obs: :class:`rlglue.types.Observation`
        :returns: The primitive action to execute in the environment according to the
        behavior policy.
        :rtype: a primitive action under the form of a :class:`rlglue.types.Action`

        """
        observation = np.array(obs.doubleArray)
        current_features = self.basis.computeFeatures(observation)

        self.last_observation = observation
        self.last_features = current_features
        self.last_action = self.mu(observation,
                                   current_features).pi(observation)

        action = Action()
        action.intArray = [self.last_action]
        return action

    def agent_step(self, reward, obs):
        """ This function is called by the environment while the episode lasts.

        If learning is not frozen, the option-value function Q(s, o) is updated
        using intra-option learning.

        :param reward: The reward obtained as a result of the last transition.
        :param obs: An observation from the environment
        :rtype obs: :class:`rlglue.types.Observation`
        :returns: The primitive action to execute in the environment according to the
        behavior policy.
        :rtype: a primitive action under the form of a :class:`rlglue.types.Action`

        """
        observation = np.array(obs.doubleArray)
        current_features = self.basis.computeFeatures(observation)

        if not self.finished_learning:
            self.intraoption_update(reward, current_features, observation)

        self.last_observation = observation
        self.last_features = current_features
        self.last_action = self.mu(observation,
                                   current_features).pi(observation)

        action = Action()
        action.intArray = [self.last_action]
        return action

    def agent_end(self, reward):
        """ This function is called by the environment when the episode finishes.

        If learning is not frozen, the option-value function Q(s, o) is updated
        using intra-option learning.

        :param reward: The reward obtained as a result of the last transition.

        """
        if not self.finished_learning:
            for i in self.consistent_options(self.last_observation,
                                             self.last_action):
                delta = reward - np.dot(self.weights[:, i].T,
                                        self.last_features)
                self.weights[:, i] = self.weights[:, i] + self.alpha * delta

    def agent_cleanup(self):
        pass

    def agent_message(self, msg):
        return "Intra-Option Learning does not understand your message."