def __init__(self, **args):
        from plastk.utils import LogFile

        super(TDAgent, self).__init__(**args)
        self.nopickle.append('policy_fn')
        self.policy_fn = getattr(self, self.action_selection)

        self.total_steps = 0

        if isinstance(self.history_log, str):
            self._history_file = LogFile(self.history_log)
        elif isinstance(self.history_log, file) or isinstance(
                self.history_log, LogFile):
            self._history_file = self.history_log
예제 #2
0
    def __init__(self,**args):
        from plastk.utils import LogFile
        
        super(TDAgent,self).__init__(**args)
        self.nopickle.append('policy_fn')
        self.policy_fn = getattr(self,self.action_selection)
        
        self.total_steps = 0

        if isinstance(self.history_log,str):
            self._history_file = LogFile(self.history_log)
        elif isinstance(self.history_log,file) or isinstance(self.history_log,LogFile):
            self._history_file = self.history_log
예제 #3
0
class TDAgent(Agent):
    """
    A generic temporal-difference (TD) agent with discrete actions.
    To create a new TD agent, override this class and implement the methods
    .Q(sensation,action=None) and .update_Q(sensation,action,delda,on_policy=True).

    Parameters:

    alpha  -- The learning rate, default = 0.1
    gamma  -- The discount factor, default = 1.0
    lambda_ -- The eligibility discount factor, default = 0.0.

    step_method -- The method for doing TD updates: 'sarsa' or 'q_learning'.
                     default = 'sarsa'

    action_selection -- The action selection method, default 'epsilon_greedy'.
                        To change action selection, set this to the name of the new method,
                        e.g. 'softmax'.

    initial_epsilon -- The starting epsilon for epsilon_greedy selection. (default=0.1)
    min_epsilon     -- The minimum (final) epsilon. (default = 0.0)
    epsilon_half_life -- The half-life for epsilon annealing. (default = 1)
    
    initial_temperature -- The starting temperature for softmax (Boltzman distribution)
                           selection. (default = 1.0)
    min_temperature     -- The min (final) temperature for softmax selection.
                           (default = 0.01)
    temperature_half_life -- The temperature half-life for softmax selection
                           (default = 1)

    actions -- The list of available actions - can be any Python object
               that is understood as an action by the environment
    """


    alpha =       Magnitude(default=0.1)
    gamma =       Magnitude(default=1.0)
    lambda_ =     Magnitude(default=0.0)

    step_method = Parameter(default="sarsa")

    action_selection = Parameter(default="epsilon_greedy")

    # epsilon-greedy selection parameters
    initial_epsilon =   Magnitude(default=0.1)
    min_epsilon =       Magnitude(default=0.0)
    epsilon_half_life = Number(default=1, bounds=(0,None))

    # softmax selection parameters
    initial_temperature =   Number(default=1.0, bounds=(0,None))
    min_temperature =       Number(default=0.01, bounds=(0,None))
    temperature_half_life = Number(default=1, bounds=(0,None))

    actions = Parameter(default=[])

    prune_eligibility = Magnitude(default=0.001)
    replacing_traces = Parameter(default=True)

    history_log = Parameter(default=None)
    allow_learning = Parameter(default=True)

    def __init__(self,**args):
        from plastk.utils import LogFile
        
        super(TDAgent,self).__init__(**args)
        self.nopickle.append('policy_fn')
        self.policy_fn = getattr(self,self.action_selection)
        
        self.total_steps = 0

        if isinstance(self.history_log,str):
            self._history_file = LogFile(self.history_log)
        elif isinstance(self.history_log,file) or isinstance(self.history_log,LogFile):
            self._history_file = self.history_log

    def unpickle(self):
        """
        Called automatically when the agent is unpickled.  Sets
        the action-selection function to its appropriate value.
        """
        super(TDAgent,self).unpickle()
        self.policy_fn = getattr(self,self.action_selection)


    def __call__(self,sensation,reward=None):
        """
        Do a step.  Calls the function selected in self.step_method
        and returns the action.
        """
        step_fn = getattr(self,self.step_method+'_step')

        action_index = step_fn(sensation,reward)
        if self.history_log:
            if reward is None:
                self._history_file.write('start\n')
            self._history_file.write(`sensation`+'\n')
            self._history_file.write(`reward`+'\n')
            if not is_terminal(sensation):
                self._history_file.write(`action_index`+'\n')
        return self.actions[action_index]

    def Q(self,sensation,action=None):
        """
        Return Q(s,a).  If action is None, return an array
        of Q-values for each action in self.actions
        with the given sensation.

        You must override this method to implement a TDAgent subclass.
        """
        raise NYI

    def update_Q(self,sensation,action,delta,on_policy=True):
        """
        Update Q(sensation,action) by delta.  on_policy indicates
        whether the step that produced the update was on- or
        off-policy.  Any eligibility trace updates should be done from
        within this method.

        You must override this method to implement a TDAgent subclass.
        """
        raise NYI

    def sarsa_step(self,sensation,reward=None):
        """
        Do a step using the SARSA update method.  Selects an action,
        computes the TD update and calls self.update_Q.  Returns the
        agent's next action.
        """
        if reward == None:
            return self._start_episode(sensation)

        rho = self.rho(reward)
        next_action = self.policy(sensation)

        if is_terminal(sensation):
            value = 0
        else:
            value = self.Q(sensation,next_action)

        last_value = self.Q(self.last_sensation,self.last_action)
        delta = rho + (self.gamma * value - last_value)

        self.verbose("controller step = %d, rho = %.2f"
                      % (self.total_steps,rho))
        self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f,"+
                       "delta = %.5f, terminal? = %d")
                      % (last_value,value,value-last_value,
                         delta,is_terminal(sensation)))        

        if self.allow_learning:
            self.update_Q(self.last_sensation,self.last_action,delta)

        self.last_sensation = sensation
        self.last_action = next_action
        if isinstance(reward,list):
            self.total_steps += len(reward)
        else:
            self.total_steps += 1

        return next_action


    def q_learning_step(self,sensation,reward=None):
        """
        Do a step using Watkins' Q(\lambda) update method.  Selects an
        action, computes the TD update and calls
        self._q_learning_training.  Returns the agent's next action.
        """
        if reward == None:
            return self._start_episode(sensation)

        if self.allow_learning:
            self._q_learning_training(self.last_sensation,self.last_action,reward,sensation)
        
        self.last_sensation = sensation
        self.last_action = self.policy(sensation)
        if isinstance(reward,list):
            self.total_steps += len(reward)
        else:
            self.total_steps += 1
        return self.last_action

    def _q_learning_training(self,sensation,action,reward,next_sensation):
        """
        Do a single Q-lambda training step given (s,a,r,s').  Can be
        called from outside the q_learning_step method for off-policy
        training, experience replay, etc.
        """
        rho = self.rho(reward)

        last_Q = self.Q(sensation)
        last_value = last_Q[action]
        
        if is_terminal(next_sensation):
            value = 0
        else:
            value = max(self.Q(next_sensation))

        delta = rho + (self.gamma * value - last_value)
        
        self.verbose("r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d"
                      % (rho,last_value,value,value-last_value,delta,is_terminal(next_sensation)))

        self.update_Q(sensation,action,delta,on_policy = (last_Q[action] == max(last_Q)))

        if delta:
            assert (self.Q(sensation,action) - last_value)/delta < 1.0
    
    def _start_episode(self,sensation):
        """
        Start a new episode.  Called from self.__call__ when the reward is None.
        """
        self.last_sensation = sensation
        self.last_action = self.policy(sensation)
        return self.last_action


    def policy(self,sensation):
        """
        Given a sensation, return an action.  Uses
        self.action_selection to get a distribution over the agent's
        actions.  Uses self.applicable_actions to prevent selecting
        inapplicable actions.

        Returns 0 if is_terminal(sensation).
        """
        if not is_terminal(sensation):
            actions = self.applicable_actions(sensation)
            return actions[weighted_sample(self.policy_fn(sensation,actions))]
        else:
            # In the terminal state, the action is irrelevant
            return 0
        
    def epsilon_greedy(self,sensation,applicable_actions):
        """
        Given self.epsilon() and self.Q(), return a distribution over
        applicable_actions as an array where each element contains the
        a probability mass for the corresponding action.  I.e.  The
        action with the highest Q gets p = self.epsilon() and the
        others get the remainder of the mass, uniformly distributed.
        """
        Q = array([self.Q(sensation,action) for action in applicable_actions])

        # simple epsilon-greedy policy
        # get a vector with a 1 where each max element is, zero elsewhere
        mask = (Q == mmax(Q))

        num_maxes = len(nonzero(mask))
        num_others = len(mask) - num_maxes

        if num_others == 0: return mask
        
        e0 = self.epsilon()/num_maxes
        e1 = self.epsilon()/num_others

        result = zeros(len(mask))+0.0
        putmask(result,mask,1-e0)
        putmask(result,mask==0,e1)
        return result

    def softmax(self,sensation,applicable_actions):
        """
        Given self.temperature() and self.Q(), return a Bolzman
        distribution over applicable_actions as an array where each
        element contains the a probability mass for the corresponding
        action.
        """
        temp = self.temperature()
        self.verbose("softmax, temperature = %.3f" % temp)
        Q = array([self.Q(sensation,action) for action in applicable_actions])
        return softmax(Q,temp)

    def normalized_softmax(self,sensation,applicable_actions):
        """
        Like softmax, except that the Q values are scaled into the
        range [0,1].  May make setting the initial temperature easier than with softmax.
        """
        temp = self.temperature()
        self.verbose("softmax, temperature = %.3f" % temp)
        Q = array([self.Q(sensation,action) for action in applicable_actions])
        return softmax(normalize_minmax(Q),temp)

    def temperature(self):
        """
        Using initial_temperature, min_temperature, and temperature_half_life,
        compute the temperature after self.total_steps, steps.
        """
        Ti = self.initial_temperature
        Tm = self.min_temperature
        decay = log(2)/self.temperature_half_life
        return Tm + (Ti - Tm) * exp( -decay * self.total_steps )

    def epsilon(self):
        """
        Using initial_epsilon, min_epsilon, and epsilon_half_life,
        compute epsilon after self.total_steps, steps.
        """
        Ei = self.initial_epsilon
        Em = self.min_epsilon
        decay = log(2)/self.epsilon_half_life
        return Em + (Ei - Em) * exp( -decay * self.total_steps )
    
    def rho(self,reward):
        """
        Compute the reward since the last step.
        
        IF the reward is a scalar, it is returned unchanged.

        If reward is a list, it is assumed to be a list of rewards
        accrued at a constant time step, and the discounted sum is
        returned.
        """
        if isinstance(reward,list):
            result = 0
            for r in reward:
                result = self.gamma*result + r
        else:
            result = reward
        return result

    def applicable(self,action,sensation):
        """
        If the given action has a method called 'applicable' return
        the value of action.applicable(sensation), otherwise return True.
        """
        if 'applicable' in dir(action):
            return action.applicable(sensation)
        else:
            return True

    def applicable_actions(self,sensation):
        """
        Return a list of the actions that are applicable to the given
        sensation.
        """
        return [a for a in range(len(self.actions))
                if self.applicable(self.actions[a],sensation)]
class TDAgent(Agent):
    """
    A generic temporal-difference (TD) agent with discrete actions.
    To create a new TD agent, override this class and implement the methods
    .Q(sensation,action=None) and .update_Q(sensation,action,delda,on_policy=True).

    Parameters:

    alpha  -- The learning rate, default = 0.1
    gamma  -- The discount factor, default = 1.0
    lambda_ -- The eligibility discount factor, default = 0.0.

    step_method -- The method for doing TD updates: 'sarsa' or 'q_learning'.
                     default = 'sarsa'

    action_selection -- The action selection method, default 'epsilon_greedy'.
                        To change action selection, set this to the name of the new method,
                        e.g. 'softmax'.

    initial_epsilon -- The starting epsilon for epsilon_greedy selection. (default=0.1)
    min_epsilon     -- The minimum (final) epsilon. (default = 0.0)
    epsilon_half_life -- The half-life for epsilon annealing. (default = 1)
    
    initial_temperature -- The starting temperature for softmax (Boltzman distribution)
                           selection. (default = 1.0)
    min_temperature     -- The min (final) temperature for softmax selection.
                           (default = 0.01)
    temperature_half_life -- The temperature half-life for softmax selection
                           (default = 1)

    actions -- The list of available actions - can be any Python object
               that is understood as an action by the environment
    """

    alpha = Magnitude(default=0.1)
    gamma = Magnitude(default=1.0)
    lambda_ = Magnitude(default=0.0)

    step_method = Parameter(default="sarsa")

    action_selection = Parameter(default="epsilon_greedy")

    # epsilon-greedy selection parameters
    initial_epsilon = Magnitude(default=0.1)
    min_epsilon = Magnitude(default=0.0)
    epsilon_half_life = Number(default=1, bounds=(0, None))

    # softmax selection parameters
    initial_temperature = Number(default=1.0, bounds=(0, None))
    min_temperature = Number(default=0.01, bounds=(0, None))
    temperature_half_life = Number(default=1, bounds=(0, None))

    actions = Parameter(default=[])

    prune_eligibility = Magnitude(default=0.001)
    replacing_traces = Parameter(default=True)

    history_log = Parameter(default=None)
    allow_learning = Parameter(default=True)

    def __init__(self, **args):
        from plastk.utils import LogFile

        super(TDAgent, self).__init__(**args)
        self.nopickle.append('policy_fn')
        self.policy_fn = getattr(self, self.action_selection)

        self.total_steps = 0

        if isinstance(self.history_log, str):
            self._history_file = LogFile(self.history_log)
        elif isinstance(self.history_log, file) or isinstance(
                self.history_log, LogFile):
            self._history_file = self.history_log

    def unpickle(self):
        """
        Called automatically when the agent is unpickled.  Sets
        the action-selection function to its appropriate value.
        """
        super(TDAgent, self).unpickle()
        self.policy_fn = getattr(self, self.action_selection)

    def __call__(self, sensation, reward=None):
        """
        Do a step.  Calls the function selected in self.step_method
        and returns the action.
        """
        step_fn = getattr(self, self.step_method + '_step')

        action_index = step_fn(sensation, reward)
        if self.history_log:
            if reward is None:
                self._history_file.write('start\n')
            self._history_file.write( ` sensation ` + '\n')
            self._history_file.write( ` reward ` + '\n')
            if not is_terminal(sensation):
                self._history_file.write( ` action_index ` + '\n')
        return self.actions[action_index]

    def Q(self, sensation, action=None):
        """
        Return Q(s,a).  If action is None, return an array
        of Q-values for each action in self.actions
        with the given sensation.

        You must override this method to implement a TDAgent subclass.
        """
        raise NYI

    def update_Q(self, sensation, action, delta, on_policy=True):
        """
        Update Q(sensation,action) by delta.  on_policy indicates
        whether the step that produced the update was on- or
        off-policy.  Any eligibility trace updates should be done from
        within this method.

        You must override this method to implement a TDAgent subclass.
        """
        raise NYI

    def sarsa_step(self, sensation, reward=None):
        """
        Do a step using the SARSA update method.  Selects an action,
        computes the TD update and calls self.update_Q.  Returns the
        agent's next action.
        """
        if reward == None:
            return self._start_episode(sensation)

        rho = self.rho(reward)
        next_action = self.policy(sensation)

        if is_terminal(sensation):
            value = 0
        else:
            value = self.Q(sensation, next_action)

        last_value = self.Q(self.last_sensation, self.last_action)
        delta = rho + (self.gamma * value - last_value)

        self.verbose("controller step = %d, rho = %.2f" %
                     (self.total_steps, rho))
        self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f," +
                      "delta = %.5f, terminal? = %d") %
                     (last_value, value, value - last_value, delta,
                      is_terminal(sensation)))

        if self.allow_learning:
            self.update_Q(self.last_sensation, self.last_action, delta)

        self.last_sensation = sensation
        self.last_action = next_action
        if isinstance(reward, list):
            self.total_steps += len(reward)
        else:
            self.total_steps += 1

        return next_action

    def q_learning_step(self, sensation, reward=None):
        """
        Do a step using Watkins' Q(\lambda) update method.  Selects an
        action, computes the TD update and calls
        self._q_learning_training.  Returns the agent's next action.
        """
        if reward == None:
            return self._start_episode(sensation)

        if self.allow_learning:
            self._q_learning_training(self.last_sensation, self.last_action,
                                      reward, sensation)

        self.last_sensation = sensation
        self.last_action = self.policy(sensation)
        if isinstance(reward, list):
            self.total_steps += len(reward)
        else:
            self.total_steps += 1
        return self.last_action

    def _q_learning_training(self, sensation, action, reward, next_sensation):
        """
        Do a single Q-lambda training step given (s,a,r,s').  Can be
        called from outside the q_learning_step method for off-policy
        training, experience replay, etc.
        """
        rho = self.rho(reward)

        last_Q = self.Q(sensation)
        last_value = last_Q[action]

        if is_terminal(next_sensation):
            value = 0
        else:
            value = max(self.Q(next_sensation))

        delta = rho + (self.gamma * value - last_value)

        self.verbose(
            "r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d"
            % (rho, last_value, value, value - last_value, delta,
               is_terminal(next_sensation)))

        self.update_Q(sensation,
                      action,
                      delta,
                      on_policy=(last_Q[action] == max(last_Q)))

        if delta:
            assert (self.Q(sensation, action) - last_value) / delta < 1.0

    def _start_episode(self, sensation):
        """
        Start a new episode.  Called from self.__call__ when the reward is None.
        """
        self.last_sensation = sensation
        self.last_action = self.policy(sensation)
        return self.last_action

    def policy(self, sensation):
        """
        Given a sensation, return an action.  Uses
        self.action_selection to get a distribution over the agent's
        actions.  Uses self.applicable_actions to prevent selecting
        inapplicable actions.

        Returns 0 if is_terminal(sensation).
        """
        if not is_terminal(sensation):
            actions = self.applicable_actions(sensation)
            return actions[weighted_sample(self.policy_fn(sensation, actions))]
        else:
            # In the terminal state, the action is irrelevant
            return 0

    def epsilon_greedy(self, sensation, applicable_actions):
        """
        Given self.epsilon() and self.Q(), return a distribution over
        applicable_actions as an array where each element contains the
        a probability mass for the corresponding action.  I.e.  The
        action with the highest Q gets p = self.epsilon() and the
        others get the remainder of the mass, uniformly distributed.
        """
        Q = array([self.Q(sensation, action) for action in applicable_actions])

        # simple epsilon-greedy policy
        # get a vector with a 1 where each max element is, zero elsewhere
        mask = (Q == mmax(Q))

        num_maxes = len(nonzero(mask))
        num_others = len(mask) - num_maxes

        if num_others == 0: return mask

        e0 = self.epsilon() / num_maxes
        e1 = self.epsilon() / num_others

        result = zeros(len(mask)) + 0.0
        putmask(result, mask, 1 - e0)
        putmask(result, mask == 0, e1)
        return result

    def softmax(self, sensation, applicable_actions):
        """
        Given self.temperature() and self.Q(), return a Bolzman
        distribution over applicable_actions as an array where each
        element contains the a probability mass for the corresponding
        action.
        """
        temp = self.temperature()
        self.verbose("softmax, temperature = %.3f" % temp)
        Q = array([self.Q(sensation, action) for action in applicable_actions])
        return softmax(Q, temp)

    def normalized_softmax(self, sensation, applicable_actions):
        """
        Like softmax, except that the Q values are scaled into the
        range [0,1].  May make setting the initial temperature easier than with softmax.
        """
        temp = self.temperature()
        self.verbose("softmax, temperature = %.3f" % temp)
        Q = array([self.Q(sensation, action) for action in applicable_actions])
        return softmax(normalize_minmax(Q), temp)

    def temperature(self):
        """
        Using initial_temperature, min_temperature, and temperature_half_life,
        compute the temperature after self.total_steps, steps.
        """
        Ti = self.initial_temperature
        Tm = self.min_temperature
        decay = log(2) / self.temperature_half_life
        return Tm + (Ti - Tm) * exp(-decay * self.total_steps)

    def epsilon(self):
        """
        Using initial_epsilon, min_epsilon, and epsilon_half_life,
        compute epsilon after self.total_steps, steps.
        """
        Ei = self.initial_epsilon
        Em = self.min_epsilon
        decay = log(2) / self.epsilon_half_life
        return Em + (Ei - Em) * exp(-decay * self.total_steps)

    def rho(self, reward):
        """
        Compute the reward since the last step.
        
        IF the reward is a scalar, it is returned unchanged.

        If reward is a list, it is assumed to be a list of rewards
        accrued at a constant time step, and the discounted sum is
        returned.
        """
        if isinstance(reward, list):
            result = 0
            for r in reward:
                result = self.gamma * result + r
        else:
            result = reward
        return result

    def applicable(self, action, sensation):
        """
        If the given action has a method called 'applicable' return
        the value of action.applicable(sensation), otherwise return True.
        """
        if 'applicable' in dir(action):
            return action.applicable(sensation)
        else:
            return True

    def applicable_actions(self, sensation):
        """
        Return a list of the actions that are applicable to the given
        sensation.
        """
        return [
            a for a in range(len(self.actions))
            if self.applicable(self.actions[a], sensation)
        ]