def __init__(self, **args): from plastk.utils import LogFile super(TDAgent, self).__init__(**args) self.nopickle.append('policy_fn') self.policy_fn = getattr(self, self.action_selection) self.total_steps = 0 if isinstance(self.history_log, str): self._history_file = LogFile(self.history_log) elif isinstance(self.history_log, file) or isinstance( self.history_log, LogFile): self._history_file = self.history_log
def __init__(self,**args): from plastk.utils import LogFile super(TDAgent,self).__init__(**args) self.nopickle.append('policy_fn') self.policy_fn = getattr(self,self.action_selection) self.total_steps = 0 if isinstance(self.history_log,str): self._history_file = LogFile(self.history_log) elif isinstance(self.history_log,file) or isinstance(self.history_log,LogFile): self._history_file = self.history_log
class TDAgent(Agent): """ A generic temporal-difference (TD) agent with discrete actions. To create a new TD agent, override this class and implement the methods .Q(sensation,action=None) and .update_Q(sensation,action,delda,on_policy=True). Parameters: alpha -- The learning rate, default = 0.1 gamma -- The discount factor, default = 1.0 lambda_ -- The eligibility discount factor, default = 0.0. step_method -- The method for doing TD updates: 'sarsa' or 'q_learning'. default = 'sarsa' action_selection -- The action selection method, default 'epsilon_greedy'. To change action selection, set this to the name of the new method, e.g. 'softmax'. initial_epsilon -- The starting epsilon for epsilon_greedy selection. (default=0.1) min_epsilon -- The minimum (final) epsilon. (default = 0.0) epsilon_half_life -- The half-life for epsilon annealing. (default = 1) initial_temperature -- The starting temperature for softmax (Boltzman distribution) selection. (default = 1.0) min_temperature -- The min (final) temperature for softmax selection. (default = 0.01) temperature_half_life -- The temperature half-life for softmax selection (default = 1) actions -- The list of available actions - can be any Python object that is understood as an action by the environment """ alpha = Magnitude(default=0.1) gamma = Magnitude(default=1.0) lambda_ = Magnitude(default=0.0) step_method = Parameter(default="sarsa") action_selection = Parameter(default="epsilon_greedy") # epsilon-greedy selection parameters initial_epsilon = Magnitude(default=0.1) min_epsilon = Magnitude(default=0.0) epsilon_half_life = Number(default=1, bounds=(0,None)) # softmax selection parameters initial_temperature = Number(default=1.0, bounds=(0,None)) min_temperature = Number(default=0.01, bounds=(0,None)) temperature_half_life = Number(default=1, bounds=(0,None)) actions = Parameter(default=[]) prune_eligibility = Magnitude(default=0.001) replacing_traces = Parameter(default=True) history_log = Parameter(default=None) allow_learning = Parameter(default=True) def __init__(self,**args): from plastk.utils import LogFile super(TDAgent,self).__init__(**args) self.nopickle.append('policy_fn') self.policy_fn = getattr(self,self.action_selection) self.total_steps = 0 if isinstance(self.history_log,str): self._history_file = LogFile(self.history_log) elif isinstance(self.history_log,file) or isinstance(self.history_log,LogFile): self._history_file = self.history_log def unpickle(self): """ Called automatically when the agent is unpickled. Sets the action-selection function to its appropriate value. """ super(TDAgent,self).unpickle() self.policy_fn = getattr(self,self.action_selection) def __call__(self,sensation,reward=None): """ Do a step. Calls the function selected in self.step_method and returns the action. """ step_fn = getattr(self,self.step_method+'_step') action_index = step_fn(sensation,reward) if self.history_log: if reward is None: self._history_file.write('start\n') self._history_file.write(`sensation`+'\n') self._history_file.write(`reward`+'\n') if not is_terminal(sensation): self._history_file.write(`action_index`+'\n') return self.actions[action_index] def Q(self,sensation,action=None): """ Return Q(s,a). If action is None, return an array of Q-values for each action in self.actions with the given sensation. You must override this method to implement a TDAgent subclass. """ raise NYI def update_Q(self,sensation,action,delta,on_policy=True): """ Update Q(sensation,action) by delta. on_policy indicates whether the step that produced the update was on- or off-policy. Any eligibility trace updates should be done from within this method. You must override this method to implement a TDAgent subclass. """ raise NYI def sarsa_step(self,sensation,reward=None): """ Do a step using the SARSA update method. Selects an action, computes the TD update and calls self.update_Q. Returns the agent's next action. """ if reward == None: return self._start_episode(sensation) rho = self.rho(reward) next_action = self.policy(sensation) if is_terminal(sensation): value = 0 else: value = self.Q(sensation,next_action) last_value = self.Q(self.last_sensation,self.last_action) delta = rho + (self.gamma * value - last_value) self.verbose("controller step = %d, rho = %.2f" % (self.total_steps,rho)) self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f,"+ "delta = %.5f, terminal? = %d") % (last_value,value,value-last_value, delta,is_terminal(sensation))) if self.allow_learning: self.update_Q(self.last_sensation,self.last_action,delta) self.last_sensation = sensation self.last_action = next_action if isinstance(reward,list): self.total_steps += len(reward) else: self.total_steps += 1 return next_action def q_learning_step(self,sensation,reward=None): """ Do a step using Watkins' Q(\lambda) update method. Selects an action, computes the TD update and calls self._q_learning_training. Returns the agent's next action. """ if reward == None: return self._start_episode(sensation) if self.allow_learning: self._q_learning_training(self.last_sensation,self.last_action,reward,sensation) self.last_sensation = sensation self.last_action = self.policy(sensation) if isinstance(reward,list): self.total_steps += len(reward) else: self.total_steps += 1 return self.last_action def _q_learning_training(self,sensation,action,reward,next_sensation): """ Do a single Q-lambda training step given (s,a,r,s'). Can be called from outside the q_learning_step method for off-policy training, experience replay, etc. """ rho = self.rho(reward) last_Q = self.Q(sensation) last_value = last_Q[action] if is_terminal(next_sensation): value = 0 else: value = max(self.Q(next_sensation)) delta = rho + (self.gamma * value - last_value) self.verbose("r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d" % (rho,last_value,value,value-last_value,delta,is_terminal(next_sensation))) self.update_Q(sensation,action,delta,on_policy = (last_Q[action] == max(last_Q))) if delta: assert (self.Q(sensation,action) - last_value)/delta < 1.0 def _start_episode(self,sensation): """ Start a new episode. Called from self.__call__ when the reward is None. """ self.last_sensation = sensation self.last_action = self.policy(sensation) return self.last_action def policy(self,sensation): """ Given a sensation, return an action. Uses self.action_selection to get a distribution over the agent's actions. Uses self.applicable_actions to prevent selecting inapplicable actions. Returns 0 if is_terminal(sensation). """ if not is_terminal(sensation): actions = self.applicable_actions(sensation) return actions[weighted_sample(self.policy_fn(sensation,actions))] else: # In the terminal state, the action is irrelevant return 0 def epsilon_greedy(self,sensation,applicable_actions): """ Given self.epsilon() and self.Q(), return a distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. I.e. The action with the highest Q gets p = self.epsilon() and the others get the remainder of the mass, uniformly distributed. """ Q = array([self.Q(sensation,action) for action in applicable_actions]) # simple epsilon-greedy policy # get a vector with a 1 where each max element is, zero elsewhere mask = (Q == mmax(Q)) num_maxes = len(nonzero(mask)) num_others = len(mask) - num_maxes if num_others == 0: return mask e0 = self.epsilon()/num_maxes e1 = self.epsilon()/num_others result = zeros(len(mask))+0.0 putmask(result,mask,1-e0) putmask(result,mask==0,e1) return result def softmax(self,sensation,applicable_actions): """ Given self.temperature() and self.Q(), return a Bolzman distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. """ temp = self.temperature() self.verbose("softmax, temperature = %.3f" % temp) Q = array([self.Q(sensation,action) for action in applicable_actions]) return softmax(Q,temp) def normalized_softmax(self,sensation,applicable_actions): """ Like softmax, except that the Q values are scaled into the range [0,1]. May make setting the initial temperature easier than with softmax. """ temp = self.temperature() self.verbose("softmax, temperature = %.3f" % temp) Q = array([self.Q(sensation,action) for action in applicable_actions]) return softmax(normalize_minmax(Q),temp) def temperature(self): """ Using initial_temperature, min_temperature, and temperature_half_life, compute the temperature after self.total_steps, steps. """ Ti = self.initial_temperature Tm = self.min_temperature decay = log(2)/self.temperature_half_life return Tm + (Ti - Tm) * exp( -decay * self.total_steps ) def epsilon(self): """ Using initial_epsilon, min_epsilon, and epsilon_half_life, compute epsilon after self.total_steps, steps. """ Ei = self.initial_epsilon Em = self.min_epsilon decay = log(2)/self.epsilon_half_life return Em + (Ei - Em) * exp( -decay * self.total_steps ) def rho(self,reward): """ Compute the reward since the last step. IF the reward is a scalar, it is returned unchanged. If reward is a list, it is assumed to be a list of rewards accrued at a constant time step, and the discounted sum is returned. """ if isinstance(reward,list): result = 0 for r in reward: result = self.gamma*result + r else: result = reward return result def applicable(self,action,sensation): """ If the given action has a method called 'applicable' return the value of action.applicable(sensation), otherwise return True. """ if 'applicable' in dir(action): return action.applicable(sensation) else: return True def applicable_actions(self,sensation): """ Return a list of the actions that are applicable to the given sensation. """ return [a for a in range(len(self.actions)) if self.applicable(self.actions[a],sensation)]
class TDAgent(Agent): """ A generic temporal-difference (TD) agent with discrete actions. To create a new TD agent, override this class and implement the methods .Q(sensation,action=None) and .update_Q(sensation,action,delda,on_policy=True). Parameters: alpha -- The learning rate, default = 0.1 gamma -- The discount factor, default = 1.0 lambda_ -- The eligibility discount factor, default = 0.0. step_method -- The method for doing TD updates: 'sarsa' or 'q_learning'. default = 'sarsa' action_selection -- The action selection method, default 'epsilon_greedy'. To change action selection, set this to the name of the new method, e.g. 'softmax'. initial_epsilon -- The starting epsilon for epsilon_greedy selection. (default=0.1) min_epsilon -- The minimum (final) epsilon. (default = 0.0) epsilon_half_life -- The half-life for epsilon annealing. (default = 1) initial_temperature -- The starting temperature for softmax (Boltzman distribution) selection. (default = 1.0) min_temperature -- The min (final) temperature for softmax selection. (default = 0.01) temperature_half_life -- The temperature half-life for softmax selection (default = 1) actions -- The list of available actions - can be any Python object that is understood as an action by the environment """ alpha = Magnitude(default=0.1) gamma = Magnitude(default=1.0) lambda_ = Magnitude(default=0.0) step_method = Parameter(default="sarsa") action_selection = Parameter(default="epsilon_greedy") # epsilon-greedy selection parameters initial_epsilon = Magnitude(default=0.1) min_epsilon = Magnitude(default=0.0) epsilon_half_life = Number(default=1, bounds=(0, None)) # softmax selection parameters initial_temperature = Number(default=1.0, bounds=(0, None)) min_temperature = Number(default=0.01, bounds=(0, None)) temperature_half_life = Number(default=1, bounds=(0, None)) actions = Parameter(default=[]) prune_eligibility = Magnitude(default=0.001) replacing_traces = Parameter(default=True) history_log = Parameter(default=None) allow_learning = Parameter(default=True) def __init__(self, **args): from plastk.utils import LogFile super(TDAgent, self).__init__(**args) self.nopickle.append('policy_fn') self.policy_fn = getattr(self, self.action_selection) self.total_steps = 0 if isinstance(self.history_log, str): self._history_file = LogFile(self.history_log) elif isinstance(self.history_log, file) or isinstance( self.history_log, LogFile): self._history_file = self.history_log def unpickle(self): """ Called automatically when the agent is unpickled. Sets the action-selection function to its appropriate value. """ super(TDAgent, self).unpickle() self.policy_fn = getattr(self, self.action_selection) def __call__(self, sensation, reward=None): """ Do a step. Calls the function selected in self.step_method and returns the action. """ step_fn = getattr(self, self.step_method + '_step') action_index = step_fn(sensation, reward) if self.history_log: if reward is None: self._history_file.write('start\n') self._history_file.write( ` sensation ` + '\n') self._history_file.write( ` reward ` + '\n') if not is_terminal(sensation): self._history_file.write( ` action_index ` + '\n') return self.actions[action_index] def Q(self, sensation, action=None): """ Return Q(s,a). If action is None, return an array of Q-values for each action in self.actions with the given sensation. You must override this method to implement a TDAgent subclass. """ raise NYI def update_Q(self, sensation, action, delta, on_policy=True): """ Update Q(sensation,action) by delta. on_policy indicates whether the step that produced the update was on- or off-policy. Any eligibility trace updates should be done from within this method. You must override this method to implement a TDAgent subclass. """ raise NYI def sarsa_step(self, sensation, reward=None): """ Do a step using the SARSA update method. Selects an action, computes the TD update and calls self.update_Q. Returns the agent's next action. """ if reward == None: return self._start_episode(sensation) rho = self.rho(reward) next_action = self.policy(sensation) if is_terminal(sensation): value = 0 else: value = self.Q(sensation, next_action) last_value = self.Q(self.last_sensation, self.last_action) delta = rho + (self.gamma * value - last_value) self.verbose("controller step = %d, rho = %.2f" % (self.total_steps, rho)) self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f," + "delta = %.5f, terminal? = %d") % (last_value, value, value - last_value, delta, is_terminal(sensation))) if self.allow_learning: self.update_Q(self.last_sensation, self.last_action, delta) self.last_sensation = sensation self.last_action = next_action if isinstance(reward, list): self.total_steps += len(reward) else: self.total_steps += 1 return next_action def q_learning_step(self, sensation, reward=None): """ Do a step using Watkins' Q(\lambda) update method. Selects an action, computes the TD update and calls self._q_learning_training. Returns the agent's next action. """ if reward == None: return self._start_episode(sensation) if self.allow_learning: self._q_learning_training(self.last_sensation, self.last_action, reward, sensation) self.last_sensation = sensation self.last_action = self.policy(sensation) if isinstance(reward, list): self.total_steps += len(reward) else: self.total_steps += 1 return self.last_action def _q_learning_training(self, sensation, action, reward, next_sensation): """ Do a single Q-lambda training step given (s,a,r,s'). Can be called from outside the q_learning_step method for off-policy training, experience replay, etc. """ rho = self.rho(reward) last_Q = self.Q(sensation) last_value = last_Q[action] if is_terminal(next_sensation): value = 0 else: value = max(self.Q(next_sensation)) delta = rho + (self.gamma * value - last_value) self.verbose( "r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d" % (rho, last_value, value, value - last_value, delta, is_terminal(next_sensation))) self.update_Q(sensation, action, delta, on_policy=(last_Q[action] == max(last_Q))) if delta: assert (self.Q(sensation, action) - last_value) / delta < 1.0 def _start_episode(self, sensation): """ Start a new episode. Called from self.__call__ when the reward is None. """ self.last_sensation = sensation self.last_action = self.policy(sensation) return self.last_action def policy(self, sensation): """ Given a sensation, return an action. Uses self.action_selection to get a distribution over the agent's actions. Uses self.applicable_actions to prevent selecting inapplicable actions. Returns 0 if is_terminal(sensation). """ if not is_terminal(sensation): actions = self.applicable_actions(sensation) return actions[weighted_sample(self.policy_fn(sensation, actions))] else: # In the terminal state, the action is irrelevant return 0 def epsilon_greedy(self, sensation, applicable_actions): """ Given self.epsilon() and self.Q(), return a distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. I.e. The action with the highest Q gets p = self.epsilon() and the others get the remainder of the mass, uniformly distributed. """ Q = array([self.Q(sensation, action) for action in applicable_actions]) # simple epsilon-greedy policy # get a vector with a 1 where each max element is, zero elsewhere mask = (Q == mmax(Q)) num_maxes = len(nonzero(mask)) num_others = len(mask) - num_maxes if num_others == 0: return mask e0 = self.epsilon() / num_maxes e1 = self.epsilon() / num_others result = zeros(len(mask)) + 0.0 putmask(result, mask, 1 - e0) putmask(result, mask == 0, e1) return result def softmax(self, sensation, applicable_actions): """ Given self.temperature() and self.Q(), return a Bolzman distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. """ temp = self.temperature() self.verbose("softmax, temperature = %.3f" % temp) Q = array([self.Q(sensation, action) for action in applicable_actions]) return softmax(Q, temp) def normalized_softmax(self, sensation, applicable_actions): """ Like softmax, except that the Q values are scaled into the range [0,1]. May make setting the initial temperature easier than with softmax. """ temp = self.temperature() self.verbose("softmax, temperature = %.3f" % temp) Q = array([self.Q(sensation, action) for action in applicable_actions]) return softmax(normalize_minmax(Q), temp) def temperature(self): """ Using initial_temperature, min_temperature, and temperature_half_life, compute the temperature after self.total_steps, steps. """ Ti = self.initial_temperature Tm = self.min_temperature decay = log(2) / self.temperature_half_life return Tm + (Ti - Tm) * exp(-decay * self.total_steps) def epsilon(self): """ Using initial_epsilon, min_epsilon, and epsilon_half_life, compute epsilon after self.total_steps, steps. """ Ei = self.initial_epsilon Em = self.min_epsilon decay = log(2) / self.epsilon_half_life return Em + (Ei - Em) * exp(-decay * self.total_steps) def rho(self, reward): """ Compute the reward since the last step. IF the reward is a scalar, it is returned unchanged. If reward is a list, it is assumed to be a list of rewards accrued at a constant time step, and the discounted sum is returned. """ if isinstance(reward, list): result = 0 for r in reward: result = self.gamma * result + r else: result = reward return result def applicable(self, action, sensation): """ If the given action has a method called 'applicable' return the value of action.applicable(sensation), otherwise return True. """ if 'applicable' in dir(action): return action.applicable(sensation) else: return True def applicable_actions(self, sensation): """ Return a list of the actions that are applicable to the given sensation. """ return [ a for a in range(len(self.actions)) if self.applicable(self.actions[a], sensation) ]