def epsilon_greedy(self,sensation,applicable_actions): """ Given self.epsilon() and self.Q(), return a distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. I.e. The action with the highest Q gets p = self.epsilon() and the others get the remainder of the mass, uniformly distributed. """ Q = array([self.Q(sensation,action) for action in applicable_actions]) # simple epsilon-greedy policy # get a vector with a 1 where each max element is, zero elsewhere mask = (Q == mmax(Q)) num_maxes = len(nonzero(mask)) num_others = len(mask) - num_maxes if num_others == 0: return mask e0 = self.epsilon()/num_maxes e1 = self.epsilon()/num_others result = zeros(len(mask))+0.0 putmask(result,mask,1-e0) putmask(result,mask==0,e1) return result
def epsilon_greedy(self, sensation, applicable_actions): """ Given self.epsilon() and self.Q(), return a distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. I.e. The action with the highest Q gets p = self.epsilon() and the others get the remainder of the mass, uniformly distributed. """ Q = array([self.Q(sensation, action) for action in applicable_actions]) # simple epsilon-greedy policy # get a vector with a 1 where each max element is, zero elsewhere mask = (Q == mmax(Q)) num_maxes = len(nonzero(mask)) num_others = len(mask) - num_maxes if num_others == 0: return mask e0 = self.epsilon() / num_maxes e1 = self.epsilon() / num_others result = zeros(len(mask)) + 0.0 putmask(result, mask, 1 - e0) putmask(result, mask == 0, e1) return result
def update_Q(self,sensation,action,delta,on_policy=True): """ Do a linear update of the weights. """ if self.lambda_ and on_policy: self.e *= self.lambda_ if self.prune_eligibility > 0.0: self.e *= (self.e > self.prune_eligibility) else: self.e *= 0.0 self.e[action] += sensation if self.replacing_traces: putmask(self.e,self.e > 1,1) self.w += self.e * (self.alpha/(sum(sensation))) * delta
def update_Q(self, sensation, action, delta, on_policy=True): """ Do a linear update of the weights. """ if self.lambda_ and on_policy: self.e *= self.lambda_ if self.prune_eligibility > 0.0: self.e *= (self.e > self.prune_eligibility) else: self.e *= 0.0 self.e[action] += sensation if self.replacing_traces: putmask(self.e, self.e > 1, 1) self.w += self.e * (self.alpha / (sum(sensation))) * delta