class UniformTiledAgent(LinearListAgent): """ A LinearTDAgent subclass for continuous state spaces that automatically tiles the input space. For high-dimensional inputs, the input can be separated into a several uniformly distributed 'receptive fields' (rfs) that may overlap, and each rf is tiled separately. Parameters: num_rfs -- The number of receptive fields to use (default=1) rf_width -- The width of the receptive fields (default=[D/num_rfs] where D = input dimensionality) num_tilings -- The number of tilings to use for each rf. tile_width -- The width of each tile. num_features -- The total combined memory size for all rfs. Each separate rf is assumed to use the same tiling parameters. Examples: D = 9 , num_rfs = 3, rf_width = <default> will give the following |-rf0-| |-rf2-| Features: [ 0 1 2 3 4 5 6 7 8 ] |-rf1-| D = 10 , num_rfs = 3, rf_width = 4 will give the following |--rf0--| |--rf2--| Features: [ 0 1 2 3 4 5 6 7 8 9 ] |--rf1--| RF placements are determined with function place_rfs. """ num_rfs = Integer(default=1, bounds=(1, None)) rf_width = Parameter(None) num_tilings = Integer(default=1, bounds=(1, None)) tile_width = Number(default=1) def __init__(self, **args): super(UniformTiledAgent, self).__init__(**args) if not self.rf_width: self.rf_width = self.num_features / self.num_rfs def __call__(self, sensation, reward=None): if not is_terminal(sensation): sensation = tile_uniform_rfs( array(sensation) / self.tile_width, self.num_rfs, self.rf_width, self.num_tilings, self.num_features / self.num_rfs) return super(UniformTiledAgent, self).__call__(sensation, reward)
class NoveltySOM(SOM): alpha_gain = Number(default=2.0,bounds=(0.0,None)) radius_gain = Number(default=2.0,bounds=(0.0,None)) def __init__(self,**params): SOM.__init__(self,**params) self.error_ratio = 1.0 def present_input(self,X): SOM.present_input(self,X) dist = norm( self.get_model_vector(self.winner()) - X ) self.error_ratio = dist / norm(X) def alpha(self): return SOM.alpha(self) * tanh(self.error_ratio * self.alpha_gain) def radius(self): return SOM.radius(self) * tanh(self.error_ratio * self.radius_gain)
class EquilibriumGNG(GNG): error_threshold = Number(default=1.0, bounds=(0, None)) def time_to_grow(self): from numpy import average, sqrt e = average(self.error * self.beta) result = (e > self.error_threshold and super(EquilibriumGNG, self).time_to_grow()) if result: self.verbose("average error = %.4e" % e, " -- Time to grow.") else: self.debug("average error = %.4e" % e, " -- Not growing.") return result
class TabularTDAgent(TDAgent): """ A TDAgent for environments with discrete states and actions. Sensations/states can be any hashable Python object, and the universe of sensations need not be specified in advance. The agent stores and updates a separate Q estimate for every (s,a) pair. Parameters: initial_q -- The initial Q estimate for each (s,a) pair. (default = 0.0) """ initial_q = Number(default=0.0) def __init__(self, **params): super(TabularTDAgent, self).__init__(**params) self.reset_q() self.reset_e() def _start_episode(self, sensation): self.reset_e() return super(TabularTDAgent, self)._start_episode(sensation) def reset_q(self): self.q_table = {} def reset_e(self): self.e = {} def Q(self, s, a=None): if a is None: result = [self.Q(s, a) for a in range(len(self.actions))] else: result = self.q_table.get((s, a), self.initial_q) self.debug('Q(', s, ',', a, ') = ', result) return result def update_Q(self, s, a, delta, on_policy=True): if not on_policy: self.reset_e() if (s, a) not in self.q_table: self.q_table[(s, a)] = self.initial_q if self.lambda_: to_be_deleted = [] for x in self.e: self.e[x] *= self.lambda_ if self.e[x] < self.prune_eligibility: to_be_deleted.append(x) for x in to_be_deleted: del self.e[x] if self.replacing_traces: self.e[(s, a)] = 1 else: self.e[(s, a)] += 1 for x, e in self.e.iteritems(): self.q_table[x] += self.alpha * e * delta
class LinearTDAgent(TDAgent): """ A TD agent that takes a sensation as a 1D numpy vector of features and computes Q as a linear function of that sensation, using simple gradient descent. The function is stored in the weight matrix self.w, such that Q(s) can be computed as w*s. Assumes a discrete set of actions. Uses replacing eligibility traces. Parameters: num_features = The number of input features (default = 1) initial_w = A scalar value with which to initialize the weight matrix. """ num_features = Integer(default=1, bounds=(1, None)) initial_w = Number(default=0.0) def __init__(self, **params): super(LinearTDAgent, self).__init__(**params) self.reset_w() self.reset_e() def _start_episode(self, sensation): self.reset_e() return super(LinearTDAgent, self)._start_episode(sensation) def reset_w(self): """ Reset the weight matrix to self.initial_w. """ self.w = zeros( (len(self.actions), self.num_features), 'f') + self.initial_w def reset_e(self): """ Reset the eligibility traces for self.w to all zeros. """ self.e = zeros((len(self.actions), self.num_features), 'f') + 0.0 def Q(self, state, action=None): """ Compute Q(s,a) from W*s. """ if action is None: return dot(self.w, state) else: return dot(self.w[action], state) def update_Q(self, sensation, action, delta, on_policy=True): """ Do a linear update of the weights. """ if self.lambda_ and on_policy: self.e *= self.lambda_ if self.prune_eligibility > 0.0: self.e *= (self.e > self.prune_eligibility) else: self.e *= 0.0 self.e[action] += sensation if self.replacing_traces: putmask(self.e, self.e > 1, 1) self.w += self.e * (self.alpha / (sum(sensation))) * delta
class TDAgent(Agent): """ A generic temporal-difference (TD) agent with discrete actions. To create a new TD agent, override this class and implement the methods .Q(sensation,action=None) and .update_Q(sensation,action,delda,on_policy=True). Parameters: alpha -- The learning rate, default = 0.1 gamma -- The discount factor, default = 1.0 lambda_ -- The eligibility discount factor, default = 0.0. step_method -- The method for doing TD updates: 'sarsa' or 'q_learning'. default = 'sarsa' action_selection -- The action selection method, default 'epsilon_greedy'. To change action selection, set this to the name of the new method, e.g. 'softmax'. initial_epsilon -- The starting epsilon for epsilon_greedy selection. (default=0.1) min_epsilon -- The minimum (final) epsilon. (default = 0.0) epsilon_half_life -- The half-life for epsilon annealing. (default = 1) initial_temperature -- The starting temperature for softmax (Boltzman distribution) selection. (default = 1.0) min_temperature -- The min (final) temperature for softmax selection. (default = 0.01) temperature_half_life -- The temperature half-life for softmax selection (default = 1) actions -- The list of available actions - can be any Python object that is understood as an action by the environment """ alpha = Magnitude(default=0.1) gamma = Magnitude(default=1.0) lambda_ = Magnitude(default=0.0) step_method = Parameter(default="sarsa") action_selection = Parameter(default="epsilon_greedy") # epsilon-greedy selection parameters initial_epsilon = Magnitude(default=0.1) min_epsilon = Magnitude(default=0.0) epsilon_half_life = Number(default=1, bounds=(0, None)) # softmax selection parameters initial_temperature = Number(default=1.0, bounds=(0, None)) min_temperature = Number(default=0.01, bounds=(0, None)) temperature_half_life = Number(default=1, bounds=(0, None)) actions = Parameter(default=[]) prune_eligibility = Magnitude(default=0.001) replacing_traces = Parameter(default=True) history_log = Parameter(default=None) allow_learning = Parameter(default=True) def __init__(self, **args): from plastk.misc.utils import LogFile super(TDAgent, self).__init__(**args) self.nopickle.append('policy_fn') self.policy_fn = getattr(self, self.action_selection) self.total_steps = 0 if isinstance(self.history_log, str): self._history_file = LogFile(self.history_log) elif isinstance(self.history_log, file) or isinstance( self.history_log, LogFile): self._history_file = self.history_log def unpickle(self): """ Called automatically when the agent is unpickled. Sets the action-selection function to its appropriate value. """ super(TDAgent, self).unpickle() self.policy_fn = getattr(self, self.action_selection) def __call__(self, sensation, reward=None): """ Do a step. Calls the function selected in self.step_method and returns the action. """ training_fn = getattr(self, '_' + self.step_method + '_training') action_index = self.learning_step(training_fn, sensation, reward) if self.history_log: if reward is None: self._history_file.write('start\n') self._history_file.write( ` sensation ` + '\n') self._history_file.write( ` reward ` + '\n') if not is_terminal(sensation): self._history_file.write( ` action_index ` + '\n') return self.actions[action_index] def Q(self, sensation, action=None): """ Return Q(s,a). If action is None, return an array of Q-values for each action in self.actions with the given sensation. You must override this method to implement a TDAgent subclass. """ raise NYI def update_Q(self, sensation, action, delta, on_policy=True): """ Update Q(sensation,action) by delta. on_policy indicates whether the step that produced the update was on- or off-policy. Any eligibility trace updates should be done from within this method. You must override this method to implement a TDAgent subclass. """ raise NYI def learning_step(self, training_fn, sensation, reward=None): """ Do a step using the learning algorithm specified. Selects an action, computes the update and calls the appropriate training routine. Returns the agent's next action. """ if reward == None: return self._start_episode(sensation) next_action = self.policy(sensation) if self.allow_learning: training_fn(self.last_sensation, self.last_action, reward, sensation, next_action) self.last_sensation = sensation self.last_action = next_action if isinstance(reward, list): self.total_steps += len(reward) else: self.total_steps += 1 return self.last_action def _sarsa_training(self, sensation, action, reward, next_sensation, next_action): """ Perform a single SARSA training step given (s,a,r,s',a'). """ rho = self.rho(reward) if is_terminal(next_sensation): value = 0 else: value = self.Q(next_sensation, next_action) last_value = self.Q(sensation, action) delta = rho + (self.gamma * value - last_value) self.verbose("controller step = %d, rho = %.2f" % (self.total_steps, rho)) self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f," + "delta = %.5f, terminal? = %d") % (last_value, value, value - last_value, delta, is_terminal(next_sensation))) self.update_Q(sensation, action, delta) def _q_learning_training(self, sensation, action, reward, next_sensation, next_action=None): """ Do a single Q-lambda training step given (s,a,r,s'). Can be called from outside the q_learning_step method for off-policy training, experience replay, etc. """ rho = self.rho(reward) last_Q = self.Q(sensation) last_value = last_Q[action] if is_terminal(next_sensation): value = 0 else: value = max(self.Q(next_sensation)) delta = rho + (self.gamma * value - last_value) self.verbose( "r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d" % (rho, last_value, value, value - last_value, delta, is_terminal(next_sensation))) self.update_Q(sensation, action, delta, on_policy=(last_Q[action] == max(last_Q))) if delta: assert (self.Q(sensation, action) - last_value) / delta < 1.0 def _start_episode(self, sensation): """ Start a new episode. Called from self.__call__ when the reward is None. """ self.last_sensation = sensation self.last_action = self.policy(sensation) return self.last_action def policy(self, sensation): """ Given a sensation, return an action. Uses self.action_selection to get a distribution over the agent's actions. Uses self.applicable_actions to prevent selecting inapplicable actions. Returns 0 if is_terminal(sensation). """ if not is_terminal(sensation): actions = self.applicable_actions(sensation) return actions[weighted_sample(self.policy_fn(sensation, actions))] else: # In the terminal state, the action is irrelevant return 0 def epsilon_greedy(self, sensation, applicable_actions): """ Given self.epsilon() and self.Q(), return a distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. I.e. The action with the highest Q gets p = self.epsilon() and the others get the remainder of the mass, uniformly distributed. """ Q = array([self.Q(sensation, action) for action in applicable_actions]) # simple epsilon-greedy policy # get a vector with a 1 where each max element is, zero elsewhere mask = (Q == mmax(Q)) num_maxes = len(nonzero(mask)) num_others = len(mask) - num_maxes if num_others == 0: return mask e0 = self.epsilon() / num_maxes e1 = self.epsilon() / num_others result = zeros(len(mask)) + 0.0 putmask(result, mask, 1 - e0) putmask(result, mask == 0, e1) return result def softmax(self, sensation, applicable_actions): """ Given self.temperature() and self.Q(), return a Bolzman distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. """ temp = self.temperature() self.verbose("softmax, temperature = %.3f" % temp) Q = array([self.Q(sensation, action) for action in applicable_actions]) return softmax(Q, temp) def normalized_softmax(self, sensation, applicable_actions): """ Like softmax, except that the Q values are scaled into the range [0,1]. May make setting the initial temperature easier than with softmax. """ temp = self.temperature() self.verbose("softmax, temperature = %.3f" % temp) Q = array([self.Q(sensation, action) for action in applicable_actions]) return softmax(normalize_minmax(Q), temp) def temperature(self): """ Using initial_temperature, min_temperature, and temperature_half_life, compute the temperature after self.total_steps, steps. """ Ti = self.initial_temperature Tm = self.min_temperature decay = log(2) / self.temperature_half_life return Tm + (Ti - Tm) * exp(-decay * self.total_steps) def epsilon(self): """ Using initial_epsilon, min_epsilon, and epsilon_half_life, compute epsilon after self.total_steps, steps. """ Ei = self.initial_epsilon Em = self.min_epsilon decay = log(2) / self.epsilon_half_life return Em + (Ei - Em) * exp(-decay * self.total_steps) def rho(self, reward): """ Compute the reward since the last step. IF the reward is a scalar, it is returned unchanged. If reward is a list, it is assumed to be a list of rewards accrued at a constant time step, and the discounted sum is returned. """ if isinstance(reward, list): result = 0 for r in reward: result = self.gamma * result + r else: result = reward return result def applicable(self, action, sensation): """ If the given action has a method called 'applicable' return the value of action.applicable(sensation), otherwise return True. """ if 'applicable' in dir(action): return action.applicable(sensation) else: return True def applicable_actions(self, sensation): """ Return a list of the actions that are applicable to the given sensation. """ return [ a for a in range(len(self.actions)) if self.applicable(self.actions[a], sensation) ]
class SOM(VQ): dim = Integer(default=2,bounds=(0,None)) xdim = Integer(default=1,bounds=(0,None)) ydim = Integer(default=1,bounds=(0,None)) rmin = Number(default=0.0) rmax = Number(default=1.0) alpha_0 = Magnitude(default=0.5) radius_0 = Number(default=1.0,bounds=(0.0,None)) response_exponent = Number(default=2) def __init__(self,**params): super(SOM,self).__init__(**params) self.weights = rand.uniform(self.rmin,self.rmax, (self.ydim,self.xdim,self.dim)) self.activation = numpy.zeros( (self.ydim,self.xdim), 'f') self.count = 0 ########################################### def init_training(self,alpha_0=None, radius_0=None, training_length=None): self.count = 0 if alpha_0: self.alpha_0 = alpha_0 if radius_0: self.radius_0 = radius_0 self.half_life = training_length/8 def alpha(self): return self.alpha_0 * decay(self.count,self.half_life) def radius(self): return self.radius_0 * decay(self.count,self.half_life) ########################################## def present_input(self,X): for y in range(self.ydim): for x in range(self.xdim): self.activation[y,x] = norm(X - self.weights[y,x])**self.response_exponent self.activation = 1/self.activation if inf in self.activation: win = self.winner() self.activation.flat[win] = 0 self.activation -= self.activation self.activation.flat[win] = 1.0 else: self.activation /= sum(self.activation.flat) def train(self,X): self.present_input(X) wy,wx = self.winner_coords() self.debug("Winner coords = "+`(wy,wx)`) int_radius = numpy.floor(self.radius()) self.debug("Training radius = %.2f" % self.radius()) ymin = max(0,wy-int_radius) ymax = min(wy+int_radius,self.ydim-1) xmin = max(0,wx-int_radius) xmax = min(wx+int_radius,self.xdim-1) self.debug('y range = '+`(ymin,ymax)`) self.debug('x range = '+`(xmin,xmax)`) for y in range(ymin,ymax+1): for x in range(xmin,xmax+1): lattice_dist = sqrt((wx-x)**2 + (wy-y)**2) self.debug("Trying cell %d,%d"%(x,y)) if lattice_dist <= self.radius(): self.debug("Training cell %d,%d"%(x,y)) rate = self.alpha() * gaussian(lattice_dist,self.radius()) self.weights[y,x] += rate * (X - self.weights[y,x]) self.count += 1 def train_batch(self,data,epochs): self.init_training(training_length=len(data)*epochs) for i in xrange(epochs): self.message("Starting epoch",i) for x in rand.shuffle(data): self.train(x) def winner(self): return numpy.argmax(self.activation.flat) def winner_coords(self): pos = numpy.argmax(self.activation.flat) return (pos/self.ydim, pos%self.xdim) def get_model_vector(self,index): if type(index) == int: y = index/self.ydim x = index%self.xdim else: # assume it's a tuple x,y = index return self.weights[y,x] def num_model_vectors(self): return len(self.activation.flat) def get_activation(self): return self.activation.flat
class GNG(VQ): dim = Integer(default=2, bounds=(1, None)) rmin = Number(default=0.0) rmax = Number(default=1.0) e_b = Magnitude(default=0.05) e_n = Magnitude(default=0.0006) lambda_ = Integer(default=200, bounds=(1, None)) beta = Magnitude(default=0.0005) alpha = Magnitude(default=0.5) max_age = Integer(default=100, bounds=(1, None)) response_exponent = Number(default=2) activation_function = Parameter(default='reciprocal') grow_callback = Parameter(default=None) shrink_callback = Parameter(default=None) initial_num_units = Integer(default=2, bounds=(1, None)) initial_connections_per_unit = Integer(default=0, bounds=(0, None)) normalize_error = Parameter(default=True) def __init__(self, **params): from plastk.base.rand import uniform from numpy import zeros super(GNG, self).__init__(**params) N = self.initial_num_units self.weights = uniform(self.rmin, self.rmax, (N, self.dim)) self.dists = zeros((N, 1)) * 0.0 self.error = zeros((N, 1)) * 0.0 self.connections = [{} for i in range(N)] self.last_input = zeros(self.dim) self.count = 0 if self.initial_connections_per_unit > 0: for w in self.weights: self.present_input(w) ww = self.winners(self.initial_connections_per_unit + 1) i = ww[0] for j in ww[1:]: self.add_connection(i, j) self.nopickle += ['_activation_fn'] self.unpickle() def unpickle(self): self._activation_fn = getattr(self, self.activation_function + '_activation') if hasattr(self, 'units'): # if the gng has a units attrib, it's the old version, # so convert it to the new version. self.weights = array([u.weights for u in self.units]) self.error = array([u.error for u in self.units]) self.dists = array([u.distance for u in self.units]) self.connections = [] for u in self.units: conn_dict = {} for k, v in u.connections.iteritems(): conn_dict[self.units.index(k)] = v self.connections.append(conn_dict) del self.units def get_model_vector(self, i): return self.weights[i] def num_model_vectors(self): return len(self.weights) def add_connection(self, a, b): if b not in self.connections[a]: self.verbose("Adding connection between", a, "and", b) self.connections[b][a] = 0 self.connections[a][b] = 0 def del_connection(self, a, b): self.verbose("Deleting connection between", a, "and", b) del (self.connections[b][a]) del (self.connections[a][b]) def del_unit(self, x): self.verbose("Deleting unit", x) if self.shrink_callback: self.shrink_callback(x) # remove the connections for unit x del self.connections[x] # iterate through the connection dictionaries decrementing # all the connection numbers greater than x for i, conn_dict in enumerate(self.connections): new_dict = {} for k, v in conn_dict.items(): assert x != k if k > x: new_dict[k - 1] = v else: new_dict[k] = v self.verbose("old connections for unit", i, "=", conn_dict) self.verbose("new connections for unit", i, "=", new_dict) self.connections[i] = new_dict # set up slices for the items before and after # item x before = slice(0, x) after = slice(x + 1, len(self.weights)) # remove the weights for unit x self.weights = join((self.weights[before], self.weights[after])) # remove the error accumulator for unit x self.error = join((self.error[before], self.error[after])) # remove the distance value for unit x self.dists = join((self.dists[before], self.dists[after])) def present_input(self, X): self.dists = matrixnorm(self.weights - X) self.last_input = X self.new_input = True def get_activation(self): if self.new_input: self._activation_fn() self.new_input = False return self.__activation def reciprocal_activation(self): self.__activation = 1 / self.dists if inf in self.__activation: win = self.winner() self.__activation.flat[win] = 0 self.__activation -= self.__activation self.__activation.flat[win] = 1.0 else: self.__activation /= sum(self.__activation.flat) return self.__activation def gaussian_activation(self): x = self.dists radii = zeros(self.dists.shape) * 0.0 for u, conn_dict in enumerate(self.connections): neighbors = take(self.weights, conn_dict.keys()) radii[u] = average(matrixnorm(neighbors - self.weights[u])) self.__activation = gaussian(x, radii / 2) def uniform_gaussian_activation(self): x = self.dists total = 0.0 count = 0 for u, conn_dict in enumerate(self.connections): neighbors = take(self.weights, conn_dict.keys()) total += sum(matrixnorm(neighbors - self.weights[u])) count += len(conn_dict) self.__activation = gaussian(x, (total / count) / 2) def exp_abs_activation(self): x = self.dists total = 0.0 count = 0 for u, conn_dict in enumerate(self.connections): neighbors = take(self.weights, conn_dict.keys()) total += sum(matrixnorm(neighbors - self.weights[u])) count += len(conn_dict) stddev = total / count self.__activation = exp(-abs(x / stddev)) def winner_take_all_activation(self): self.__activation = zeros(len(self.dists)) self.__activation[argmin(self.dists)] = 1.0 def dot_activation(self): self.__activation = dot(self.weights, self.last_input) def train(self, X, error=None): self.debug("Training on input:", X) self.present_input(X) self.count += 1 # (roman numeral comments from fritzke's algorithm in # B. Fritzke, Unsupervised ontogenetic networks, in Handbook # of Neural Computation, IOP Publishing and Oxford University # Press, 1996) [ replacing \zeta with X ] # (iii) Determine units s_1 and s_2 (s_1,s_2 \in A) such that # |w_{s_1} - X| <= |w_c - X| (\forall c \in A) # and # |w_{s_2} - X| <= |w_c - X| (\forall c \in A\\s_1) s_1, s_2 = self.winners(2) # (iv) If it does not already exist, insert a connection between s1 and s2 # in any case, set the age of the connection to zero self.add_connection(s_1, s_2) # (v) Add the squared distance betwen the input signal and the # nearest unit in input space to a local error variable if error == None: error = self.dists[s_1]**2 if self.normalize_error: error = sqrt(error) / norm(X) self.error[s_1] += error # (vi) Move s_i and its direcct topological neighbors towards # X by fractions e_b and e_n, respectively, of the total # distance. self.weights[s_1] += self.e_b * (X - self.weights[s_1]) for n in self.connections[s_1]: self.weights[n] += self.e_n * (X - self.weights[n]) # (vii) Increment the age of all edges emanating from s_1 for n in self.connections[s_1]: self.connections[n][s_1] += 1 self.connections[s_1][n] += 1 # (viii) Remove edges with an age larger than max_age.... for a, connection_dict in enumerate(self.connections): for b, age in connection_dict.items(): if age > self.max_age: self.del_connection(a, b) # (viii) ... If this results in units having no emanating # edges, remove them as well. to_be_deleted = [a for a, d in enumerate(self.connections) if not d] # sort the list in descending order, so deleting lower numbered # units doesn't screw up the connections to_be_deleted.sort(reverse=True) if to_be_deleted: self.verbose("Deleting units", to_be_deleted) for a in to_be_deleted: self.del_unit(a) # (ix) if the number of input signals so far is an integer # multiple of a parameter \lambda, insert a new unit as # follows. if self.time_to_grow(): # o Determine the unit q with the maximum accumulated error. # o Interpolate a new unit r from q and its neighbor f with the largest # error variable q, f = self.growth_pair() r = len(self.weights) new_weights = 0.5 * (self.weights[q] + self.weights[f]) new_weights.shape = (1, self.dim) self.weights = join((self.weights, new_weights), axis=0) new_distance = norm(X - new_weights) self.dists = join((self.dists, new_distance), axis=0) self.connections.append({}) # o Insert edges connecting the new unit r with unts q and f and # remove the original edge between q and f. self.verbose("Adding unit", r, "between", q, "and", f, "--- count =", self.count) self.add_connection(q, r) self.add_connection(r, f) self.del_connection(q, f) # o Decrease the error variables of q and f self.error[q] += -self.alpha * self.error[q] self.error[f] += -self.alpha * self.error[f] # o Interpolate the error variable of r from q and f new_error = array(0.5 * (self.error[q] + self.error[f])) new_error.shape = (1, 1) self.error = join((self.error, new_error)) if self.grow_callback: self.grow_callback(q, f) # (x) Decrease the error variables of all units self.error += -self.beta * self.error return def winners(self, N=1): N = min(N, len(self.dists)) indices = argsort(self.dists) return tuple(indices[:N]) def winner(self): return argmin(self.dists) def time_to_grow(self): return (self.count % self.lambda_) == 0 def growth_pair(self): def max_error(a, b): if self.error[a] > self.error[b]: return a else: return b q = reduce(max_error, range(len(self.error))) f = reduce(max_error, self.connections[q]) return q, f def neighbors(self, i): return self.connections[i].keys()
class GridWorld(rl.Environment): grid = Parameter(default=[ "############", "#G.........#", "#..........#", "#..........#", "#..........#", "#..........#", "#..........#", "#..........#", "#..........#", "#..........#", "#.........S#", "############" ]) random_start_pos = Parameter(default=False) timeout = Integer(default=0, bounds=(0, None)) actions = Parameter(default=['N', 'S', 'E', 'W']) action_map = Parameter(default={ 'N': (-1, 0), 'S': (1, 0), 'E': (0, 1), 'W': (0, -1) }) correct_action_probability = Magnitude(default=1.0) step_reward = Number(default=-1) goal_reward = Number(default=0) start_pos = Parameter(default=None) goal_pos = Parameter(default=None) crumbs = Parameter(default=False) clear_crumbs_on_pose_set = Parameter(default=True) recolor_crumbs_on_pose_set = Parameter(default=False) count_wall_states = Boolean(default=False) def __init__(self, **args): super(GridWorld, self).__init__(**args) if self.crumbs: self.clear_crumbs = False self.recolor_crumbs = False self.connect_crumbs = True for r, row in enumerate(self.grid): if len(row) != len(self.grid[0]): raise "GridWorld error: grid rows must all be the same length." for c, cell in enumerate(row): if cell == START: if self.start_pos: raise "GridWorld error: grid has more than one start position." self.start_pos = (r, c) elif cell == GOAL: if self.goal_pos: raise "GridWorld error: grid has more than one goal position." self.goal_pos = (r, c) self.start_episode() if self.count_wall_states: self.num_states = sum([len(row) for row in self.grid]) else: self.num_states = sum([ row.count(FREE) + row.count(START) + row.count(GOAL) for row in self.grid ]) def __call__(self, action=None): if action == None: self.curr_pos = self.start_pos self.episode_steps = 0 self.start_episode() return self.state() else: self.episode_steps += 1 assert action in self.actions r, c = self.curr_pos p = self.correct_action_probability N = len(self.actions) distr = array([(1 - p) / (N - 1)] * N) distr[self.actions.index(action)] = p a = utils.weighted_sample(distr) dr, dc = self.action_map[self.actions[a]] if self.move_okay(r + dr, c + dc): r, c = self.curr_pos = (r + dr, c + dc) if (r, c) == self.goal_pos: self.verbose("!!! GOAL !!!") return rl.TERMINAL_STATE, self.goal_reward elif self.timeout and self.episode_steps > self.timeout: return rl.TERMINAL_STATE, self.step_reward else: return self.state(), self.step_reward def reset_crumbs(self): if not self.crumbs: return if self.clear_crumbs_on_pose_set: self.clear_crumbs = True if self.recolor_crumbs_on_pose_set: self.recolor_crumbs = True self.connect_crumbs = False def start_episode(self): if self.random_start_pos: while True: r = rand.randint(len(self.grid)) c = rand.randint(len(self.grid[0])) g = self.grid[r][c] if g != WALL and g != GOAL: self.curr_pos = self.start_pos = r, c break else: self.curr_pos = self.start_pos self.episode_steps = 0 self.reset_crumbs() def set_route(self, start_pos=None, goal_pos=None): if start_pos: self.start_pos = start_pos if goal_pos: self.goal_pos = goal_pos self.start_episode() def move_okay(self, r, c): rbound = len(self.grid) cbound = len(self.grid[0]) return (0 <= r < rbound and 0 <= c < cbound and self.grid[r][c] != WALL) def state(self): r, c = self.curr_pos return r * len(self.grid[0]) + c