def gen_encoders(self, N, contextD, context_scale): """Generate encoders for state population of learning agent. :param N: number of neurons in state population :param contextD: dimension of context vector representation :param context_scale: weight on context representation relative to state (1.0 = equal weighting) """ if contextD > 0: contexts = MU.I(contextD) else: contexts = [[]] # neurons each sensitive to different combinations of stimuli encs = (list(MU.I(self.stateD)) + [o + s + c for o in MU.I(self.num_orientations) for s in MU.I(self.num_shapes) for c in MU.I(self.num_colours)]) return [HRLutils.normalize( HRLutils.normalize(random.choice(encs)) + [x * context_scale for x in random.choice(contexts)]) for _ in range(N)]
def tick(self): cond_active = False for c in self.conds: if isinstance(c, Timer): # if it is a timer entry, just update the timer and check if it # has expired c.tick() if c.ring(): self.reward = self.rewardval self.activate() c.reset() cond_active = True elif (self.env.is_in(self.env.state, c) and (self.conds[c] is None or HRLutils.similarity(HRLutils.normalize(self.context), self.conds[c]) > 0.3)): # if it is a state entry, check if the agent is in the region # associated with that state, and check if that region is the # one corresponding to the currently selected context self.reward = self.rewardval self.rewardamount += 1 if self.rewardamount > self.rewardresetamount: self.activate() self.rewardamount = 0 cond_active = True # if no termination conditions met, just give default reward if not cond_active: self.reward = self.defaultreward # reset rewardamount when the reset signal is sent (so that there won't # be any leftover rewardamount from the agent's previous decision) if self.t > self.resettime[0] and self.t < self.resettime[1]: self.rewardamount = 0 # add a penalty if the state hasn't changed (to help prevent agent from # getting stuck) if sum(self.prev_state) != 0 and \ HRLutils.similarity(HRLutils.normalize(self.env.state), HRLutils.normalize(self.prev_state)) < 1.0: self.state_penalty = 0.0 else: self.state_penalty += 0.0001 self.prev_state = copy.deepcopy(self.env.state) self.reward = self.reward - self.state_penalty
def tick(self): cond_active = False for c in self.conds: if isinstance(c, Timer): # if it is a timer entry, just update the timer and check if it # has expired c.tick() if c.ring(): self.reward = self.rewardval self.activate() c.reset() cond_active = True elif (self.env.is_in(self.env.state, c) and (self.conds[c] is None or HRLutils.similarity( HRLutils.normalize(self.context), self.conds[c]) > 0.3)): # if it is a state entry, check if the agent is in the region # associated with that state, and check if that region is the # one corresponding to the currently selected context self.reward = self.rewardval self.rewardamount += 1 if self.rewardamount > self.rewardresetamount: self.activate() self.rewardamount = 0 cond_active = True # if no termination conditions met, just give default reward if not cond_active: self.reward = self.defaultreward # reset rewardamount when the reset signal is sent (so that there won't # be any leftover rewardamount from the agent's previous decision) if self.t > self.resettime[0] and self.t < self.resettime[1]: self.rewardamount = 0 # add a penalty if the state hasn't changed (to help prevent agent from # getting stuck) if sum(self.prev_state) != 0 and \ HRLutils.similarity(HRLutils.normalize(self.env.state), HRLutils.normalize(self.prev_state)) < 1.0: self.state_penalty = 0.0 else: self.state_penalty += 0.0001 self.prev_state = copy.deepcopy(self.env.state) self.reward = self.reward - self.state_penalty
def termination_context(self, c, pstc=0.01): self.context = max(self.contexts, key=lambda x: MU.prod(HRLutils.normalize(c), HRLutils.normalize(x[1])))
def termination_action(self, a, pstc=0.01, dimensions=3): self.action = HRLutils.normalize(a)