class Feather: '''Internal logger used by Rollout. Due for a rewrite.''' def __init__(self): self.expMap = set() self.blob = Blob() def scrawl(self, iden): '''Write logs from one time step Args: iden: The unique ID used in serialization ''' world, annID, entID, _ = iden self.blob.entID = entID self.blob.annID = annID self.blob.world = world #tile = self.tile(stim) #self.move(tile, ent.pos) #self.action(arguments, atnArgs) def tile(self, stim): R, C = stim.shape rCent, cCent = R // 2, C // 2 tile = stim[rCent, cCent] return tile def action(self, arguments, atnArgs): move, attk = arguments moveArgs, attkArgs, _ = atnArgs moveLogits, moveIdx = moveArgs attkLogits, attkIdx = attkArgs def move(self, tile, pos): tile = type(tile.state) if pos not in self.expMap: self.expMap.add(pos) if tile in self.blob.unique: self.blob.unique[tile] += 1 if tile in self.blob.counts: self.blob.counts[tile] += 1 def reward(self, reward): self.blob.reward.append(reward) def value(self, value): self.blob.value.append(float(value)) def finish(self): self.blob.finish()
class Feather: def __init__(self): self.blob = Blob() def scrawl(self, apple, ent, val, reward, lmPunishment): self.blob.annID = ent.annID self.stats(val, reward, lmPunishment, apple) def stats(self, value, reward, lmPunishment, apple): self.blob.reward.append(reward) self.blob.apples.append(apple) self.blob.value.append(float(value)) self.blob.lmPunishment.append(float(lmPunishment)) def finish(self): self.blob.finish()
class Feather: def __init__(self, config): #self.expMap = set() self.blob = Blob(config) def scrawl(self, annID, val, reward, apples, lmPunishment): self.blob.annID = annID self.stats(val, reward, apples, lmPunishment) def stats(self, value, reward, apples, lmPunishment): self.blob.reward.append(reward) self.blob.apples.append(apples) self.blob.value.append(float(value)) self.blob.lmPunishment.append(float(lmPunishment)) def finish(self): self.blob.finish()
class Feather: def __init__(self, config): self.blob = Blob(config) def scrawl(self, ent, val, reward, lmPunishment, attack, contact): self.blob.annID = ent.annID self.stats(val, reward, lmPunishment, attack, contact) def stats(self, value, reward, lmPunishment, attack, contact): self.blob.reward.append(reward) self.blob.value.append(float(value)) self.blob.lmPunishment.append(float(lmPunishment)) self.blob.contact.append(float(contact)) if attack is not None: self.blob.attack.append(float(attack)) def finish(self): self.blob.finish()
class Feather: def __init__(self): self.expMap = set() self.blob = Blob() def scrawl(self, stim, ent, val, reward): self.blob.annID = ent.annID tile = self.tile(stim) self.move(tile, ent.pos) # self.action(arguments, atnArgs) self.stats(val, reward) def tile(self, stim): R, C = stim.shape rCent, cCent = R // 2, C // 2 tile = stim[rCent, cCent] return tile def action(self, arguments, atnArgs): move, attk = arguments moveArgs, attkArgs, _ = atnArgs moveLogits, moveIdx = moveArgs attkLogits, attkIdx = attkArgs def move(self, tile, pos): tile = type(tile.state) r, c = pos self.blob.expMap[r][c] += 1 if pos not in self.expMap: self.expMap.add(pos) self.blob.unique[tile] += 1 self.blob.counts[tile] += 1 def stats(self, value, reward): self.blob.reward.append(reward) self.blob.value.append(float(value)) def finish(self): self.blob.finish()
class Rollout: def __init__(self, config): '''Rollout object used internally by RolloutManager Args: config: A configuration object ''' self.actions = defaultdict(list) self.values = [] self.rewards = [] self.done = False self.time = -1 #Logger self.config = config self.blob = None def __len__(self): '''Length of a rollout Returns: lifetime: Number of timesteps the agent has survived ''' return self.blob.lifetime def inputs(self, reward, key): '''Collects input data to internal buffers Args: reward : The reward received by the agent for its last action key : The ID associated with the agent ''' #Also check if blob is not none. This prevents #recording the first reward of a partial trajectory if reward is not None and self.blob is not None: self.rewards.append(reward) if self.blob is None: annID, entID = key self.blob = Blob(entID, annID) self.time += 1 self.blob.inputs(reward) def outputs(self, atnArgKey, atnLogits, atnIdx, value): '''Collects output data to internal buffers Args: atnArgKey : Action-Argument formatted string atnLogits : Action logits atnsIdx : Argument indices sampled from logits value : Value function prediction ''' if len(self.actions[self.time]) == 0: self.blob.outputs(float(value)) self.values.append(value) output = Output(atnArgKey, atnLogits, atnIdx, value) self.actions[self.time].append(output) def finish(self): '''Called internally once the full rollout has been collected''' self.rewards.append(-1) self.blob.inputs(-1) #self.returns = self.gae(self.config.GAMMA, self.config.LAMBDA, self.config.HORIZON) self.returns = self.discount(self.config.GAMMA) self.lifespan = len(self.rewards) self.blob.finish() def gae(self, gamma, lamb, H): '''Applies generalized advantage estimation to the given trajectory Args: gamma: Reward discount factor gamma: GAE discount factor Returns: rewards: Discounted list of rewards ''' r = self.rewards V = self.values L = len(r) returns = [] for t in range(L): At, T = 0, min(L - t - 1, H) for i in range(T): tt = t + i deltaT = r[tt] + gamma * V[tt + 1] - V[tt] At += deltaT * (gamma * lamb)**i for out in self.actions[t]: out.returns = At returns.append(At) return returns def discount(self, gamma): '''Applies standard gamma discounting to the given trajectory Args: gamma: Reward discount factor Returns: rewards: Discounted list of rewards ''' rets, N = [], len(self.rewards) discounts = np.array([gamma**i for i in range(N)]) rewards = np.array(self.rewards) for idx in range(N): R_i = sum(rewards[idx:] * discounts[:N - idx]) for out in self.actions[idx]: out.returns = R_i rets.append(R_i) return rets