def _stats(self, moves): if not moves.nelement(): return for i in range(2): for j in range(2): count = ((moves[..., 0] == i) & (moves[..., 1] == j)).sum() stats.mean(f'outcomes/{i}-{j}', count, moves.nelement() / 2)
def chunk_stats(chunk, n_new): with stats.defer(): tail = chunk[-n_new:] d, t = tail.decisions, tail.transitions n_trajs = t.terminal.sum() n_inputs = t.terminal.size(0) n_samples = t.terminal.nelement() n_sims = d.n_sims.int().sum() stats.rate('sample-rate.actor', n_samples) stats.mean('traj-length', n_samples, n_trajs) stats.cumsum('count.traj', n_trajs) stats.cumsum('count.inputs', n_inputs) stats.cumsum('count.chunks', 1) stats.cumsum('count.samples', n_samples) stats.cumsum('count.sims', n_sims) stats.rate('step-rate.chunks', 1) stats.rate('step-rate.inputs', n_inputs) stats.rate('sim-rate', n_sims) stats.mean('mcts-n-leaves', d.n_leaves.float().mean()) wins = (t.rewards == 1).sum(0).sum(0) for i, w in enumerate(wins): stats.mean(f'wins.seat-{i}', w, n_trajs) d, t = chunk.decisions, chunk.transitions v = d.v[t.terminal] w = t.rewards[t.terminal] stats.mean('corr.terminal', ((v - v.mean()) * (w - w.mean())).mean() / (v.var() * w.var())**.5) v = d.v[:-1][t.terminal[1:]] w = t.rewards[1:][t.terminal[1:]] stats.mean('corr.penultimate', ((v - v.mean()) * (w - w.mean())).mean() / (v.var() * w.var())**.5)
def step(self, chunk): if (self._count % self._buffer_len == 0): gs = gradients(self._agent.network, chunk) results = pd.DataFrame([noise_scale_components(chunk, gs[k], k) for k in gs]) for k, v in results.set_index('kind').unstack().iteritems(): stats.silent('noise.' + '.'.join(k), v) for _, row in results.iterrows(): stats.mean(f'noise.{row.kind}', row.batch_size*row.variance/row.mean_sq) self._count += 1
def update_field(self, splitter, stable): # Figure out who's been playing too long. Stagger it a bit so they don't all change at once threshold = np.linspace(1.5, 2.5, self.n_fielded) for i, (n, s) in enumerate(zip(splitter.names, splitter.slices)): replace = self.games[i] >= threshold[i]*(s.stop - s.start) # Swap out any over the limit # Don't bother if there actually aren't any envs if replace and (s.stop > s.start): name, sd = stable.draw() splitter.field[i].load_state_dict(sd) splitter.names[i] = name stats.mean('league-field.latest', stable.step - max(splitter.names)) stats.mean('league-field.oldest', stable.step - min(splitter.names)) if self.verbose: log.info(f'New opponent is #{name}') self.games[i] = 0
def optimize(network, scaler, opt, batch): with torch.cuda.amp.autocast(): d0 = batch.decisions d = network(batch.worlds) zeros = torch.zeros_like(d.logits) l = d.logits.where(d.logits > -np.inf, zeros) l0 = d0.logits.float().where(d0.logits > -np.inf, zeros) policy_loss = -(l0.exp() * l).sum(axis=-1).mean() target_value = batch.reward_to_go value_loss = (target_value - d.v).square().mean() loss = policy_loss + value_loss old = torch.cat([p.flatten() for p in network.parameters()]) opt.zero_grad() scaler.scale(loss).backward() scaler.step(opt) scaler.update() new = torch.cat([p.flatten() for p in network.parameters()]) with stats.defer(): #TODO: Contract these all based on late-ness stats.mean('loss.value', value_loss) stats.mean('loss.policy', policy_loss) stats.mean('corr.resid-var', (target_value - d.v).pow(2).mean(), target_value.pow(2).mean()) p0 = d0.prior.float().where(d0.prior > -np.inf, zeros) stats.mean('kl-div.behaviour', (p0 - l0).mul(p0.exp()).sum(-1).mean()) stats.mean('kl-div.prior', (p0 - l).mul(p0.exp()).sum(-1).mean()) stats.mean('rel-entropy.policy', *learning.rel_entropy(d.logits)) stats.mean('rel-entropy.targets', *learning.rel_entropy(d0.logits)) stats.mean('v.target.mean', target_value.mean()) stats.mean('v.target.std', target_value.std()) stats.mean('v.target.max', target_value.abs().max()) stats.mean('v.outputs.mean', d.v.mean()) stats.mean('v.outputs.std', d.v.std()) stats.mean('v.outputs.max', d.v.abs().max()) stats.mean('p.target.mean', l0.mean()) stats.mean('p.target.std', l0.std()) stats.mean('p.target.max', l0.abs().max()) stats.mean('p.outputs.mean', l.mean()) stats.mean('p.outputs.std', l.std()) stats.mean('p.outputs.max', l.abs().max()) stats.mean('policy-conc', l0.exp().max(-1).values.mean()) stats.rate('sample-rate.learner', batch.transitions.terminal.nelement()) stats.rate('step-rate.learner', 1) stats.cumsum('count.learner-steps', 1) # stats.rel_gradient_norm('rel-norm-grad', agent) stats.mean('step.std', (new - old).pow(2).mean().pow(.5)) stats.max('step.max', (new - old).abs().max()) grad = torch.cat([ p.grad.flatten() for p in network.parameters() if p.grad is not None ]) stats.max('grad.max', grad.abs().max()) stats.max('grad.std', grad.pow(2).mean().pow(.5)) stats.max('grad.norm', grad.pow(2).sum().pow(.5)) B = batch.transitions.terminal.nelement() stats.mean('noise-scale', learning.noise_scale(B, opt))