Exemplo n.º 1
0
 def _stats(self, moves):
     if not moves.nelement():
         return
     for i in range(2):
         for j in range(2):
             count = ((moves[..., 0] == i) & (moves[..., 1] == j)).sum()
             stats.mean(f'outcomes/{i}-{j}', count, moves.nelement() / 2)
Exemplo n.º 2
0
def chunk_stats(chunk, n_new):
    with stats.defer():
        tail = chunk[-n_new:]
        d, t = tail.decisions, tail.transitions
        n_trajs = t.terminal.sum()
        n_inputs = t.terminal.size(0)
        n_samples = t.terminal.nelement()
        n_sims = d.n_sims.int().sum()
        stats.rate('sample-rate.actor', n_samples)
        stats.mean('traj-length', n_samples, n_trajs)
        stats.cumsum('count.traj', n_trajs)
        stats.cumsum('count.inputs', n_inputs)
        stats.cumsum('count.chunks', 1)
        stats.cumsum('count.samples', n_samples)
        stats.cumsum('count.sims', n_sims)
        stats.rate('step-rate.chunks', 1)
        stats.rate('step-rate.inputs', n_inputs)
        stats.rate('sim-rate', n_sims)
        stats.mean('mcts-n-leaves', d.n_leaves.float().mean())

        wins = (t.rewards == 1).sum(0).sum(0)
        for i, w in enumerate(wins):
            stats.mean(f'wins.seat-{i}', w, n_trajs)

        d, t = chunk.decisions, chunk.transitions
        v = d.v[t.terminal]
        w = t.rewards[t.terminal]
        stats.mean('corr.terminal', ((v - v.mean()) * (w - w.mean())).mean() /
                   (v.var() * w.var())**.5)

        v = d.v[:-1][t.terminal[1:]]
        w = t.rewards[1:][t.terminal[1:]]
        stats.mean('corr.penultimate',
                   ((v - v.mean()) *
                    (w - w.mean())).mean() / (v.var() * w.var())**.5)
Exemplo n.º 3
0
    def step(self, chunk):
        if (self._count % self._buffer_len == 0):
            gs = gradients(self._agent.network, chunk)
            results = pd.DataFrame([noise_scale_components(chunk, gs[k], k) for k in gs])

            for k, v in results.set_index('kind').unstack().iteritems():
                stats.silent('noise.' + '.'.join(k), v)

            for _, row in results.iterrows():
                stats.mean(f'noise.{row.kind}', row.batch_size*row.variance/row.mean_sq)

        self._count += 1
Exemplo n.º 4
0
    def update_field(self, splitter, stable):
        # Figure out who's been playing too long. Stagger it a bit so they don't all change at once
        threshold = np.linspace(1.5, 2.5, self.n_fielded)
        for i, (n, s) in enumerate(zip(splitter.names, splitter.slices)):
            replace = self.games[i] >= threshold[i]*(s.stop - s.start)
            # Swap out any over the limit
            # Don't bother if there actually aren't any envs
            if replace and (s.stop > s.start):
                name, sd = stable.draw()
                splitter.field[i].load_state_dict(sd)
                splitter.names[i] = name

                stats.mean('league-field.latest', stable.step - max(splitter.names))
                stats.mean('league-field.oldest', stable.step - min(splitter.names))

                if self.verbose:
                    log.info(f'New opponent is #{name}')

                self.games[i] = 0
Exemplo n.º 5
0
def optimize(network, scaler, opt, batch):

    with torch.cuda.amp.autocast():
        d0 = batch.decisions
        d = network(batch.worlds)

        zeros = torch.zeros_like(d.logits)
        l = d.logits.where(d.logits > -np.inf, zeros)
        l0 = d0.logits.float().where(d0.logits > -np.inf, zeros)

        policy_loss = -(l0.exp() * l).sum(axis=-1).mean()

        target_value = batch.reward_to_go
        value_loss = (target_value - d.v).square().mean()

        loss = policy_loss + value_loss

    old = torch.cat([p.flatten() for p in network.parameters()])

    opt.zero_grad()
    scaler.scale(loss).backward()
    scaler.step(opt)
    scaler.update()

    new = torch.cat([p.flatten() for p in network.parameters()])

    with stats.defer():
        #TODO: Contract these all based on late-ness
        stats.mean('loss.value', value_loss)
        stats.mean('loss.policy', policy_loss)
        stats.mean('corr.resid-var', (target_value - d.v).pow(2).mean(),
                   target_value.pow(2).mean())

        p0 = d0.prior.float().where(d0.prior > -np.inf, zeros)
        stats.mean('kl-div.behaviour', (p0 - l0).mul(p0.exp()).sum(-1).mean())
        stats.mean('kl-div.prior', (p0 - l).mul(p0.exp()).sum(-1).mean())

        stats.mean('rel-entropy.policy', *learning.rel_entropy(d.logits))
        stats.mean('rel-entropy.targets', *learning.rel_entropy(d0.logits))

        stats.mean('v.target.mean', target_value.mean())
        stats.mean('v.target.std', target_value.std())
        stats.mean('v.target.max', target_value.abs().max())
        stats.mean('v.outputs.mean', d.v.mean())
        stats.mean('v.outputs.std', d.v.std())
        stats.mean('v.outputs.max', d.v.abs().max())

        stats.mean('p.target.mean', l0.mean())
        stats.mean('p.target.std', l0.std())
        stats.mean('p.target.max', l0.abs().max())
        stats.mean('p.outputs.mean', l.mean())
        stats.mean('p.outputs.std', l.std())
        stats.mean('p.outputs.max', l.abs().max())

        stats.mean('policy-conc', l0.exp().max(-1).values.mean())

        stats.rate('sample-rate.learner',
                   batch.transitions.terminal.nelement())
        stats.rate('step-rate.learner', 1)
        stats.cumsum('count.learner-steps', 1)
        # stats.rel_gradient_norm('rel-norm-grad', agent)

        stats.mean('step.std', (new - old).pow(2).mean().pow(.5))
        stats.max('step.max', (new - old).abs().max())

        grad = torch.cat([
            p.grad.flatten() for p in network.parameters()
            if p.grad is not None
        ])
        stats.max('grad.max', grad.abs().max())
        stats.max('grad.std', grad.pow(2).mean().pow(.5))
        stats.max('grad.norm', grad.pow(2).sum().pow(.5))

        B = batch.transitions.terminal.nelement()
        stats.mean('noise-scale', learning.noise_scale(B, opt))