def update_k(self, x, x_l): mu, emb_width, k_bins = self.mu, self.emb_width, self.k_bins with t.no_grad(): # Calculate new centres x_l_onehot = t.zeros(k_bins, x.shape[0], device=x.device) # k_bins, N * L x_l_onehot.scatter_(0, x_l.view(1, x.shape[0]), 1) _k_sum = t.matmul(x_l_onehot, x) # k_bins, w _k_elem = x_l_onehot.sum(dim=-1) # k_bins y = self._tile(x) _k_rand = y[t.randperm(y.shape[0])][:k_bins] dist.broadcast(_k_rand, 0) dist.all_reduce(_k_sum) dist.all_reduce(_k_elem) # Update centres old_k = self.k self.k_sum = mu * self.k_sum + (1. - mu) * _k_sum # w, k_bins self.k_elem = mu * self.k_elem + (1. - mu) * _k_elem # k_bins usage = (self.k_elem.view(k_bins, 1) >= self.threshold).float() self.k = usage * (self.k_sum.view(k_bins, emb_width) / self.k_elem.view(k_bins, 1)) \ + (1 - usage) * _k_rand _k_prob = _k_elem / t.sum( _k_elem) # x_l_onehot.mean(dim=-1) # prob of each bin entropy = -t.sum( _k_prob * t.log(_k_prob + 1e-8)) # entropy ie how diverse used_curr = (_k_elem >= self.threshold).sum() usage = t.sum(usage) dk = t.norm(self.k - old_k) / np.sqrt(np.prod(old_k.shape)) return dict(entropy=entropy, used_curr=used_curr, usage=usage, dk=dk)
def update(self, tag, val, batch): # v is average value over batch # store total value and total batch, returns dist average sum = t.tensor(val * batch).float().cuda() n = t.tensor(batch).float().cuda() dist.all_reduce(sum) dist.all_reduce(n) sum = sum.item() n = n.item() self.sum[tag] = self.sum.get(tag, 0.0) + sum self.n[tag] = self.n.get(tag, 0.0) + n return sum / n
def allreduce(x, op=dist.ReduceOp.SUM): x = torch.tensor(x).float().cuda() dist.all_reduce(x, op=op) return x.item()