def corrupt(batch, n): """ Corrupts the negatives of a batch of triples (in place). :param batch_size: :param n: nr of nodes in the graph :return: """ bs, ns, _ = batch.size() # new entities to insert corruptions = torch.randint(size=(bs * ns, ), low=0, high=n, dtype=torch.long, device=d(batch)) # boolean mask for entries to corrupt mask = torch.bernoulli( torch.empty(size=(bs, ns, 1), dtype=torch.float, device=d(batch)).fill_(0.5)).to(torch.bool) zeros = torch.zeros(size=(bs, ns, 1), dtype=torch.bool, device=d(batch)) mask = torch.cat([mask, zeros, ~mask], dim=2) batch[mask] = corruptions
def run(self): while self.flag and self._flag: try: data, _ = self.__socket.recvfrom(65565) except Exception as err: util.d(err) else: self.__catalogue.touch(util.unpack(data))
def sum(indices, values, size, row=True): """ Sum the rows or columns of a sparse matrix, and redistribute the results back to the non-sparse row/column entries Arguments are interpreted as defining sparse matrix. Any extra dimensions as treated as batch. :return: """ assert len(indices.size()) == len(values.size()) + 1 if len(indices.size()) == 2: # add batch dim indices = indices[None, :, :] values = values[None, :] bdims = None else: # fold up batch dim bdims = indices.size()[:-2] k, r = indices.size()[-2:] assert bdims == values.size()[:-1] assert values.size()[-1] == k indices = indices.view(-1, k, r) values = values.view(-1, k) b, k, r = indices.size() if row: ones = torch.ones((size[1], 1), device=d(indices)) else: ones = torch.ones((size[0], 1), device=d(indices)) # transpose the matrix indices = torch.cat([indices[:, :, 1:2], indices[:, :, 0:1]], dim=1) s, _ = ones.size() ones = ones[None, :, :].expand(b, s, 1).contiguous() sums = batchmm(indices, values, size, ones) # row/column sums bindex = torch.arange(b, device=d(indices))[:, None].expand(b, indices.size(1)) sums = sums[bindex, indices[:, :, 0], 0] if bdims is None: return sums.view(k) return sums.view(*bdims + (k, ))
def genDefaultConfig(params): c = {} for p in params: p = d(p) c[p.id] = p.default return c
def getKeys(self): result = {} for k in self._map: v = self._map[k] if isinstance(v, str): v = ord(v) result[k] = abs(win32api.GetAsyncKeyState(v)) > 1 return d(result)
def getKeys(self): bits = 0 try: packet = SerialGamePad._generateHeader(CMDTYPE.GET_BTNS, 0) self._com.write(packet) resp = self._com.read(1) if len(resp) == 0: SerialGamePad._comError() elif ord(resp) != RETURN_CODES.SUCCESS: SerialGamePad._printError(ord(resp)) resp = self._com.read(2) if len(resp) != 2: SerialGamePad._comError() bits = ord(resp[0]) + (ord(resp[1]) << 8) except IOError: log.logger.error("IO Error Communicatng With Game Pad!") index = 0 result = {} for m in self._map: result[m] = (bits & (1<<index) > 0) index += 1 return d(result)
def getKeys(self): bits = 0 try: packet = SerialGamePad._generateHeader(CMDTYPE.GET_BTNS, 0) self._com.write(packet) resp = self._com.read(1) if len(resp) == 0: SerialGamePad._comError() elif ord(resp) != RETURN_CODES.SUCCESS: SerialGamePad._printError(ord(resp)) resp = self._com.read(2) if len(resp) != 2: SerialGamePad._comError() bits = ord(resp[0]) + (ord(resp[1]) << 8) except IOError: log.logger.error("IO Error Communicatng With Game Pad!") index = 0 result = {} for m in self._map: result[m] = (bits & (1 << index) > 0) index += 1 return d(result)
def readServerConfig(): data = readConfig("config", path=__home) base = paramsToDict(BASE_SERVER_CONFIG.params) if len(data.keys()) == 0: data = paramsToDict(BASE_SERVER_CONFIG.params) elif len(data.keys()) != len(base.keys()): data.upgrade(base) return d(data)
def cluster_adjacency_matrix(self,C,dmax=1.e10): n = len(C) a= scipy.sparse.lil_matrix((n,n)) for i in range(n): for j in range(i+1,n): d = util.d(C[i],C[j]) if (d < dmax): a[j,i] = a[i,j] = d A= a.tocsr() return A.todense()
def plot(self, inputs, numpixels=5, ims=None): ims = inputs if ims is None else ims b, c, h, w = inputs.size() b, cims, hims, wims = ims.size() k = self.k # choose 5 random pixels, for which we'll plot the input pixels. choices = torch.randint(low=0, high=h * w, size=(numpixels, )) perrow = 5 rows = int(math.ceil(b / perrow)) means, sigmas, _ = self.hyper(inputs) inputs = inputs.data plt.figure(figsize=(perrow * 3, rows * 3)) # scale up to image coordinates scale = torch.tensor((hims / h, wims / w), device=d(inputs)) means = means * scale + (scale / 2) for current in range(b): # select subset of means, sigmas smeans = means[current, :, :, :, :].view(h * w, k, 2) ssigmas = sigmas[current, :, :, :].view(h * w, k, 2) color = (torch.arange(numpixels, dtype=torch.float) [:, None].expand(numpixels, k) / numpixels) * 2.0 - 1.0 smeans = smeans[choices, :, :] ssigmas = ssigmas[choices, :] ax = plt.subplot(rows, perrow, current + 1) im = np.transpose(ims[current, :, :, :].cpu().numpy(), (1, 2, 0)) im = np.squeeze(im) ax.imshow(im, interpolation='nearest', extent=(-0.5, wims - 0.5, -0.5, hims - 0.5), cmap='gray_r') util.plot(smeans.reshape(1, -1, 2), ssigmas.reshape(1, -1, 2), color.reshape(1, -1), axes=ax, flip_y=hims, tanh=False) plt.gcf()
def from_points(self, c1, c2, dmax=1.e10): self.init() ### NOTE assume len(c1) == len(c2) self.nnodes = n = len(c1) for i in range(n): x = c1[i] for j in range(n): y = c2[i] d = util.d(x, y) if (d < dmax): self.edges.append([[i, j], d])
def addKeyFunc(self, key, func, speed=1, hold=True): if not isinstance(key, list): key = [key] for k in key: self._keyfuncs[k] = d({ "func": func, "speed": speed, "hold": hold, "last": False, "inter": False })
def addKeyFunc(self, key, func, speed = 1, hold = True): if not isinstance(key, list): key = [key] for k in key: self._keyfuncs[k] = d({ "func": func, "speed": speed, "hold": hold, "last": False, "inter": False })
def reparameterize(self, mean_logvar): """ Reparametrization trick. """ self.mean = mean = mean_logvar[:, :, :self.z_dim] self.logvar = logvar = mean_logvar[:, :, self.z_dim:] if self.var: eps = torch.normal(torch.zeros_like(mean), std=1.).to(d()) else: eps = 1. return eps * torch.exp(logvar * .5) + mean
def from_points(self,c1,c2,dmax=1.e10): self.init() ### NOTE assume len(c1) == len(c2) self.nnodes = n = len(c1) for i in range(n): x = c1[i] for j in range(n): y = c2[i] d = util.d(x,y) if (d < dmax): self.edges.append([[i,j],d])
def get_LSC_random(self, cluster_number, r): data_points = self.dataset.samples landmarks = self.get_landmarks_random(cluster_number) sparse_matrix = self.get_sparse_affinity_matrix( data_points, landmarks, r) row_sum = self.get_row_sum_vector(sparse_matrix) # Zn = D^(-1/2)Z # This is Zn, it should be samples x landmark size final_z = row_sum * np.transpose(sparse_matrix) # Calculate the Singular Value Decomposition of the final_z V_t, E, U = np.linalg.svd(final_z, False) U_t = np.transpose(U) I = np.eye(cluster_number) E_minus1 = np.power(E, -1) E_minus1 = E_minus1 * I final_a = np.dot(E_minus1, np.dot(U, np.transpose(final_z))) result = np.transpose(final_a) d("Size " + str(len(final_a)) + " - " + str(len(final_a[0])))
def add_inverse_and_self(triples, n, r): """ Adds inverse relations and self loops to a tensor of triples :param triples: :return: """ b, _ = triples.size() inv = torch.cat( [triples[:, 2, None], triples[:, 1, None] + r, triples[:, 0, None]], dim=1) assert inv.size() == (b, 3) all = torch.arange(n, device=d(triples))[:, None] id = torch.empty(size=(n, 1), device=d(triples), dtype=torch.long).fill_(2 * r) slf = torch.cat([all, id, all], dim=1) assert slf.size() == (n, 3) return torch.cat([triples, slf, inv], dim=0)
def hyper(self, x): assert x.size()[1:] == self.in_size b, c, h, w = x.size() k = self.k # the coordinates of the current pixels in parameters space # - the index tuples are described relative to these hw = torch.tensor((h, w), device=d(x), dtype=torch.float) mids = self.coords[None, :, :, :].expand( b, 2, h, w) * (hw - 1)[None, :, None, None] mids = mids.permute(0, 2, 3, 1) if not self.modulo: mids = util.inv(mids, mx=hw[None, None, None, :]) mids = mids[:, :, :, None, :].expand(b, h, w, k, 2) # add coords to channels if self.admode == 'none': params = self.params[None, None, None, :].expand(b, h, w, k * 3) else: if self.admode == 'full': coords = self.coords[None, :, :, :].expand(b, 2, h, w) x = torch.cat([x, coords], dim=1) elif self.admode == 'coords': x = self.coords[None, :, :, :].expand(b, 2, h, w) elif self.admode == 'inputs': pass else: raise Exception( f'adaptivity mode {self.admode} not recognized') x = x.permute(0, 2, 3, 1) params = self.toparams(x) assert params.size() == (b, h, w, k * 3 ) # k index tuples per output pixel means = params[:, :, :, :k * 2].view(b, h, w, k, 2) sigmas = params[:, :, :, k * 2:].view(b, h, w, k) values = self.mvalues[None, None, None, :].expand(b, h, w, k) means = mids + self.mmult * means s = (h, w) means = sparse.transform_means( means, s, method='modulo' if self.modulo else 'sigmoid') sigmas = sparse.transform_sigmas( sigmas, s, min_sigma=self.min_sigma) * self.sigma_scale return means, sigmas, values
def calculate_bpb(arg, model, data_sub): with torch.no_grad(): bits, tot = 0.0, 0 batch = [ ] # buffer, every time it fills up, we run it through the model for current in tqdm.trange(data_sub.size(0)): fr = max(0, current - arg.context) to = current + 1 context = data_sub[fr:to].to(torch.long) if context.size(0) < arg.context + 1: pad = torch.zeros(size=(arg.context + 1 - context.size(0), ), dtype=torch.long) context = torch.cat([pad, context], dim=0) assert context.size(0) == arg.context + 1 if torch.cuda.is_available(): context = context.cuda() batch.append(context[None, :]) if len(batch ) == arg.test_batchsize or current == data_sub.size(0) - 1: # batch is full, run it through the model b = len(batch) all = torch.cat(batch, dim=0) source = all[:, :-1] # input target = all[:, -1] # target values output = model(source) lnprobs = output[torch.arange(b, device=d()), -1, target] log2probs = lnprobs * LOG2E # convert from nats to bits bits += -log2probs.sum() batch = [] # empty buffer bits_per_byte = bits / data_sub.size(0) return bits_per_byte
def corrupt_one(batch, candidates, target): """ Corrupts the negatives of a batch of triples (in place). Corrupts either only head or only tails :param batch_size: :param n: nr of nodes in the graph :param target: 0 for head, 1 for predicate, 2 for tail :return: """ bs, ns, _ = batch.size() # new entities to insert #corruptions = torch.randint(size=(bs * ns,),low=0, high=n, dtype=torch.long, device=d(batch)) corruptions = torch.tensor(random.choices(candidates, k=bs * ns), dtype=torch.long, device=d(batch)).view(bs, ns) batch[:, :, target] = corruptions
BASE_SERVER_CONFIG = d({ "id": "server_config", "display": "server_config", "preconfig": False, "presets": [], "params": [ { "id": "external_access", "label": "Allow External Access", "type": "bool", "default": True, "help": "On: Other computers on your network can access PixelWeb. Off: LocalHost access only." }, { "id": "port", "label": "Server Port", "type": "int", "default": 8080, "help": "Port to listen on." }, { "id": "load_defaults", "label": "Load Last Config on Start", "type": "bool", "default": False, "help": "Load last driver/controller configuration on application start." }, { "id": "show_debug", "label": "Show Debug in Console", "type": "bool", "default": False, "help": "Show BiblioPixel debug in server console (not in main UI)." }, { "id": "mod_dirs", "label": "Module Directories", "type": "str_multi", "default": [], "help": "Directories from which to load modules (animations, drivers, controllers, pre-configs).", "replace": { "\\": "/" } }, { "id": "off_anim_time", "label": "All Off Timeout", "type": "int", "default": 10, "min": 0, "max": 3600, "help": "Keep display off when not running an animation by actively turning all pixels off every X seconds. Set to 0 to disable." }, ] })
def forward(self, batch): assert batch.size(-1) == 3 n, r = self.n, self.r dims = batch.size()[:-1] batch = batch.reshape(-1, 3) batchl = batch.tolist() with torch.no_grad(): if self.prune and self.depth > 0: # gather all triples that are relevant to the current batch triples = {tuple(t) for t in batchl} nds = set() for s, _, o in batchl: nds.add(s) nds.add(o) for _ in range(self.depth): #-- gather all triples that are close enough to the batch triples to be relevant inc_triples = set() for n in nds: inc_triples.update(self.lookup[n]) triples.update(inc_triples) nds.update([s for (s, _, _) in inc_triples]) nds.update([o for (_, _, o) in inc_triples]) triples = torch.tensor(list(triples), device=d(self.all_triples), dtype=torch.long) with torch.no_grad(): triples = add_inverse_and_self(triples, n, r) else: triples = self.all_triples_plus # just use all triples if self.dropout is not None and self.training: # We drop out edges by actually removing the triples, to save on memory assert len(self.dropout) == 2 keep, keepid = 1.0 - self.dropout[0], 1.0 - self.dropout[1] nt = triples.size(0) - n keep_ind = random.sample(range(nt), k=int(floor(keep * nt))) keepid_ind = random.sample(range(nt, nt + n), k=int(floor(keepid * n))) ind = keep_ind + keepid_ind triples = triples[ind, :] nodes = self.embeddings if self.layer0 is None else self.layer0( triples=triples) if self.layer1 is not None: nodes = self.layer1(triples=triples, nodes=nodes) if self.do is not None: nodes = self.do(nodes) relations = self.do(self.relations) else: relations = self.relations if self.biases: biases = (self.gbias, self.sbias, self.pbias, self.obias) else: biases = None scores = self.decoder(batch, nodes, relations, biases=biases) assert scores.size() == (util.prod(dims), ) return scores.view(*dims)
def forward(self, triples, nodes=None): n, r = self.n, self.r rn = r * n ## Construct the graph # horizontally and vertically stacked versions of the adjacency graph # (the vertical is always necessary to normalize the adjacencies) if self.hor: hor_ind, hor_size = util.adj_triples_tensor(triples, n, r, vertical=False) ver_ind, ver_size = util.adj_triples_tensor(triples, n, r, vertical=True) rn, _ = ver_size # compute values of row-normalized adjacency matrices (same for hor and ver) vals = torch.ones(ver_ind.size(0), dtype=torch.float, device=d(triples)) vals = vals / util.sum_sparse(ver_ind, vals, ver_size) if self.hor: self.adj = torch.sparse.FloatTensor(indices=hor_ind.t(), values=vals, size=hor_size) else: self.adj = torch.sparse.FloatTensor(indices=ver_ind.t(), values=vals, size=ver_size) if triples.is_cuda: self.adj = self.adj.to('cuda') ## Perform message passing assert (nodes is None) == (self.insize is None) h0 = n if self.insize is None else self.insize h1 = self.outsize if self.decomp is None: weights = self.weights elif self.decomp == 'basis': weights = torch.einsum('rb, bij -> rij', self.comps, self.bases) elif self.decomp == 'block': weights = util.block_diag(self.blocks) # TODO: multiply in block form (more efficient, but implementation differs per layer type) assert weights.size() == (r, h0, h1) if self.insize is None: # -- input is the identity matrix, just multiply the weights by the adjacencies out = torch.mm(self.adj, weights.view(r * h0, h1)) elif self.hor: # -- input is high-dim and output is low dim, multiply h0 x weights first nodes = nodes[None, :, :].expand(r, n, h0) nw = torch.einsum('rni, rio -> rno', nodes, weights).contiguous() out = torch.mm(self.adj, nw.view(r * n, h1)) else: # -- adj x h0 first, then weights out = torch.mm(self.adj, nodes) # sparse mm out = out.view(r, n, h0) # new dim for the relations out = torch.einsum('rio, rni -> no', weights, out) assert out.size() == (n, h1) return out + self.bias
def compute_compression(model, data, context, batch_size): """ Compute the _compression_ of a dataset under a model. That is, given a model, in how many bits could we represent the dataset. This requires us to turn a given probability distribution into a code for the outcomes. See [this video](https://youtu.be/mSneVjDvzNQ) for an explanation. :param model: A sequence-to-sequence model that takes as input a (sub) sequence of integers and produces a probability distributuion on the output. :param data: A singe list of integers representing the data :return: The result of the computation in "bits per byte". That is, how many bits does the compressed representation spend on each byte (=ASCII character) of the raw data. """ bits, tot = 0.0, 0 batch = [] # Buffer, every time it fills up, we run it through the model # --- For the sake of speed we want to process the data in batches. For each token in the data, we make a # prediction based on all the `context` tokens before it. This means that for each subsequence in the batch, we # need to shift the start/end indices ahead by one token. # # After we pass the batch through the model, we look at only the probabilities predicted for the last token. for current in range(data.size(0)): fr = max(0, current - context) to = current + 1 instance = data[fr:to].to( torch.long) # the subsequence of the data to add to the batch if instance.size(0) < context + 1: pad = torch.zeros(size=(context + 1 - instance.size(0), ), dtype=torch.long) instance = torch.cat([pad, instance], dim=0) # -- the first tokens don't have enough tokens preceding them, so we pad them to the right size. assert instance.size( 0) == context + 1 # all instances should be `context` + 1 long if torch.cuda.is_available(): instance = instance.cuda() batch.append(instance[None, :]) # -- We add a singleton dimension to concatenate along later. if len(batch) == batch_size or current == data.size(0) - 1: # batch is full or we are at the last instance, run it through the model b = len(batch) all = torch.cat(batch, dim=0) inputs = all[:, :-1] # input target = all[:, -1] # target values output = model(inputs) lnprobs = output[torch.arange(b, device=d()), -1, target] log2probs = lnprobs * LOG2E # -- The model produces natural logarithms of probabilities, but we need base-2 logarithms of the # probabilities, since these give us bits. bits += -log2probs.sum( ) # Add the bits for each character (the negative log_2 probabilties) to the running total batch = [] # clear the buffer return bits / data.size(0) # bits-per-byte
def go(arg): global repeats repeats = arg.repeats tbdir = arg.tb_dir if arg.tb_dir is not None else os.path.join('./runs', get_slug(arg))[:250] tbw = SummaryWriter(log_dir=tbdir) dev = 'cuda' if torch.cuda.is_available() else 'cpu' test_mrrs = [] train, val, test, (n2i, i2n), (r2i, i2r) = \ embed.load(arg.name) # set of all triples (for filtering) alltriples = set() for s, p, o in torch.cat([train, val, test], dim=0): s, p, o = s.item(), p.item(), o.item() alltriples.add((s, p, o)) truedicts = util.truedicts(alltriples) if arg.final: train, test = torch.cat([train, val], dim=0), test else: train, test = train, val subjects = torch.tensor(list({s for s, _, _ in train}), dtype=torch.long, device=d()) predicates = torch.tensor(list({p for _, p, _ in train}), dtype=torch.long, device=d()) objects = torch.tensor(list({o for _, _, o in train}), dtype=torch.long, device=d()) ccandidates = (subjects, predicates, objects) print(len(i2n), 'nodes') print(len(i2r), 'relations') print(train.size(0), 'training triples') print(test.size(0), 'test triples') print(train.size(0) + test.size(0), 'total triples') for r in tqdm.trange(repeats) if repeats > 1 else range(repeats): """ Define model """ model = embed.LinkPredictor( triples=train, n=len(i2n), r=len(i2r), embedding=arg.emb, biases=arg.biases, edropout = arg.edo, rdropout=arg.rdo, decoder=arg.decoder, reciprocal=arg.reciprocal, init_method=arg.init_method, init_parms=arg.init_parms) if torch.cuda.is_available(): prt('Using CUDA.') model.cuda() if arg.opt == 'adam': opt = torch.optim.Adam(model.parameters(), lr=arg.lr) elif arg.opt == 'adamw': opt = torch.optim.AdamW(model.parameters(), lr=arg.lr) elif arg.opt == 'adagrad': opt = torch.optim.Adagrad(model.parameters(), lr=arg.lr) elif arg.opt == 'sgd': opt = torch.optim.SGD(model.parameters(), lr=arg.lr, nesterov=True, momentum=arg.momentum) else: raise Exception() sched = torch.optim.lr_scheduler.ReduceLROnPlateau(patience=arg.patience, optimizer=opt, mode='max', factor=0.95, threshold=0.0001) \ if arg.sched else None #-- defaults taken from libkge # nr of negatives sampled weight = torch.tensor([arg.nweight, 1.0], device=d()) if arg.nweight else None seen = 0 for e in range(arg.epochs): seeni, sumloss = 0, 0.0 tforward = tbackward = 0 rforward = rbackward = 0 tprep = tloss = 0 tic() for fr in trange(0, train.size(0), arg.batch): to = min(train.size(0), fr + arg.batch) model.train(True) opt.zero_grad() positives = train[fr:to].to(d()) for ctarget in [0, 1, 2]: # which part of the triple to corrupt ng = arg.negative_rate[ctarget] if ng > 0: with torch.no_grad(): bs, _ = positives.size() tic() if arg.limit_negatives: cand = ccandidates[ctarget] mx = cand.size(0) idx = torch.empty(bs, ng, dtype=torch.long, device=d()).random_(0, mx) corruptions = cand[idx] else: mx = len(i2r) if ctarget == 1 else len(i2n) corruptions = torch.empty(bs, ng, dtype=torch.long, device=d()).random_(0, mx) tprep += toc() s, p, o = positives[:, 0:1], positives[:, 1:2], positives[:, 2:3] if ctarget == 0: s = torch.cat([s, corruptions], dim=1) if ctarget == 1: p = torch.cat([p, corruptions], dim=1) if ctarget == 2: o = torch.cat([o, corruptions], dim=1) # -- NB: two of the index vectors s, p o are now size (bs, 1) and the other is (bs, ng+1) # We will let the model broadcast these to give us a score tensor of (bs, ng+1) # In most cases we can optimize the decoder to broadcast late for better speed. if arg.loss == 'bce': labels = torch.cat([torch.ones(bs, 1, device=d()), torch.zeros(bs, ng, device=d())], dim=1) elif arg.loss == 'ce': labels = torch.zeros(bs, dtype=torch.long, device=d()) # -- CE loss treats the problem as a multiclass classification problem: for a positive triple, # together with its k corruptions, identify which is the true triple. This is always triple 0. # (It may seem like the model could easily cheat by always choosing triple 0, but the score # function is order equivariant, so it can't choose by ordering.) recip = None if not arg.reciprocal else ('head' if ctarget == 0 else 'tail') # -- We use the tail relations if the target is the relation (usually p-corruption is not used) tic() out = model(s, p, o, recip=recip) tforward += toc() assert out.size() == (bs, ng + 1), f'{out.size()=} {(bs, ng + 1)=}' tic() if arg.loss == 'bce': loss = F.binary_cross_entropy_with_logits(out, labels, weight=weight, reduction=arg.lred) elif arg.loss == 'ce': loss = F.cross_entropy(out, labels, reduction=arg.lred) assert not torch.isnan(loss), 'Loss has become NaN' sumloss += float(loss.item()) seen += bs; seeni += bs tloss += toc() tic() loss.backward() tbackward += toc() # No step yet, we accumulate the gradients over all corruptions. # -- this causes problems with modules like batchnorm, so be careful when porting. tic() regloss = None if arg.reg_eweight is not None: regloss = model.penalty(which='entities', p=arg.reg_exp, rweight=arg.reg_eweight) if arg.reg_rweight is not None: regloss = model.penalty(which='relations', p=arg.reg_exp, rweight=arg.reg_rweight) rforward += toc() tic() if regloss is not None: sumloss += float(regloss.item()) regloss.backward() rbackward += toc() opt.step() tbw.add_scalar('biases/train_loss', float(loss.item()), seen) if e == 0: print(f'\n pred: forward {tforward:.4}, backward {tbackward:.4}') print (f' reg: forward {rforward:.4}, backward {rbackward:.4}') print (f' prep {tprep:.4}, loss {tloss:.4}') print (f' total: {toc():.4}') # -- NB: these numbers will not be accurate for GPU runs unless CUDA_LAUNCH_BLOCKING is set to 1 # Evaluate if ((e+1) % arg.eval_int == 0) or e == arg.epochs - 1: with torch.no_grad(): model.train(False) if arg.eval_size is None: testsub = test else: testsub = test[random.sample(range(test.size(0)), k=arg.eval_size)] mrr, hits, ranks = util.eval( model=model, valset=testsub, truedicts=truedicts, n=len(i2n), batch_size=arg.test_batch, verbose=True) if arg.check_simple: # double-check using a separate, slower implementation mrrs, hitss, rankss = util.eval_simple( model=model, valset=testsub, alltriples=alltriples, n=len(i2n), verbose=True) assert ranks == rankss assert mrr == mrrs print(f'epoch {e}: MRR {mrr:.4}\t hits@1 {hits[0]:.4}\t hits@3 {hits[1]:.4}\t hits@10 {hits[2]:.4}') tbw.add_scalar('biases/mrr', mrr, e) tbw.add_scalar('biases/h@1', hits[0], e) tbw.add_scalar('biases/h@3', hits[1], e) tbw.add_scalar('biases/h@10', hits[2], e) if sched is not None: sched.step(mrr) # reduce lr if mrr stalls test_mrrs.append(mrr) print('training finished.') temrrs = torch.tensor(test_mrrs) print(f'mean test MRR {temrrs.mean():.3} ({temrrs.std():.3}) \t{test_mrrs}')
BASE_SERVER_CONFIG = d({ "id":"server_config", "display": "server_config", "preconfig": False, "presets":[], "params": [{ "id": "external_access", "label": "Allow External Access", "type": "bool", "default": True, "help":"On: Other computers on your network can access PixelWeb. Off: LocalHost access only." },{ "id": "port", "label": "Server Port", "type": "int", "default": 8080, "help":"Port to listen on." },{ "id": "load_defaults", "label": "Load Last Config on Start", "type": "bool", "default": False, "help":"Load last driver/controller configuration on application start." }, { "id": "show_debug", "label": "Show Debug in Console", "type": "bool", "default": False, "help":"Show BiblioPixel debug in server console (not in main UI)." },{ "id": "mod_dirs", "label": "Module Directories", "type": "str_multi", "default": [], "help":"Directories from which to load modules (animations, drivers, controllers, pre-configs).", "replace": {"\\":"/"} }, { "id": "off_anim_time", "label": "All Off Timeout", "type": "int", "default": 10, "min": 0, "max": 3600, "help":"Keep display off when not running an animation by actively turning all pixels off every X seconds. Set to 0 to disable." },] });
#!/usr/bin/env python # -*- coding: utf-8 -*- """ Project Euler 23: Find the sum of all the positive integers which cannot be written as the sum of two abundant numbers. A perfect number is a number for which the sum of its proper divisors is exactly equal to the number. For example, the sum of the proper divisors of 28 would be 1 + 2 + 4 + 7 + 14 = 28, which means that 28 is a perfect number. A number whose proper divisors are less than the number is called deficient and a number whose proper divisors exceed the number is called abundant. As 12 is the smallest abundant number, 1 + 2 + 3 + 4 + 6 = 16, the smallest number that can be written as the sum of two abundant numbers is 24. By mathematical analysis, it can be shown that all integers greater than 28123 can be written as the sum of two abundant numbers. However, this upper limit cannot be reduced any further by analysis even though it is known that the greatest number that cannot be expressed as the sum of two abundant numbers is less than this limit. Find the sum of all the positive integers which cannot be written as the sum of two abundant numbers. """ import util limit = 20162 sum = 0 # it's a set, after all. sets are faster than lists for our needs. abn = set() for n in range(1, limit): if util.d(n) > n: abn.add(n) # if the difference of the number we're examining and every number in the set # is in the set, then the number is the sum of two abundant numbers. # otherwise, we must add it to our sum in question. if not any( (n-a in abn) for a in abn ): sum += n
def go(arg): if arg.seed < 0: seed = random.randint(0, 1000000) print('random seed: ', seed) else: torch.manual_seed(arg.seed) tbw = SummaryWriter(log_dir=arg.tb_dir) # Tensorboard logging # load the data (validation unless arg.final is true, then test) arg.data = here('data/enwik8.gz') if arg.data is None else arg.data data_train, data_val, data_test = enwik8(arg.data) data_train, data_test = (torch.cat([data_train, data_val], dim=0), data_test) \ if arg.final else (data_train, data_val) # create the model model = GTransformer(emb=arg.embedding_size, heads=arg.num_heads, depth=arg.depth, seq_length=arg.context, num_tokens=NUM_TOKENS) if torch.cuda.is_available(): model.cuda() opt = torch.optim.Adam(lr=arg.lr, params=model.parameters()) # training loop # - note: we don't loop over the data, instead we sample a batch of random subsequences each time. for i in tqdm.trange(arg.num_batches): # learning rate warmup # - we linearly increase the learning rate from 10e-10 to arg.lr over the first # few thousand batches if arg.lr_warmup > 0 and i < arg.lr_warmup: lr = max((arg.lr / arg.lr_warmup) * i, 1e-10) opt.lr = lr opt.zero_grad() # sample a batch of random subsequences starts = torch.randint(size=(arg.batch_size, ), low=0, high=data_train.size(0) - arg.context - 1) seqs_source = [ data_train[start:start + arg.context] for start in starts ] seqs_target = [ data_train[start + 1:start + arg.context + 1] for start in starts ] source = torch.cat([s[None, :] for s in seqs_source], dim=0).to(torch.long) target = torch.cat([s[None, :] for s in seqs_target], dim=0).to(torch.long) # - target is the same sequence as source, except one character ahead if torch.cuda.is_available(): source, target = source.cuda(), target.cuda() source, target = Variable(source), Variable(target) output = model(source) loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean') tbw.add_scalar('transformer/train-loss', float(loss.item()) * LOG2E, i * arg.batch_size) loss.backward() # clip gradients # - If the total gradient vector has a length > 1, we clip it back down to 1. if arg.gradient_clipping > 0.0: nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping) opt.step() # - validate every {arg.test_every} steps. First we compute the # compression on the validation (or a subset) # then we generate some random text to monitor progress if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1): upto = data_test.size( 0) if i == arg.num_batches - 1 else arg.test_subset data_sub = data_test[:upto] with torch.no_grad(): bits, tot = 0.0, 0 batch = [ ] # buffer, every time it fills up, we run it through the model for current in range(data_sub.size(0)): fr = max(0, current - arg.context) to = current + 1 context = data_sub[fr:to].to(torch.long) if context.size(0) < arg.context + 1: pad = torch.zeros(size=(arg.context + 1 - context.size(0), ), dtype=torch.long) context = torch.cat([pad, context], dim=0) assert context.size(0) == arg.context + 1 if torch.cuda.is_available(): context = context.cuda() batch.append(context[None, :]) if len( batch ) == arg.test_batchsize or current == data_sub.size(0) - 1: # batch is full, run it through the model b = len(batch) all = torch.cat(batch, dim=0) source = all[:, :-1] # input target = all[:, -1] # target values output = model(source) lnprobs = output[torch.arange(b, device=d()), -1, target] log2probs = lnprobs * LOG2E # convert from nats to bits bits += -log2probs.sum() batch = [] # empty buffer bits_per_byte = bits / data_sub.size(0) # print validation performance. 1 bit per byte is (currently) state of the art. print(f'epoch{i}: {bits_per_byte:.4} bits per byte') tbw.add_scalar(f'transformer/eval-loss', bits_per_byte, i * arg.batch_size) # generate some random text GENSIZE = 600 TEMP = 0.5 seedfr = random.randint(0, data_test.size(0) - arg.context) input = data_test[seedfr:seedfr + arg.context].to(torch.long) if torch.cuda.is_available(): input = input.cuda() input = Variable(input) print('[', end='', flush=True) for c in input: print(str(chr(c)), end='', flush=True) print(']', end='', flush=True) for _ in range(GENSIZE): output = model(input[None, :]) c = sample(output[0, -1, :], TEMP) print(str(chr(max(32, c))), end='', flush=True) input = torch.cat([input[1:], c[None]], dim=0) print()
def go(arg): try: arg.bins = int(arg.bins) except ValueError: pass util.makedirs('./bias/') if not os.path.exists('./bias/cached.npz'): if arg.seed < 0: seed = random.randint(0, 1000000) print('random seed: ', seed) else: torch.manual_seed(arg.seed) tbw = SummaryWriter(log_dir=arg.tb_dir) tfms = transforms.Compose([transforms.ToTensor()]) if (arg.task == 'mnist'): shape = (1, 28, 28) num_classes = 10 data = arg.data + os.sep + arg.task if arg.final: train = torchvision.datasets.MNIST(root=data, train=True, download=True, transform=tfms) trainloader = torch.utils.data.DataLoader(train, batch_size=arg.batch_size, shuffle=True, num_workers=0) test = torchvision.datasets.MNIST(root=data, train=False, download=True, transform=ToTensor()) testloader = torch.utils.data.DataLoader(test, batch_size=arg.batch_size, shuffle=False, num_workers=0) else: NUM_TRAIN = 45000 NUM_VAL = 5000 total = NUM_TRAIN + NUM_VAL train = torchvision.datasets.MNIST(root=data, train=True, download=True, transform=tfms) trainloader = DataLoader(train, batch_size=arg.batch, sampler=util.ChunkSampler(0, NUM_TRAIN, total)) testloader = DataLoader(train, batch_size=arg.batch, sampler=util.ChunkSampler(NUM_TRAIN, NUM_VAL, total)) elif (arg.task == 'cifar10'): shape = (3, 32, 32) num_classes = 10 data = arg.data + os.sep + arg.task if arg.final: train = torchvision.datasets.CIFAR10(root=data, train=True, download=True, transform=tfms) trainloader = torch.utils.data.DataLoader(train, batch_size=arg.batch, shuffle=True, num_workers=2) test = torchvision.datasets.CIFAR10(root=data, train=False, download=True, transform=ToTensor()) testloader = torch.utils.data.DataLoader(test, batch_size=arg.batch, shuffle=False, num_workers=2) else: NUM_TRAIN = 45000 NUM_VAL = 5000 total = NUM_TRAIN + NUM_VAL train = torchvision.datasets.CIFAR10(root=data, train=True, download=True, transform=tfms) trainloader = DataLoader(train, batch_size=arg.batch, sampler=util.ChunkSampler(0, NUM_TRAIN, total)) testloader = DataLoader(train, batch_size=arg.batch, sampler=util.ChunkSampler(NUM_TRAIN, NUM_VAL, total)) elif arg.task == 'ffhq': transform = ToTensor() shape = (3, 128, 128) trainset = torchvision.datasets.ImageFolder(root=arg.data+os.sep+'train', transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=arg.batch, shuffle=True, num_workers=2) testset = torchvision.datasets.ImageFolder(root=arg.data+os.sep+'valid', transform=transform) testloader = torch.utils.data.DataLoader(testset, batch_size=arg.batch, shuffle=False, num_workers=2) else: raise Exception('Task {} not recognized'.format(arg.task)) encoder = Encoder(shape, latent_size=arg.latent_size, depth=arg.depth) decoder = Decoder(shape, latent_size=arg.latent_size, depth=arg.depth) if arg.cuda: encoder.cuda() decoder.cuda() opt = torch.optim.Adam(params=list(encoder.parameters()) + list(decoder.parameters()), lr=arg.lr) nparms = num_params([encoder]) print(f'{nparms} parameters in encoder.') seen = 0 l = arg.latent_size ti = random.sample(range(nparms), arg.num_params) # random indices of parameters for which to test the gradient k = arg.k # Train for a fixed nr of instances (with the true gradient) for e in range(arg.epochs): print('epoch', e) for i, (inputs, _) in enumerate(trainloader): b, c, h, w = inputs.size() if arg.cuda: inputs = inputs.cuda() # compute actual gradient opt.zero_grad() latent = encoder(inputs) latent = F.softmax(latent, dim=1) dinp = torch.eye(l, device=d(arg.cuda))[None, :, :].expand(b, l, l).reshape(b*l, l) dout = decoder(dinp) assert dout.size() == (b*l, c, h, w) target = inputs.detach()[:, None, :, :, :].expand(b, l, c, h, w).reshape(b*l, c, h, w) loss = F.binary_cross_entropy(dout, target, reduction='none') loss = loss.sum(dim=1).sum(dim=1).sum(dim=1).view(b, l) loss = (loss * latent).sum(dim=1).mean() loss.backward() true_gradient = gradient([encoder, decoder]) true_gradient = true_gradient[ti] opt.step() inputs, _ = next(iter(trainloader)) if arg.cuda: inputs = inputs.cuda() b, c, h, w = inputs.size() # compute true gradient opt.zero_grad() latent = encoder(inputs) latent = F.softmax(latent, dim=1) dinp = torch.eye(l, device=d(arg.cuda))[None, :, :].expand(b, l, l).reshape(b*l, l) dout = decoder(dinp) assert dout.size() == (b*l, c, h, w) target = inputs.detach()[:, None, :, :, :].expand(b, l, c, h, w).reshape(b*l, c, h, w) loss = F.binary_cross_entropy(dout, target, reduction='none') loss = loss.sum(dim=1).sum(dim=1).sum(dim=1).view(b, l) loss = (loss * latent).sum(dim=1).mean() loss.backward() true_gradient = gradient([encoder]) true_gradient = true_gradient[ti] # - Estimate the bias for the uninformed sampler uste = torch.zeros((arg.samples, len(ti),), device=d(arg.cuda)) # Unbiased, uninformed STE for s in trange(arg.samples): opt.zero_grad() ks = [random.sample(range(arg.latent_size), k) for _ in range(b)] ks = torch.tensor(ks, device=d(arg.cuda)) latent = encoder(inputs) latent = torch.gather(latent, dim=1, index=ks); assert latent.size() == (b, k) latent = F.softmax(latent, dim=1) dinp = torch.zeros(size=(b*k, l), device=d(arg.cuda)) dinp.scatter_(dim=1, index=ks.view(b*k, 1), value=1) dout = decoder(dinp) assert dout.size() == (b * k, c, h, w) target = inputs.detach()[:, None, :, :, :].expand(b, k, c, h, w).reshape(b * k, c, h, w) loss = F.binary_cross_entropy(dout, target, reduction='none') loss = loss.sum(dim=1).sum(dim=1).sum(dim=1).view(b, k) loss = (loss * latent).sum(dim=1).mean() loss.backward() samp_gradient = gradient([encoder]) uste[s, :] = samp_gradient[ti] del loss iste = torch.zeros((arg.samples, len(ti),), device=d(arg.cuda)) # Unbiased, informed STE # This behaves like the USTE, but ensures that the argmax is always included in the sample for s in trange(arg.samples): opt.zero_grad() latent = encoder(inputs) ks = [random.sample(range(arg.latent_size-1), k-1) for _ in range(b)] ks = torch.tensor(ks, device=d(arg.cuda)) am = latent.argmax(dim=1, keepdim=True) ks[ks > am] += 1 ks = torch.cat([am, ks], dim=1) latent = torch.gather(latent, dim=1, index=ks); assert latent.size() == (b, k) latent = F.softmax(latent, dim=1) dinp = torch.zeros(size=(b * k, l), device=d()) dinp.scatter_(dim=1, index=ks.view(b * k, 1), value=1) dout = decoder(dinp) assert dout.size() == (b * k, c, h, w) target = inputs.detach()[:, None, :, :, :].expand(b, k, c, h, w).reshape(b * k, c, h, w) loss = F.binary_cross_entropy(dout, target, reduction='none') loss = loss.sum(dim=1).sum(dim=1).sum(dim=1).view(b, k) loss = (loss * latent).sum(dim=1).mean() loss.backward() samp_gradient = gradient([encoder]) iste[s, :] = samp_gradient[ti] del loss # Biased (?) gumbel STE # STE with gumbel noise gste = torch.zeros((arg.samples, len(ti),), device=d(arg.cuda)) for s in trange(arg.samples): for _ in range(k): opt.zero_grad() latent = encoder(inputs) gumbelize(latent, temperature=arg.gumbel) latent = F.softmax(latent, dim=1) ks = latent.argmax(dim=1, keepdim=True) dinp = torch.zeros(size=(b, l), device=d()) dinp.scatter_(dim=1, index=ks, value=1) dinp = (dinp - latent).detach() + latent # straight-through trick dout = decoder(dinp) assert dout.size() == (b, c, h, w) target = inputs.detach() loss = F.binary_cross_entropy(dout, target, reduction='none') loss = loss.sum(dim=1).sum(dim=1).sum(dim=1).view(b) loss = loss.mean() loss.backward() samp_gradient = gradient([encoder]) gste[s, :] += samp_gradient[ti] del loss gste[s, :] /= k # Classical STE # cste = torch.zeros((arg.samples, len(ti),), device=d(arg.cuda)) # # for s in trange(arg.samples): # opt.zero_grad() # # latent = encoder(inputs) # # # gumbelize(latent, temperature=arg.gumbel) # dist = ds.Categorical(logits=latent) # ks = dist.sample()[:, None] # # dinp = torch.zeros(size=(b, l), device=d()) # dinp.scatter_(dim=1, index=ks, value=1) # # dinp = (dinp - latent).detach() + latent # straight-through trick # dout = decoder(dinp) # # assert dout.size() == (b, c, h, w) # # target = inputs.detach() # # loss = F.binary_cross_entropy(dout, target, reduction='none') # loss = loss.sum(dim=1).sum(dim=1).sum(dim=1).view(b) # loss = loss.mean() # # loss.backward() # # samp_gradient = gradient([encoder]) # cste[s, :] = samp_gradient[ti] # # del loss uste = uste.cpu().numpy() iste = iste.cpu().numpy() gste = gste.cpu().numpy() tgrd = true_gradient.cpu().numpy() np.savez_compressed('./bias/cached.npz', uste=uste, iste=iste, gste=gste, tgrd=tgrd) else: res = np.load('./bias/cached.npz') uste, iste, gste, tgrd = res['uste'], res['iste'], res['gste'], res['tgrd'] ind = tgrd != 0.0 print(tgrd.shape, ind) print(f'{ind.sum()} derivatives out of {ind.shape} not equal to zero.') if not arg.skip: for nth, i in enumerate( np.arange(ind.shape[0])[ind][:5] ): plt.gcf().clear() unump = uste[:, i] inump = iste[:, i] gnump = gste[:, i] # cnump = cste[:, i].cpu().numpy() ulab = f'uninformed, var={unump.var():.4}' ilab = f'informed, var={inump.var():.4}' glab = f'Gumbel STE (t={arg.gumbel}) var={gnump.var():.4}' # clab = f'Classical STE var={cnump.var():.4}' plt.hist([unump, inump, gnump], color=['r', 'g', 'b'], label=[ulab, ilab, glab], bins=arg.bins) plt.axvline(x=tgrd[i], color='k', label='true gradient') plt.axvline(x=unump.mean(), color='r', ls='--') plt.axvline(x=inump.mean(), color='g', ls='-.') plt.axvline(x=gnump.mean(), color='b', ls=':') # plt.axvline(x=cnump.mean(), color='c') plt.title(f'estimates for parameter ... ({uste.shape[0]} samples)') plt.legend() util.basic() plt.savefig(f'./bias/histogram.{nth}.pdf') plt.gcf().clear() unump = uste[:, ind].mean(axis=0) inump = iste[:, ind].mean(axis=0) gnump = gste[:, ind].mean(axis=0) tnump = tgrd[ind] unump = np.abs(unump - tnump) inump = np.abs(inump - tnump) gnump = np.abs(gnump - tnump) ulab = f'uninformed, var={unump.var():.4}' ilab = f'informed, var={inump.var():.4}' glab = f'gumbel STE (t={arg.gumbel}) var={gnump.var():.4}' # clab = f'Classical STE var={cnump.var():.4}' plt.hist([unump, inump, gnump], color=['r', 'g', 'b'], label=[ulab, ilab, glab], bins=arg.bins) plt.axvline(x=unump.mean(), color='r', ls='--') plt.axvline(x=inump.mean(), color='g', ls='-.') plt.axvline(x=gnump.mean(), color='b', ls=':') # plt.axvline(x=cnump.mean(), color='c') plt.title(f'Absolute error between true gradient and estimate \n over {ind.sum()} parameters with nonzero gradient.') plt.legend() util.basic() if arg.range is not None: plt.xlim(*arg.range) plt.savefig(f'./bias/histogram.all.pdf')
def go(arg): if arg.seed < 0: seed = random.randint(0, 1000000) print('random seed: ', seed) else: torch.manual_seed(arg.seed) tbw = SummaryWriter(log_dir=arg.tb_dir) # Tensorboard logging arg.data = here('data/enwik8.gz') if arg.data is None else arg.data str_train, str_val, str_test = load_text(arg.data) str_train, str_test = (str_train + str_val, str_test) \ if arg.final else (str_train, str_val) # create the model model = GPT2Wrapper(iblocks=arg.iblocks) if torch.cuda.is_available(): model.to('cuda') model.model.mod[0].to('cuda') # tokenize the data data_train, data_val, data_test = \ torch.tensor(model.tokenizer.encode(str_train)), \ torch.tensor(model.tokenizer.encode(str_val)), \ torch.tensor(model.tokenizer.encode(str_test)) opt = torch.optim.Adam(lr=arg.lr, params=model.parameters()) # sch = torch.optim.lr_scheduler.LambdaLR(opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0)) # -- linear learning rate warmup # training loop # -- note: we don't loop over the data, instead we sample a batch of random subsequences each time. for i in tqdm.trange(arg.num_batches): opt.zero_grad() # sample a batch of random subsequences starts = torch.randint(size=(arg.batch_size, ), low=0, high=data_train.size(0) - model.ctx - 1) seqs_source = [data_train[start:start + model.ctx] for start in starts] seqs_target = [ data_train[start + 1:start + model.ctx + 1] for start in starts ] source = torch.cat([s[None, :] for s in seqs_source], dim=0).to(torch.long) target = torch.cat([s[None, :] for s in seqs_target], dim=0).to(torch.long) # -- target is the same sequence as source, except one character ahead if torch.cuda.is_available(): source, target = source.to('cuda'), target.to('cuda') output = model(source) loss = F.cross_entropy(output.transpose(2, 1), target, reduction='mean') tbw.add_scalar('podcasts/train-loss', float(loss.item()) * LOG2E, i * arg.batch_size) loss.backward() # clip gradients # - If the total gradient vector has a length > 1, we clip it back down to 1. if arg.gradient_clipping > 0.0: nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping) opt.step() # sch.step() model.clear() # - validate every {arg.test_every} steps. First we compute the # compression on the validation (or a subset) # then we generate some random text to monitor progress if i != 0 and (i % arg.print_every == 0 or i == arg.num_batches - 1): with torch.no_grad(): # generate and print some random text seedfr = random.randint( 0, data_test.size(0) - arg.print_seed_size) input = data_test[seedfr:seedfr + arg.print_seed_size].to( torch.long) if torch.cuda.is_available(): input = input.cuda() # print the seed strinput = model.tokenizer.decode(input) print(f'[{strinput}]', end='') outseq = [] for _ in range(arg.print_size): output = model(input[None, :]) c = sample(output[0, -1, :], arg.sampling_temp) outseq.append(c[None]) input = torch.cat([input[1:], c[None]], dim=0) outseq = torch.cat(outseq, dim=0) outseq = model.tokenizer.decode(outseq) print(outseq) # val if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1): with torch.no_grad(): upto = data_test.size( 0) if i == arg.num_batches - 1 else arg.test_subset data_sub = data_test[:upto] bits, tot = 0.0, 0 batch = [ ] # buffer, every time it fills up, we run it through the model for current in range(data_sub.size(0)): fr = max(0, current - model.ctx) to = current + 1 context = data_sub[fr:to].to(torch.long) if context.size(0) < model.ctx + 1: pad = torch.zeros(size=(model.ctx + 1 - context.size(0), ), dtype=torch.long) context = torch.cat([pad, context], dim=0) assert context.size(0) == model.ctx + 1 if torch.cuda.is_available(): context = context.cuda() batch.append(context[None, :]) if len( batch ) == arg.test_batchsize or current == data_sub.size(0) - 1: # batch is full, run it through the model b = len(batch) all = torch.cat(batch, dim=0) source = all[:, :-1] # input target = all[:, -1] # target values output = model(source) lnprobs = output[torch.arange(b, device=d()), -1, target] log2probs = lnprobs * LOG2E # convert from nats to bits bits += -log2probs.sum() batch = [] # empty buffer bits_per_byte = bits / data_sub.size(0) # print validation performance. 0.92 bit per byte is (currently) state of the art. print(f'epoch{i}: {bits_per_byte:.4} bits per byte') tbw.add_scalar(f'podcasts/eval-loss', bits_per_byte, i * arg.batch_size)
def forward(self, x): assert x.size()[1:] == self.in_size b, c, h, w = x.size() k = self.k s = (h, w) means, sigmas, mvalues = self.hyper(x) # This is a bit confusing, but k is the chunk dimension here. This is because the sparse operation # only selects in the k separate input pixels, it doens not sum/merge them. # In other words, we add a separate tuple dimension. means = means[:, :, :, :, None, :] sigmas = sigmas[:, :, :, :, None, :] mvalues = mvalues[:, :, :, :, None] if self.smp: # sample integer indices and values indices = sparse.ngenerate(means, self.gadditional, self.radditional, rng=s, relative_range=self.region, cuda=x.is_cuda) vs = (4 + self.radditional + self.gadditional) assert indices.size() == ( b, h, w, k, vs, 2), f'{indices.size()}, {(b, h, w, k, vs, 2)}' indices = indices.view(b, h, w, k, vs, 2) indfl = indices.float() # Mask for duplicate indices dups = util.nduplicates(indices).to(torch.bool) # compute (unnormalized) densities under the given MVNs (proportions) props = sparse.densities(indfl, means, sigmas).clone() # (b, h, w, k, vs, 1) assert props.size() == (b, h, w, k, vs, 1) props[dups, :] = 0 props = props / props.sum( dim=4, keepdim=True ) # normalize over all points of a given index tuple # weight the values by the proportions weights = mvalues[:, :, :, :, None, :].expand_as(props) # - add a dim for the MVNs weights = props * weights weights = weights.sum(dim=5) # - sum out the MVNs assert indices.size() == (b, h, w, k, vs, 2) assert weights.size() == (b, h, w, k, vs) else: vs = 1 indices = means.floor().to(torch.long).detach() l = h * w * k * vs indices = indices.view(b * l, 2) br = torch.arange(b, device=d(x), dtype=torch.long)[:, None].expand( b, l).contiguous().view(-1) features = x[br, :, indices[:, 0], indices[:, 1]] assert features.size() == (b * l, c) if self.smp: features = features.view(b, h, w, k, vs, c) features = features * weights[:, :, :, :, :, None] features = features.sum(dim=4) else: features = features.view(b, h, w, k, c) # features now contains the selected input pixels (or weighted sum thereover): k inputs per output pixel assert features.size() == ( b, h, w, k, c), f'Was {features.size()}, expected {(b, h, w, k, c)}.' features = features.view(b, h, w, k * c) return self.unify(features).permute(0, 3, 1, 2) # (b, c_out, h, w)
from util import d ans = 0 for x in xrange(1, 10000): if d(d(x)) == x and d(x) != x: ans += x print ans
def forward(self, triples, depth=2): assert triples.size(-1) == 3 n, r = self.n, self.r dims = triples.size()[:-1] triples = triples.reshape(-1, 3) b, _ = triples.size() batch = Batch(triples=triples, graph=self.graph, inv_graph=self.inv_graph) # Sample if depth > 0: batch = self.sample0(batch) if depth > 1: batch = self.sample1(batch) # extract batch node embeddings bind = batch.indices() nodes = self.embeddings[flatten(bind), :] if self.dropout is not None: nodes = self.dropout(nodes) # Message passing if depth > 0: # compute the edge weights dtriples = torch.tensor(list(batch.edges()), device=d(), dtype=torch.long) btriples = torch.tensor(batch.batch_triples(), device=d(), dtype=torch.long) # adjacency matrix indices # -- repeans R times, vertically bn = batch.num_nodes() fr = btriples[:, 0] + bn * btriples[:, 1] to = btriples[:, 2] indices = torch.cat([fr[:, None], to[:, None]], dim=1) si, pi, oi = dtriples[:, 0], dtriples[:, 1], dtriples[:, 2] semb, pemb, oemb = self.embeddings[si, :], self.relations[ pi, :], self.embeddings[oi, :] # compute the score (bilinear dot product) semb = self.tokeys(semb) oemb = self.toqueries(oemb) dots = (semb * pemb * oemb).sum(dim=1) values = torch.ones((indices.size(0), ), device=d(), dtype=torch.float) # values = (dots).abs() values = values / util.sum_sparse(indices, values, (r * bn, bn)) # values *= ACTIVATION(dots) # F.softplus(dots) nodes = nodes + self.rgcn0(nodes, indices, values) if depth > 1: nodes = nodes + self.rgcn1(nodes, indices, values) _, tind = batch.target_indices(bind) # -- indices of the target nodes in the list `bind` subjects, objects = [t[0] for t in tind], [t[1] for t in tind] assert len(subjects) == len(objects) == triples.size(0) # print(nodes.size()) # extract embeddings for target nodes try: s = nodes[subjects, :] o = nodes[objects, :] p = self.relations[triples[:, 1], :] except Exception as e: print(triples.size()) print(batch.size()) print(nodes.size()) print(len(batch.indices())) print(batch.entities) raise (e) scores = self.decoder(s, p, o) assert scores.size() == (util.prod(dims), ) return scores.view(*dims)
def forward(self, batch: Batch): """ :param batch: :return: """ # select some candidates. if self.multi: raise Exception() # with Pool(self.cpus_available) as pool: # cflats = pool.starmap(self.inner, [(i, batch) for i in range(batch.size())]) else: cflats = [] for bi in range(batch.size()): if self.csample is not None: # Sample a list of candidates using the pre-computed scores cflat = wrs_gen( batch.gen_inc_edges(bi), weight_function=lambda edge: self.globals[edge], k=self.csample) else: cflat = list(batch.gen_inc_edges(bi)) cflats.append(cflat) # pad the candidates with zero triples lens = [len(x) for x in cflats] mx = max(lens) cflats = [x + [(0, 0, 0)] * (mx - ln) for x, ln in zip(cflats, lens)] with torch.no_grad(): #- compute the attention weights all = torch.tensor(cflats, device=d(), dtype=torch.long) assert all.size() == (batch.size(), mx, 3) semb, pemb, oemb = self.nodes[all[:, :, 0]], self.relations[[ all[:, :, 1] ]], self.nodes[all[:, :, 2]] # compute the score (bilinear dot product) semb = self.tokeys(semb) oemb = self.toqueries(oemb) dots = (semb * pemb * oemb).sum(dim=2) # + sb + pb + ob + gb # dots = ACTIVATION(dots) u = torch.rand(*dots.size(), device=d(dots)) weights = u.log() / dots weights, indices = torch.sort(weights, dim=1, descending=True) indices = indices.tolist() # rm any indices that are too high indices = [[i for i in ind if i < ln] for ind, ln in zip(indices, lens)] # pick the first k indices = [ind[:self.ksample] for ind in indices] sampled = [[cflats[i][j] for j in ind] for i, ind in enumerate(indices)] for bi, samp in enumerate(sampled): batch.add_edges(samp, bi) return batch
def forward(self, nodes=None): n, r = self.n, self.r rn = r * n ## Perform message passing assert (nodes is None) == (self.insize is None) h0 = n if self.insize is None else self.insize h1 = self.outsize if self.decomp is None: weights = self.weights elif self.decomp == 'basis': weights = torch.einsum('rb, bij -> rij', self.comps, self.bases) elif self.decomp == 'block': weights = util.block_diag(self.blocks) # TODO: multiply in block form (more efficient, but implementation differs per layer type) assert weights.size() == (r, h0, h1) if self.edo is not None and self.training: # apply edge dropout p, pid = self.edo nt = self.indices.size(0) - n mask = torch.bernoulli( torch.empty(size=(nt, ), dtype=torch.float, device=d(self.bias)).fill_(1.0 - p)) maskid = torch.bernoulli( torch.empty(size=(n, ), dtype=torch.float, device=d(self.bias)).fill_(1.0 - pid)) vals = torch.cat([mask, maskid], dim=0) else: vals = torch.ones(self.indices.size(0), dtype=torch.float, device=d(self.bias)) # Row- or column normalize the values of the adjacency matrix vals = vals / util.sum_sparse( self.indices, vals, self.adjsize, row=not self.hor) adj = torch.sparse.FloatTensor(indices=self.indices.t(), values=vals, size=self.adjsize) if self.bias.is_cuda: adj = adj.to('cuda') if self.insize is None: # -- input is the identity matrix, just multiply the weights by the adjacencies out = torch.mm(adj, weights.view(r * h0, h1)) elif self.hor: # -- input is high-dim and output is low dim, multiply h0 x weights first nodes = nodes[None, :, :].expand(r, n, h0) nw = torch.einsum('rni, rio -> rno', nodes, weights).contiguous() out = torch.mm(adj, nw.view(r * n, h1)) else: # -- adj x h0 first, then weights out = torch.mm(adj, nodes) # sparse mm out = out.view(r, n, h0) # new dim for the relations out = torch.einsum('rio, rni -> no', weights, out) assert out.size() == (n, h1) return out + self.bias
def sample_gumbel(shape, eps=1e-20, cuda=False): U = torch.rand(shape, device=d(cuda)) return -Variable(torch.log(-torch.log(U + eps) + eps))