def rsample(self, sample_shape=torch.Size()): # Implements parallel batched accept-reject sampling. x = self.propose(sample_shape) if sample_shape else self.propose() log_prob_accept = self.log_prob_accept(x) probs = torch.exp(log_prob_accept).clamp_(0.0, 1.0) done = torch.bernoulli(probs).byte() while not done.all(): proposed_x = self.propose(sample_shape) if sample_shape else self.propose() log_prob_accept = self.log_prob_accept(proposed_x) prob_accept = torch.exp(log_prob_accept).clamp_(0.0, 1.0) accept = torch.bernoulli(prob_accept).byte() & ~done if accept.any(): x[accept] = proposed_x[accept] done |= accept return x
def pixelcnn_generate(self, z1, z2): # Sampling from PixelCNN x_zeros = torch.zeros( (z1.size(0), self.args.input_size[0], self.args.input_size[1], self.args.input_size[2])) if self.args.cuda: x_zeros = x_zeros.cuda() for i in range(self.args.input_size[1]): for j in range(self.args.input_size[2]): samples_mean, samples_logvar = self.p_x(Variable(x_zeros, volatile=True), z1, z2) samples_mean = samples_mean.view(samples_mean.size(0), self.args.input_size[0], self.args.input_size[1], self.args.input_size[2]) if self.args.input_type == 'binary': probs = samples_mean[:, :, i, j].data x_zeros[:, :, i, j] = torch.bernoulli(probs).float() samples_gen = samples_mean elif self.args.input_type == 'gray' or self.args.input_type == 'continuous': binsize = 1. / 256. samples_logvar = samples_logvar.view(samples_mean.size(0), self.args.input_size[0], self.args.input_size[1], self.args.input_size[2]) means = samples_mean[:, :, i, j].data logvar = samples_logvar[:, :, i, j].data # sample from logistic distribution u = torch.rand(means.size()).cuda() y = torch.log(u) - torch.log(1. - u) sample = means + torch.exp(logvar) * y x_zeros[:, :, i, j] = torch.floor(sample / binsize) * binsize samples_gen = samples_mean return samples_gen
def corrupt(self, src, rel, dst): prob = self.bern_prob[rel] selection = torch.bernoulli(prob).numpy().astype('int64') ent_random = choice(self.n_ent, len(src)) src_out = (1 - selection) * src.numpy() + selection * ent_random dst_out = selection * dst.numpy() + (1 - selection) * ent_random return torch.from_numpy(src_out), torch.from_numpy(dst_out)
def forward(self, x): if self.training: eps = Variable(torch.bernoulli(self.probs) - 0.5) else: eps = 0.0 output = F.linear(x, self.W*eps) if self.bias is not None: output = output + self.bias return output
def sample(self, sample_shape=torch.Size()): with torch.no_grad(): max_count = max(int(self.total_count.max()), 1) shape = self._extended_shape(sample_shape) + (max_count,) bernoullis = torch.bernoulli(self.probs.unsqueeze(-1).expand(shape)) if self.total_count.min() != max_count: arange = torch.arange(max_count, out=self.total_count.new_empty(max_count)) mask = arange >= self.total_count.unsqueeze(-1) bernoullis.masked_fill_(mask, 0.) return bernoullis.sum(dim=-1)
def decode(self, x): for i in range(len(self.second_half_weights)-1): pre_act = self.second_half_weights[i](x) #[B,D] # pre_act_with_noise = Variable(torch.randn(1, self.arch_2[i][1]).type(self.dtype)) * pre_act probs = torch.ones(1, self.arch_2[i][1]) * .5 pre_act_with_noise = Variable(torch.bernoulli(probs).type(self.dtype)) * pre_act x = self.act_func(pre_act_with_noise) y_hat = self.second_half_weights[-1](x) return y_hat
def decode(self, src_encodings, src_sents_len, dec_init_vec, tgt_sents_var): """ compute the final softmax layer at each decoding step :param src_encodings: Variable(src_sent_len, batch_size, hidden_size * 2) :param src_sents_len: list[int] :param dec_init_vec: tuple((batch_size, hidden_size)) :param tgt_sents_var: Variable(tgt_sent_len, batch_size) :return: scores: Variable(src_sent_len, batch_size, src_vocab_size) """ new_tensor = src_encodings.data.new batch_size = src_encodings.size(1) h_tm1 = dec_init_vec # (batch_size, query_len, hidden_size * 2) src_encodings = src_encodings.permute(1, 0, 2) # (batch_size, query_len, hidden_size) src_encodings_att_linear = self.att_src_linear(src_encodings) # initialize the attentional vector att_tm1 = Variable(new_tensor(batch_size, self.hidden_size).zero_(), requires_grad=False) # (batch_size, src_sent_len) src_sent_masks = nn_utils.length_array_to_mask_tensor(src_sents_len, cuda=self.cuda) # (tgt_sent_len, batch_size, embed_size) tgt_token_embed = self.tgt_embed(tgt_sents_var) scores = [] # start from `<s>`, until y_{T-1} for t, y_tm1_embed in list(enumerate(tgt_token_embed.split(split_size=1)))[:-1]: # input feeding: concate y_tm1 and previous attentional vector # split() keeps the first dim y_tm1_embed = y_tm1_embed.squeeze(0) if t > 0 and self.decoder_word_dropout: # (batch_size) y_tm1_mask = Variable(torch.bernoulli(new_tensor(batch_size).fill_(1 - self.decoder_word_dropout))) y_tm1_embed = y_tm1_embed * y_tm1_mask.unsqueeze(1) x = torch.cat([y_tm1_embed, att_tm1], 1) (h_t, cell_t), att_t, score_t = self.step(x, h_tm1, src_encodings, src_encodings_att_linear, src_sent_masks=src_sent_masks) scores.append(score_t) att_tm1 = att_t h_tm1 = (h_t, cell_t) # (src_sent_len, batch_size, tgt_vocab_size) scores = torch.stack(scores) return scores
def corrupt(self, src, rel, dst, keep_truth=True): n = len(src) prob = self.bern_prob[rel] selection = torch.bernoulli(prob).numpy().astype('bool') src_out = np.tile(src.numpy(), (self.n_sample, 1)).transpose() dst_out = np.tile(dst.numpy(), (self.n_sample, 1)).transpose() rel_out = rel.unsqueeze(1).expand(n, self.n_sample) if keep_truth: ent_random = choice(self.n_ent, (n, self.n_sample - 1)) src_out[selection, 1:] = ent_random[selection] dst_out[~selection, 1:] = ent_random[~selection] else: ent_random = choice(self.n_ent, (n, self.n_sample)) src_out[selection, :] = ent_random[selection] dst_out[~selection, :] = ent_random[~selection] return torch.from_numpy(src_out), rel_out, torch.from_numpy(dst_out)
def draw(self, N): ''' Draw N samples from multinomial ''' K = self.alias.size(0) #kk = torch.LongTensor(np.random.randint(0,K, size=N)) kk = torch.cuda.LongTensor(N).random_(0, K) prob = self.prob.index_select(0, kk) alias = self.alias.index_select(0, kk) # b is whether a random number is greater than q b = torch.bernoulli(prob) oq = kk.mul(b.long()) oj = alias.mul((1-b).long()) return oq + oj
def forward(self, x0, x1, x2, x3): if self.p > 0 and self.training: coef = torch.bernoulli((1.0 - self.p) * torch.ones(8)) out1 = coef[0] * self.block01(x0) + coef[1] * self.block11(x1) + coef[2] * self.block21(x2) out2 = coef[3] * self.block12(x1) + coef[4] * self.block22(x2) + coef[5] * self.block32(x3) out3 = coef[6] * self.block23(x2) + coef[7] * self.block33(x3) else: out1 = (1 - self.p) * (self.block01(x0) + self.block11(x1) + self.block21(x2)) out2 = (1 - self.p) * (self.block12(x1) + self.block22(x2) + self.block32(x3)) out3 = (1 - self.p) * (self.block23(x2) + self.block33(x3)) if self.integrate: out1 += x1 out2 += x2 out3 += x3 return x0, self.relu(out1), self.relu(out2), self.relu(out3)
def train_vae(epoch, args, train_loader, model, optimizer): # set loss to 0 train_loss = 0 train_re = 0 train_kl = 0 # set model in training mode model.train() # start training if args.warmup == 0: beta = 1. else: beta = 1.* epoch / args.warmup if beta > 1.: beta = 1. print('beta: {}'.format(beta)) for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) # dynamic binarization if args.dynamic_binarization: x = torch.bernoulli(data) else: x = data # reset gradients optimizer.zero_grad() # loss evaluation (forward pass) loss, RE, KL = model.calculate_loss(x, beta, average=True) # backward pass loss.backward() # optimization optimizer.step() train_loss += loss.data[0] train_re += -RE.data[0] train_kl += KL.data[0] # calculate final loss train_loss /= len(train_loader) # loss function already averages over batch size train_re /= len(train_loader) # re already averages over batch size train_kl /= len(train_loader) # kl already averages over batch size return model, train_loss, train_re, train_kl
def net(self, x_input): output = self.l1(x_input) output = self.a1(output) # mask = Variable(torch.bernoulli(output.data.new(output.data.size()).fill_(0.5))) mask = Variable(torch.bernoulli(output.data.new(1,50).fill_(0.5))) # print (mask) # fsad output = output*mask output = self.l2(output) return output
def sample(self): return torch.bernoulli(self.probs)
def forward_inner(self, input, means, sigmas, values, bias, train=True): t0total = time.time() rng = tuple(self.out_size) + tuple(input.size()[1:]) batchsize = input.size()[0] # NB: due to batching, real_indices has shape batchsize x K x rank(W) # real_values has shape batchsize x K # print('--------------------------------') # for i in range(util.prod(sigmas.size())): # print(sigmas.view(-1)[i].data[0]) # turn the real values into integers in a differentiable way t0 = time.time() if train: if not self.reinforce: if self.subsample is None: indices, props, values = self.discretize( means, sigmas, values, rng=rng, additional=self.additional, use_cuda=self.use_cuda, relative_range=self.relative_range) values = values * props else: # select a small proportion of the indices to learn over b, k, r = means.size() prop = torch.cuda.FloatTensor([ self.subsample ]) if self.use_cuda else torch.FloatTensor( [self.subsample]) selection = None while (selection is None) or (float(selection.sum()) < 1): selection = torch.bernoulli(prop.expand(k)).byte() mselection = selection.unsqueeze(0).unsqueeze(2).expand_as( means) sselection = selection.unsqueeze(0).unsqueeze(2).expand_as( sigmas) vselection = selection.unsqueeze(0).expand_as(values) means_in, means_out = means[mselection].view( b, -1, r), means[~mselection].view(b, -1, r) sigmas_in, sigmas_out = sigmas[sselection].view( b, -1, r), sigmas[~sselection].view(b, -1, r) values_in, values_out = values[vselection].view( b, -1), values[~vselection].view(b, -1) means_out = means_out.detach() values_out = values_out.detach() indices_in, props, values_in = self.discretize( means_in, sigmas_in, values_in, rng=rng, additional=self.additional, use_cuda=self.use_cuda) values_in = values_in * props indices_out = means_out.data.round().long() indices = torch.cat([indices_in, indices_out], dim=1) values = torch.cat([values_in, values_out], dim=1) else: # reinforce approach dists = torch.distributions.Normal(means, sigmas) samples = dists.sample() indices = samples.data.round().long() # if the sampling puts the indices out of bounds, we just clip to the min and max values indices[indices < 0] = 0 rngt = torch.tensor(data=rng, device='cuda' if self.use_cuda else 'cpu') maxes = rngt.unsqueeze(0).unsqueeze(0).expand_as(means) - 1 indices[indices > maxes] = maxes[indices > maxes] else: # not train, just use the nearest indices indices = means.round().long() if self.use_cuda: indices = indices.cuda() # # Create bias for permutation matrices # TAU = 1 # if SINKHORN_ITS is not None: # values = values / TAU # for _ in range(SINKHORN_ITS): # values = util.normalize(indices, values, rng, row=True) # values = util.normalize(indices, values, rng, row=False) # translate tensor indices to matrix indices t0 = time.time() # mindices, flat_size = flatten_indices(indices, input.size()[1:], self.out_shape, self.use_cuda) mindices, flat_size = flatten_indices_mat(indices, input.size()[1:], self.out_size) # NB: mindices is not an autograd Variable. The error-signal for the indices passes to the hypernetwork # through 'values', which are a function of both the real_indices and the real_values. ### Create the sparse weight tensor # -- Turns out we don't have autograd over sparse tensors yet (let alone over the constructor arguments). For # now, we'll do a slow, naive multiplication. x_flat = input.view(batchsize, -1) ly = prod(self.out_size) y_flat = torch.cuda.FloatTensor( batchsize, ly) if self.use_cuda else FloatTensor(batchsize, ly) y_flat.fill_(0.0) sparsemult = util.sparsemult(self.use_cuda) t0 = time.time() # Prevent segfault assert not util.contains_nan(values.data) bm = self.bmult(flat_size[1], flat_size[0], mindices.size()[1], batchsize, self.use_cuda) bfsize = Variable(flat_size * batchsize) bfindices = mindices + bm bfindices = bfindices.view(1, -1, 2).squeeze(0) vindices = Variable(bfindices.t()) bfvalues = values.view(1, -1).squeeze(0) bfx = x_flat.view(1, -1).squeeze(0) # print(vindices.size(), bfvalues.size(), bfsize, bfx.size()) bfy = sparsemult(vindices, bfvalues, bfsize, bfx) y_flat = bfy.unsqueeze(0).view(batchsize, -1) y_shape = [batchsize] y_shape.extend(self.out_size) y = y_flat.view(y_shape) # reshape y into a tensor ### Handle the bias if self.bias_type == Bias.DENSE: y = y + bias if self.bias_type == Bias.SPARSE: # untested! pass if self.reinforce and train: return y, dists, samples else: return y
def sample_v(self, y): wy = torch.mm(y, self.W) activation = wy + self.b.expand_as(wy) p_v_given_h = torch.sigmoid(activation) return p_v_given_h, torch.bernoulli(p_v_given_h)
def forward(self, x): # Applying Layer/Batch Norm if bool(self.rnn_use_laynorm_inp): x = self.ln0((x)) if bool(self.rnn_use_batchnorm_inp): x_bn = self.bn0(x.view(x.shape[0] * x.shape[1], x.shape[2])) x = x_bn.view(x.shape[0], x.shape[1], x.shape[2]) for i in range(self.N_rnn_lay): # Initial state and concatenation if self.bidir: h_init = torch.zeros(2 * x.shape[1], self.rnn_lay[i]) x = torch.cat([x, flip(x, 0)], 1) else: h_init = torch.zeros(x.shape[1], self.rnn_lay[i]) # Drop mask initilization (same mask for all time steps) if self.test_flag == False: drop_mask = torch.bernoulli( torch.Tensor(h_init.shape[0], h_init.shape[1]).fill_(1 - self.rnn_drop[i])) else: drop_mask = torch.FloatTensor([1 - self.rnn_drop[i]]) if self.use_cuda: h_init = h_init.cuda() drop_mask = drop_mask.cuda() # Feed-forward affine transformations (all steps in parallel) wh_out = self.wh[i](x) # Apply batch norm if needed (all steos in parallel) if self.rnn_use_batchnorm[i]: wh_out_bn = self.bn_wh[i](wh_out.view( wh_out.shape[0] * wh_out.shape[1], wh_out.shape[2])) wh_out = wh_out_bn.view(wh_out.shape[0], wh_out.shape[1], wh_out.shape[2]) # Processing time steps hiddens = [] ht = h_init for k in range(x.shape[0]): # rnn equation at = wh_out[k] + self.uh[i](ht) ht = self.act[i](at) * drop_mask if self.rnn_use_laynorm[i]: ht = self.ln[i](ht) hiddens.append(ht) # Stacking hidden states h = torch.stack(hiddens) # Bidirectional concatenations if self.bidir: h_f = h[:, 0:int(x.shape[1] / 2)] h_b = flip(h[:, int(x.shape[1] / 2):x.shape[1]].contiguous(), 0) h = torch.cat([h_f, h_b], 2) # Setup x for the next hidden layer x = h return x
def update_noise(self, x): self.p = 1 - self.rate self.noise.data = torch.bernoulli(self.p.expand(x.shape))
def train_both_networks(num_epochs, dataloader, netD, netG, d_labelSmooth, outputDir, model_option =1,binary = False, epoch_interval = 1): use_gpu = tc.cuda.is_available() for epoch in range(num_epochs): for i, data in enumerate(dataloader, 0): start_iter = time.time() ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) # 1A - Train the detective network in the Real Dataset ########################### # train with real netD.zero_grad() real_cpu, _ = data batch_size = real_cpu.size(0) input.data.resize_(real_cpu.size()).copy_(real_cpu) label.data.resize_(batch_size).fill_(real_label - d_labelSmooth) # use smooth label for discriminator output = netD(input) errD_real = criterion(output, label) errD_real.backward() ####################################################### ####################################################### # 1B - Train the detective network in the False Dataset ####################################################### D_x = output.data.mean() # train with fake noise.data.resize_(batch_size, nz, 1, 1) if binary: bernoulli_prob.resize_(noise.data.size()) noise.data.copy_(2*(torch.bernoulli(bernoulli_prob)-0.5)) else: noise.data.normal_(0, 1) fake,z_prediction = netG(noise) label.data.fill_(fake_label) output = netD(fake.detach()) # add ".detach()" to avoid backprop through G errD_fake = criterion(output, label) errD_fake.backward() # gradients for fake/real will be accumulated D_G_z1 = output.data.mean() errD = errD_real + errD_fake optimizerD.step() # .step() can be called once the gradients are computed ####################################################### ####################################################### # (2) Update G network: maximize log(D(G(z))) # Train the faker with de output from the Detective (but don't train the Detective) #############3######################################### netG.zero_grad() label.data.fill_(real_label) # fake labels are real for generator cost output = netD(fake) errG = criterion(output, label) errG.backward(retain_variables=True) # True if backward through the graph for the second time if model_option == 2: # with z predictor errG_z = criterion_MSE(z_prediction, noise) errG_z.backward() D_G_z2 = output.data.mean() optimizerG.step() end_iter = time.time() #Print the info print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f Elapsed %.2f s' % (epoch, num_epochs, i, len(dataloader), errD.data[0], errG.data[0], D_x, D_G_z1, D_G_z2, end_iter-start_iter)) #Save a grid with the pictures from the dataset, up until 64 if i % 100 == 0: # the first 64 samples from the mini-batch are saved. vutils.save_image(real_cpu[0:64,:,:,:], '%s/real_samples.png' % outputDir, nrow=8) fake,_ = netG(fixed_noise) vutils.save_image(fake.data[0:64,:,:,:], '%s/fake_samples_epoch_%03d.png' % (outputDir, epoch), nrow=8) if epoch % epoch_interval == 0: # do checkpointing torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (outputDir, epoch)) torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (outputDir, epoch))
def train_both_networks(num_epochs, dataloader, netD, netG, d_labelSmooth, outputDir, model_option =1,binary = False, epoch_interval = 1): use_gpu = tc.cuda.is_available() for epoch in range(num_epochs): for i, data in enumerate(dataloader, 0): start_iter = time.time() ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) # 1A - Train the detective network in the Real Dataset ########################### # train with real netD.zero_grad() #zero the gradients real_cpu, _ = data #get the batch of images batch_size = real_cpu.size(0) #defines, online, the batch size input.data.resize_(real_cpu.size()).copy_(real_cpu) # Faz uma copia do batch de imagens no Tensor que está na GPU #Faz um tensor do tamanho do batchsize e enche de 1's ou de (1-smoother)'s label.data.resize_(batch_size).fill_(real_label - d_labelSmooth) # use smooth label for discriminator output = netD(input) #Makes the predict (foward-pass) with the Detective Network errD_real = criterion(output, label) #Generate the error (isn't just a scalar) for what detective thinks of a true image errD_real.backward() #Backpropagate the error of the evaluation on a real image by the Detective Network. ####################################################### ####################################################### # 1B - Train the detective network in the False Dataset ####################################################### D_x = output.data.mean() # Gets the mean of the error in detective evaluations on the real data. # Closer to zero the better. This is a good metric! # train with fake noise.data.resize_(batch_size, nz, 1, 1) #Cria um ruido de dimensoes (batchsize, dimensionalidade_do_ruido), os # 1 e 1 finais é para não dar erro na multiplicação de tensores if binary: ## This if-else deals with the distribuition of data to get the random sample bernoulli_prob.resize_(noise.data.size()) noise.data.copy_(2*(torch.bernoulli(bernoulli_prob)-0.5)) else: noise.data.normal_(0, 1) fake,z_prediction = netG(noise) # Here we create fake data (images) label.data.fill_(fake_label) #Fills the tensor that is on the GPU with 0's or (0 + smoother)'s output = netD(fake.detach()) # add ".detach()" to avoid backprop through G. #Here Detective evaluates the fake images errD_fake = criterion(output, label) #Detective calculates the error between the evaluations and the fake label (0) "this number should increase" errD_fake.backward() # gradients for fake/real will be accumulated D_G_z1 = output.data.mean() #Calculate the error on the evaluations. Faker network wants to increase this and Detective to lower errD = errD_real + errD_fake # Sums up the Detective error in real evaluations with fake ones optimizerD.step() # .step() can be called once the gradients are computed ####################################################### ####################################################### # (2) Update G network: maximize log(D(G(z))) # Train the faker with de output from the Detective (but don't train the Detective) #############3######################################### netG.zero_grad() # Zeros the gradient of the Generative network label.data.fill_(real_label) # fake labels are real for generator cost, since the Faker network want its image to look like real ones, therefore their label should be closer to 1 as possible output = netD(fake) # Detective network evaluates the fake data errG = criterion(output, label) #Calculates the error between 1 and the Detective evaluation on the fake data errG.backward(retain_graph=True) # True if backward through the graph for the second time. # Backpropagates the error in the Faker Network. # If this if is enabled, it propagates the error on the noise_predictor (on Faker Network) as well if model_option == 2: # with z predictor errG_z = criterion_MSE(z_prediction, noise) errG_z.backward() D_G_z2 = output.data.mean() # Calculates evaluations of the Detective on the fake data generated by the Faker. Faker wants this to increase # as in Detective thinking he is making authentic images optimizerG.step() #Updates the optimizer end_iter = time.time() #Print the info print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f Elapsed %.2f s' % (epoch, num_epochs, i, len(dataloader), errD.data[0], errG.data[0], D_x, D_G_z1, D_G_z2, end_iter-start_iter)) #Save a grid with the pictures from the dataset, up until 64 if i % 100 == 0: # the first 64 samples from the mini-batch are saved. vutils.save_image(real_cpu[0:64,:,:,:], '%s/real_samples.png' % outputDir, nrow=8) fake,_ = netG(fixed_noise) vutils.save_image(fake.data[0:64,:,:,:], '%s/fake_samples_epoch_%03d.png' % (outputDir, epoch), nrow=8) if epoch % epoch_interval == 0: # do checkpointing torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (outputDir, epoch)) torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (outputDir, epoch))
def train(epoch, train_loader, model, opt, args, logger, nfef_meter=None, nfeb_meter=None): model.train() train_loss = np.zeros(len(train_loader)) train_bpd = np.zeros(len(train_loader)) num_data = 0 # set warmup coefficient beta = min([(epoch * 1.) / max([args.warmup, 1.]), args.max_beta]) logger.info('beta = {:5.4f}'.format(beta)) end = time.time() for batch_idx, (data, _) in enumerate(train_loader): if args.cuda: data = data.cuda() if args.dynamic_binarization: data = torch.bernoulli(data) data = data.view(-1, *args.input_size) opt.zero_grad() x_mean, z_mu, z_var, ldj, z0, zk = model(data, is_eval=False, epoch=epoch) if 'cnf' in args.flow: f_nfe = count_nfe(model) loss, rec, kl, bpd = calculate_loss(x_mean, data, z_mu, z_var, z0, zk, ldj, args, beta=beta) loss.backward() if 'cnf' in args.flow: t_nfe = count_nfe(model) b_nfe = t_nfe - f_nfe nfef_meter.update(f_nfe) nfeb_meter.update(b_nfe) train_loss[batch_idx] = loss.item() train_bpd[batch_idx] = bpd opt.step() rec = rec.item() kl = kl.item() num_data += len(data) batch_time = time.time() - end end = time.time() if batch_idx % args.log_interval == 0: if args.input_type == 'binary': perc = 100. * batch_idx / len(train_loader) log_msg = ( 'Epoch {:3d} [{:5d}/{:5d} ({:2.0f}%)] | Time {:.3f} | Loss {:11.6f} | ' 'Rec {:11.6f} | KL {:11.6f}'.format( epoch, num_data, len(train_loader.sampler), perc, batch_time, loss.item(), rec, kl)) else: perc = 100. * batch_idx / len(train_loader) tmp = 'Epoch {:3d} [{:5d}/{:5d} ({:2.0f}%)] | Time {:.3f} | Loss {:11.6f} | Bits/dim {:8.6f}' log_msg = tmp.format( epoch, num_data, len(train_loader.sampler), perc, batch_time, loss.item(), bpd), '\trec: {:11.3f}\tkl: {:11.6f}'.format(rec, kl) log_msg = "".join(log_msg) if 'cnf' in args.flow: log_msg += ' | NFE Forward {:.0f}({:.1f}) | NFE Backward {:.0f}({:.1f})'.format( f_nfe, nfef_meter.avg, b_nfe, nfeb_meter.avg) logger.info(log_msg) if args.input_type == 'binary': logger.info('====> Epoch: {:3d} Average train loss: {:.4f}'.format( epoch, train_loss.sum() / len(train_loader))) else: logger.info( '====> Epoch: {:3d} Average train loss: {:.4f}, average bpd: {:.4f}' .format(epoch, train_loss.sum() / len(train_loader), train_bpd.sum() / len(train_loader))) if 'cnf' not in args.flow: return train_loss else: return train_loss, nfef_meter, nfeb_meter
def test_basic_block(): block = BasicBlock x = Variable(torch.randn(1, 3, 64, 64)) mask = Variable(torch.bernoulli(torch.rand(1, 1, 64, 64))) mask2 = Variable(torch.bernoulli(torch.rand(1, 1, 128, 128))) # without upsample cfg = { 'in_channels': 3, 'out_channels': 4, 'kernel_size': 3, 'stride': 2, 'padding': 1 } # conv-bn-relu print('conv-bn-relu') b1 = block(False, False, False, **cfg) out1 = b1(x) assert_block_contains(b1, ['Conv2d', 'BatchNorm2d', 'ReLU']) assert_block_not_contains(b1, ['Upsample']) assert_size_match(out1.size(), [1, 4, 32, 32]) # conv-relu print('conv-relu') b2 = block(False, False, True, **cfg) out2 = b2(x) assert_block_contains(b2, ['Conv2d', 'ReLU']) assert_block_not_contains(b2, ['BatchNorm2d', 'Upsample']) assert_size_match(out2.size(), [1, 4, 32, 32]) # pconv-in-lrelu print('pconv-in-lrelu') b3 = block(False, True, False, **cfg, norm=nn.InstanceNorm2d, activation=nn.LeakyReLU(0.2)) out3, _mask_ = b3(x, mask) assert_block_contains(b3, ['PartialConv2d', 'InstanceNorm2d', 'LeakyReLU']) assert_block_not_contains(b3, ['BatchNorm2d', 'ReLU', 'Upsample']) assert_size_match(out3.size(), [1, 4, 32, 32]) and assert_size_match( _mask_.size(), [1, 1, 32, 32]) # pconv-sigmoid print('pconv-sigmoid') b4 = block(False, True, True, **cfg, activation=nn.Sigmoid()) out4, _mask_ = b4(x, mask) assert_block_contains(b4, ['PartialConv2d', 'Sigmoid']) assert_block_not_contains(b4, ['BatchNorm2d', 'ReLU', 'Upsample']) assert_size_match(out4.size(), [1, 4, 32, 32]) and assert_size_match( _mask_.size(), [1, 1, 32, 32]) # with upsample cfg = { 'in_channels': 3, 'out_channels': 4, 'kernel_size': 3, 'stride': 1, 'padding': 1 } # conv-bn-relu print('upsample + conv-bn-relu') b1 = block(True, False, False, **cfg) out1 = b1(x) assert_block_contains(b1, ['Conv2d', 'BatchNorm2d', 'ReLU', 'Upsample']) assert_size_match(out1.size(), [1, 4, 128, 128]) # conv-relu print('upsample + conv-relu') b2 = block(True, False, True, **cfg) out2 = b2(x) assert_block_contains(b2, ['Conv2d', 'ReLU', 'Upsample']) assert_block_not_contains(b2, ['BatchNorm2d']) assert_size_match(out2.size(), [1, 4, 128, 128]) # pconv-in-lrelu print('upsample + pconv-in-lrelu') b3 = block(True, True, False, **cfg, norm=nn.InstanceNorm2d, activation=nn.LeakyReLU(0.2)) out3, _mask_ = b3(x, mask2) assert_block_contains( b3, ['PartialConv2d', 'InstanceNorm2d', 'LeakyReLU', 'Upsample']) assert_block_not_contains(b3, ['BatchNorm2d', 'ReLU']) assert_size_match(out3.size(), [1, 4, 128, 128]) and assert_size_match( _mask_.size(), [1, 1, 128, 128]) # pconv-sigmoid print('upsample + pconv-sigmoid') b4 = block(True, True, True, **cfg, activation=nn.Sigmoid()) out4, _mask_ = b4(x, mask2) assert_block_contains(b4, ['PartialConv2d', 'Sigmoid', 'Upsample']) assert_block_not_contains(b4, ['BatchNorm2d', 'ReLU']) assert_size_match(out4.size(), [1, 4, 128, 128]) and assert_size_match( _mask_.size(), [1, 1, 128, 128])
def main(args): if args.save_path == '': make_savepath(args) seed(args) if args.cuda: print('using cuda') print(args) device = torch.device("cuda" if args.cuda else "cpu") args.device = device opt_dict = {"not_improved": 0, "lr": 1., "best_loss": 1e4} all_data = torch.load(args.data_file) x_train, x_val, x_test = all_data x_train = x_train.to(device) x_val = x_val.to(device) x_test = x_test.to(device) y_size = 1 y_train = x_train.new_zeros(x_train.size(0), y_size) y_val = x_train.new_zeros(x_val.size(0), y_size) y_test = x_train.new_zeros(x_test.size(0), y_size) print(torch.__version__) train_data = torch.utils.data.TensorDataset(x_train, y_train) val_data = torch.utils.data.TensorDataset(x_val, y_val) test_data = torch.utils.data.TensorDataset(x_test, y_test) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=True) print('Train data: %d batches' % len(train_loader)) print('Val data: %d batches' % len(val_loader)) print('Test data: %d batches' % len(test_loader)) sys.stdout.flush() log_niter = len(train_loader) // 5 encoder = ResNetEncoderV2(args) decoder = PixelCNNDecoderV2(args) vae = VAE(encoder, decoder, args).to(device) if args.sample_from != '': save_dir = "samples/%s" % args.dataset if not os.path.exists(save_dir): os.makedirs(save_dir) vae.load_state_dict(torch.load(args.sample_from)) vae.eval() with torch.no_grad(): sample_z = vae.sample_from_prior(400).to(device) sample_x, sample_probs = vae.decoder.decode(sample_z, False) image_file = 'sample_binary_from_%s.png' % ( args.sample_from.split('/')[-1][:-3]) save_image(sample_x.data.cpu(), os.path.join(save_dir, image_file), nrow=20) image_file = 'sample_cont_from_%s.png' % ( args.sample_from.split('/')[-1][:-3]) save_image(sample_probs.data.cpu(), os.path.join(save_dir, image_file), nrow=20) return if args.eval: print('begin evaluation') test_loader = torch.utils.data.DataLoader(test_data, batch_size=50, shuffle=True) vae.load_state_dict(torch.load(args.load_path)) vae.eval() with torch.no_grad(): test(vae, test_loader, "TEST", args) au, au_var = calc_au(vae, test_loader) print("%d active units" % au) # print(au_var) calc_iwnll(vae, test_loader, args) return enc_optimizer = optim.Adam(vae.encoder.parameters(), lr=0.001) dec_optimizer = optim.Adam(vae.decoder.parameters(), lr=0.001) opt_dict['lr'] = 0.001 iter_ = 0 best_loss = 1e4 best_kl = best_nll = best_ppl = 0 decay_cnt = pre_mi = best_mi = mi_not_improved = 0 aggressive_flag = True if args.aggressive else False vae.train() start = time.time() kl_weight = args.kl_start anneal_rate = (1.0 - args.kl_start) / (args.warm_up * len(train_loader)) for epoch in range(args.epochs): report_kl_loss = report_rec_loss = 0 report_num_examples = 0 for datum in train_loader: batch_data, _ = datum batch_data = torch.bernoulli(batch_data) batch_size = batch_data.size(0) report_num_examples += batch_size # kl_weight = 1.0 kl_weight = min(1.0, kl_weight + anneal_rate) sub_iter = 1 batch_data_enc = batch_data burn_num_examples = 0 burn_pre_loss = 1e4 burn_cur_loss = 0 while aggressive_flag and sub_iter < 100: enc_optimizer.zero_grad() dec_optimizer.zero_grad() burn_num_examples += batch_data_enc.size(0) loss, loss_rc, loss_kl = vae.loss(batch_data_enc, kl_weight, nsamples=args.nsamples) burn_cur_loss += loss.sum().item() loss = loss.mean(dim=-1) loss.backward() torch.nn.utils.clip_grad_norm_(vae.parameters(), clip_grad) enc_optimizer.step() id_ = np.random.choice(x_train.size(0), args.batch_size, replace=False) batch_data_enc = torch.bernoulli(x_train[id_]) if sub_iter % 10 == 0: burn_cur_loss = burn_cur_loss / burn_num_examples if burn_pre_loss - burn_cur_loss < 0: break burn_pre_loss = burn_cur_loss burn_cur_loss = burn_num_examples = 0 sub_iter += 1 # print(sub_iter) enc_optimizer.zero_grad() dec_optimizer.zero_grad() loss, loss_rc, loss_kl = vae.loss(batch_data, kl_weight, nsamples=args.nsamples) loss = loss.mean(dim=-1) loss.backward() torch.nn.utils.clip_grad_norm_(vae.parameters(), clip_grad) loss_rc = loss_rc.sum() loss_kl = loss_kl.sum() if not aggressive_flag: enc_optimizer.step() dec_optimizer.step() report_rec_loss += loss_rc.item() report_kl_loss += loss_kl.item() if iter_ % log_niter == 0: train_loss = (report_rec_loss + report_kl_loss) / report_num_examples if aggressive_flag or epoch == 0: vae.eval() with torch.no_grad(): mi = calc_mi(vae, val_loader) au, _ = calc_au(vae, val_loader) vae.train() print('epoch: %d, iter: %d, avg_loss: %.4f, kl: %.4f, mi: %.4f, recon: %.4f,' \ 'au %d, time elapsed %.2fs' % (epoch, iter_, train_loss, report_kl_loss / report_num_examples, mi, report_rec_loss / report_num_examples, au, time.time() - start)) else: print('epoch: %d, iter: %d, avg_loss: %.4f, kl: %.4f, recon: %.4f,' \ 'time elapsed %.2fs' % (epoch, iter_, train_loss, report_kl_loss / report_num_examples, report_rec_loss / report_num_examples, time.time() - start)) sys.stdout.flush() report_rec_loss = report_kl_loss = 0 report_num_examples = 0 iter_ += 1 if aggressive_flag and (iter_ % len(train_loader)) == 0: vae.eval() cur_mi = calc_mi(vae, val_loader) vae.train() if cur_mi - best_mi < 0: mi_not_improved += 1 if mi_not_improved == 5: aggressive_flag = False print("STOP BURNING") else: best_mi = cur_mi pre_mi = cur_mi print('kl weight %.4f' % kl_weight) print('epoch: %d, VAL' % epoch) vae.eval() with torch.no_grad(): loss, nll, kl = test(vae, val_loader, "VAL", args) au, au_var = calc_au(vae, val_loader) print("%d active units" % au) # print(au_var) if loss < best_loss: print('update best loss') best_loss = loss best_nll = nll best_kl = kl torch.save(vae.state_dict(), args.save_path) if loss > best_loss: opt_dict["not_improved"] += 1 if opt_dict["not_improved"] >= decay_epoch: opt_dict["best_loss"] = loss opt_dict["not_improved"] = 0 opt_dict["lr"] = opt_dict["lr"] * lr_decay vae.load_state_dict(torch.load(args.save_path)) decay_cnt += 1 print('new lr: %f' % opt_dict["lr"]) enc_optimizer = optim.Adam(vae.encoder.parameters(), lr=opt_dict["lr"]) dec_optimizer = optim.Adam(vae.decoder.parameters(), lr=opt_dict["lr"]) else: opt_dict["not_improved"] = 0 opt_dict["best_loss"] = loss if decay_cnt == max_decay: break if epoch % args.test_nepoch == 0: with torch.no_grad(): loss, nll, kl = test(vae, test_loader, "TEST", args) vae.train() # compute importance weighted estimate of log p(x) vae.load_state_dict(torch.load(args.save_path)) vae.eval() with torch.no_grad(): loss, nll, kl = test(vae, test_loader, "TEST", args) au, au_var = calc_au(vae, test_loader) print("%d active units" % au) # print(au_var) test_loader = torch.utils.data.DataLoader(test_data, batch_size=50, shuffle=True) with torch.no_grad(): calc_iwnll(vae, test_loader, args)
netG.load_state_dict(torch.load(opt.netG)) print(netG) netD.apply(weights_init) if opt.netD != '': netD.load_state_dict(torch.load(opt.netD)) print(netD) criterion = nn.BCELoss() criterion_MSE = nn.MSELoss() input = torch.FloatTensor(opt.batchSize, 3, opt.imageSize, opt.imageSize) noise = torch.FloatTensor(opt.batchSize, nz, 1, 1) if opt.binary: bernoulli_prob = torch.FloatTensor(opt.batchSize, nz, 1, 1).fill_(0.5) fixed_noise = torch.bernoulli(bernoulli_prob) else: fixed_noise = torch.FloatTensor(opt.batchSize, nz, 1, 1).normal_(0, 1) label = torch.FloatTensor(opt.batchSize) real_label = 1 fake_label = 0 if opt.cuda: netD.cuda() netG.cuda() criterion.cuda() criterion_MSE.cuda() input, label = input.cuda(), label.cuda() noise, fixed_noise = noise.cuda(), fixed_noise.cuda() input = Variable(input)
def forward(numeric, train=True, printHere=False): global hidden global beginning global beginning_chars if hidden is None: hidden = None beginning = zeroBeginning # beginning_chars = zeroBeginning_chars elif hidden is not None: hidden1 = Variable(hidden[0]).detach() hidden2 = Variable(hidden[1]).detach() forRestart = bernoulli.sample() hidden1 = torch.where( forRestart.unsqueeze(0).unsqueeze(2) == 1, zeroHidden, hidden1) hidden2 = torch.where( forRestart.unsqueeze(0).unsqueeze(2) == 1, zeroHidden, hidden2) hidden = (hidden1, hidden2) beginning = torch.where( forRestart.unsqueeze(0) == 1, zeroBeginning, beginning) # beginning_chars = torch.where(forRestart.unsqueeze(0).unsqueeze(2) == 1, zeroBeginning_chars, beginning_chars) numeric, numeric_chars = numeric # print(numeric.size()) numeric = numeric.expand(-1, args.NUMBER_OF_REPLICATES) numeric = torch.cat([beginning, numeric], dim=0) embedded_everything = word_embeddings(numeric) # Positional embeddings numeric_positions = torch.LongTensor(range(args.sequence_length + 1)).cuda().unsqueeze(1) embedded_positions = positional_embeddings(numeric_positions) numeric_embedded = memory_word_pos_inter(embedded_positions) # numeric_transformed = memory_mlp_inner_from_pos(numeric_embedded) # print(numeric_transformed.size(), embedded_everything.size()) # Retention probabilities memory_byword_inner = memory_mlp_inner(embedded_everything.detach()) memory_hidden_logit_per_wordtype = memory_mlp_outer( relu(memory_byword_inner)) #print(embedded_positions.size(), embedded_everything.size()) #print(memory_bilinear(embedded_positions).size(), embedded_everything.size()) attention_bilinear_term = torch.bmm( memory_bilinear(embedded_positions), relu(memory_mlp_inner_bilinear( embedded_everything.detach())).transpose(1, 2)).transpose(1, 2) #print( memory_hidden_logit = numeric_embedded + memory_hidden_logit_per_wordtype + attention_bilinear_term # print("----") # print(numeric_embedded.size(), memory_hidden_logit_per_wordtype.size()) # print(positional_embeddings.weight) # print(numeric_embedded) # print(memory_mlp_outer(relu(memory_mlp_inner(embedded_everything.detach())))) memory_hidden = sigmoid(memory_hidden_logit) # forWords = memory_mlp_outer(relu(memory_mlp_inner(embedded_everything.detach()))) # print(numeric_transformed.size(), forWords.size()) # interaction = torch.bmm(memory_word_pos_inter(forWords).transpose(0,1), numeric_transformed.transpose(0,1).transpose(1,2)) # memory_hidden = sigmoid(memory_linear_position(numeric_embedded) + interaction + memory_linear_word(forWords)) # quit() #memory_hidden = (numeric_transformed + sigmoid(memory_mlp_outer(relu(memory_mlp_inner(embedded_everything.detach())))) # Baseline predictions for prediction loss baselineValues = 10 * sigmoid( perword_baseline_outer( relu(perword_baseline_inner( embedded_everything[-1].detach())))).squeeze(1) assert tuple(baselineValues.size()) == (args.NUMBER_OF_REPLICATES, ) # Noise decisions memory_filter = torch.bernoulli(input=memory_hidden) bernoulli_logprob = torch.where(memory_filter == 1, torch.log(memory_hidden + 1e-10), torch.log(1 - memory_hidden + 1e-10)) bernoulli_logprob_perBatch = bernoulli_logprob.mean(dim=0) if args.entropy_weight > 0: entropy = -( memory_hidden * torch.log(memory_hidden + 1e-10) + (1 - memory_hidden) * torch.log(1 - memory_hidden + 1e-10)).mean() else: entropy = -1.0 memory_filter = memory_filter.squeeze(2) numeric_noised = torch.where( memory_filter == 1, numeric, 0 * numeric ) #[[x if random.random() > args.deletion_rate else 0 for x in y] for y in numeric.cpu().t()] # Input to language model input_tensor = Variable(numeric_noised[:-1], requires_grad=False) # Target target_tensor = Variable(numeric[1:], requires_grad=False) # baselineValues = perword_baseline(target_tensor[-1]).squeeze(1) embedded = word_embeddings(input_tensor) if TRAIN_LM: embedded = char_dropout(embedded) mask = bernoulli_input.sample() mask = mask.view(1, args.batchSize, 2 * args.word_embedding_size) embedded = embedded * mask out, hidden = rnn_drop(embedded, hidden) # Only aim to predict the last word out = out[-1:] if TRAIN_LM: mask = bernoulli_output.sample() mask = mask.view(1, args.batchSize, args.hidden_dim) out = out * mask logits = output(out) log_probs = logsoftmax(logits) # Prediction Loss lossTensor = print_loss(log_probs.view( -1, len(itos) + 3), target_tensor[-1].view(-1)).view( -1, args.NUMBER_OF_REPLICATES) # , args.batchSize is 1 # Reward, term 1 negativeRewardsTerm1 = lossTensor.mean(dim=0) # Reward, term 2 # Regularization towards lower retention rates negativeRewardsTerm2 = memory_filter.mean(dim=0) # Overall Reward negativeRewardsTerm = negativeRewardsTerm1 + args.RATE_WEIGHT * negativeRewardsTerm2 # baselineValues: the baselines for the prediction loss (term 1) # memory_hidden: baseline for term 2 # Important to detach all but the baseline values # Reward Minus Baseline # Detached surprisal and mean retention rewardMinusBaseline = ( negativeRewardsTerm.detach() - baselineValues - args.RATE_WEIGHT * memory_hidden.mean(dim=0).squeeze(dim=1).detach()) # Important to detach from the baseline!!! loss = (rewardMinusBaseline.detach() * bernoulli_logprob_perBatch.squeeze(1)).mean() if args.entropy_weight > 0: loss -= args.entropy_weight * entropy # Loss for trained baseline loss += args.reward_multiplier_baseline * rewardMinusBaseline.pow(2).mean() loss += args.bilinear_l2 * (numeric_embedded.pow(2).mean() + memory_hidden_logit_per_wordtype.pow(2).mean() + attention_bilinear_term.pow(2).mean()) ############################ # Construct running averages factor = 0.9996**args.batchSize # Update running averages global runningAverageBaselineDeviation global runningAveragePredictionLoss global runningAverageReward global expectedRetentionRate expectedRetentionRate = factor * expectedRetentionRate + ( 1 - factor) * float(memory_hidden.mean()) runningAverageBaselineDeviation = factor * runningAverageBaselineDeviation + ( 1 - factor) * float((rewardMinusBaseline).abs().mean()) runningAveragePredictionLoss = factor * runningAveragePredictionLoss + ( 1 - factor) * round(float(negativeRewardsTerm1.mean()), 3) runningAverageReward = factor * runningAverageReward + ( 1 - factor) * float(negativeRewardsTerm.mean()) ############################ if printHere: losses = lossTensor.data.cpu().numpy() numericCPU = numeric.cpu().data.numpy() numeric_noisedCPU = numeric_noised.cpu().data.numpy() memory_hidden_CPU = memory_hidden[:, 0, 0].cpu().data.numpy() memory_hidden_logit_per_wordtype_cpu = memory_hidden_logit_per_wordtype.cpu( ).data attention_bilinear_term = attention_bilinear_term.cpu().data numeric_embedded_cpu = numeric_embedded.cpu().data print(("NONE", itos_total[numericCPU[0][0]])) for i in range((args.sequence_length)): print( (losses[0][0] if i == args.sequence_length - 1 else None, itos_total[numericCPU[i + 1][0]], itos_total[numeric_noisedCPU[i + 1][0]], memory_hidden_CPU[i + 1], float(baselineValues[0]) if i == args.sequence_length - 1 else "", float(numeric_embedded_cpu[i + 1, 0, 0]), float(memory_hidden_logit_per_wordtype_cpu[i + 1, 0, 0]), float(attention_bilinear_term[i + 1, 0, 0]))) print(lossTensor.view(-1)) print(baselineValues.view(-1)) print("EMPIRICAL DEVIATION FROM BASELINE", (lossTensor - baselineValues).abs().mean()) print("PREDICTION_LOSS", runningAveragePredictionLoss, "\tTERM2", round(float(negativeRewardsTerm2.mean()), 3), "\tAVERAGE_RETENTION", expectedRetentionRate, "\tDEVIATION FROM BASELINE", runningAverageBaselineDeviation, "\tREWARD", runningAverageReward, "\tENTROPY", float(entropy)) if updatesCount % 5000 == 0: print("\t".join([ str(x) for x in ("PREDICTION_LOSS", runningAveragePredictionLoss, "\tTERM2", round(float(negativeRewardsTerm2.mean()), 3), "\tAVERAGE_RETENTION", expectedRetentionRate, "\tDEVIATION FROM BASELINE", runningAverageBaselineDeviation, "\tREWARD", runningAverageReward, "\tENTROPY", float(entropy)) ]), file=sys.stderr) #runningAveragePredictionLoss = 0.95 * runningAveragePredictionLoss + (1-0.95) * float(negativeRewardsTerm1.mean()) return loss, target_tensor.view(-1).size()[0]
def main(seed=0, p_destroy=0): model = '0_16_2_250_4_0.01_0.99_60000_250.0_250_1.0_0.05_1e-07_0.5_0.2_10_250.pt' np.random.seed(seed) torch.manual_seed(seed) torch.set_default_tensor_type('torch.cuda.FloatTensor') torch.cuda.manual_seed_all(seed) crop = 4 time = 250 n_filters = 250 intensity = 0.5 n_examples = 10000 n_classes = 10 # Load network. network = load_network( os.path.join( ROOT_DIR, 'params', 'mnist', 'crop_locally_connected', model ), learning=False ) network.connections['X', 'Y'].update_rule = NoOp( connection=network.connections['X', 'Y'], nu=network.connections['X', 'Y'].nu ) network.layers['Y'].theta_decay = 0 network.layers['Y'].theta_plus = 0 network.connections['X', 'Y'].norm = None for l in network.layers: network.layers[l].dt = network.dt for c in network.connections: network.connections[c].dt = network.dt network.layers['Y'].lbound = None network.layers['Y'].one_spike = True # Destroy `p_destroy` percentage of synapses (set to 0). mask = torch.bernoulli(p_destroy * torch.ones(network.connections['X', 'Y'].w.size())).byte() network.connections['X', 'Y'].w[mask] = 0 conv_size = network.connections['X', 'Y'].conv_size conv_prod = int(np.prod(conv_size)) n_neurons = n_filters * conv_prod # Voltage recording for excitatory and inhibitory layers. voltage_monitor = Monitor(network.layers['Y'], ['v'], time=time) network.add_monitor(voltage_monitor, name='output_voltage') # Load MNIST data. dataset = MNIST(path=data_path, download=True, shuffle=True) images, labels = dataset.get_test() images *= intensity images = images[:, crop:-crop, crop:-crop] update_interval = 250 # Record spikes during the simulation. spike_record = torch.zeros(update_interval, time, n_neurons) # Neuron assignments and spike proportions. path = os.path.join( ROOT_DIR, 'params', 'mnist', 'crop_locally_connected', f'auxiliary_{model}' ) assignments, proportions, rates, ngram_scores = torch.load(open(path, 'rb')) # Sequence of accuracy estimates. curves = {'all': [], 'proportion': [], 'ngram': []} predictions = { scheme: torch.Tensor().long() for scheme in curves.keys() } spikes = {} for layer in set(network.layers): spikes[layer] = Monitor(network.layers[layer], state_vars=['s'], time=time) network.add_monitor(spikes[layer], name=f'{layer}_spikes') start = t() for i in range(n_examples): if i % 10 == 0: print(f'Progress: {i} / {n_examples} ({t() - start:.4f} seconds)') start = t() if i % update_interval == 0 and i > 0: if i % len(labels) == 0: current_labels = labels[-update_interval:] else: current_labels = labels[i % len(images) - update_interval:i % len(images)] # Update and print accuracy evaluations. curves, preds = update_curves( curves, current_labels, n_classes, spike_record=spike_record, assignments=assignments, proportions=proportions, ngram_scores=ngram_scores, n=2 ) print_results(curves) for scheme in preds: predictions[scheme] = torch.cat([predictions[scheme], preds[scheme]], -1) # Get next input sample. image = images[i % len(images)].contiguous().view(-1) sample = poisson(datum=image, time=time, dt=1) inpts = {'X': sample} # Run the network on the input. network.run(inpts=inpts, time=time) retries = 0 while spikes['Y'].get('s').sum() < 5 and retries < 3: retries += 1 image *= 2 sample = poisson(datum=image, time=time, dt=1) inpts = {'X': sample} network.run(inpts=inpts, time=time) # Add to spikes recording. spike_record[i % update_interval] = spikes['Y'].get('s').t() network.reset_() # Reset state variables. print(f'Progress: {n_examples} / {n_examples} ({t() - start:.4f} seconds)') i += 1 if i % len(labels) == 0: current_labels = labels[-update_interval:] else: current_labels = labels[i % len(images) - update_interval:i % len(images)] # Update and print accuracy evaluations. curves, preds = update_curves( curves, current_labels, n_classes, spike_record=spike_record, assignments=assignments, proportions=proportions, ngram_scores=ngram_scores, n=2 ) print_results(curves) for scheme in preds: predictions[scheme] = torch.cat([predictions[scheme], preds[scheme]], -1) print('Average accuracies:\n') for scheme in curves.keys(): print('\t%s: %.2f' % (scheme, float(np.mean(curves[scheme])))) # Save results to disk. results = [ np.mean(curves['all']), np.mean(curves['proportion']), np.mean(curves['ngram']), np.max(curves['all']), np.max(curves['proportion']), np.max(curves['ngram']) ] to_write = [str(x) for x in [seed, p_destroy] + results] name = 'synapse_robust.csv' if not os.path.isfile(os.path.join(results_path, name)): with open(os.path.join(results_path, name), 'w') as f: f.write( 'random_seed,p_destroy\n' ) with open(os.path.join(results_path, name), 'a') as f: f.write(','.join(to_write) + '\n')
def training_step(self, batch, batch_idx): is_corrupted = None if len(batch) == 2: x, y = batch else: x, y, is_corrupted = batch estimated_dv = torch.sigmoid(self(x, y)).squeeze() selection_vector = torch.bernoulli(estimated_dv).detach() if selection_vector.sum() == 0: # exception when selection probability is 0 estimated_dv_ = 0.5 * torch.ones_like(estimated_dv) selection_vector = torch.bernoulli(estimated_dv_).detach() # calling detach here since we don't want to track gradients of ops in prediction model wrt to dve training_accuracy = self.prediction_model.dvrl_fit( x, y, selection_vector) log_prob = torch.sum( selection_vector * torch.log(estimated_dv + self.hparams.epsilon) + (1.0 - selection_vector) * torch.log(1.0 - estimated_dv + self.hparams.epsilon)) exploration_bonus = torch.max( torch.mean(estimated_dv.squeeze()) - self.exploration_threshold, torch.tensor(0.0, device=estimated_dv.device)) + torch.max( (1.0 - self.exploration_threshold) - torch.mean(estimated_dv.squeeze()), torch.tensor(0.0, device=estimated_dv.device)) cross_entropy_loss_sum = 0.0 accuracy_tracker = pl.metrics.Accuracy(compute_on_step=False) if is_corrupted is not None: with torch.no_grad(): self.dve.eval() corrupted_indices = torch.where(is_corrupted)[0] clean_indices = torch.where(~is_corrupted)[0] self.log('mean_corrupted_dve', torch.sigmoid( self(x[corrupted_indices], y[corrupted_indices])).mean(), prog_bar=True) self.log('mean_clean_dve', torch.sigmoid(self(x[clean_indices], y[clean_indices])).mean(), prog_bar=True) self.dve.train() for val_batch in self.validation_dataloader: if len(val_batch) == 2: x_val, y_val = val_batch else: x_val, y_val, val_corrupted = val_batch with torch.no_grad(): self.prediction_model.eval() logits = self.prediction_model(x_val.cuda()).cpu() accuracy_tracker(logits.detach().cpu(), y_val.detach().cpu()) cross_entropy_loss_sum += F.cross_entropy(logits, y_val, reduction='sum') mean_cross_entropy_loss = cross_entropy_loss_sum / self.val_split val_accuracy = accuracy_tracker.compute() dve_loss = -(val_accuracy - self.validation_performance ) * log_prob + 1.e3 * exploration_bonus self.baseline_delta = (self.hparams.T - 1) * self.baseline_delta / self.hparams.T + \ mean_cross_entropy_loss / self.hparams.T self.log('val_accuracy', val_accuracy, prog_bar=True, on_step=True) self.log('training_accuracy', training_accuracy, prog_bar=True, on_step=True) self.log('estimated_dv_sum', estimated_dv.sum(), prog_bar=True, on_step=True) self.log('estimated_dv_mean', estimated_dv.mean(), prog_bar=True, on_step=True) self.log('estimated_dv_std', estimated_dv.std(), prog_bar=True, on_step=True) self.log('exploration_bonus', exploration_bonus, prog_bar=True, on_step=True) # self.log('ori_validation_accuracy', self.validation_performance, prog_bar=True, on_step=True) return {'loss': dve_loss, 'val_accuracy': val_accuracy}
def bern_eq(*shape): return cuda(torch.bernoulli(torch.ones(*shape).fill_(0.5)))
def sample_v(self, y): wy = torch.mm(y, self.W) # computing the weights times the neurons activation = wy + self.b.expand_as( wy) # expand is used to convert a in to dimensions of wx p_v_given_h = torch.sigmoid(activation) return p_v_given_h, torch.bernoulli(p_v_given_h)
def sample_h(self, x): wx = torch.mm(x, self.W.t()) activation = wx + self.a.expand_as(wx) p_h_given_v = torch.sigmoid(activation) return p_h_given_v, torch.bernoulli(p_h_given_v)
def random_mask(x, p, training): if training: return torch.bernoulli((1. - p) * torch.ones(x.shape)).cuda() else: return 1.
def preprocess(data): if args.dynamic_binarization: return torch.bernoulli(data) else: return data
def __call__(self, img): mask = torch.Tensor(img.shape[0], img.shape[1]).fill_(self.p) mask = torch.bernoulli(mask) cpy = img.copy() cpy[mask.numpy() == 1] = 255 return cpy
def sample(theta): x = torch.bernoulli(theta) # x = theta - (theta - x).detach().clone() return x
def __call__(self, img): if torch.bernoulli(torch.Tensor([self.prob]))[0] == 1: return self.transform(img) else: return img
if args.model == 'VAE' or args.model == 'ConditionalVAE' or args.model == 'VIS': train_dataset = datasets.MNIST(root='./data/', train=True, transform=transforms.ToTensor(), download=True) test_dataset = datasets.MNIST(root='./data/', train=False, transform=transforms.ToTensor()) print(len(train_dataset)) print(len(test_dataset)) train_dataset[0][0] torch.manual_seed(args.seed) train_img = torch.stack([torch.bernoulli(d[0]) for d in train_dataset]) train_label = torch.LongTensor([d[1] for d in train_dataset]) test_img = torch.stack([torch.bernoulli(d[0]) for d in test_dataset]) test_label = torch.LongTensor([d[1] for d in test_dataset]) # print(train_img[0]) print(train_img.size(), train_label.size(), test_img.size(), test_label.size()) val_img = train_img[-10000:].clone() val_label = train_label[-10000:].clone() train_img = train_img[:10000] train_label = train_label[:10000] train = torch.utils.data.TensorDataset(train_img, train_label) val = torch.utils.data.TensorDataset(val_img, val_label) test = torch.utils.data.TensorDataset(test_img, test_label)
def sample_v(self, y): wy = torch.mm(y, self.W) #note here we do not have transpose of W activation = wy + self.bias_visible.expand_as(wy) prob_v_given_h = torch.sigmoid(activation) return prob_v_given_h, torch.bernoulli(prob_v_given_h)
def decode(self, mol_vec, prob_decode): stack,trace = [],[] init_hidden = create_var(torch.zeros(1,self.hidden_size)) zero_pad = create_var(torch.zeros(1,1,self.hidden_size)) #Root Prediction root_hidden = torch.cat([init_hidden, mol_vec], dim=1) root_hidden = nn.ReLU()(self.W(root_hidden)) root_score = self.W_o(root_hidden) _,root_wid = torch.max(root_score, dim=1) root_wid = root_wid.data[0] root = MolTreeNode(self.vocab.get_smiles(root_wid)) root.wid = root_wid root.idx = 0 stack.append( (root, self.vocab.get_slots(root.wid)) ) all_nodes = [root] h = {} for step in range(MAX_DECODE_LEN): node_x,fa_slot = stack[-1] cur_h_nei = [ h[(node_y.idx,node_x.idx)] for node_y in node_x.neighbors ] if len(cur_h_nei) > 0: cur_h_nei = torch.stack(cur_h_nei, dim=0).view(1,-1,self.hidden_size) else: cur_h_nei = zero_pad cur_x = create_var(torch.LongTensor([node_x.wid])) cur_x = self.embedding(cur_x) #Predict stop cur_h = cur_h_nei.sum(dim=1) stop_hidden = torch.cat([cur_x,cur_h,mol_vec], dim=1) stop_hidden = nn.ReLU()(self.U(stop_hidden)) stop_score = nn.Sigmoid()(self.U_s(stop_hidden) * 20).squeeze() if prob_decode: backtrack = (torch.bernoulli(1.0 - stop_score.data).item() == 1) else: backtrack = (stop_score.data[0] < 0.5) if not backtrack: #Forward: Predict next clique new_h = GRU(cur_x, cur_h_nei, self.W_z, self.W_r, self.U_r, self.W_h) pred_hidden = torch.cat([new_h,mol_vec], dim=1) pred_hidden = nn.ReLU()(self.W(pred_hidden)) pred_score = nn.Softmax(dim=1)(self.W_o(pred_hidden) * 20) if prob_decode: if(pred_score.data.squeeze().sum().item() > 1): print(pred_score.data.squeeze().sum().item()) sort_wid = torch.multinomial(pred_score.data.squeeze(), 5) #sort_wid = np.random.multinomial(5, pred_score.data.squeeze()) else: _,sort_wid = torch.sort(pred_score, dim=1, descending=True) sort_wid = sort_wid.data.squeeze() sort_wid = sort_wid.cpu().numpy() next_wid = None for wid in sort_wid[:5]: slots = self.vocab.get_slots(wid) node_y = MolTreeNode(self.vocab.get_smiles(wid)) if have_slots(fa_slot, slots) and can_assemble(node_x, node_y): next_wid = wid next_slots = slots break if next_wid is None: backtrack = True #No more children can be added else: node_y = MolTreeNode(self.vocab.get_smiles(next_wid)) node_y.wid = next_wid node_y.idx = step + 1 node_y.neighbors.append(node_x) h[(node_x.idx,node_y.idx)] = new_h[0] stack.append( (node_y,next_slots) ) all_nodes.append(node_y) if backtrack: #Backtrack, use if instead of else if len(stack) == 1: break #At root, terminate node_fa,_ = stack[-2] cur_h_nei = [ h[(node_y.idx,node_x.idx)] for node_y in node_x.neighbors if node_y.idx != node_fa.idx ] if len(cur_h_nei) > 0: cur_h_nei = torch.stack(cur_h_nei, dim=0).view(1,-1,self.hidden_size) else: cur_h_nei = zero_pad new_h = GRU(cur_x, cur_h_nei, self.W_z, self.W_r, self.U_r, self.W_h) h[(node_x.idx,node_fa.idx)] = new_h[0] node_fa.neighbors.append(node_x) stack.pop() return root, all_nodes
def sample(self): """ Ref: :py:meth:`pyro.distributions.distribution.Distribution.sample`. """ return Variable(torch.bernoulli(self.ps.data))
def sample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) return torch.bernoulli(self.probs.expand(shape))
def sample_n(self, n): return torch.bernoulli(self.probs.expand(n, *self.probs.size()))
def forward(self, x,lab,test_flag): # initial state if self.bidir or self.twin_reg: h_init = Variable(torch.zeros(2*x.shape[1], self.hidden_dim)) else: h_init = Variable(torch.zeros(x.shape[1],self. hidden_dim)) # Drop mask initialization if test_flag==0: drop_mask=Variable(torch.bernoulli(torch.Tensor(h_init.shape[0],h_init.shape[1]).fill_(1-self.drop_rate))) else: drop_mask=Variable(torch.FloatTensor([1-self.drop_rate])) if self.use_cuda: x=x.cuda() lab=lab.cuda() h_init=h_init.cuda() drop_mask=drop_mask.cuda() if self.twin_reg: reg=0 if self.cnn_pre: x=self.cnn(x) # Processing hidden layers for i in range(self.N_hid): # frame concatenation for bidirectional RNNs if self.bidir or self.twin_reg: x=torch.cat([x,flip(x,0)],1) # Feed-forward affine transformation (done in parallel) wfx_out=self.wfx[i](x) wix_out=self.wix[i](x) wox_out=self.wox[i](x) wcx_out=self.wcx[i](x) # Applying batch norm if self.use_batchnorm: wfx_out_bn=self.bn_wfx[i](wfx_out.view(wfx_out.shape[0]*wfx_out.shape[1],wfx_out.shape[2])) wfx_out=wfx_out_bn.view(wfx_out.shape[0],wfx_out.shape[1],wfx_out.shape[2]) wix_out_bn=self.bn_wix[i](wix_out.view(wix_out.shape[0]*wix_out.shape[1],wix_out.shape[2])) wix_out=wix_out_bn.view(wix_out.shape[0],wix_out.shape[1],wix_out.shape[2]) wox_out_bn=self.bn_wox[i](wox_out.view(wox_out.shape[0]*wox_out.shape[1],wox_out.shape[2])) wox_out=wox_out_bn.view(wox_out.shape[0],wox_out.shape[1],wox_out.shape[2]) wcx_out_bn=self.bn_wcx[i](wcx_out.view(wcx_out.shape[0]*wcx_out.shape[1],wcx_out.shape[2])) wcx_out=wcx_out_bn.view(wcx_out.shape[0],wcx_out.shape[1],wcx_out.shape[2]) if i==0 and self.skip_conn: prev_pre_act= Variable(torch.zeros(wfx_out.shape[0],wfx_out.shape[1],wfx_out.shape[2])) if self.use_cuda: prev_pre_act=prev_pre_act.cuda() if i>0 and self.skip_conn: prev_pre_act=pre_act # Processing time steps hiddens = [] pre_act = [] c=h_init h=h_init for k in range(x.shape[0]): ft=self.act_gate(wfx_out[k]+self.ufh[i](h)) it=self.act_gate(wix_out[k]+self.uih[i](h)) ot=self.act_gate(wox_out[k]+self.uoh[i](h)) at=wcx_out[k]+self.uch[i](h) if self.skip_conn: pre_act.append(at) at=at-prev_pre_act[k] if self.use_laynorm: at=self.ln[i](at) c=it*self.act(at)*drop_mask+ft*c h=ot*self.act(c) hiddens.append(h) # stacking hidden states h=torch.stack(hiddens) if self.skip_conn: pre_act=torch.stack(pre_act) # bidirectional concatenations if self.bidir: h_f=h[:,0:int(x.shape[1]/2)] h_b=flip(h[:,int(x.shape[1]/2):x.shape[1]].contiguous(),0) h=torch.cat([h_f,h_b],2) if self.twin_reg: if not(self.bidir): h_f=h[:,0:int(x.shape[1]/2)] h_b=flip(h[:,int(x.shape[1]/2):x.shape[1]].contiguous(),0) h=h_f reg=reg+torch.mean((h_f - h_b)**2) # setup x for the next hidden layer x=h # computing output (done in parallel) out=self.fco(h) # computing loss if self.cost=="nll": pout=F.log_softmax(out,dim=2) pred=torch.max(pout,dim=2)[1] loss=self.criterion(pout.view(h.shape[0]*h.shape[1],-1), lab.view(-1)) err = torch.sum((pred!=lab).float())/(h.shape[0]*h.shape[1]) if self.cost=="mse": loss=self.criterion(out, lab) pout=out err=Variable(torch.FloatTensor([0])) if self.twin_reg: loss=loss+self.twin_w*reg return [loss,err,pout]
def setup_reparam_mask(self, N): while True: mask = torch.bernoulli(0.30 * torch.ones(N)) if torch.sum(mask) < 0.40 * N and torch.sum(mask) > 0.5: return mask
def evaluate_vae(args, model, train_loader, data_loader, epoch, dir, mode): # set loss to 0 evaluate_loss = 0 evaluate_re = 0 evaluate_kl = 0 # set model to evaluation mode model.eval() # evaluate for batch_idx, (data, target) in enumerate(data_loader): if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data, volatile=True), Variable(target) x = data # calculate loss function loss, RE, KL = model.calculate_loss(x, average=True) evaluate_loss += loss.data[0] evaluate_re += -RE.data[0] evaluate_kl += KL.data[0] # print N digits if batch_idx == 1 and mode == 'validation': if epoch == 1: if not os.path.exists(dir + 'reconstruction/'): os.makedirs(dir + 'reconstruction/') # VISUALIZATION: plot real images plot_images(args, data.data.cpu().numpy()[0:9], dir + 'reconstruction/', 'real', size_x=3, size_y=3) x_mean = model.reconstruct_x(x) plot_images(args, x_mean.data.cpu().numpy()[0:9], dir + 'reconstruction/', str(epoch), size_x=3, size_y=3) if mode == 'test': # load all data test_data = Variable(data_loader.dataset.data_tensor) test_target = Variable(data_loader.dataset.target_tensor) full_data = Variable(train_loader.dataset.data_tensor) if args.cuda: test_data, test_target, full_data = test_data.cuda(), test_target.cuda(), full_data.cuda() if args.dynamic_binarization: full_data = torch.bernoulli(full_data) # print(model.means(model.idle_input)) # VISUALIZATION: plot real images plot_images(args, test_data.data.cpu().numpy()[0:25], dir, 'real', size_x=5, size_y=5) # VISUALIZATION: plot reconstructions samples = model.reconstruct_x(test_data[0:25]) plot_images(args, samples.data.cpu().numpy(), dir, 'reconstructions', size_x=5, size_y=5) # VISUALIZATION: plot generations samples_rand = model.generate_x(25) plot_images(args, samples_rand.data.cpu().numpy(), dir, 'generations', size_x=5, size_y=5) if args.prior == 'vampprior': # VISUALIZE pseudoinputs pseudoinputs = model.means(model.idle_input).cpu().data.numpy() plot_images(args, pseudoinputs[0:25], dir, 'pseudoinputs', size_x=5, size_y=5) # CALCULATE lower-bound t_ll_s = time.time() elbo_test = model.calculate_lower_bound(test_data, MB=args.MB) t_ll_e = time.time() print('Test lower-bound value {:.2f} in time: {:.2f}s'.format(elbo_test, t_ll_e - t_ll_s)) # CALCULATE log-likelihood t_ll_s = time.time() elbo_train = model.calculate_lower_bound(full_data, MB=args.MB) t_ll_e = time.time() print('Train lower-bound value {:.2f} in time: {:.2f}s'.format(elbo_train, t_ll_e - t_ll_s)) # CALCULATE log-likelihood t_ll_s = time.time() log_likelihood_test = model.calculate_likelihood(test_data, dir, mode='test', S=args.S, MB=args.MB) t_ll_e = time.time() print('Test log_likelihood value {:.2f} in time: {:.2f}s'.format(log_likelihood_test, t_ll_e - t_ll_s)) # CALCULATE log-likelihood t_ll_s = time.time() log_likelihood_train = 0. #model.calculate_likelihood(full_data, dir, mode='train', S=args.S, MB=args.MB)) #commented because it takes too much time t_ll_e = time.time() print('Train log_likelihood value {:.2f} in time: {:.2f}s'.format(log_likelihood_train, t_ll_e - t_ll_s)) # calculate final loss evaluate_loss /= len(data_loader) # loss function already averages over batch size evaluate_re /= len(data_loader) # re already averages over batch size evaluate_kl /= len(data_loader) # kl already averages over batch size if mode == 'test': return evaluate_loss, evaluate_re, evaluate_kl, log_likelihood_test, log_likelihood_train, elbo_test, elbo_train else: return evaluate_loss, evaluate_re, evaluate_kl
def sample(*shape): return Variable(cuda(torch.bernoulli(torch.ones(*shape).fill_(0.5))))
alpha = args.alpha train_dataset = datasets.MNIST(root='./data/', train=True, transform=transforms.ToTensor(), download=True) test_dataset = datasets.MNIST(root='./data/', train=False, transform=transforms.ToTensor()) print(len(train_dataset)) print(len(test_dataset)) # train_dataset[0][0] torch.manual_seed(3435) train_img = torch.stack([torch.bernoulli(d[0]) for d in train_dataset]) train_label = torch.LongTensor([d[1] for d in train_dataset]) test_img = torch.stack([torch.bernoulli(d[0]) for d in test_dataset]) test_label = torch.LongTensor([d[1] for d in test_dataset]) # print(train_img[0]) print(train_img.size(), train_label.size(), test_img.size(), test_label.size()) # MNIST does not have an official train dataset. So we will use the last 10000 training points as your validation set. val_img = train_img[-10000:].clone() val_label = train_label[-10000:].clone() train_img = train_img[:-10000] # TODO: this should be -10000 right? train_label = train_label[:-10000] train = torch.utils.data.TensorDataset(train_img, train_label) val = torch.utils.data.TensorDataset(val_img, val_label) test = torch.utils.data.TensorDataset(test_img, test_label)
input = torch.FloatTensor(batch_size, 3, imageSize, imageSize) print(input.size()) noise = torch.FloatTensor(batch_size, nz, 1, 1) print(noise.size()) # In[22]: #parser.add_argument('--binary', action='store_true', help='z from bernoulli distribution, with prob=0.5') binary=False #Ele testa pergunta se vc quer que o seu Z venha da distribuição bernoulli if binary: bernoulli_prob = torch.FloatTensor(batch_size, nz, 1, 1).fill_(0.5) fixed_noise = torch.bernoulli(bernoulli_prob) else: fixed_noise = torch.FloatTensor(batch_size, nz, 1, 1).normal_(0, 1) # In[23]: label = torch.FloatTensor(batch_size) real_label = 1 fake_label = 0 # ### Broadcast para CUDA, se quiser # In[24]:
def __init__( self, source: Nodes, target: Nodes, nu: Optional[Union[float, Sequence[float]]] = None, reduction: Optional[callable] = None, weight_decay: float = None, **kwargs ) -> None: # language=rst """ Instantiates a :code:`Connection` object with sparse weights. :param source: A layer of nodes from which the connection originates. :param target: A layer of nodes to which the connection connects. :param nu: Learning rate for both pre- and post-synaptic events. :param reduction: Method for reducing parameter updates along the minibatch dimension. :param weight_decay: Constant multiple to decay weights by on each iteration. Keyword arguments: :param torch.Tensor w: Strengths of synapses. :param float sparsity: Fraction of sparse connections to use. :param LearningRule update_rule: Modifies connection parameters according to some rule. :param float wmin: Minimum allowed value on the connection weights. :param float wmax: Maximum allowed value on the connection weights. :param float norm: Total weight per target neuron normalization constant. """ super().__init__(source, target, nu, reduction, weight_decay, **kwargs) w = kwargs.get("w", None) self.sparsity = kwargs.get("sparsity", None) assert ( w is not None and self.sparsity is None or w is None and self.sparsity is not None ), 'Only one of "weights" or "sparsity" must be specified' if w is None and self.sparsity is not None: i = torch.bernoulli( 1 - self.sparsity * torch.ones(*source.shape, *target.shape) ) if self.wmin == -np.inf or self.wmax == np.inf: v = torch.clamp( torch.rand(*source.shape, *target.shape)[i.bool()], self.wmin, self.wmax, ) else: v = self.wmin + torch.rand(*source.shape, *target.shape)[i.bool()] * ( self.wmax - self.wmin ) w = torch.sparse.FloatTensor(i.nonzero().t(), v) elif w is not None and self.sparsity is None: assert w.is_sparse, "Weight matrix is not sparse (see torch.sparse module)" if self.wmin != -np.inf or self.wmax != np.inf: w = torch.clamp(w, self.wmin, self.wmax) self.w = Parameter(w, requires_grad=False)
def train_our(num_epochs, dataloader, netD, netG, d_labelSmooth, outputDir, model_option =1,binary = False, epoch_interval = 1, D_steps = 1, G_steps = 1): use_gpu = tc.cuda.is_available() for epoch in range(num_epochs): start_iter = time.time() D_x = 0 D_G_z1 = 0 D_G_z2 = 0 errD_acum = 0 errG_acum = 0 for z in range(D_steps): if z > 3: raise ValueError('KEEP IT LOW!') print('z', z) for j, data in enumerate(dataloader, 0): ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) # 1A - Train the detective network in the Real Dataset ########################### # train with real netD.zero_grad() real_cpu, _ = data if (epoch == 0 and z == 0 ): vutils.save_image(real_cpu[0:64,:,:,:], '%s/real_samples.png' % outputDir, nrow=8) batch_size = real_cpu.size(0) input.data.resize_(real_cpu.size()).copy_(real_cpu) label.data.resize_(batch_size).fill_(real_label - d_labelSmooth) # use smooth label for discriminator output = netD(input) errD_real = criterion(output, label) errD_real.backward() ####################################################### ####################################################### # 1B - Train the detective network in the False Dataset ####################################################### D_x += output.data.mean() print() # train with fake noise.data.resize_(batch_size, nz, 1, 1) if binary: bernoulli_prob.resize_(noise.data.size()) noise.data.copy_(2*(torch.bernoulli(bernoulli_prob)-0.5)) else: noise.data.normal_(0, 1) fake,z_prediction = netG(noise) label.data.fill_(fake_label) output = netD(fake.detach()) # add ".detach()" to avoid backprop through G errD_fake = criterion(output, label) errD_fake.backward() # gradients for fake/real will be accumulated # ERROR MEAN D_G_z1 += output.data.mean() errD_acum += errD_real.data[0] + errD_fake.data[0] optimizerD.step() # .step() can be called once the gradients are computed ####################################################### # PARADA PARA VER O Q ESTÁ ACONTENDO for a in range(G_steps): print('interacao = ',a, 'de ',G_steps ) for i, data in enumerate(dataloader, 0): # G_steps > D_steps (G_steps \geq D_steps) if a > 3: raise ValueError('KEEP IT LOW!') ####################################################### # (2) Update G network: maximize log(D(G(z))) # Train the faker with de output from the Detective (but don't train the Detective) #############3######################################### # print('ITERACAO QUE VAI DA MERDA = ',i) #if i==150: # pdb.set_trace() netG.zero_grad() label.data.fill_(real_label) # fake labels are real for generator cost output = netD(fake) errG = criterion(output, label) errG.backward(retain_variables=True) # True if backward through the graph for the second time #errG.backward() # True if backward through the graph for the second time #print("DEU ESSA SAIDA") if model_option == 2: # with z predictor errG_z = criterion_MSE(z_prediction, noise) errG_z.backward() D_G_z2 += output.data.mean() errG_acum += errG.data[0] #pdb.set_trace() #D_G_z2 = output.data.mean() #errG_acum = errG optimizerG.step() print('epoch = ',epoch) end_iter = time.time() #Print the info print('[%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f Elapsed %.2f s' % (epoch, num_epochs, errD_acum/D_steps, errG_acum/G_steps, D_x, D_G_z1, D_G_z2, end_iter-start_iter)) print('chegou no print') #Save a grid with the pictures from the dataset, up until 64 save_images(netG = netG, noise = fixed_noise, outputDir = outputDir, epoch = epoch) if epoch % epoch_interval == 0: # do checkpointing save_models(netG = netG, netD = netD, outputDir = outputDir, epoch = epoch)
def forward(self, x): # Applying Layer/Batch Norm if bool(self.lstm_use_laynorm_inp): x = self.ln0((x)) if bool(self.lstm_use_batchnorm_inp): x_bn = self.bn0(x.view(x.shape[0] * x.shape[1], x.shape[2])) x = x_bn.view(x.shape[0], x.shape[1], x.shape[2]) for i in range(self.N_lstm_lay): # Initial state and concatenation if self.bidir: h_init = torch.zeros(2 * x.shape[1], self.lstm_lay[i]) x = torch.cat([x, flip(x, 0)], 1) else: h_init = torch.zeros(x.shape[1], self.lstm_lay[i]) # Drop mask initilization (same mask for all time steps) if self.test_flag == False: drop_mask = torch.bernoulli( torch.Tensor(h_init.shape[0], h_init.shape[1]).fill_(1 - self.lstm_drop[i])) else: drop_mask = torch.FloatTensor([1 - self.lstm_drop[i]]) h_init = h_init.to(x.device) drop_mask = drop_mask.to(x.device) # Feed-forward affine transformations (all steps in parallel) wfx_out = self.wfx[i](x) wix_out = self.wix[i](x) wox_out = self.wox[i](x) wcx_out = self.wcx[i](x) # Apply batch norm if needed (all steos in parallel) if self.lstm_use_batchnorm[i]: wfx_out_bn = self.bn_wfx[i](wfx_out.view( wfx_out.shape[0] * wfx_out.shape[1], wfx_out.shape[2])) wfx_out = wfx_out_bn.view(wfx_out.shape[0], wfx_out.shape[1], wfx_out.shape[2]) wix_out_bn = self.bn_wix[i](wix_out.view( wix_out.shape[0] * wix_out.shape[1], wix_out.shape[2])) wix_out = wix_out_bn.view(wix_out.shape[0], wix_out.shape[1], wix_out.shape[2]) wox_out_bn = self.bn_wox[i](wox_out.view( wox_out.shape[0] * wox_out.shape[1], wox_out.shape[2])) wox_out = wox_out_bn.view(wox_out.shape[0], wox_out.shape[1], wox_out.shape[2]) wcx_out_bn = self.bn_wcx[i](wcx_out.view( wcx_out.shape[0] * wcx_out.shape[1], wcx_out.shape[2])) wcx_out = wcx_out_bn.view(wcx_out.shape[0], wcx_out.shape[1], wcx_out.shape[2]) # Processing time steps hiddens = [] ct = h_init ht = h_init for k in range(x.shape[0]): # LSTM equations ft = torch.sigmoid(wfx_out[k] + self.ufh[i](ht)) it = torch.sigmoid(wix_out[k] + self.uih[i](ht)) ot = torch.sigmoid(wox_out[k] + self.uoh[i](ht)) ct = it * self.act[i](wcx_out[k] + self.uch[i] (ht)) * drop_mask + ft * ct ht = ot * self.act[i](ct) if self.lstm_use_laynorm[i]: ht = self.ln[i](ht) hiddens.append(ht) # Stacking hidden states h = torch.stack(hiddens) # Bidirectional concatenations if self.bidir: h_f = h[:, 0:int(x.shape[1] / 2)] h_b = flip(h[:, int(x.shape[1] / 2):x.shape[1]].contiguous(), 0) h = torch.cat([h_f, h_b], 2) # Setup x for the next hidden layer x = h return x
# Create input and output groups of neurons. input_group = nodes.Input(n=n_input) # 100 input nodes. output_group = nodes.LIFNodes(n=n_output) # 500 output nodes. network.add_layer(input_group, name='input') network.add_layer(output_group, name='output') # Input -> output connection. # Unit Gaussian feed-forward weights. w = torch.randn(n_input, n_output) forward_conn = topology.Connection(input_group, output_group, w=w) # Output -> output connection. # Random, inhibitory recurrent weights. w = torch.bernoulli(torch.rand(n_output, n_output)) - torch.diag(torch.ones(n_output)) recurrent_conn = topology.Connection(output_group, output_group, w=w) network.add_connection(forward_conn, source='input', target='output') network.add_connection(recurrent_conn, source='output', target='output') # Monitor input and output spikes during the simulation. for l in network.layers: monitor = monitors.Monitor(network.layers[l], state_vars=['s'], time=time) network.add_monitor(monitor, name=l) # Create input ~ Bernoulli(0.1) for 1,000 timesteps. inpts = {'input': torch.bernoulli(0.05 * torch.rand(time, n_input))} # Run network simulation for 1,000 timesteps and retrieve spikes. network.run(inpts=inpts, time=time)
def sample(self, probas): return torch.bernoulli(probas).detach()
def forward(ctx, input): return torch.bernoulli(input)
def sample_h(self, x): activation = torch.matmul(x, self.W) + self.b p_h_given_v = torch.sigmoid(activation) return p_h_given_v, torch.bernoulli(p_h_given_v)
def sample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) + (self.total_count,) with torch.no_grad(): return torch.bernoulli(self.probs.unsqueeze(-1).expand(shape)).sum(dim=-1)
def sample_v(self, y): activation = torch.matmul(y, self.W.t()) + self.a p_v_given_h = torch.sigmoid(activation) return p_v_given_h, torch.bernoulli(p_v_given_h)
def tmp_lambda(x): return torch.bernoulli(x)