def get_positive_expectation(p_samples, measure, average=True): log_2 = math.log(2.) if measure == 'GAN': Ep = - F.softplus(-p_samples) elif measure == 'JSD': Ep = log_2 - F.softplus(- p_samples) elif measure == 'X2': Ep = p_samples ** 2 elif measure == 'KL': Ep = p_samples + 1. elif measure == 'RKL': Ep = -torch.exp(-p_samples) elif measure == 'DV': Ep = p_samples elif measure == 'H2': Ep = 1. - torch.exp(-p_samples) elif measure == 'W1': Ep = p_samples else: raise_measure_error(measure) if average: return Ep.mean() else: return Ep
def get_negative_expectation(q_samples, measure, average=True): log_2 = math.log(2.) if measure == 'GAN': Eq = F.softplus(-q_samples) + q_samples elif measure == 'JSD': Eq = F.softplus(-q_samples) + q_samples - log_2 elif measure == 'X2': Eq = -0.5 * ((torch.sqrt(q_samples ** 2) + 1.) ** 2) elif measure == 'KL': Eq = torch.exp(q_samples) elif measure == 'RKL': Eq = q_samples - 1. elif measure == 'DV': Eq = log_sum_exp(q_samples, 0) - math.log(q_samples.size(0)) elif measure == 'H2': Eq = torch.exp(q_samples) - 1. elif measure == 'W1': Eq = q_samples else: raise_measure_error(measure) if average: return Eq.mean() else: return Eq
def discretized_mix_logistic_loss_1d(x, l): """ log-likelihood for mixture of discretized logistics, assumes the data has been rescaled to [-1,1] interval """ # Pytorch ordering x = x.permute(0, 2, 3, 1) l = l.permute(0, 2, 3, 1) xs = [int(y) for y in x.size()] ls = [int(y) for y in l.size()] # here and below: unpacking the params of the mixture of logistics nr_mix = int(ls[-1] / 3) logit_probs = l[:, :, :, :nr_mix] l = l[:, :, :, nr_mix:].contiguous().view(xs + [nr_mix * 2]) # 2 for mean, scale means = l[:, :, :, :, :nr_mix] log_scales = torch.clamp(l[:, :, :, :, nr_mix:2 * nr_mix], min=-7.) # here and below: getting the means and adjusting them based on preceding # sub-pixels x = x.contiguous() x = x.unsqueeze(-1) + Variable(torch.zeros(xs + [nr_mix]).cuda(), requires_grad=False) # means = torch.cat((means[:, :, :, 0, :].unsqueeze(3), m2, m3), dim=3) centered_x = x - means inv_stdv = torch.exp(-log_scales) plus_in = inv_stdv * (centered_x + 1. / 255.) cdf_plus = F.sigmoid(plus_in) min_in = inv_stdv * (centered_x - 1. / 255.) cdf_min = F.sigmoid(min_in) # log probability for edge case of 0 (before scaling) log_cdf_plus = plus_in - F.softplus(plus_in) # log probability for edge case of 255 (before scaling) log_one_minus_cdf_min = -F.softplus(min_in) cdf_delta = cdf_plus - cdf_min # probability for all other cases mid_in = inv_stdv * centered_x # log probability in the center of the bin, to be used in extreme cases # (not actually used in our code) log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) inner_inner_cond = (cdf_delta > 1e-5).float() inner_inner_out = inner_inner_cond * torch.log(torch.clamp(cdf_delta, min=1e-12)) + (1. - inner_inner_cond) * (log_pdf_mid - np.log(127.5)) inner_cond = (x > 0.999).float() inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out cond = (x < -0.999).float() log_probs = cond * log_cdf_plus + (1. - cond) * inner_out log_probs = torch.sum(log_probs, dim=3) + log_prob_from_logits(logit_probs) #Don't sum over batch dimension lse = log_sum_exp(log_probs) return -torch.sum(lse.view(lse.size(0), -1), dim=1)
def forward(self, x): return x * (F.softplus(self.alpha.exp() * x)).tanh()
def rational_quadratic_spline( inputs, unnormalized_widths, unnormalized_heights, unnormalized_derivatives, inverse=False, left=0.0, right=1.0, bottom=0.0, top=1.0, min_bin_width=DEFAULT_MIN_BIN_WIDTH, min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE, full_jacobian=False, ): assert not full_jacobian try: if torch.min(inputs) < left or torch.max(inputs) > right: raise transforms.InputOutsideDomain() except RuntimeError: logger.error("Error in rational_quadratic_spline!") logger.error(" Left: %s", left) logger.error(" Right: %s", left) logger.error(" Input shape: %s", inputs.size()) logger.error(" Input: %s", inputs) raise num_bins = unnormalized_widths.shape[-1] if min_bin_width * num_bins > 1.0: raise ValueError("Minimal bin width too large for the number of bins") if min_bin_height * num_bins > 1.0: raise ValueError("Minimal bin height too large for the number of bins") widths = F.softmax(unnormalized_widths, dim=-1) widths = min_bin_width + (1 - min_bin_width * num_bins) * widths cumwidths = torch.cumsum(widths, dim=-1) cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) cumwidths = (right - left) * cumwidths + left cumwidths[..., 0] = left cumwidths[..., -1] = right widths = cumwidths[..., 1:] - cumwidths[..., :-1] derivatives = min_derivative + F.softplus(unnormalized_derivatives) heights = F.softmax(unnormalized_heights, dim=-1) heights = min_bin_height + (1 - min_bin_height * num_bins) * heights cumheights = torch.cumsum(heights, dim=-1) cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) cumheights = (top - bottom) * cumheights + bottom cumheights[..., 0] = bottom cumheights[..., -1] = top heights = cumheights[..., 1:] - cumheights[..., :-1] if inverse: bin_idx = various.searchsorted(cumheights, inputs)[..., None] else: bin_idx = various.searchsorted(cumwidths, inputs)[..., None] input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] input_bin_widths = widths.gather(-1, bin_idx)[..., 0] input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] delta = heights / widths input_delta = delta.gather(-1, bin_idx)[..., 0] input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] input_heights = heights.gather(-1, bin_idx)[..., 0] if inverse: a = (inputs - input_cumheights) * ( input_derivatives + input_derivatives_plus_one - 2 * input_delta) + input_heights * (input_delta - input_derivatives) b = input_heights * input_derivatives - (inputs - input_cumheights) * ( input_derivatives + input_derivatives_plus_one - 2 * input_delta) c = -input_delta * (inputs - input_cumheights) discriminant = b.pow(2) - 4 * a * c # assert (discriminant >= 0).all() discriminant = torch.clamp(discriminant, min=0.0) root = (2 * c) / (-b - torch.sqrt(discriminant)) outputs = root * input_bin_widths + input_cumwidths theta_one_minus_theta = root * (1 - root) denominator = input_delta + ( (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta) derivative_numerator = input_delta.pow(2) * ( input_derivatives_plus_one * root.pow(2) + 2 * input_delta * theta_one_minus_theta + input_derivatives * (1 - root).pow(2)) logabsdet = torch.log( derivative_numerator) - 2 * torch.log(denominator) return outputs, -logabsdet else: theta = (inputs - input_cumwidths) / input_bin_widths theta_one_minus_theta = theta * (1 - theta) numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta) denominator = input_delta + ( (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta) outputs = input_cumheights + numerator / denominator derivative_numerator = input_delta.pow(2) * ( input_derivatives_plus_one * theta.pow(2) + 2 * input_delta * theta_one_minus_theta + input_derivatives * (1 - theta).pow(2)) logabsdet = torch.log( derivative_numerator) - 2 * torch.log(denominator) return outputs, logabsdet
def _activate(self, x, predict): return F.softplus(x)
def forward(self, input): # logrbf up to constants is: c - t^1 / 2l out = self.constant - input[:, 0] * input[:, 1] * ( input[:, 0] - input[:, 1]).pow(2).squeeze(-1) / ( 2 * (softplus(self.lengthscale.view(-1)) + 1e-7)) return out
def forward(self, x): return x * torch.tanh(F.softplus(x))
def forward(self, x): h = F.relu(self.fc1(x)) h = F.relu(self.fc2(h)) return {"loc": self.fc31(h), "scale": F.softplus(self.fc32(h))}
def g_nonsaturating_loss(self, fake_pred): loss = F.softplus(-fake_pred).mean() return loss
def forward(self, x): return x.mul_(F.softplus(x).tanh())
def log_abs_det_jacobian(self, x, y): return 2. * (np.log(2) - x - F.softplus(-2. * x))
def forward(self, feat): feat = feat * torch.tanh(F.softplus(feat)) return feat
def discriminator_loss(real_pred, fake_pred, loss_dict): real_loss = F.softplus(-real_pred).mean() fake_loss = F.softplus(fake_pred).mean() loss_dict['d_real_loss'] = float(real_loss) loss_dict['d_fake_loss'] = float(fake_loss) return real_loss + fake_loss
def train(num_block, generator, discriminator, batch_size, epochs, path_image) : d_losses = [] g_losses = [] # Progressive 학습 실행, 8x8 부터 for step in range(2, num_block + 1) : # 에폭별실행 #for epoch in tqdm(range(1, epochs[step] + 1)): for epoch in range(1, epochs[step] + 1): #데이터 로더 생성 (에포크당 최대 샘플 1000개) loader = data_loader(step, batch_size, path=path_image, num_workers=1) print(f'step = {step}, epoch = {epoch}') #생성된 로더로 이터레이션 실행 for real_image in loader : # 가우시안 분포로 z 생성 z = [torch.rand(100), torch.rand(100)] #z.append() #z.append(torch.rand(100)) if torch.cuda.is_available() : real_image = real_image.cuda() z[0] = z[0].cuda() z[1] = z[1].cuda() # Discriminator 학습 discriminator.zero_grad() set_requires_grad(generator, False) set_requires_grad(discriminator, True) # 기울기 계산 real_image.requires_grad = True real_predict = discriminator(real_image, step) real_predict = F.softplus(-real_predict).mean() real_predict.backward(retain_graph=True) # R1 패널티계산 grad_real = torch.autograd.grad(outputs=real_predict.sum(), inputs=real_image, create_graph=True)[0] grad_penalty = (grad_real.view(grad_real.size(0), -1).norm(2, dim=1)**2).mean() grad_penalty = 10 / 2 * grad_penalty grad_penalty.backward() # Loss 계산 fake_image = generator(z[0], step) fake_predict = discriminator(fake_image, step) fake_predict = F.softplus(fake_predict).mean() fake_predict.backward() d_losses.append((real_predict + fake_predict).item()) # 가중치 업데이트 d_optim = torch.optim.Adam(discriminator.parameters(), lr=0.001) d_optim.step() # 메모리 반환 del fake_image, real_image, grad_penalty, grad_real # Generator 학습 generator.zero_grad() set_requires_grad(discriminator, False) set_requires_grad(generator, True) fake_image = generator(z[0], step) fake_predict = discriminator(fake_image, step) fake_predict = F.softplus(-fake_predict).mean() fake_predict.backward() # 가중치 업데이트 g_optim = torch.optim.Adam(generator.parameters(), lr=0.001) g_optim.step() g_losses.append(fake_predict.item()) return d_losses, g_losses
def train(args, dataset, generator, discriminator): step = int(math.log2(args.init_size)) - 2 resolution = 4 * 2**step loader = sample_data(dataset, args.batch.get(resolution, args.batch_default), resolution) data_loader = iter(loader) adjust_lr(g_optimizer, args.lr.get(resolution, 0.001)) adjust_lr(d_optimizer, args.lr.get(resolution, 0.001)) pbar = tqdm(range(3_000_000)) requires_grad(generator, False) requires_grad(discriminator, True) disc_loss_val = 0 gen_loss_val = 0 grad_loss_val = 0 alpha = 0 used_sample = 0 max_step = int(math.log2(args.max_size)) - 2 final_progress = False for i in pbar: discriminator.zero_grad() alpha = min(1, 1 / args.phase * (used_sample + 1)) if (resolution == args.init_size and args.ckpt is None) or final_progress: alpha = 1 if used_sample > args.phase * 2: used_sample = 0 step += 1 if step > max_step: step = max_step final_progress = True ckpt_step = step + 1 else: alpha = 0 ckpt_step = step resolution = 4 * 2**step loader = sample_data( dataset, args.batch.get(resolution, args.batch_default), resolution) data_loader = iter(loader) torch.save( { 'generator': generator.module.state_dict(), 'discriminator': discriminator.module.state_dict(), 'g_optimizer': g_optimizer.state_dict(), 'd_optimizer': d_optimizer.state_dict(), 'g_running': g_running.state_dict(), }, f'checkpoint/train_step-{ckpt_step}.model', ) adjust_lr(g_optimizer, args.lr.get(resolution, 0.001)) adjust_lr(d_optimizer, args.lr.get(resolution, 0.001)) try: real_image = next(data_loader) except (OSError, StopIteration): data_loader = iter(loader) real_image = next(data_loader) used_sample += real_image.shape[0] b_size = real_image.size(0) real_image = real_image.cuda() if args.loss == 'wgan-gp': real_predict = discriminator(real_image, step=step, alpha=alpha) real_predict = real_predict.mean() - 0.001 * (real_predict** 2).mean() (-real_predict).backward() elif args.loss == 'r1': real_image.requires_grad = True real_scores = discriminator(real_image, step=step, alpha=alpha) real_predict = F.softplus(-real_scores).mean() real_predict.backward(retain_graph=True) grad_real = grad(outputs=real_scores.sum(), inputs=real_image, create_graph=True)[0] grad_penalty = (grad_real.view(grad_real.size(0), -1).norm(2, dim=1)**2).mean() grad_penalty = 10 / 2 * grad_penalty grad_penalty.backward() if i % 10 == 0: grad_loss_val = grad_penalty.item() if args.mixing and random.random() < 0.9: gen_in11, gen_in12, gen_in21, gen_in22 = torch.randn( 4, b_size, code_size, device='cuda').chunk(4, 0) gen_in1 = [gen_in11.squeeze(0), gen_in12.squeeze(0)] gen_in2 = [gen_in21.squeeze(0), gen_in22.squeeze(0)] else: gen_in1, gen_in2 = torch.randn(2, b_size, code_size, device='cuda').chunk(2, 0) gen_in1 = gen_in1.squeeze(0) gen_in2 = gen_in2.squeeze(0) fake_image = generator(gen_in1, step=step, alpha=alpha) fake_predict = discriminator(fake_image, step=step, alpha=alpha) if args.loss == 'wgan-gp': fake_predict = fake_predict.mean() fake_predict.backward() eps = torch.rand(b_size, 1, 1, 1).cuda() x_hat = eps * real_image.data + (1 - eps) * fake_image.data x_hat.requires_grad = True hat_predict = discriminator(x_hat, step=step, alpha=alpha) grad_x_hat = grad(outputs=hat_predict.sum(), inputs=x_hat, create_graph=True)[0] grad_penalty = ( (grad_x_hat.view(grad_x_hat.size(0), -1).norm(2, dim=1) - 1)**2).mean() grad_penalty = 10 * grad_penalty grad_penalty.backward() if i % 10 == 0: grad_loss_val = grad_penalty.item() disc_loss_val = (-real_predict + fake_predict).item() elif args.loss == 'r1': fake_predict = F.softplus(fake_predict).mean() fake_predict.backward() if i % 10 == 0: disc_loss_val = (real_predict + fake_predict).item() d_optimizer.step() if (i + 1) % n_critic == 0: generator.zero_grad() requires_grad(generator, True) requires_grad(discriminator, False) fake_image = generator(gen_in2, step=step, alpha=alpha) predict = discriminator(fake_image, step=step, alpha=alpha) if args.loss == 'wgan-gp': loss = -predict.mean() elif args.loss == 'r1': loss = F.softplus(-predict).mean() if i % 10 == 0: gen_loss_val = loss.item() loss.backward() g_optimizer.step() accumulate(g_running, generator.module) requires_grad(generator, False) requires_grad(discriminator, True) if (i + 1) % 100 == 0: images = [] gen_i, gen_j = args.gen_sample.get(resolution, (10, 5)) with torch.no_grad(): for _ in range(gen_i): images.append( g_running(torch.randn(gen_j, code_size).cuda(), step=step, alpha=alpha).data.cpu()) utils.save_image( torch.cat(images, 0), f'sample/{str(i + 1).zfill(6)}.png', nrow=gen_i, normalize=True, range=(-1, 1), ) if (i + 1) % 10000 == 0: torch.save( { 'generator': generator.module.state_dict(), 'discriminator': discriminator.module.state_dict(), 'g_optimizer': g_optimizer.state_dict(), 'd_optimizer': d_optimizer.state_dict(), 'g_running': g_running.state_dict(), }, f'checkpoint/{str(i + 1).zfill(6)}.model', ) state_msg = ( f'Size: {4 * 2 ** step}; G: {gen_loss_val:.3f}; D: {disc_loss_val:.3f};' f' Grad: {grad_loss_val:.3f}; Alpha: {alpha:.5f}') pbar.set_description(state_msg)
def to_sigma(x): return F.softplus(x + 0.5) + 1e-8
def loss_dcgan_dis(dis_fake, dis_real): L1 = torch.mean(F.softplus(-dis_real)) L2 = torch.mean(F.softplus(dis_fake)) return L1, L2
def forward( self, prev_state: torch.Tensor, actions: torch.Tensor, prev_belief: torch.Tensor, observations: Optional[torch.Tensor] = None, nonterminals: Optional[torch.Tensor] = None) -> List[torch.Tensor]: ''' Input: init_belief, init_state: torch.Size([50, 200]) torch.Size([50, 30]) Output: beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs torch.Size([49, 50, 200]) torch.Size([49, 50, 30]) torch.Size([49, 50, 30]) torch.Size([49, 50, 30]) torch.Size([49, 50, 30]) torch.Size([49, 50, 30]) torch.Size([49, 50, 30]) ''' if args.MultiGPU and torch.cuda.device_count() > 1: actions = torch.transpose(actions, 0, 1) observations = None if observations is None else torch.transpose( observations, 0, 1) nonterminals = None if nonterminals is None else torch.transpose( nonterminals, 0, 1) # Create lists for hidden states (cannot use single tensor as buffer because autograd won't work with inplace writes) T = actions.size(0) + 1 beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = [ torch.empty(0) ] * T, [torch.empty(0)] * T, [torch.empty(0)] * T, [ torch.empty(0) ] * T, [torch.empty(0)] * T, [torch.empty(0)] * T, [torch.empty(0)] * T beliefs[0], prior_states[0], posterior_states[ 0] = prev_belief, prev_state, prev_state # Loop over time sequence for t in range(T - 1): _state = prior_states[ t] if observations is None else posterior_states[ t] # Select appropriate previous state _state = _state if nonterminals is None else _state * nonterminals[ t] # Mask if previous transition was terminal # Compute belief (deterministic hidden state) hidden = self.act_fn( self.fc_embed_state_action( torch.cat([_state, actions[t]], dim=1))) beliefs[t + 1] = self.rnn(hidden, beliefs[t]) # Compute state prior by applying transition dynamics hidden = self.act_fn(self.fc_embed_belief_prior(beliefs[t + 1])) prior_means[t + 1], _prior_std_dev = torch.chunk( self.fc_state_prior(hidden), 2, dim=1) prior_std_devs[t + 1] = F.softplus(_prior_std_dev) + self.min_std_dev prior_states[t + 1] = prior_means[t + 1] + prior_std_devs[ t + 1] * torch.randn_like(prior_means[t + 1]) if observations is not None: # Compute state posterior by applying transition dynamics and using current observation t_ = t - 1 # Use t_ to deal with different time indexing for observations hidden = self.act_fn( self.fc_embed_belief_posterior( torch.cat([beliefs[t + 1], observations[t_ + 1]], dim=1))) posterior_means[t + 1], _posterior_std_dev = torch.chunk( self.fc_state_posterior(hidden), 2, dim=1) posterior_std_devs[ t + 1] = F.softplus(_posterior_std_dev) + self.min_std_dev posterior_states[t + 1] = posterior_means[ t + 1] + posterior_std_devs[t + 1] * torch.randn_like( posterior_means[t + 1]) # Return new hidden states hidden = [ torch.stack(beliefs[1:], dim=0), torch.stack(prior_states[1:], dim=0), torch.stack(prior_means[1:], dim=0), torch.stack(prior_std_devs[1:], dim=0) ] if observations is not None: hidden += [ torch.stack(posterior_states[1:], dim=0), torch.stack(posterior_means[1:], dim=0), torch.stack(posterior_std_devs[1:], dim=0) ] return hidden
y_fill = Variable(y_fill.cuda()) y_fill_list.append(y_fill) y_onehot_v_concat = y_onehot_v_list[0] if opt.label_mode == 2: y_onehot_v_concat = torch.cat([y_onehot_v_list[0], y_onehot_v_list[1]], 1) y_fill_concat = y_fill_list[0] if opt.label_mode == 2: y_fill_concat = torch.cat([y_fill_list[0], y_fill_list[1]], 1) input.resize_(real_cpu.size()).copy_(real_cpu) # label.resize_(batch_size).fill_(real_label) inputv = Variable(input) # labelv = Variable(label) output = SND(inputv, y_fill_concat) #print(output) errD_real = torch.mean(F.softplus(-output).mean()) #errD_real = criterion(output, labelv) #errD_real.backward() D_x = output.data.mean() # train with fake noise.resize_(batch_size, opt.nz, 1, 1).normal_(0, 1) noisev = Variable(noise) #y_nz = torch.cat([noisev, y_onehot], 1) fake = G(noisev, y_onehot_v_concat) # labelv = Variable(label.fill_(fake_label)) output = SND(fake.detach(), y_fill_concat) errD_fake = torch.mean(F.softplus(output)) #errD_fake = criterion(output, labelv) #errD_fake.backward() D_G_z1 = output.data.mean()
def forward(self,x): d = x.shape[1] // 3 num_off_diagonals = d * (d - 1) // 2 n = x.shape[0] q, q_dot, q_ddot = torch.split(x,[d,d,d], dim = 1) # q.requires_grad = True h1 = self.act_fn(self.fc1(q)) h2 = self.act_fn(self.fc1a(h1)) # Gravity torque g = self.fc2(h2) # ld is vector of diagonal L terms, lo is vector of off-diagonal L terms h3 = self.fc3(h2) ld = F.softplus(h3) lo = self.fc4(h2) dRelu_fc1 = torch.where(h1 > 0, torch.ones(h1.shape,device=self.device), self.neg_slope * torch.ones(h1.shape,device=self.device)) dh1_dq = torch.diag_embed(dRelu_fc1) @ self.fc1.weight dRelu_fc1a = torch.where(h2 > 0, torch.ones(h2.shape,device=self.device), self.neg_slope * torch.ones(h2.shape,device=self.device)) dh2_dh1 = torch.diag_embed(dRelu_fc1a) @ self.fc1a.weight dRelu_fc3 = torch.sigmoid(h3)#torch.where(ld > 0, torch.ones(ld.shape), 0.0 * torch.ones(ld.shape)) dld_dh2 = torch.diag_embed(dRelu_fc3) @ self.fc3.weight dlo_dh2 = self.fc4.weight dld_dq = dld_dh2 @ dh2_dh1 @ dh1_dq dlo_dq = dlo_dh2 @ dh2_dh1 @ dh1_dq dld_dqi = dld_dq.permute(0,2,1).view(n,d,d,1) dlo_dqi = dlo_dq.permute(0,2,1).view(n,d,-1,1) dld_dt = dld_dq @ q_dot.view(n,d,1) dlo_dt = dlo_dq @ q_dot.view(n,d,1) # Get L, dL matrices without inplace operations L = [] dL_dt = [] dL_dqi = [] zeros = torch.zeros_like(ld) zeros_2 = torch.zeros_like(dld_dqi) lo_start = 0 lo_end = d - 1 for i in range(d): l = torch.cat((zeros[:, :i].view(n, -1), ld[:, i].view(-1, 1), lo[:, lo_start:lo_end]), dim=1) dl_dt = torch.cat((zeros[:, :i].view(n, -1), dld_dt[:, i].view(-1, 1), dlo_dt[:, lo_start:lo_end].view(n, -1)), dim=1) dl_dqi = torch.cat((zeros_2[:, :, :i].view(n, d, -1), dld_dqi[:, :, i].view(n, -1, 1), dlo_dqi[:, :, lo_start:lo_end].view(n, d, -1)), dim=2) lo_start = lo_start + lo_end lo_end = lo_end + d - 2 - i L.append(l) dL_dt.append(dl_dt) dL_dqi.append(dl_dqi) L = torch.stack(L, dim=2) dL_dt = torch.stack(dL_dt, dim=2) # dL_dqi n x d x d x d -- last dim is index for qi dL_dqi = torch.stack(dL_dqi, dim=3).permute(0, 2, 3, 1) epsilon = 1e-9 #small number to ensure positive definiteness of H H = L @ L.transpose(1, 2) + epsilon * torch.eye(d, device=self.device) # Time derivative of Mass Matrix dH_dt = L @ dL_dt.permute(0,2,1) + dL_dt @ L.permute(0,2,1) quadratic_term = [] for i in range(d): qterm = q_dot.view(n, 1, d) @ (dL_dqi[:, :, :, i] @ L.transpose(1, 2) + L @ dL_dqi[:, :, :, i].transpose(1, 2)) @ q_dot.view(n, d, 1) quadratic_term.append(qterm) quadratic_term = torch.stack(quadratic_term, dim=1) c = dH_dt @ q_dot.view(n,d,1) - 0.5 * quadratic_term.view(n,d,1) tau = H @ q_ddot.view(n,d,1) + c + g.view(n,d,1) #set uncontrolled torque to zero tau = torch.diag_embed(torch.cat((torch.ones((n,1),device=self.device), torch.zeros((n,1),device=self.device)),dim=1)) @ tau # The loss layer will be applied outside Network class return (tau.squeeze(), (H @ q_ddot.view(n,d,1)).squeeze(), c.squeeze(), g.squeeze())
def d_logistic_loss(self, real_pred, fake_pred): real_loss = F.softplus(-real_pred) fake_loss = F.softplus(fake_pred) return real_loss.mean() + fake_loss.mean()
def mish(x): """Mish: A Self Regularized Non-Monotonic Neural Activation Function (https://arxiv.org/abs/1908.08681)""" return x * torch.tanh(F.softplus(x))
def log_abs_det_jacobian(self, x, y): # We use a formula that is more numerically stable, see details in the following link # https://github.com/tensorflow/probability/commit/ef6bb176e0ebd1cf6e25c6b5cecdd2428c22963f#diff-e120f70e92e6741bca649f04fcd907b7 return 2. * (np.log(2.) - x - F.softplus(-2. * x))
def forward(ctx, x): ctx.save_for_backward(x) return x.mul(torch.tanh(F.softplus(x))) # x * tanh(ln(1 + exp(x)))
def forward(self, x): x = x * (torch.tanh(F.softplus(x))) return x
def backward(ctx, grad_output): x = ctx.saved_tensors[0] sx = torch.sigmoid(x) fx = F.softplus(x).tanh() return grad_output * (fx + x * sx * (1 - fx * fx))
def forward(self, x): return x * F.softplus(x).tanh()
def forward(self, x): a = self.mlp(x) return a[:, 0:self.z_size], softplus(a[:, self.z_size:])
""" import torch import torch.nn.functional as F from torch.autograd import Variable import matplotlib.pyplot as plt # fake data x = torch.linspace(-5, 5, 200) # x data (tensor), shape=(100, 1) x = Variable(x) x_np = x.data.numpy() # numpy array for plotting # following are popular activation functions y_relu = torch.relu(x).data.numpy() y_sigmoid = torch.sigmoid(x).data.numpy() y_tanh = torch.tanh(x).data.numpy() y_softplus = F.softplus(x).data.numpy() # there's no softplus in torch # y_softmax = torch.softmax(x, dim=0).data.numpy() softmax is a special kind of activation function, it is about probability # plt to visualize these activation function plt.figure(1, figsize=(8, 6)) plt.subplot(221) plt.plot(x_np, y_relu, c='red', label='relu') plt.ylim((-1, 5)) plt.legend(loc='best') plt.subplot(222) plt.plot(x_np, y_sigmoid, c='red', label='sigmoid') plt.ylim((-0.2, 1.2)) plt.legend(loc='best') plt.subplot(223)
def forward(self, z): out = self.linear2(F.softplus(self.linear1(z))) # softrelu. softplus better for counts return out # change whatever
def discrete_gan(nets, inputs, measure=None, penalty=None, n_samples=10, reinforce=False, gamma=0.95, penalty_type='gradient_norm', use_beta=False, test_mode=False, use_sm=False): global log_Z log_M = math.log(n_samples) discriminator = nets['discriminator'] generator = nets['generator'] M = n_samples X = (inputs['images'] >= 0).float() Z = inputs['z'] R = inputs['r'] U = inputs['u'] B = inputs['z'].size()[0] log_B = math.log(B) if R.size()[1] != DIM_C * n_samples * DIM_X * DIM_Y: R = inputs['r_t'] assert R.size() == (B, DIM_C * n_samples * DIM_X * DIM_Y), (R.size(), (B, DIM_C * n_samples * DIM_X * DIM_Y)) try: R = R.view(M, -1, DIM_C * DIM_X * DIM_Y) except BaseException: R = R.view(M, -1, DIM_C * DIM_X * DIM_Y) U.requires_grad = False logit = generator(Z) assert logit.size()[1:] == X.size()[1:], (logit.size(), X.size()) g_output = F.sigmoid(logit) g_output_ = g_output.view(-1, DIM_C * DIM_X * DIM_Y) S = (R <= g_output_).float() S = S.view(M, -1, DIM_C, DIM_X, DIM_Y) S_ = Variable(S.data.cuda(), volatile=True) S = Variable(S.data.cuda(), requires_grad=False) gen_out = (U <= g_output_).float() gen_out = gen_out.view(-1, DIM_C, DIM_X, DIM_Y) real_out = discriminator(X) fake_out = discriminator(S.view(-1, DIM_C, DIM_X, DIM_Y)) fake_out_ = discriminator(S_.view(-1, DIM_C, DIM_X, DIM_Y)) log_g = -((1. - S) * logit + F.softplus(-logit)).sum(2).sum(2).sum(2) if (measure == 'w' and not test_mode) or use_sm: fake_out_sm = discriminator(g_output) d_loss, g_loss, r, f, w, b = f_divergence(measure, real_out, fake_out_sm) else: d_loss, g_loss, r, f, w, b = f_divergence(measure, real_out, fake_out.view(M, B, -1)) if measure in ('gan', 'jsd', 'rkl', 'kl', 'sh', 'proxy_gan', 'dv') and not use_sm: log_w = Variable(fake_out_.data.cuda(), requires_grad=False).view(M, B) log_beta = log_sum_exp(log_w.view(M * B, -1) - log_M - log_B, axis=0) log_alpha = log_sum_exp(log_w - log_M, axis=0) if use_beta: log_Z_est = log_beta log_w_tilde = log_w - log_Z_est - log_M - log_B else: log_Z_est = log_alpha log_w_tilde = log_w - log_Z_est - log_M w_tilde = torch.exp(log_w_tilde) alpha = torch.exp(log_alpha) beta = torch.exp(log_beta) elif measure == 'xs': w = (fake_out / 2. + 1.).view(M, B) w_tilde = w / w.sum(0) log_Z_est = torch.log(torch.mean(w)) elif measure == 'w' or use_sm: log_w = Variable(torch.Tensor([0.]).float()).cuda() log_Z_est = Variable(torch.Tensor([0.]).float()).cuda() w_tilde = Variable(torch.Tensor([0.]).float()).cuda() else: raise NotImplementedError(measure) if measure != 'w' and not use_sm: if reinforce: r = (log_w - log_Z) assert not r.requires_grad g_loss = -(r * log_g).sum(0).mean() else: w_tilde = Variable(w_tilde.data.cuda(), requires_grad=False) assert not w_tilde.requires_grad if use_beta: g_loss = -((w_tilde * log_g).view(M * B)).sum(0).mean() else: g_loss = -(w_tilde * log_g).sum(0).mean() results = dict(g_loss=g_loss.data[0], distance=-d_loss.data[0], boundary=torch.mean(b).data[0], real=torch.mean(r).data[0], fake=torch.mean(f).data[0], gen_out=g_output.mean().data[0], w_tilde=w_tilde.mean().data[0], real_out=real_out.mean().data[0], fake_out=fake_out.mean().data[0]) if measure != 'w' and not use_sm: results.update(alpha=alpha.mean().data[0], log_alpha=log_alpha.mean().data[0], beta=beta.mean().data[0], log_beta=log_beta.mean().data[0]) results.update(ess=(1. / (w_tilde ** 2).sum(0)).mean().data[0]) if test_mode or measure == 'w' or use_sm: fake_out_sm = discriminator(Variable(g_output.data.cuda(), volatile=True)) S_th = Variable((g_output >= 0.5).float().data.cuda(), volatile=True) fake_out_sam = Variable(fake_out.data.cuda(), volatile=True) fake_out_th = discriminator(S_th) dist_th = -f_divergence(measure, real_out, fake_out_th)[0] dist_sam = -f_divergence(measure, real_out, fake_out_sam)[0] dist_sm = -f_divergence(measure, real_out, fake_out_sm)[0] results.update(distance_th=dist_th.data[0], distance_sam=dist_sam.data[0], distance_sm=dist_sm.data[0]) samples = dict(images=dict(generated=gen_out.data, prob=g_output.data, real=X.data)) if penalty: p_term = apply_penalty(inputs, discriminator, X, g_output, measure, penalty_type=penalty_type) d_loss += penalty * p_term results['gradient penalty'] = p_term.data[0] log_Z *= gamma log_Z += (1. - gamma) * log_Z_est.mean() results.update(log_Z=log_Z.data[0], log_Z_est=log_Z_est.mean().data[0], log_w=log_w.mean().data[0], log_g=log_g.mean().data[0]) return dict(generator=g_loss, discriminator=d_loss), results, samples, 'boundary'
def create_gaussian_conditional(l): n_channels = int(l.size(1)/2) mu = l[:, :n_channels] sigma = F.softplus(l[:, n_channels:]) dist = torch.distributions.normal.Normal(mu, sigma) return dist
import pytest import torch import torch.nn.functional as F from torch.testing import assert_allclose mish_forward_pt = lambda x: x.mul(torch.tanh(F.softplus(x))) class Mish(torch.nn.Module): def forward(self, x): return mish_forward_pt(x) def get_input_params(): devs = ['cpu'] if torch.cuda.is_available() and torch.cuda.device_count() > 0: devs += ['cuda:0'] # TODO: Allow other devices dev_types = [ (dtype, device) for dtype in [torch.float16, torch.float32, torch.float64] for device in devs # Basic ops not supported on CPU/Half, could test by converting but skip for now if not (dtype == torch.float16 and torch.device(device).type == 'cpu') ] inputs = [(ndim, dtype, device) for (dtype, device) in dev_types for ndim in [1, 2, 3, 4, 8]] return inputs @pytest.fixture(params=get_input_params()) def test_input(request):
def loss_dcgan_gen(dis_fake): loss = torch.mean(F.softplus(-dis_fake)) return loss
def forward(self, h): out = self.mlp(h) z_pres_p = sigmoid(out[:, 0:self.z_pres_size]) z_where_mu = out[:, self.z_pres_size:self.z_pres_size + self.z_where_size] z_where_sigma = softplus(out[:, (self.z_pres_size + self.z_where_size):]) return z_pres_p, z_where_mu, z_where_sigma
def pz_params(self): return self._pz_params[0], F.softplus(self._pz_params[1])
def discretized_mix_logistic_loss(x, l): """ log-likelihood for mixture of discretized logistics, assumes the data has been rescaled to [-1,1] interval """ # Pytorch ordering x = x.permute(0, 2, 3, 1) l = l.permute(0, 2, 3, 1) xs = [int(y) for y in x.size()] ls = [int(y) for y in l.size()] # here and below: unpacking the params of the mixture of logistics nr_mix = int(ls[-1] / 10) logit_probs = l[:, :, :, :nr_mix] l = l[:, :, :, nr_mix:].contiguous().view(xs + [nr_mix * 3]) # 3 for mean, scale, coef means = l[:, :, :, :, :nr_mix] # log_scales = torch.max(l[:, :, :, :, nr_mix:2 * nr_mix], -7.) log_scales = torch.clamp(l[:, :, :, :, nr_mix:2 * nr_mix], min=-7.) coeffs = F.tanh(l[:, :, :, :, 2 * nr_mix:3 * nr_mix]) # here and below: getting the means and adjusting them based on preceding # sub-pixels x = x.contiguous() x = x.unsqueeze(-1) + Variable(torch.zeros(xs + [nr_mix]).cuda(), requires_grad=False) m2 = (means[:, :, :, 1, :] + coeffs[:, :, :, 0, :] * x[:, :, :, 0, :]).view(xs[0], xs[1], xs[2], 1, nr_mix) m3 = (means[:, :, :, 2, :] + coeffs[:, :, :, 1, :] * x[:, :, :, 0, :] + coeffs[:, :, :, 2, :] * x[:, :, :, 1, :]).view(xs[0], xs[1], xs[2], 1, nr_mix) means = torch.cat((means[:, :, :, 0, :].unsqueeze(3), m2, m3), dim=3) centered_x = x - means inv_stdv = torch.exp(-log_scales) plus_in = inv_stdv * (centered_x + 1. / 255.) cdf_plus = F.sigmoid(plus_in) min_in = inv_stdv * (centered_x - 1. / 255.) cdf_min = F.sigmoid(min_in) # log probability for edge case of 0 (before scaling) log_cdf_plus = plus_in - F.softplus(plus_in) # log probability for edge case of 255 (before scaling) log_one_minus_cdf_min = -F.softplus(min_in) cdf_delta = cdf_plus - cdf_min # probability for all other cases mid_in = inv_stdv * centered_x # log probability in the center of the bin, to be used in extreme cases # (not actually used in our code) log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) # now select the right output: left edge case, right edge case, normal # case, extremely low prob case (doesn't actually happen for us) # this is what we are really doing, but using the robust version below for extreme cases in other applications and to avoid NaN issue with tf.select() # log_probs = tf.select(x < -0.999, log_cdf_plus, tf.select(x > 0.999, log_one_minus_cdf_min, tf.log(cdf_delta))) # robust version, that still works if probabilities are below 1e-5 (which never happens in our code) # tensorflow backpropagates through tf.select() by multiplying with zero instead of selecting: this requires use to use some ugly tricks to avoid potential NaNs # the 1e-12 in tf.maximum(cdf_delta, 1e-12) is never actually used as output, it's purely there to get around the tf.select() gradient issue # if the probability on a sub-pixel is below 1e-5, we use an approximation # based on the assumption that the log-density is constant in the bin of # the observed sub-pixel value inner_inner_cond = (cdf_delta > 1e-5).float() inner_inner_out = inner_inner_cond * torch.log(torch.clamp(cdf_delta, min=1e-12)) + (1. - inner_inner_cond) * (log_pdf_mid - np.log(127.5)) inner_cond = (x > 0.999).float() inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out cond = (x < -0.999).float() log_probs = cond * log_cdf_plus + (1. - cond) * inner_out log_probs = torch.sum(log_probs, dim=3) + log_prob_from_logits(logit_probs) #Don't sum over batch dimension lse = log_sum_exp(log_probs) return -torch.sum(lse.view(lse.size(0), -1), dim=1)
def forward(self, x): e = self.enc(self.embedding(x.long()).unsqueeze(1)) mu, logvar = self.c1(e).squeeze(), self.c2(e).squeeze() return mu, F.softplus(logvar) + Constants.eta
def prune_model_keep_size2(model, prune_idx, CBL_idx, CBLidx2mask): pruned_model = deepcopy(model) activations = [] for i, model_def in enumerate(model.module_defs): if model_def['type'] == 'convolutional': activation = torch.zeros(int(model_def['filters'])).cuda() if i in prune_idx: mask = torch.from_numpy(CBLidx2mask[i]).cuda() # mask = torch.from_numpy(CBLidx2mask[i]) bn_module = pruned_model.module_list[i][1] bn_module.weight.data.mul_(mask) if model_def['activation'] == 'leaky': activation = F.leaky_relu((1 - mask) * bn_module.bias.data, 0.1) elif model_def['activation'] == 'mish': activation = (1 - mask) * bn_module.bias.data.mul( F.softplus(bn_module.bias.data).tanh()) elif model_def['activation'] == 'SiLU': #yolov5-v4 activation = (1 - mask) * bn_module.bias.data * F.sigmoid( bn_module.bias.data) elif model_def['activation'] == 'Hardswish': activation = (1 - mask) * bn_module.bias.data * F.hardtanh( bn_module.bias.data + 3, 0., 6.) / 6. update_activation(i, pruned_model, activation, CBL_idx) bn_module.bias.data.mul_(mask) activations.append(activation) elif model_def['type'] == 'shortcut': actv1 = activations[i - 1] from_layer = int(model_def['from']) actv2 = activations[i + from_layer] activation = actv1 + actv2 update_activation(i, pruned_model, activation, CBL_idx) activations.append(activation) elif model_def['type'] == 'route': #spp不参与剪枝,其中的route不用更新,仅占位 from_layers = [int(s) for s in model_def['layers'].split(',')] activation = None if len(from_layers) == 1: activation = activations[ i + from_layers[0] if from_layers[0] < 0 else from_layers[0]] if 'groups' in model_def: activation = activation[(activation.shape[0] // 2):] update_activation(i, pruned_model, activation, CBL_idx) elif len(from_layers) == 2: actv1 = activations[i + from_layers[0]] actv2 = activations[ i + from_layers[1] if from_layers[1] < 0 else from_layers[1]] activation = torch.cat((actv1, actv2)) # update_activation(i, pruned_model, activation, CBL_idx) #update_activation_nconv next_idx = i + 1 if pruned_model.module_defs[next_idx][ 'type'] == 'convolutional_noconv': next_conv1 = pruned_model.module_list[i + from_layers[0]][0] next_conv2 = pruned_model.module_list[ i + from_layers[1] if from_layers[1] < 0 else from_layers[1]][0] conv_sum1 = next_conv1.weight.data.sum(dim=(2, 3)) conv_sum2 = next_conv2.weight.data.sum(dim=(2, 3)) offset1 = conv_sum1.matmul(actv1.reshape(-1, 1)).reshape(-1) offset2 = conv_sum2.matmul(actv2.reshape(-1, 1)).reshape(-1) offset = torch.cat((offset1, offset2)) if next_idx in CBL_idx: next_bn = pruned_model.module_list[next_idx][0] next_bn.running_mean.data.sub_(offset) else: update_activation(i, pruned_model, activation, CBL_idx) activations.append(activation) elif model_def['type'] == 'upsample': # activation = torch.zeros(int(model.module_defs[i - 1]['filters'])).cuda() activations.append(activations[i - 1]) elif model_def['type'] == 'yolo': activations.append(None) elif model_def['type'] == 'focus': activations.append(None) elif model_def['type'] == 'convolutional_nobias': activations.append(activations[i - 1]) # activation = torch.zeros(int(model_def['filters'])).cuda() # activations.append(activation) elif model_def['type'] == 'convolutional_noconv': activation = torch.zeros(int(model_def['filters'])).cuda() if i in prune_idx: mask = torch.from_numpy(CBLidx2mask[i]).cuda() # mask = torch.from_numpy(CBLidx2mask[i]) bn_module = pruned_model.module_list[i][0] bn_module.weight.data.mul_(mask) activation = F.leaky_relu((1 - mask) * bn_module.bias.data, 0.1) # if model_def['activation'] == 'leaky': # activation = F.leaky_relu((1 - mask) * bn_module.bias.data, 0.1) # elif model_def['activation'] == 'mish': # activation = (1 - mask) * bn_module.bias.data.mul(F.softplus(bn_module.bias.data).tanh()) update_activation(i, pruned_model, activation, CBL_idx) bn_module.bias.data.mul_(mask) activations.append(activation) elif model_def['type'] == 'maxpool': #区分spp和tiny if model.module_defs[i + 1]['type'] == 'route': activations.append(None) else: activation = activations[i - 1] update_activation(i, pruned_model, activation, CBL_idx) activations.append(activation) return pruned_model
def scale(self): return softplus(self._scale)
""" import torch import torch.nn.functional as F from torch.autograd import Variable import matplotlib.pyplot as plt # fake data x = torch.linspace(-5, 5, 200) # x data (tensor), shape=(100, 1) x = Variable(x) x_np = x.data.numpy() # numpy array for plotting # following are popular activation functions y_relu = F.relu(x).data.numpy() y_sigmoid = F.sigmoid(x).data.numpy() y_tanh = F.tanh(x).data.numpy() y_softplus = F.softplus(x).data.numpy() # y_softmax = F.softmax(x) softmax is a special kind of activation function, it is about probability # plt to visualize these activation function plt.figure(1, figsize=(8, 6)) plt.subplot(221) plt.plot(x_np, y_relu, c='red', label='relu') plt.ylim((-1, 5)) plt.legend(loc='best') plt.subplot(222) plt.plot(x_np, y_sigmoid, c='red', label='sigmoid') plt.ylim((-0.2, 1.2)) plt.legend(loc='best')
def log_prob(self, x): return -(F.softplus(x) + F.softplus(-x))