def test_non_mean_field_bern_normal_elbo_gradient(enumerate1, pi1, pi2, pi3, include_z=True): pyro.clear_param_store() num_particles = 10000 def model(): with pyro.iarange("particles", num_particles): q3 = pyro.param("q3", torch.tensor(pi3, requires_grad=True)) y = pyro.sample("y", dist.Bernoulli(q3).expand_by([num_particles])) if include_z: pyro.sample("z", dist.Normal(0.55 * y + q3, 1.0)) def guide(): q1 = pyro.param("q1", torch.tensor(pi1, requires_grad=True)) q2 = pyro.param("q2", torch.tensor(pi2, requires_grad=True)) with pyro.iarange("particles", num_particles): y = pyro.sample("y", dist.Bernoulli(q1).expand_by([num_particles]), infer={"enumerate": enumerate1}) if include_z: pyro.sample("z", dist.Normal(q2 * y + 0.10, 1.0)) logger.info("Computing gradients using surrogate loss") elbo = TraceEnum_ELBO(max_iarange_nesting=1, strict_enumeration_warning=any([enumerate1])) elbo.loss_and_grads(model, guide) actual_grad_q1 = pyro.param('q1').grad / num_particles if include_z: actual_grad_q2 = pyro.param('q2').grad / num_particles actual_grad_q3 = pyro.param('q3').grad / num_particles logger.info("Computing analytic gradients") q1 = torch.tensor(pi1, requires_grad=True) q2 = torch.tensor(pi2, requires_grad=True) q3 = torch.tensor(pi3, requires_grad=True) elbo = kl_divergence(dist.Bernoulli(q1), dist.Bernoulli(q3)) if include_z: elbo = elbo + q1 * kl_divergence(dist.Normal(q2 + 0.10, 1.0), dist.Normal(q3 + 0.55, 1.0)) elbo = elbo + (1.0 - q1) * kl_divergence(dist.Normal(0.10, 1.0), dist.Normal(q3, 1.0)) expected_grad_q1, expected_grad_q2, expected_grad_q3 = grad(elbo, [q1, q2, q3]) else: expected_grad_q1, expected_grad_q3 = grad(elbo, [q1, q3]) prec = 0.04 if enumerate1 is None else 0.02 assert_equal(actual_grad_q1, expected_grad_q1, prec=prec, msg="".join([ "\nq1 expected = {}".format(expected_grad_q1.data.cpu().numpy()), "\nq1 actual = {}".format(actual_grad_q1.data.cpu().numpy()), ])) if include_z: assert_equal(actual_grad_q2, expected_grad_q2, prec=prec, msg="".join([ "\nq2 expected = {}".format(expected_grad_q2.data.cpu().numpy()), "\nq2 actual = {}".format(actual_grad_q2.data.cpu().numpy()), ])) assert_equal(actual_grad_q3, expected_grad_q3, prec=prec, msg="".join([ "\nq3 expected = {}".format(expected_grad_q3.data.cpu().numpy()), "\nq3 actual = {}".format(actual_grad_q3.data.cpu().numpy()), ]))
def guide(): p = pyro.param("p", torch.tensor(0.5), constraint=constraints.unit_interval) scale = pyro.param("scale", torch.tensor(1.0), constraint=constraints.positive) var = pyro.param("var", torch.tensor(1.0), constraint=constraints.positive) x = torch.tensor(0., requires_grad=True) prior = dist.Normal(0., 10.).log_prob(x) likelihood = dist.Normal(x, scale).log_prob(data).sum() loss = -(prior + likelihood) g = grad(loss, [x], create_graph=True)[0] H = grad(g, [x], create_graph=True)[0] loc = x.detach() - g / H # newton step pyro.sample("loc", dist.Normal(loc, var)) pyro.sample("b", dist.Bernoulli(p))
def test_elbo_bern(quantity, enumerate1): pyro.clear_param_store() num_particles = 1 if enumerate1 else 10000 prec = 0.001 if enumerate1 else 0.1 q = pyro.param("q", torch.tensor(0.5, requires_grad=True)) kl = kl_divergence(dist.Bernoulli(q), dist.Bernoulli(0.25)) def model(): with pyro.iarange("particles", num_particles): pyro.sample("z", dist.Bernoulli(0.25).expand_by([num_particles])) @config_enumerate(default=enumerate1) def guide(): q = pyro.param("q") with pyro.iarange("particles", num_particles): pyro.sample("z", dist.Bernoulli(q).expand_by([num_particles])) elbo = TraceEnum_ELBO(max_iarange_nesting=1, strict_enumeration_warning=any([enumerate1])) if quantity == "loss": actual = elbo.loss(model, guide) / num_particles expected = kl.item() assert_equal(actual, expected, prec=prec, msg="".join([ "\nexpected = {}".format(expected), "\n actual = {}".format(actual), ])) else: elbo.loss_and_grads(model, guide) actual = q.grad / num_particles expected = grad(kl, [q])[0] assert_equal(actual, expected, prec=prec, msg="".join([ "\nexpected = {}".format(expected.detach().cpu().numpy()), "\n actual = {}".format(actual.detach().cpu().numpy()), ]))
def forward(self, real_samples, fake_samples, **critic_kwargs): from torch.autograd import grad real_samples = real_samples.view(fake_samples.shape) subset_size = real_samples.shape[0] real_samples = real_samples[:subset_size] fake_samples = fake_samples[:subset_size] alpha = torch.rand(subset_size) if self.use_cuda: alpha = alpha.cuda() alpha = alpha.view((-1,) + ((1,) * (real_samples.dim() - 1))) interpolates = alpha * real_samples + ((1 - alpha) * fake_samples) if self.use_cuda: interpolates = interpolates.cuda() interpolates = Variable(interpolates, requires_grad=True) d_output = self.critic(interpolates, **critic_kwargs) output = torch.ones(d_output.size()) if self.use_cuda: output = output.cuda() gradients = grad( outputs=d_output, inputs=interpolates, grad_outputs=output, create_graph=True, retain_graph=True, only_inputs=True)[0] return ((gradients.norm(2, dim=1) - 1) ** 2).mean() * self.weight
def compute_elbo_grad(model, guide, variables): x = guide.rsample() model_log_prob = model.log_prob(x) guide_log_prob, score_function, entropy_term = guide.score_parts(x) log_r = model_log_prob - guide_log_prob surrogate_elbo = model_log_prob + log_r.detach() * score_function - entropy_term return grad(surrogate_elbo.sum(), variables, create_graph=True)
def _grad(potential_fn, z): z_keys, z_nodes = zip(*z.items()) for node in z_nodes: node.requires_grad = True potential_energy = potential_fn(z) grads = grad(potential_energy, z_nodes) for node in z_nodes: node.requires_grad = False return dict(zip(z_keys, grads)), potential_energy
def penalty(self, dis, real_data, fake_data): probe = self.get_probe(real_data.detach(), fake_data.detach()) probe.requires_grad = True probe_logit, _ = dis(probe) gradients = autograd.grad(outputs=F.sigmoid(probe_logit), inputs=probe, grad_outputs=torch.ones_like(probe_logit))[0] grad_norm = gradients.view(gradients.shape[0], -1).norm(2, dim=1) penalty = ((grad_norm - self.target) ** 2).mean() return self.weight * penalty, grad_norm.mean()
def __disc_train_func__(self, target, source, disc_optimizer, running_loss, epoch, batch_num): for params in self.disc_model.parameters(): params.requires_grad = True disc_optimizer.zero_grad() if isinstance(target, list) or isinstance(target, tuple): x = target[0] else: x = target batch_size = x.size(0) if self.cuda: x = x.cuda() source = source.cuda() x = Variable(x) source = Variable(source) real_loss = -torch.mean(self.disc_model(x)) real_loss.backward() generated = self.gen_model(source).detach() gen_loss = torch.mean(self.disc_model(generated)) gen_loss.backward() eps = torch.randn(x.size()).uniform_(0,1) if self.cuda: eps = eps.cuda() x__ = Variable(eps * x.data + (1.0 - eps) * generated.data,requires_grad=True) pred__ = self.disc_model(x__) grad_outputs = torch.ones(pred__.size()) if self.cuda: grad_outputs = grad_outputs.cuda() gradients = grad(outputs=pred__,inputs=x__,grad_outputs=grad_outputs,create_graph=True,retain_graph=True,only_inputs=True)[0] gradient_penalty = self.lambda_ * ((gradients.view(gradients.size(0),-1).norm(2,1) - 1) ** 2).mean() gradient_penalty.backward() loss = real_loss + gen_loss + gradient_penalty disc_optimizer.step() running_loss.add_(loss.cpu() * batch_size)
def grad_norm(self, d_out, x): ones = torch.ones(d_out.size()) if self.use_cuda: ones = ones.cuda() grad_wrt_x = grad(outputs=d_out, inputs=x, grad_outputs=ones, create_graph=True, retain_graph=True, only_inputs=True)[0] g_norm = (grad_wrt_x.view( grad_wrt_x.size()[0], -1).norm(2, 1)**2).mean() return g_norm
def fgsm(classifier, x, loss_func,attack_params): epsilon = attack_params['eps'] #x_diff = 2 * 0.025 * (to_var(torch.rand(x.size())) - 0.5) #x_diff = torch.clamp(x_diff, -epsilon, epsilon) x_adv = to_var(x.data) c_pre = classifier(x_adv) loss = loss_func(c_pre) # gan_loss(c, is_real,compute_penalty=False) nx_adv = x_adv + epsilon*torch.sign(grad(loss, x_adv,retain_graph=False)[0]) x_adv = to_var(nx_adv.data) return x_adv
def test_elbo_rsvi(enumerate1): pyro.clear_param_store() num_particles = 40000 prec = 0.01 if enumerate1 else 0.02 q = pyro.param("q", torch.tensor(0.5, requires_grad=True)) a = pyro.param("a", torch.tensor(1.5, requires_grad=True)) kl1 = kl_divergence(dist.Bernoulli(q), dist.Bernoulli(0.25)) kl2 = kl_divergence(dist.Gamma(a, 1.0), dist.Gamma(0.5, 1.0)) def model(): with pyro.iarange("particles", num_particles): pyro.sample("z", dist.Bernoulli(0.25).expand_by([num_particles])) pyro.sample("y", dist.Gamma(0.50, 1.0).expand_by([num_particles])) @config_enumerate(default=enumerate1) def guide(): q = pyro.param("q") a = pyro.param("a") with pyro.iarange("particles", num_particles): pyro.sample("z", dist.Bernoulli(q).expand_by([num_particles])) pyro.sample("y", ShapeAugmentedGamma(a, torch.tensor(1.0)).expand_by([num_particles])) elbo = TraceEnum_ELBO(max_iarange_nesting=1, strict_enumeration_warning=any([enumerate1])) elbo.loss_and_grads(model, guide) actual_q = q.grad / num_particles expected_q = grad(kl1, [q])[0] assert_equal(actual_q, expected_q, prec=prec, msg="".join([ "\nexpected q.grad = {}".format(expected_q.detach().cpu().numpy()), "\n actual q.grad = {}".format(actual_q.detach().cpu().numpy()), ])) actual_a = a.grad / num_particles expected_a = grad(kl2, [a])[0] assert_equal(actual_a, expected_a, prec=prec, msg="".join([ "\nexpected a.grad= {}".format(expected_a.detach().cpu().numpy()), "\n actual a.grad = {}".format(actual_a.detach().cpu().numpy()), ]))
def test_elbo_iarange_iarange(outer_dim, inner_dim, enumerate1, enumerate2, enumerate3, enumerate4): pyro.clear_param_store() num_particles = 1 if all([enumerate1, enumerate2, enumerate3, enumerate4]) else 100000 q = pyro.param("q", torch.tensor(0.75, requires_grad=True)) p = 0.2693204236205713 # for which kl(Bernoulli(q), Bernoulli(p)) = 0.5 def model(): d = dist.Bernoulli(p) with pyro.iarange("particles", num_particles): context1 = pyro.iarange("outer", outer_dim, dim=-2) context2 = pyro.iarange("inner", inner_dim, dim=-3) pyro.sample("w", d.expand_by([num_particles])) with context1: pyro.sample("x", d.expand_by([outer_dim, num_particles])) with context2: pyro.sample("y", d.expand_by([inner_dim, 1, num_particles])) with context1, context2: pyro.sample("z", d.expand_by([inner_dim, outer_dim, num_particles])) def guide(): d = dist.Bernoulli(pyro.param("q")) with pyro.iarange("particles", num_particles): context1 = pyro.iarange("outer", outer_dim, dim=-2) context2 = pyro.iarange("inner", inner_dim, dim=-3) pyro.sample("w", d.expand_by([num_particles]), infer={"enumerate": enumerate1}) with context1: pyro.sample("x", d.expand_by([outer_dim, num_particles]), infer={"enumerate": enumerate2}) with context2: pyro.sample("y", d.expand_by([inner_dim, 1, num_particles]), infer={"enumerate": enumerate3}) with context1, context2: pyro.sample("z", d.expand_by([inner_dim, outer_dim, num_particles]), infer={"enumerate": enumerate4}) kl_node = kl_divergence(dist.Bernoulli(q), dist.Bernoulli(p)) kl = (1 + outer_dim + inner_dim + outer_dim * inner_dim) * kl_node expected_loss = kl.item() expected_grad = grad(kl, [q])[0] elbo = TraceEnum_ELBO(max_iarange_nesting=3, strict_enumeration_warning=any([enumerate1, enumerate2, enumerate3])) actual_loss = elbo.loss_and_grads(model, guide) / num_particles actual_grad = pyro.param('q').grad / num_particles assert_equal(actual_loss, expected_loss, prec=0.1, msg="".join([ "\nexpected loss = {}".format(expected_loss), "\n actual loss = {}".format(actual_loss), ])) assert_equal(actual_grad, expected_grad, prec=0.1, msg="".join([ "\nexpected grad = {}".format(expected_grad.detach().cpu().numpy()), "\n actual grad = {}".format(actual_grad.detach().cpu().numpy()), ]))
def compute_gradient_penalty(D, real_samples, fake_samples): """Calculates the gradient penalty loss for WGAN GP""" # Random weight term for interpolation between real and fake samples alpha = Tensor(np.random.random((real_samples.size(0), 1, 1, 1))) # Get random interpolation between real and fake samples interpolates = (alpha * real_samples + ((1 - alpha) * fake_samples)).requires_grad_(True) d_interpolates = D(interpolates) fake = Variable(Tensor(real_samples.shape[0], 1).fill_(1.0), requires_grad=False) # Get gradient w.r.t. interpolates gradients = autograd.grad(outputs=d_interpolates, inputs=interpolates, grad_outputs=fake, create_graph=True, retain_graph=True, only_inputs=True)[0] gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() return gradient_penalty
def test_rsample(dist): if not dist.pyro_dist.has_rsample: return for idx in range(len(dist.dist_params)): # Compute CPU value. with tensors_default_to("cpu"): params = dist.get_dist_params(idx) grad_params = [key for key, val in params.items() if torch.is_tensor(val) and val.dtype in (torch.float32, torch.float64)] for key in grad_params: val = params[key].clone() val.requires_grad = True params[key] = val try: with xfail_if_not_implemented(): cpu_value = dist.pyro_dist(**params).rsample() cpu_grads = grad(cpu_value.sum(), [params[key] for key in grad_params]) except ValueError as e: pytest.xfail('CPU version fails: {}'.format(e)) assert not cpu_value.is_cuda # Compute GPU value. with tensors_default_to("cuda"): params = dist.get_dist_params(idx) for key in grad_params: val = params[key].clone() val.requires_grad = True params[key] = val cuda_value = dist.pyro_dist(**params).rsample() assert cuda_value.is_cuda assert_equal(cpu_value.size(), cuda_value.size()) cuda_grads = grad(cuda_value.sum(), [params[key] for key in grad_params]) for cpu_grad, cuda_grad in zip(cpu_grads, cuda_grads): assert_equal(cpu_grad.size(), cuda_grad.size())
def calc_gradient_penalty(self,real_data,fake_data,original_size): batch_size = real_data.size(0) alpha = torch.rand(batch_size,1) alpha = alpha.expand(real_data.size()) alpha = alpha.to(real_data.device) interpolates = alpha * real_data + ((1 - alpha) * fake_data) interpolates = autograd.Variable(interpolates,requires_grad=True) interpolates = interpolates.to(real_data.device) disc_interpolates = self.discriminator(interpolates.view(original_size)).view(-1) gradients = autograd.grad(outputs=disc_interpolates,inputs=interpolates, grad_outputs=torch.ones(batch_size).to(real_data.device), create_graph=True,retain_graph=True,only_inputs=True)[0] gradient_penalty = ((gradients.norm(p=2,dim=1)-1)**2).mean()*self.LAMBDA return gradient_penalty
def test_elbo_irange_irange(outer_dim, inner_dim, enumerate1, enumerate2, enumerate3): pyro.clear_param_store() num_particles = 1 if all([enumerate1, enumerate2, enumerate3]) else 50000 q = pyro.param("q", torch.tensor(0.75, requires_grad=True)) p = 0.2693204236205713 # for which kl(Bernoulli(q), Bernoulli(p)) = 0.5 def model(): with pyro.iarange("particles", num_particles): pyro.sample("x", dist.Bernoulli(p).expand_by([num_particles])) inner_irange = pyro.irange("inner", outer_dim) for i in pyro.irange("outer", inner_dim): pyro.sample("y_{}".format(i), dist.Bernoulli(p).expand_by([num_particles])) for j in inner_irange: pyro.sample("z_{}_{}".format(i, j), dist.Bernoulli(p).expand_by([num_particles])) def guide(): q = pyro.param("q") with pyro.iarange("particles", num_particles): pyro.sample("x", dist.Bernoulli(q).expand_by([num_particles]), infer={"enumerate": enumerate1}) inner_irange = pyro.irange("inner", inner_dim) for i in pyro.irange("outer", outer_dim): pyro.sample("y_{}".format(i), dist.Bernoulli(q).expand_by([num_particles]), infer={"enumerate": enumerate2}) for j in inner_irange: pyro.sample("z_{}_{}".format(i, j), dist.Bernoulli(q).expand_by([num_particles]), infer={"enumerate": enumerate3}) kl = (1 + outer_dim * (1 + inner_dim)) * kl_divergence(dist.Bernoulli(q), dist.Bernoulli(p)) expected_loss = kl.item() expected_grad = grad(kl, [q])[0] elbo = TraceEnum_ELBO(max_iarange_nesting=1, strict_enumeration_warning=any([enumerate1, enumerate2, enumerate3])) actual_loss = elbo.loss_and_grads(model, guide) / num_particles actual_grad = pyro.param('q').grad / num_particles assert_equal(actual_loss, expected_loss, prec=0.1, msg="".join([ "\nexpected loss = {}".format(expected_loss), "\n actual loss = {}".format(actual_loss), ])) assert_equal(actual_grad, expected_grad, prec=0.1, msg="".join([ "\nexpected grad = {}".format(expected_grad.detach().cpu().numpy()), "\n actual grad = {}".format(actual_grad.detach().cpu().numpy()), ]))
def calc_gradient_penalty(self, netD, real_data, fake_data): alpha = torch.rand(1, 1) alpha = alpha.expand(real_data.size()) alpha = alpha.cuda() interpolates = alpha * real_data + ((1 - alpha) * fake_data) interpolates = interpolates.cuda() interpolates = Variable(interpolates, requires_grad=True) disc_interpolates = netD.forward(interpolates) gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates, grad_outputs=torch.ones(disc_interpolates.size()).cuda(), create_graph=True, retain_graph=True, only_inputs=True)[0] gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * self.LAMBDA return gradient_penalty
def compute_gradient_penalty(D, X): """Calculates the gradient penalty loss for DRAGAN""" # Random weight term for interpolation alpha = Tensor(np.random.random(size=X.shape)) interpolates = alpha * X + ((1 - alpha) * (X + 0.5 * X.std() * torch.rand(X.size()))) interpolates = Variable(interpolates, requires_grad=True) d_interpolates = D(interpolates) fake = Variable(Tensor(X.shape[0], 1).fill_(1.0), requires_grad=False) # Get gradient w.r.t. interpolates gradients = autograd.grad(outputs=d_interpolates, inputs=interpolates, grad_outputs=fake, create_graph=True, retain_graph=True, only_inputs=True)[0] gradient_penalty = lambda_gp * ((gradients.norm(2, dim=1) - 1) ** 2).mean() return gradient_penalty
def test_non_mean_field_bern_bern_elbo_gradient(enumerate1, pi1, pi2): pyro.clear_param_store() num_particles = 1 if enumerate1 else 20000 def model(): with pyro.iarange("particles", num_particles): y = pyro.sample("y", dist.Bernoulli(0.33).expand_by([num_particles])) pyro.sample("z", dist.Bernoulli(0.55 * y + 0.10)) def guide(): q1 = pyro.param("q1", torch.tensor(pi1, requires_grad=True)) q2 = pyro.param("q2", torch.tensor(pi2, requires_grad=True)) with pyro.iarange("particles", num_particles): y = pyro.sample("y", dist.Bernoulli(q1).expand_by([num_particles])) pyro.sample("z", dist.Bernoulli(q2 * y + 0.10)) logger.info("Computing gradients using surrogate loss") elbo = TraceEnum_ELBO(max_iarange_nesting=1, strict_enumeration_warning=any([enumerate1])) elbo.loss_and_grads(model, config_enumerate(guide, default=enumerate1)) actual_grad_q1 = pyro.param('q1').grad / num_particles actual_grad_q2 = pyro.param('q2').grad / num_particles logger.info("Computing analytic gradients") q1 = torch.tensor(pi1, requires_grad=True) q2 = torch.tensor(pi2, requires_grad=True) elbo = kl_divergence(dist.Bernoulli(q1), dist.Bernoulli(0.33)) elbo = elbo + q1 * kl_divergence(dist.Bernoulli(q2 + 0.10), dist.Bernoulli(0.65)) elbo = elbo + (1.0 - q1) * kl_divergence(dist.Bernoulli(0.10), dist.Bernoulli(0.10)) expected_grad_q1, expected_grad_q2 = grad(elbo, [q1, q2]) prec = 0.03 if enumerate1 is None else 0.001 assert_equal(actual_grad_q1, expected_grad_q1, prec=prec, msg="".join([ "\nq1 expected = {}".format(expected_grad_q1.data.cpu().numpy()), "\nq1 actual = {}".format(actual_grad_q1.data.cpu().numpy()), ])) assert_equal(actual_grad_q2, expected_grad_q2, prec=prec, msg="".join([ "\nq2 expected = {}".format(expected_grad_q2.data.cpu().numpy()), "\nq2 actual = {}".format(actual_grad_q2.data.cpu().numpy()), ]))
def calc_gradient_penalty(netD, real_data, fake_data): alpha = torch.rand(BATCH_SIZE, 1, 1) alpha = alpha.expand(real_data.size()) alpha = alpha.cuda(gpu) if use_cuda else alpha interpolates = alpha * real_data + ((1 - alpha) * fake_data) if use_cuda: interpolates = interpolates.cuda(gpu) interpolates = autograd.Variable(interpolates, requires_grad=True) disc_interpolates = netD(interpolates) # TODO: Make ConvBackward diffentiable gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates, grad_outputs=torch.ones(disc_interpolates.size()).cuda(gpu) if use_cuda else torch.ones( disc_interpolates.size()), create_graph=True, retain_graph=True, only_inputs=True)[0] gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * LAMBDA return gradient_penalty
def calc_gradient_penalty(D, real_data, fake_data, iwass_lambda, iwass_target): global mixing_factors, grad_outputs if mixing_factors is None or real_data.size(0) != mixing_factors.size(0): mixing_factors = torch.cuda.FloatTensor(real_data.size(0), 1) mixing_factors.uniform_() mixed_data = Variable(mul_rowwise(real_data, 1 - mixing_factors) + mul_rowwise(fake_data, mixing_factors), requires_grad=True) mixed_scores = D(mixed_data) if grad_outputs is None or mixed_scores.size(0) != grad_outputs.size(0): grad_outputs = torch.cuda.FloatTensor(mixed_scores.size()) grad_outputs.fill_(1.) gradients = grad(outputs=mixed_scores, inputs=mixed_data, grad_outputs=grad_outputs, create_graph=True, retain_graph=True, only_inputs=True)[0] gradients = gradients.view(gradients.size(0), -1) gradient_penalty = ((gradients.norm(2, dim=1) - iwass_target) ** 2) * iwass_lambda / (iwass_target ** 2) return gradient_penalty
def _gradient_penalty(self, real_samples, fake_samples, kwargs): """ Compute the norm of the gradients for each sample in a batch, and penalize anything on either side of unit norm """ import torch from torch.autograd import Variable, grad real_samples = real_samples.view(fake_samples.shape) subset_size = real_samples.shape[0] real_samples = real_samples[:subset_size] fake_samples = fake_samples[:subset_size] alpha = torch.rand(subset_size) if self.use_cuda: alpha = alpha.cuda() alpha = alpha.view((-1,) + ((1,) * (real_samples.dim() - 1))) interpolates = alpha * real_samples + ((1 - alpha) * fake_samples) interpolates = Variable(interpolates, requires_grad=True) if self.use_cuda: interpolates = interpolates.cuda() d_output = self.critic(interpolates, **kwargs) grad_ouputs = torch.ones(d_output.size()) if self.use_cuda: grad_ouputs = grad_ouputs.cuda() gradients = grad( outputs=d_output, inputs=interpolates, grad_outputs=grad_ouputs, create_graph=True, retain_graph=True, only_inputs=True)[0] return ((gradients.norm(2, dim=1) - 1) ** 2).mean() * 10
def test_elbo_categoricals(enumerate1, enumerate2, enumerate3, max_iarange_nesting): pyro.clear_param_store() p1 = torch.tensor([0.6, 0.4]) p2 = torch.tensor([0.3, 0.3, 0.4]) p3 = torch.tensor([0.1, 0.2, 0.3, 0.4]) q1 = pyro.param("q1", torch.tensor([0.4, 0.6], requires_grad=True)) q2 = pyro.param("q2", torch.tensor([0.4, 0.3, 0.3], requires_grad=True)) q3 = pyro.param("q3", torch.tensor([0.4, 0.3, 0.2, 0.1], requires_grad=True)) def model(): pyro.sample("x1", dist.Categorical(p1)) pyro.sample("x2", dist.Categorical(p2)) pyro.sample("x3", dist.Categorical(p3)) def guide(): pyro.sample("x1", dist.Categorical(pyro.param("q1")), infer={"enumerate": enumerate1}) pyro.sample("x2", dist.Categorical(pyro.param("q2")), infer={"enumerate": enumerate2}) pyro.sample("x3", dist.Categorical(pyro.param("q3")), infer={"enumerate": enumerate3}) kl = (kl_divergence(dist.Categorical(q1), dist.Categorical(p1)) + kl_divergence(dist.Categorical(q2), dist.Categorical(p2)) + kl_divergence(dist.Categorical(q3), dist.Categorical(p3))) expected_loss = kl.item() expected_grads = grad(kl, [q1, q2, q3]) elbo = TraceEnum_ELBO(max_iarange_nesting=max_iarange_nesting, strict_enumeration_warning=any([enumerate1, enumerate2, enumerate3])) actual_loss = elbo.loss_and_grads(model, guide) actual_grads = [q1.grad, q2.grad, q3.grad] assert_equal(actual_loss, expected_loss, prec=0.001, msg="".join([ "\nexpected loss = {}".format(expected_loss), "\n actual loss = {}".format(actual_loss), ])) for actual_grad, expected_grad in zip(actual_grads, expected_grads): assert_equal(actual_grad, expected_grad, prec=0.001, msg="".join([ "\nexpected grad = {}".format(expected_grad.detach().cpu().numpy()), "\n actual grad = {}".format(actual_grad.detach().cpu().numpy()), ]))
def test_svi_enum(Elbo, irange_dim, enumerate1, enumerate2): pyro.clear_param_store() num_particles = 10 q = pyro.param("q", torch.tensor(0.75), constraint=constraints.unit_interval) p = 0.2693204236205713 # for which kl(Bernoulli(q), Bernoulli(p)) = 0.5 def model(): pyro.sample("x", dist.Bernoulli(p)) for i in pyro.irange("irange", irange_dim): pyro.sample("y_{}".format(i), dist.Bernoulli(p)) def guide(): q = pyro.param("q") pyro.sample("x", dist.Bernoulli(q), infer={"enumerate": enumerate1}) for i in pyro.irange("irange", irange_dim): pyro.sample("y_{}".format(i), dist.Bernoulli(q), infer={"enumerate": enumerate2}) kl = (1 + irange_dim) * kl_divergence(dist.Bernoulli(q), dist.Bernoulli(p)) expected_loss = kl.item() expected_grad = grad(kl, [q.unconstrained()])[0] inner_particles = 2 outer_particles = num_particles // inner_particles elbo = TraceEnum_ELBO(max_iarange_nesting=0, strict_enumeration_warning=any([enumerate1, enumerate2]), num_particles=inner_particles) actual_loss = sum(elbo.loss_and_grads(model, guide) for i in range(outer_particles)) / outer_particles actual_grad = q.unconstrained().grad / outer_particles assert_equal(actual_loss, expected_loss, prec=0.3, msg="".join([ "\nexpected loss = {}".format(expected_loss), "\n actual loss = {}".format(actual_loss), ])) assert_equal(actual_grad, expected_grad, prec=0.5, msg="".join([ "\nexpected grad = {}".format(expected_grad.detach().cpu().numpy()), "\n actual grad = {}".format(actual_grad.detach().cpu().numpy()), ]))
def test_elbo_berns(enumerate1, enumerate2, enumerate3): pyro.clear_param_store() num_particles = 1 if all([enumerate1, enumerate2, enumerate3]) else 10000 prec = 0.001 if all([enumerate1, enumerate2, enumerate3]) else 0.1 q = pyro.param("q", torch.tensor(0.75, requires_grad=True)) def model(): with pyro.iarange("particles", num_particles): pyro.sample("x1", dist.Bernoulli(0.1).expand_by([num_particles])) pyro.sample("x2", dist.Bernoulli(0.2).expand_by([num_particles])) pyro.sample("x3", dist.Bernoulli(0.3).expand_by([num_particles])) def guide(): q = pyro.param("q") with pyro.iarange("particles", num_particles): pyro.sample("x1", dist.Bernoulli(q).expand_by([num_particles]), infer={"enumerate": enumerate1}) pyro.sample("x2", dist.Bernoulli(q).expand_by([num_particles]), infer={"enumerate": enumerate2}) pyro.sample("x3", dist.Bernoulli(q).expand_by([num_particles]), infer={"enumerate": enumerate3}) kl = sum(kl_divergence(dist.Bernoulli(q), dist.Bernoulli(p)) for p in [0.1, 0.2, 0.3]) expected_loss = kl.item() expected_grad = grad(kl, [q])[0] elbo = TraceEnum_ELBO(max_iarange_nesting=1, strict_enumeration_warning=any([enumerate1, enumerate2, enumerate3])) actual_loss = elbo.loss_and_grads(model, guide) / num_particles actual_grad = q.grad / num_particles assert_equal(actual_loss, expected_loss, prec=prec, msg="".join([ "\nexpected loss = {}".format(expected_loss), "\n actual loss = {}".format(actual_loss), ])) assert_equal(actual_grad, expected_grad, prec=prec, msg="".join([ "\nexpected grads = {}".format(expected_grad.detach().cpu().numpy()), "\n actual grads = {}".format(actual_grad.detach().cpu().numpy()), ]))
def test_model(cuda, data_loader, mymodel, mymodel_clone, val_iter, task_lr, meta_optimizer, zero_shot=False): meta_loss_final = 0.0 accs = 0.0 mymodel.eval() for it in range(val_iter): meta_loss = 0.0 # mymodel.eval() class_name, support, support_label, query, query_label = next( data_loader) if cuda: support_label, query_label = support_label.cuda( ), query_label.cuda() '''First Step''' loss_s, right_s, query1, class_name1 = train_one_batch( args, class_name, support, support_label, query, query_label, mymodel, args.task_lr, it) zero_grad(mymodel.parameters()) grads_fc = autograd.grad(loss_s, mymodel.fc.parameters(), retain_graph=True) grads_mlp = autograd.grad(loss_s, mymodel.mlp.parameters()) fast_weights_fc, orderd_params = mymodel.cloned_fc_dict(), OrderedDict( ) fast_weights_mlp = mymodel.cloned_mlp_dict() for (key, val), grad in zip(mymodel.fc.named_parameters(), grads_fc): fast_weights_fc[key] = orderd_params[ 'fc.' + key] = val - args.task_lr * grad for (key, val), grad in zip(mymodel.mlp.named_parameters(), grads_mlp): fast_weights_mlp[key] = orderd_params[ 'mlp.' + key] = val - args.task_lr * grad name_list = [] for name in mymodel_clone.state_dict(): name_list.append(name) for name in orderd_params: if name in name_list: mymodel_clone.state_dict()[name].copy_(orderd_params[name]) '''second-10th step''' for _ in range(10 - 1): loss_s, right_s, query1, class_name1 = train_one_batch( args, class_name, support, support_label, query, query_label, mymodel_clone, args.task_lr, it) zero_grad(mymodel_clone.parameters()) grads_fc = autograd.grad(loss_s, mymodel_clone.fc.parameters(), retain_graph=True) grads_mlp = autograd.grad(loss_s, mymodel_clone.mlp.parameters()) fast_weights_fc, orderd_params = mymodel_clone.cloned_fc_dict( ), OrderedDict() fast_weights_mlp = mymodel_clone.cloned_mlp_dict() for (key, val), grad in zip(mymodel_clone.fc.named_parameters(), grads_fc): fast_weights_fc[key] = orderd_params[ 'fc.' + key] = val - args.task_lr * grad for (key, val), grad in zip(mymodel_clone.mlp.named_parameters(), grads_mlp): fast_weights_mlp[key] = orderd_params[ 'mlp.' + key] = val - args.task_lr * grad name_list = [] for name in mymodel_clone.state_dict(): name_list.append(name) for name in orderd_params: if name in name_list: mymodel_clone.state_dict()[name].copy_(orderd_params[name]) # -----在Query上计算loss和acc------- loss_q, right_q = train_q(args, class_name, query, query_label, mymodel_clone) meta_loss = meta_loss + loss_q meta_loss_final += loss_q accs += right_q meta_optimizer.zero_grad() meta_loss.backward() if (it + 1) % 200 == 0: print('step: {0:4} | val_loss:{1:3.6f}, val_accuracy: {2:3.2f}%'. format(it + 1, meta_loss_final / (it + 1), 100 * accs / (it + 1))) # torch.cuda.empty_cache() return accs / val_iter, meta_loss_final / val_iter
torch.eye(W.shape[0], device=device), torch.eye(W.shape[1], device=device) ] for W in Ws] step_size = 0.1 grad_norm_clip_thr = 1e8 TrainLoss, TestLoss = [], [] for epoch in range(10): t0 = time.time() for batch_idx, (data, target) in enumerate(train_loader): loss = train_loss(data.to(device), target.to(device)) TrainLoss.append(loss.item()) if batch_idx % 100 == 0: print('Epoch: {}; batch: {}; train loss: {}'.format( epoch, batch_idx, TrainLoss[-1])) grads = grad(loss, Ws, create_graph=True) if batch_idx % update_preconditioner_every == 0: for num_Qs_update in range(update_preconditioner_times): v = [torch.randn(W.shape, device=device) for W in Ws] Hv = grad(grads, Ws, grad_outputs=v, retain_graph=True) with torch.no_grad(): Qs = [ psgd.update_precond_kron(q[0], q[1], dw, dg) for (q, dw, dg) in zip(Qs, v, Hv) ] with torch.no_grad(): pre_grads = [ psgd.precond_grad_kron(q[0], q[1], g) for (q, g) in zip(Qs, grads) ]
def backward(self): grad(outputs=(self.Y,), inputs=(self.X, self.alpha), grad_outputs=(self.Y))
def step(self, ob_tot, lp1, lp2): grad_x = autograd.grad(lp1, self.max_params, create_graph=True, retain_graph=True) # can remove create graph grad_x_vec = torch.cat([g.contiguous().view(-1, 1) for g in grad_x]) grad_y = autograd.grad(lp2, self.min_params, create_graph=True, retain_graph=True) grad_y_vec = torch.cat([g.contiguous().view(-1, 1) for g in grad_y]) tot_grad_y = autograd.grad(ob_tot.mean(), self.min_params, create_graph=True, retain_graph=True) tot_grad_y = torch.cat( [g.contiguous().view(-1, 1) for g in tot_grad_y]) tot_grad_xy = autograd.grad(tot_grad_y, self.max_params, grad_outputs=grad_y_vec, retain_graph=True) hvp_x_vec = torch.cat( [g.contiguous().view(-1, 1) for g in tot_grad_xy]) #tot_xy tot_grad_x = autograd.grad(ob_tot.mean(), self.max_params, create_graph=True, retain_graph=True) tot_grad_x = torch.cat( [g.contiguous().view(-1, 1) for g in tot_grad_x]) tot_grad_yx = autograd.grad(tot_grad_x, self.min_params, grad_outputs=grad_x_vec, retain_graph=True) hvp_y_vec = torch.cat( [g.contiguous().view(-1, 1) for g in tot_grad_yx]) p_x = torch.add(grad_x_vec, -self.lr * hvp_x_vec) p_y = torch.add(grad_y_vec, self.lr * hvp_y_vec) if self.collect_info: self.norm_px = torch.norm(p_x, p=2) self.norm_py = torch.norm(p_y, p=2) self.timer = time.time() if self.solve_x: cg_y, self.iter_num = conjugate_gradient( grad_x=grad_y_vec, grad_y=grad_x_vec, tot_grad_x=tot_grad_y, tot_grad_y=tot_grad_x, x_params=self.min_params, y_params=self.max_params, b=p_y, x=self.old_y, nsteps=p_y.shape[0], # // 10000, lr=self.lr, device=self.device) hcg = autograd.grad(tot_grad_y, self.max_params, grad_outputs=cg_y, retain_graph=False) # yx hcg = torch.cat([g.contiguous().view(-1, 1) for g in hcg]) cg_x = torch.add(grad_x_vec, -self.lr * hcg) self.old_x = cg_x else: cg_x, self.iter_num = conjugate_gradient( grad_x=grad_x_vec, grad_y=grad_y_vec, tot_grad_x=tot_grad_x, tot_grad_y=tot_grad_y, x_params=self.max_params, y_params=self.min_params, b=p_x, x=self.old_x, nsteps=p_x.shape[0], # // 10000, lr=self.lr, device=self.device) hcg = autograd.grad(tot_grad_x, self.min_params, grad_outputs=cg_x, retain_graph=False) # yx hcg = torch.cat([g.contiguous().view(-1, 1) for g in hcg]) cg_y = torch.add(grad_y_vec, self.lr * hcg) self.old_y = cg_y if self.collect_info: self.timer = time.time() - self.timer index = 0 for p in self.max_params: if self.weight_decay != 0: p.data.add_(-self.weight_decay * p) p.data.add_(self.lr * cg_x[index:index + p.numel()].reshape(p.shape)) index += p.numel() if index != cg_x.numel(): raise ValueError('CG size mismatch') index = 0 for p in self.min_params: if self.weight_decay != 0: p.data.add_(-self.weight_decay * p) p.data.add_(-self.lr * cg_y[index:index + p.numel()].reshape(p.shape)) index += p.numel() if index != cg_y.numel(): raise ValueError('CG size mismatch') if self.collect_info: self.norm_gx = torch.norm(grad_x_vec, p=2) self.norm_gy = torch.norm(grad_y_vec, p=2) self.norm_cgx = torch.norm(cg_x, p=2) self.norm_cgy = torch.norm(cg_y, p=2) self.solve_x = False if self.solve_x else True
def D_logistic_r1(real_image, Discriminator, gamma=10.0): reals = Variable(real_image, requires_grad=True).to(real_image.device) real_logit = Discriminator(reals) real_grads = grad(torch.sum(real_logit), reals)[0] gradient_pen = torch.sum(torch.mul(real_grads, real_grads), dim=[1, 2, 3]) return gradient_pen * (gamma * 0.5)
def train(self): self.train_hist = {} self.train_hist['D_loss'] = [] self.train_hist['G_loss'] = [] self.train_hist['per_epoch_time'] = [] self.train_hist['total_time'] = [] self.train_hist['D_norm'] = [] f = open("%s/results.txt" % self.log_dir, "w") f.write("d_loss,g_loss,d_norm\n") if self.gpu_mode: self.y_real_, self.y_fake_ = Variable(torch.ones(self.batch_size, 1).cuda()), Variable(torch.zeros(self.batch_size, 1).cuda()) else: self.y_real_, self.y_fake_ = Variable(torch.ones(self.batch_size, 1)), Variable(torch.zeros(self.batch_size, 1)) #for iter, ((x1_,_), (x2_,_)) in enumerate(zip(self.data_loader, self.data_loader)): # import pdb # pdb.set_trace() self.D.train() print('training start!!') start_time = time.time() for epoch in range(self.epoch): self.G.train() epoch_start_time = time.time() for iter, (x_, _) in enumerate(self.data_loader): if iter == self.data_loader.dataset.__len__() // self.batch_size: break z_ = torch.rand((self.batch_size, self.z_dim)) if self.gpu_mode: x_, z_ = Variable(x_.cuda(), requires_grad=True), \ Variable(z_.cuda()) else: x_, z_ = Variable(x_, requires_grad=True), \ Variable(z_) # update D network D_real = self.D(x_) # compute gradient penalty grad_wrt_x = grad(outputs=D_real, inputs=x_, grad_outputs=torch.ones(D_real.size()).cuda(), create_graph=True, retain_graph=True, only_inputs=True)[0] g_norm = ((grad_wrt_x.view(grad_wrt_x.size()[0], -1).norm(2, 1) - 1) ** 2).mean() self.train_hist['D_norm'].append(g_norm.data.item()) self.D_optimizer.zero_grad() G_ = self.G(z_).detach() alpha = float(np.random.random()) Xz = Variable(alpha*x_.data + (1.-alpha)*G_.data) D_Xz = self.D(Xz) D_loss = self.BCE_loss(D_Xz, alpha*self.y_real_) self.train_hist['D_loss'].append(D_loss.data.item()) D_loss.backward() self.D_optimizer.step() # update G network self.G_optimizer.zero_grad() G_ = self.G(z_) D_fake = self.D(G_) G_loss = self.BCE_loss(D_fake, self.y_real_) self.train_hist['G_loss'].append(G_loss.data.item()) G_loss.backward() self.G_optimizer.step() if ((iter + 1) % 100) == 0: print("Epoch: [%2d] [%4d/%4d] D_loss: %.8f, G_loss: %.8f, D_norm: %.8f" % ((epoch + 1), (iter + 1), self.data_loader.dataset.__len__() // self.batch_size, D_loss.data.item(), G_loss.data.item(), g_norm.data.item())) f.write("%.8f,%.8f,%.8f\n" % (D_loss.data.item(), G_loss.data.item(), g_norm.data.item())) f.flush() self.train_hist['per_epoch_time'].append(time.time() - epoch_start_time) self.visualize_results((epoch+1)) self.train_hist['total_time'].append(time.time() - start_time) print("Avg one epoch time: %.2f, total %d epochs time: %.2f" % (np.mean(self.train_hist['per_epoch_time']), self.epoch, self.train_hist['total_time'][0])) print("Training finish!... save training results") f.close() self.save() utils.generate_animation(self.result_dir + '/' + self.dataset + '/' + self.model_name + '/' + self.model_name, self.epoch) utils.loss_plot(self.train_hist, os.path.join(self.save_dir, self.dataset, self.model_name), self.model_name)
def _get_gradient(inp, output): gradient = autograd.grad(outputs=output, inputs=inp, grad_outputs=torch.ones_like(output), create_graph=True, retain_graph=True, only_inputs=True, allow_unused=True)[0] return gradient
Tensor(np.random.normal(0, 1, (imgs.shape[0], opt.latent_dim)))) # Generate a batch of images fake_imgs = generator(z) # Real images real_validity = discriminator(real_imgs) # Fake images fake_validity = discriminator(fake_imgs) # Compute W-div gradient penalty real_grad_out = Variable(Tensor(real_imgs.size(0), 1).fill_(1.0), requires_grad=False) real_grad = autograd.grad(real_validity, real_imgs, real_grad_out, create_graph=True, retain_graph=True, only_inputs=True)[0] real_grad_norm = real_grad.view(real_grad.size(0), -1).pow(2).sum(1)**(p / 2) fake_grad_out = Variable(Tensor(fake_imgs.size(0), 1).fill_(1.0), requires_grad=False) fake_grad = autograd.grad(fake_validity, fake_imgs, fake_grad_out, create_graph=True, retain_graph=True, only_inputs=True)[0] fake_grad_norm = fake_grad.view(fake_grad.size(0), -1).pow(2).sum(1)**(p / 2)
def HamiltonianSys(p, q, K): H = Hamiltonian(p, q, K) Gp, Gq = grad(H, (p, q), create_graph=True) return -Gq, Gp
from likelihood import likelihood from hmc import hmc_sampler import sys import numpy import torch from torch.autograd import Variable,grad dim = 3 q = Variable(torch.rand(dim),requires_grad=True) SigInv = Variable(torch.eye(dim),requires_grad=False) potentialE = q.dot(SigInv.mv(q*q)) #print(q.data) g = grad(potentialE,q,create_graph=True)[0] #print(g) gsplit = torch.split(g,1,dim=0) #print(gsplit) H = Variable(torch.rand(dim,dim)) for i in range(dim): H[i,:] = grad(gsplit[i],q,create_graph=True)[0] #print(H) dH = Variable(torch.rand(dim,dim,dim)) print(H) #exit() #x = Variable(torch.rand(1),requires_grad=True) #y = 0.5 *x #o = grad(y,x,create_graph=True) #print(o) #oo = grad(Variable(torch.rand(1),requires_grad=True),x) #o = grad(H[0,0],q) #print(o) for i in range(dim): for j in range(dim):
# .. math:: # # \langle \text{d} c . \delta y , e \rangle = \langle g , \delta y \rangle = \langle \delta y , \partial c . e \rangle # # Backpropagation is all about computing the tensor :math:`g=\partial c . e` efficiently, for arbitrary values of :math:`e`: # Declare a new tensor of shape (M,3) used as the input of the gradient operator. # It can be understood as a "gradient with respect to the output c" # and is thus called "grad_output" in the documentation of PyTorch. e = torch.rand_like(c) # Call the gradient op: start = time.time() # PyTorch remark : grad(c, y, e) alone outputs a length 1 tuple, hence the need for [0]. g = grad(c, y, e)[0] # g = [∂_y c].e print('Time to compute gradient of convolution operation with KeOps: ', round(time.time() - start, 5), 's') #################################################################### # The equivalent code with a "vanilla" pytorch implementation g_torch = ((p - a.transpose(0, 1))[:, None] **2 * torch.exp(x.transpose(0, 1)[:, :, None] \ + y.transpose(0, 1)[:, None, :]) * e.transpose(0, 1)[:, :, None] ).sum(dim=1).transpose(0, 1) # Plot the results next to each other: for i in range(3): plt.subplot(3, 1, i + 1) plt.plot(g.detach().cpu().numpy()[:40, i], '-', label='KeOps') plt.plot(g_torch.detach().cpu().numpy()[:40, i], '--', label='PyTorch')
# encoding=GBK """ 利用torch.autograd.grad求二阶导 """ import torch from torch import autograd ########################################### # 自动求导 # a,b,c的值分别是1,2,3 # x 的值是 1 ########################################### x = torch.tensor(1) a = torch.tensor(1.0, requires_grad=True) b = torch.tensor(2.0, requires_grad=True) c = torch.tensor(3.0, requires_grad=True) y = a**2 * x + b * x + c print('before: ', a.grad, b.grad, c.grad) grades = autograd.grad(y, [a, b, c]) print('after: ', grades[0], grades[1], grades[2])
def train_model(mymodel, mymodel_clone, args, sample_class_weights, val_step=200): n_way_k_shot = str(args.N) + '-way-' + str(args.K) + '-shot' print('Start training ' + n_way_k_shot) cuda = torch.cuda.is_available() if cuda: mymodel = mymodel.cuda() mymodel_clone = mymodel_clone.cuda() data_loader = {} data_loader['train'] = get_dataloader( args, args.train, args.class_name_file, args.N, args.K, args.L, args.noise_rate, sample_class_weights=sample_class_weights) data_loader['val'] = get_dataloader( args, args.val, args.class_name_file, args.N, args.K, args.L, args.noise_rate, sample_class_weights=sample_class_weights, train=False) data_loader['test'] = get_dataloader( args, args.test, args.class_name_file, args.N, args.K, args.L, args.noise_rate, sample_class_weights=sample_class_weights, train=False) optim_params = [{'params': mymodel.coder.parameters(), 'lr': 5e-5}] optim_params.append({ 'params': mymodel.fc.parameters(), 'lr': args.meta_lr }) optim_params.append({ 'params': mymodel.mlp.parameters(), 'lr': args.meta_lr }) meta_optimizer = AdamW(optim_params, lr=1) # mymodel1_meta_opt = AdamW(mymodel.parameters(), lr=args.meta_lr) # mymodel2_task_opt = AdamW(mymodel.parameters(), lr=args.task_lr) best_acc, best_step, best_test_acc, best_test_step, best_val_loss, best_changed = 0.0, 0, 0.0, 0, 100.0, False iter_loss, iter_right, iter_sample = 0.0, 0.0, 0.0 count = 0 count_test = 0 for it in range(args.Train_iter): mymodel.train() meta_loss, meta_right = 0.0, 0.0 # meta_loss = [] # meta_right = 0.0 # torch.save(mymodel2.state_dict(), 'model_checkpoint/checkpoint.{}th.tar'.format(it)) for batch in range(args.B): class_name, support, support_label, query, query_label = next( data_loader['train']) # [N, length], tokens:{[N*K,length]}, [1,N*K], tokens:{[N*L,length]}, [1,N*L] if cuda: support_label, query_label = support_label.cuda( ), query_label.cuda() '''First Step''' loss_s, right_s, query1, class_name1 = train_one_batch( args, class_name, support, support_label, query, query_label, mymodel, args.task_lr, it) zero_grad(mymodel.parameters()) grads_fc = autograd.grad(loss_s, mymodel.fc.parameters(), retain_graph=True) grads_mlp = autograd.grad(loss_s, mymodel.mlp.parameters()) fast_weights_fc, orderd_params = mymodel.cloned_fc_dict( ), OrderedDict() fast_weights_mlp = mymodel.cloned_mlp_dict() for (key, val), grad in zip(mymodel.fc.named_parameters(), grads_fc): fast_weights_fc[key] = orderd_params[ 'fc.' + key] = val - args.task_lr * grad for (key, val), grad in zip(mymodel.mlp.named_parameters(), grads_mlp): fast_weights_mlp[key] = orderd_params[ 'mlp.' + key] = val - args.task_lr * grad name_list = [] for name in mymodel_clone.state_dict(): name_list.append(name) for name in orderd_params: if name in name_list: mymodel_clone.state_dict()[name].copy_(orderd_params[name]) for _ in range(5 - 1): '''2-5th Step''' loss_s, right_s, query1, class_name1 = train_one_batch( args, class_name, support, support_label, query, query_label, mymodel_clone, args.task_lr, it) zero_grad(mymodel_clone.parameters()) grads_fc = autograd.grad(loss_s, mymodel_clone.fc.parameters(), retain_graph=True) grads_mlp = autograd.grad(loss_s, mymodel_clone.mlp.parameters()) fast_weights_fc, orderd_params = mymodel_clone.cloned_fc_dict( ), OrderedDict() fast_weights_mlp = mymodel_clone.cloned_mlp_dict() for (key, val), grad in zip(mymodel_clone.fc.named_parameters(), grads_fc): fast_weights_fc[key] = orderd_params[ 'fc.' + key] = val - args.task_lr * grad for (key, val), grad in zip(mymodel_clone.mlp.named_parameters(), grads_mlp): fast_weights_mlp[key] = orderd_params[ 'mlp.' + key] = val - args.task_lr * grad name_list = [] for name in mymodel_clone.state_dict(): name_list.append(name) for name in orderd_params: if name in name_list: mymodel_clone.state_dict()[name].copy_( orderd_params[name]) # -----在Query上计算loss和acc------- loss_q, right_q = train_q(args, class_name, query, query_label, mymodel_clone) meta_loss = meta_loss + loss_q meta_right = meta_right + right_q meta_loss_avg = meta_loss / args.B meta_right_avg = meta_right / args.B # mymodel2.load_state_dict(torch.load('model_checkpoint/checkpoint.{}th.tar'.format(it))) # deep_copy(mymodel1, mymodel2) meta_optimizer.zero_grad() meta_loss_avg.backward() meta_optimizer.step() iter_loss += meta_loss_avg iter_right += meta_right_avg if (it + 1) % val_step == 0: print('step: {0:4} | loss: {1:2.6f}, accuracy: {2:3.2f}%'.format( it + 1, iter_loss / val_step, 100 * iter_right / val_step)) iter_loss, iter_right, iter_sample = 0.0, 0.0, 0.0 count += 1 val_acc, val_loss = test_model(cuda, data_loader['val'], mymodel, mymodel_clone, args.Val_iter, args.task_lr, meta_optimizer) # print('[EVAL] | loss: {0:2.6f}, accuracy: {1:2.2f}%'.format(val_loss, val_acc * 100)) print('[EVAL] | accuracy: {0:2.2f}%'.format(val_acc * 100)) if val_acc >= best_acc: print('Best checkpoint!') count = 0 count_test += 1 torch.save( mymodel.state_dict(), 'model_checkpoint/checkpoint.{0}th_best_model{1}_way_{2}_shot_Lis25_isNPM_isSW.tar' .format(it + 1, args.N, args.K)) best_acc, best_step, best_val_loss, best_changed = val_acc, ( it + 1), val_loss, True if count_test % 5 == 0: test_acc, test_loss = test_model(cuda, data_loader['test'], mymodel, mymodel_clone, args.Val_iter, args.task_lr, meta_optimizer) print( '[TEST] | loss: {0:2.6f}, accuracy: {1:2.2f}%'.format( test_loss, test_acc * 100)) # torch.cuda.empty_cache() if count > 20: break print("\n####################\n") print('Finish training model! Best val acc: ' + str(best_acc) + ' at step ' + str(best_step))
def newton_step_2d(loss, x, trust_radius=None): """ Performs a Newton update step to minimize loss on a batch of 2-dimensional variables, optionally regularizing to constrain to a trust region. ``loss`` must be twice-differentiable as a function of ``x``. If ``loss`` is ``2+d``-times differentiable, then the return value of this function is ``d``-times differentiable. When ``loss`` is interpreted as a negative log probability density, then the return value of this function can be used to construct a Laplace approximation ``MultivariateNormal(mode,cov)``. .. warning:: Take care to detach the result of this function when used in an optimization loop. If you forget to detach the result of this function during optimization, then backprop will propagate through the entire iteration process, and worse will compute two extra derivatives for each step. Example use inside a loop:: x = torch.zeros(1000, 2) # arbitrary initial value for step in range(100): x = x.detach() # block gradients through previous steps x.requires_grad = True # ensure loss is differentiable wrt x loss = my_loss_function(x) x = newton_step_2d(loss, x, trust_radius=1.0) # the final x is still differentiable :param torch.Tensor loss: A scalar function of ``x`` to be minimized. :param torch.Tensor x: A dependent variable with rightmost size of 2. :param float trust_radius: An optional trust region trust_radius. The updated value ``mode`` of this function will be within ``trust_radius`` of the input ``x``. :return: A pair ``(mode, cov)`` where ``mode`` is an updated tensor of the same shape as the original value ``x``, and ``cov`` is an esitmate of the covariance 2x2 matrix with ``cov.shape == x.shape[:-1] + (2,2)``. :rtype: tuple """ if loss.shape != (): raise ValueError('Expected loss to be a scalar, actual shape {}'.format(loss.shape)) if x.dim() < 1 or x.shape[-1] != 2: raise ValueError('Expected x to have rightmost size 2, actual shape {}'.format(x.shape)) # compute derivatives g = grad(loss, [x], create_graph=True)[0] H = torch.stack([grad(g[..., 0].sum(), [x], create_graph=True)[0], grad(g[..., 1].sum(), [x], create_graph=True)[0]], -1) assert g.shape[-1:] == (2,) assert H.shape[-2:] == (2, 2) _warn_if_nan(g, 'g') _warn_if_nan(H, 'H') if trust_radius is not None: # regularize to keep update within ball of given trust_radius detH = H[..., 0, 0] * H[..., 1, 1] - H[..., 0, 1] * H[..., 1, 0] mean_eig = (H[..., 0, 0] + H[..., 1, 1]) / 2 min_eig = mean_eig - (mean_eig ** 2 - detH).sqrt() regularizer = (g.pow(2).sum(-1).sqrt() / trust_radius - min_eig).clamp_(min=1e-8) _warn_if_nan(regularizer, 'regularizer') H = H + regularizer.unsqueeze(-1).unsqueeze(-1) * H.new([[1.0, 0.0], [0.0, 1.0]]) # compute newton update detH = H[..., 0, 0] * H[..., 1, 1] - H[..., 0, 1] * H[..., 1, 0] Hinv = H.new(H.shape) Hinv[..., 0, 0] = H[..., 1, 1] Hinv[..., 0, 1] = -H[..., 0, 1] Hinv[..., 1, 0] = -H[..., 1, 0] Hinv[..., 1, 1] = H[..., 0, 0] Hinv = Hinv / detH.unsqueeze(-1).unsqueeze(-1) _warn_if_nan(Hinv, 'Hinv') # apply update x_new = x.detach() - (Hinv * g.unsqueeze(-2)).sum(-1) assert x_new.shape == x.shape return x_new, Hinv
def train(self): self.train_hist = {} self.train_hist['D_loss'] = [] self.train_hist['G_loss'] = [] self.train_hist['per_epoch_time'] = [] self.train_hist['total_time'] = [] if self.gpu_mode: self.y_real_, self.y_fake_ = Variable(torch.ones(self.batch_size, 1).cuda()), Variable(torch.zeros(self.batch_size, 1).cuda()) else: self.y_real_, self.y_fake_ = Variable(torch.ones(self.batch_size, 1)), Variable(torch.zeros(self.batch_size, 1)) self.D.train() print('training start!!') start_time = time.time() for epoch in range(self.epoch): self.G.train() epoch_start_time = time.time() for iter, (x_, _) in enumerate(self.data_loader): if iter == self.data_loader.dataset.__len__() // self.batch_size: break z_ = torch.rand((self.batch_size, self.z_dim)) if self.gpu_mode: x_, z_ = Variable(x_.cuda()), Variable(z_.cuda()) else: x_, z_ = Variable(x_), Variable(z_) # update D network self.D_optimizer.zero_grad() D_real = self.D(x_) D_real_loss = -torch.mean(D_real) G_ = self.G(z_) D_fake = self.D(G_) D_fake_loss = torch.mean(D_fake) # gradient penalty if self.gpu_mode: alpha = torch.rand(x_.size()).cuda() else: alpha = torch.rand(x_.size()) x_hat = Variable(alpha * x_.data + (1 - alpha) * G_.data, requires_grad=True) pred_hat = self.D(x_hat) if self.gpu_mode: gradients = grad(outputs=pred_hat, inputs=x_hat, grad_outputs=torch.ones(pred_hat.size()).cuda(), create_graph=True, retain_graph=True, only_inputs=True)[0] else: gradients = grad(outputs=pred_hat, inputs=x_hat, grad_outputs=torch.ones(pred_hat.size()), create_graph=True, retain_graph=True, only_inputs=True)[0] gradient_penalty = self.lambda_ * ((gradients.view(gradients.size()[0], -1).norm(2, 1) - 1) ** 2).mean() D_loss = D_real_loss + D_fake_loss + gradient_penalty D_loss.backward() self.D_optimizer.step() if ((iter+1) % self.n_critic) == 0: # update G network self.G_optimizer.zero_grad() G_ = self.G(z_) D_fake = self.D(G_) G_loss = -torch.mean(D_fake) self.train_hist['G_loss'].append(G_loss.data[0]) G_loss.backward() self.G_optimizer.step() self.train_hist['D_loss'].append(D_loss.data[0]) if ((iter + 1) % 100) == 0: print("Epoch: [%2d] [%4d/%4d] D_loss: %.8f, G_loss: %.8f" % ((epoch + 1), (iter + 1), self.data_loader.dataset.__len__() // self.batch_size, D_loss.data[0], G_loss.data[0])) self.train_hist['per_epoch_time'].append(time.time() - epoch_start_time) self.visualize_results((epoch+1)) self.train_hist['total_time'].append(time.time() - start_time) print("Avg one epoch time: %.2f, total %d epochs time: %.2f" % (np.mean(self.train_hist['per_epoch_time']), self.epoch, self.train_hist['total_time'][0])) print("Training finish!... save training results") self.save() utils.generate_animation(self.result_dir + '/' + self.dataset + '/' + self.model_name + '/' + self.model_name, self.epoch) utils.loss_plot(self.train_hist, os.path.join(self.save_dir, self.dataset, self.model_name), self.model_name)
def backwardPass(self, func, create_graph): g = autograd.grad(func, self.parameters(), create_graph=create_graph) return g
def getdV(q, V): potentialE = V(q) g = grad(potentialE, q, create_graph=True)[0] return (g)
def step(self, ob, ob_tot, lp1, lp2): self.count += 1 grad_x = autograd.grad(lp1, self.max_params, retain_graph=True) grad_x_vec = torch.cat([g.contiguous().view(-1, 1) for g in grad_x]) grad_y = autograd.grad(lp2, self.min_params, retain_graph=True) grad_y_vec = torch.cat([g.contiguous().view(-1, 1) for g in grad_y]) if self.square_avgx is None and self.square_avgy is None: self.square_avgx = torch.zeros(grad_x_vec.size(), requires_grad=False, device=self.device) self.square_avgy = torch.zeros(grad_y_vec.size(), requires_grad=False, device=self.device) self.square_avgx.mul_(self.beta2).addcmul_(1 - self.beta2, grad_x_vec.data, grad_x_vec.data) self.square_avgy.mul_(self.beta2).addcmul_(1 - self.beta2, grad_y_vec.data, grad_y_vec.data) # Initialization bias correction bias_correction2 = 1 - self.beta2**self.count self.v_x = self.square_avgx / bias_correction2 self.v_y = self.square_avgy / bias_correction2 lr_x = math.sqrt( bias_correction2) * self.lr / self.square_avgx.sqrt().add(self.eps) lr_y = math.sqrt( bias_correction2) * self.lr / self.square_avgy.sqrt().add(self.eps) scaled_grad_x = torch.mul(lr_x, grad_x_vec).detach() # lr_x * grad_x scaled_grad_y = torch.mul(lr_y, grad_y_vec).detach() # lr_y * grad_y tot_grad_y = autograd.grad(ob_tot.mean(), self.min_params, create_graph=True, retain_graph=True) tot_grad_y = torch.cat( [g.contiguous().view(-1, 1) for g in tot_grad_y]) tot_grad_xy = autograd.grad(tot_grad_y, self.max_params, grad_outputs=scaled_grad_y, retain_graph=True) hvp_x_vec = torch.cat([ g.contiguous().view(-1, 1) for g in tot_grad_xy ]) # D_xy * lr_y * grad_y tot_grad_x = autograd.grad(ob_tot.mean(), self.max_params, create_graph=True, retain_graph=True) tot_grad_x = torch.cat( [g.contiguous().view(-1, 1) for g in tot_grad_x]) tot_grad_yx = autograd.grad(tot_grad_x, self.min_params, grad_outputs=scaled_grad_x, retain_graph=True) hvp_y_vec = torch.cat([ g.contiguous().view(-1, 1) for g in tot_grad_yx ]) # D_yx * lr_x * grad_x) p_x = torch.add(grad_x_vec, -hvp_x_vec).detach_() # grad_x - D_xy * lr_y * grad_y p_y = torch.add(grad_y_vec, hvp_y_vec).detach_() # grad_y + D_yx * lr_x * grad_x if self.collect_info: self.norm_px = torch.norm(p_x, p=2) self.norm_py = torch.norm(p_y, p=2) self.timer = time.time() if self.solve_x: p_y.mul_(lr_y.sqrt()) cg_y, self.iter_num = general_conjugate_gradient( grad_x=grad_y_vec, grad_y=grad_x_vec, tot_grad_x=tot_grad_y, tot_grad_y=tot_grad_x, x_params=self.min_params, y_params=self.max_params, b=p_y, x=self.old_y, nsteps=p_y.shape[0], # // 10000, lr_x=lr_y, lr_y=lr_x, device=self.device) #hcg = Hvp_vec(grad_y_vec, self.max_params, cg_y) cg_y.detach_().mul_(-lr_y.sqrt()) hcg = autograd.grad(tot_grad_y, self.max_params, grad_outputs=cg_y, retain_graph=False) # yx hcg = torch.cat([g.contiguous().view(-1, 1) for g in hcg]).add_(grad_x_vec).detach_() # grad_x + D_xy * delta y cg_x = hcg.mul(lr_x) # this is basically deltax # torch.add(grad_x_vec, - self.lr * hcg) self.old_x = hcg.mul(lr_x.sqrt()) else: p_x.mul_(lr_x.sqrt()) cg_x, self.iter_num = general_conjugate_gradient( grad_x=grad_x_vec, grad_y=grad_y_vec, tot_grad_x=tot_grad_x, tot_grad_y=tot_grad_y, x_params=self.max_params, y_params=self.min_params, b=p_x, x=self.old_x, nsteps=p_x.shape[0], # // 10000, lr_x=lr_x, lr_y=lr_y, device=self.device) # cg_x.detach_().mul_(p_x_norm) cg_x.detach_().mul_(lr_x.sqrt()) # delta x = lr_x.sqrt() * cg_x hcg = autograd.grad(tot_grad_x, self.min_params, grad_outputs=cg_x, retain_graph=False) # yx hcg = torch.cat([g.contiguous().view(-1, 1) for g in hcg]).add_(grad_y_vec).detach_() # grad_y + D_yx * delta x cg_y = hcg.mul(-lr_y) # cg_y = torch.add(grad_y_vec, self.lr * hcg) self.old_y = hcg.mul(lr_y.sqrt()) if self.collect_info: self.timer = time.time() - self.timer index = 0 for p in self.max_params: if self.weight_decay != 0: p.data.add_(-self.weight_decay * p) p.data.add_(cg_x[index:index + p.numel()].reshape(p.shape)) index += p.numel() if index != cg_x.numel(): raise ValueError('CG size mismatch') index = 0 for p in self.min_params: if self.weight_decay != 0: p.data.add_(-self.weight_decay * p) p.data.add_(cg_y[index:index + p.numel()].reshape(p.shape)) index += p.numel() if index != cg_y.numel(): raise ValueError('CG size mismatch') if self.collect_info: self.norm_gx = torch.norm(grad_x_vec, p=2) self.norm_gy = torch.norm(grad_y_vec, p=2) self.norm_cgx = torch.norm(cg_x, p=2) self.norm_cgy = torch.norm(cg_y, p=2) self.norm_cgx_cal = torch.norm(self.square_avgx, p=2) self.norm_cgy_cal = torch.norm(self.square_avgy, p=2) self.norm_vx = torch.norm(self.v_x, p=2) self.norm_vy = torch.norm(self.v_y, p=2) self.norm_mx = lr_x.max() self.norm_my = lr_y.max() self.solve_x = False if self.solve_x else True
def lamfun(qdot_): KE = self.kinetic_energy(q, qdot_) # (*, 1) JKEq = grad(KE.sum(), [qdot_], create_graph=True)[0] # (*, qdim) return JKEq.sum(0)
for index, batch in enumerate(dataloader): total_step += 1 batch['img'] = batch['img'].to(device) batch['t'] = batch['t'].to(device) batch['v_0'] = batch['v_0'].to(device) batch['xy'] = batch['xy'].to(device) batch['vxy'] = batch['vxy'].to(device) batch['axy'] = batch['axy'].to(device) batch['img'].requires_grad = True batch['t'].requires_grad = True output = model(batch['img'], batch['t'], batch['v_0']) vx = grad(output[:, 0].sum(), batch['t'], create_graph=True)[0] * (opt.max_dist / opt.max_t) vy = grad(output[:, 1].sum(), batch['t'], create_graph=True)[0] * (opt.max_dist / opt.max_t) output_vxy = torch.cat([vx, vy], dim=1) real_v = torch.norm(batch['vxy'], dim=1) ax = grad(vx.sum(), batch['t'], create_graph=True)[0] ay = grad(vy.sum(), batch['t'], create_graph=True)[0] output_axy = (1. / opt.max_t) * torch.cat([ax, ay], dim=1) optimizer.zero_grad() loss_xy = criterion(output, batch['xy']) loss_vxy = criterion(output_vxy, batch['vxy']) loss_axy = criterion(output_axy, batch['axy']) loss = loss_xy + opt.gamma * loss_vxy + opt.gamma2 * loss_axy
torch.eye(W.shape[1], device=device) ] for W in Ws] step_size = 0.01 num_epochs = 64 grad_norm_clip_thr = 0.1 * sum(W.shape[0] * W.shape[1] for W in Ws)**0.5 TrainLoss, TestLossApprox, TestLossExact = [], [], [] for epoch in range(num_epochs): for batch_idx, (data, target) in enumerate(train_loader): #new_size = random.randint(28, 36)#random height rescaling #data = data[:,:,(torch.arange(new_size)*(32-1)/(new_size-1)).long()] #new_size = random.randint(28, 36)#random width rescaling #data = data[:,:,:,(torch.arange(new_size)*(32-1)/(new_size-1)).long()] loss = train_loss(data.to(device), target.to(device)) grads = grad(loss, Ws, create_graph=True) TrainLoss.append(loss.item()) if batch_idx % 100 == 0: print('Epoch: {}; batch: {}; train loss: {}'.format( epoch, batch_idx, TrainLoss[-1])) v = [torch.randn(W.shape, device=device) for W in Ws] Hv = grad(grads, Ws, v) #just let Hv=grads if using whitened gradients with torch.no_grad(): Qs = [ psgd.update_precond_kron(q[0], q[1], dw, dg) for (q, dw, dg) in zip(Qs, v, Hv) ] pre_grads = [ psgd.precond_grad_kron(q[0], q[1], g) for (q, g) in zip(Qs, grads)
def _matvec_grad(self, img, vec): w = torch.zeros(self.hidden_dim, requires_grad=True).to(self.device) matvec_transposed = self._matvec_T_grad(img, w) dotproduct = torch.matmul(matvec_transposed.flatten(), vec.flatten()) return autograd.grad(dotproduct, w)[0]
# # such that for all variation :math:`\delta y` of :math:`y` we have: # # .. math:: # # \langle \text{d} c . \delta y , e \rangle = \langle g , \delta y \rangle = \langle \delta y , \partial c . e \rangle # # Backpropagation is all about computing the tensor :math:`g=\partial c . e` efficiently, for arbitrary values of :math:`e`: # Declare a new tensor of shape (M,1) used as the input of the gradient operator. # It can be understood as a "gradient with respect to the output c" # and is thus called "grad_output" in the documentation of PyTorch. e = torch.rand_like(c) # Call the gradient op: start = time.time() g = grad(c, y, e)[0] # PyTorch remark : grad(c, y, e) alone outputs a length 1 tuple, hence the need for [0] at the end. print('Time to compute gradient of convolution operation on the cpu: ', round(time.time() - start, 5), 's', end=' ') #################################################################### # We compare with gradient of Log of Sum of Exp: g2 = grad(c2, y, e)[0] print('(relative error: ', ((g2 - g).norm() / g.norm()).item(), ')') # Plot the results next to each other: plt.plot(g.detach().cpu().numpy()[:40], '-', label='KeOps - Stable')
def refine_mesh(self, mesh, occ_hat, z, c=None, world_mat=None, camera_mat=None): ''' Refines the predicted mesh. Args: mesh (trimesh object): predicted mesh occ_hat (tensor): predicted occupancy grid z (tensor): latent code z c (tensor): latent conditioned code c ''' self.model.eval() # Some shorthands n_x, n_y, n_z = occ_hat.shape assert (n_x == n_y == n_z) # threshold = np.log(self.threshold) - np.log(1. - self.threshold) threshold = self.threshold # Vertex parameter v0 = torch.FloatTensor(mesh.vertices).to(self.device) v = torch.nn.Parameter(v0.clone()) # Faces of mesh faces = torch.LongTensor(mesh.faces).to(self.device) # Start optimization optimizer = optim.RMSprop([v], lr=1e-4) for it_r in trange(self.refinement_step): optimizer.zero_grad() # Loss face_vertex = v[faces] eps = np.random.dirichlet((0.5, 0.5, 0.5), size=faces.shape[0]) eps = torch.FloatTensor(eps).to(self.device) face_point = (face_vertex * eps[:, :, None]).sum(dim=1) face_v1 = face_vertex[:, 1, :] - face_vertex[:, 0, :] face_v2 = face_vertex[:, 2, :] - face_vertex[:, 1, :] face_normal = torch.cross(face_v1, face_v2) face_normal = face_normal / \ (face_normal.norm(dim=1, keepdim=True) + 1e-10) vc = self.model.gproj(face_point.unsqueeze(0), c, world_mat, camera_mat) face_value = torch.sigmoid( self.model.decode(face_point.unsqueeze(0), z, vc).logits) normal_target = -autograd.grad([face_value.sum()], [face_point], create_graph=True)[0] normal_target = \ normal_target / \ (normal_target.norm(dim=1, keepdim=True) + 1e-10) loss_target = (face_value - threshold).pow(2).mean() loss_normal = \ (face_normal - normal_target).pow(2).sum(dim=1).mean() loss = loss_target + 0.01 * loss_normal # Update loss.backward() optimizer.step() mesh.vertices = v.data.cpu().numpy() return mesh
def eval_metric(step): model.eval() batch = next(eval_samples) mask = [2, 5, 8, 10, 13, 17, 20, 23, 26, 29] batch['ts_list'] = batch['ts_list'][:, mask] batch['x_list'] = batch['x_list'][:, mask] batch['y_list'] = batch['y_list'][:, mask] batch['vx_list'] = batch['vx_list'][:, mask] batch['vy_list'] = batch['vy_list'][:, mask] t = batch['ts_list'].flatten().unsqueeze(1).to(device) t.requires_grad = True batch['img'] = batch['img'].expand(len(t), 10, 1, opt.height, opt.width) batch['img'] = batch['img'].to(device) batch['v_0'] = batch['v_0'].expand(len(t), 1) batch['v_0'] = batch['v_0'].to(device) batch['xy'] = batch['xy'].to(device) batch['vxy'] = batch['vxy'].to(device) batch['img'].requires_grad = True output = model(batch['img'], t, batch['v_0']) vx = grad(output[:, 0].sum(), t, create_graph=True)[0][:, 0] * (opt.max_dist / opt.max_t) vy = grad(output[:, 1].sum(), t, create_graph=True)[0][:, 0] * (opt.max_dist / opt.max_t) x = output[:, 0] * opt.max_dist y = output[:, 1] * opt.max_dist ax = grad(vx.sum(), t, create_graph=True)[0] * (1. / opt.max_t) ay = grad(vy.sum(), t, create_graph=True)[0] * (1. / opt.max_t) jx = grad(ax.sum(), t, create_graph=True)[0] * (1. / opt.max_t) jy = grad(ay.sum(), t, create_graph=True)[0] * (1. / opt.max_t) vx = vx.data.cpu().numpy() vy = vy.data.cpu().numpy() x = x.data.cpu().numpy() y = y.data.cpu().numpy() real_x = batch['x_list'].data.cpu().numpy()[0] real_y = batch['y_list'].data.cpu().numpy()[0] real_vx = batch['vx_list'].data.cpu().numpy()[0] real_vy = batch['vy_list'].data.cpu().numpy()[0] ts_list = batch['ts_list'].data.cpu().numpy()[0] ex = np.mean(np.abs(x - real_x)) ey = np.mean(np.abs(y - real_y)) evx = np.mean(np.abs(vx - real_vx)) evy = np.mean(np.abs(vy - real_vy)) fde = np.hypot(x - real_x, y - real_y)[-1] ade = np.mean(np.hypot(x - real_x, y - real_y)) ev = np.mean(np.hypot(vx - real_vx, vy - real_vy)) jx = jx.data.cpu().numpy() jy = jy.data.cpu().numpy() smoothness = np.mean(np.hypot(jx, jy)) logger.add_scalar('metric/ex', ex, step) logger.add_scalar('metric/ey', ey, step) logger.add_scalar('metric/evx', evx, step) logger.add_scalar('metric/evy', evy, step) logger.add_scalar('metric/fde', fde, step) logger.add_scalar('metric/ade', ade, step) logger.add_scalar('metric/ev', ev, step) logger.add_scalar('metric/smoothness', smoothness, step) model.train()
def train(epoch, learning_rate, result_path): epoch_loss = 0 epoch_loss1 = 0 epoch_loss2 = 0 Dloss = 0 regloss = 0 begin = time.time() optimizer_g = optim.Adam([{ 'params': first_frame_net.parameters() }, { 'params': rnn1.parameters() }, { 'params': rnn2.parameters() }], lr=learning_rate) optimizer_d = optim.Adam(D.parameters(), lr=learning_rate) if __name__ == '__main__': for iteration, batch in enumerate(train_data_loader): gt, meas = Variable(batch[0]), Variable(batch[1]) gt = gt.cuda() # [batch,8,256,256] gt = gt.float() meas = meas.cuda() # [batch,256 256] meas = meas.float() mini_batch = gt.size()[0] y_real_ = torch.ones(mini_batch).cuda() y_fake_ = torch.zeros(mini_batch).cuda() meas_re = torch.div(meas, mask_s) meas_re = torch.unsqueeze(meas_re, 1) optimizer_d.zero_grad() batch_size1 = gt.shape[0] h0 = torch.zeros(batch_size1, 20, 256, 256).cuda() xt1 = first_frame_net(mask, meas_re, block_size, compress_rate) model_out1, h1 = rnn1(xt1, meas, mask, h0, meas_re, block_size, compress_rate) model_out = rnn2(model_out1, meas, mask, h1, meas_re, block_size, compress_rate) # discriminator training toggle_grad(first_frame_net, False) toggle_grad(rnn1, False) toggle_grad(rnn2, False) toggle_grad(D, True) gt.requires_grad_() D_result = D(gt, y_real_) # assert (D_result > 0.0 & D_result < 1.0).all() D_real_loss = compute_loss(D_result, 1) Dloss += D_result.data.mean() D_real_loss.backward(retain_graph=True) # model_out.requires_grad_() # d_fake = D(model_out, y_real_) # dloss_fake = compute_loss(d_fake, 0) batch_size = gt.size(0) grad_dout = autograd.grad(outputs=D_result.sum(), inputs=gt, create_graph=True, retain_graph=True, only_inputs=True)[0] grad_dout2 = grad_dout.pow(2) assert (grad_dout2.size() == gt.size()) reg1 = grad_dout2.view(batch_size, -1).sum(1) reg = 10 * reg1.mean() regloss += reg.data.mean() reg.backward(retain_graph=True) optimizer_d.step() # generator training toggle_grad(first_frame_net, True) toggle_grad(rnn1, True) toggle_grad(rnn2, True) toggle_grad(D, False) optimizer_g.zero_grad() D_result = D(model_out, y_real_) G_train_loss = compute_loss(D_result, 1) Loss1 = loss(model_out1, gt) Loss2 = loss(model_out, gt) Loss = 0.5 * Loss1 + 0.5 * Loss2 + 0.001 * G_train_loss epoch_loss += Loss.data epoch_loss1 += Loss1.data epoch_loss2 += Loss2.data Loss.backward() optimizer_g.step() test(test_path1, epoch, result_path) end = time.time() print( "===> Epoch {} Complete: Avg. Loss: {:.7f}".format( epoch, epoch_loss / len(train_data_loader)), "loss1 {:.7f} loss2: {:.7f}".format( epoch_loss1 / len(train_data_loader), epoch_loss2 / len(train_data_loader)), "d loss: {:.7f},reg loss: {:.7f}".format( Dloss / len(train_data_loader), regloss / len(train_data_loader)), " time: {:.2f}".format(end - begin))
def _matvec_T_grad(self, img, vec): img.requires_grad = True layer_output = self.mfe.extract_layer_output(img) dotproduct = torch.matmul(layer_output.flatten(), vec.flatten()) return autograd.grad(dotproduct, img, create_graph=True)[0]
opt.batch_size else: entropy = 0.0 costs = costs.gather(2, Variable(real.unsqueeze(2))).squeeze(2) E_real = costs.sum() / opt.batch_size if train_disc: loss = (opt.real_multiplier * E_real) - (opt.disc_entropy_reg * entropy) loss.backward() if train_disc and opt.gradient_penalty > 0: disc.gradient_penalize = True costs, inputs = disc((real, generated)) costs = costs * inputs[:, 1:] loss = ((opt.real_multiplier + 1) / 2) * costs.sum() inputs_grad, = autograd.grad([loss], [inputs], create_graph=True) inputs_grad = inputs_grad.view(opt.batch_size, -1) norm_sq = (inputs_grad**2).sum(1) norm_errors = norm_sq - 2 * torch.sqrt(norm_sq) + 1 loss = opt.gradient_penalty * norm_errors.sum( ) / opt.batch_size loss.backward() disc.gradient_penalize = False disc_gnorms.append(util.gradient_norm(disc.parameters())) if train_disc: if opt.max_grad_norm > 0: nn.utils.clip_grad_norm(disc.parameters(), opt.max_grad_norm) disc_optimizer.step() Wdist = (E_generated - E_real).data[0]
def compute_dual_gap(objective, player): strategy = Parameter(player()) loss = objective(strategy) grad = autograd.grad(loss, (strategy,))[0] err = torch.sum(grad * strategy) - torch.min(grad) return loss.item(), err.item()
def d_r1_loss(real_pred, real_img): grad_real, = autograd.grad( \ outputs = real_pred.sum(), inputs = real_img, create_graph = True) grad_penalty = (grad_real * grad_real).reshape(grad_real.shape[0], -1).sum(1).mean() return grad_penalty
def train(self): iteration = -1 label = Variable(torch.FloatTensor(batch_size, 1.0)).to(device) logging.info('Current epoch: {}. Max epoch: {}.'.format(self.epoch, max_epoch)) while self.epoch <= max_epoch: msg = {} adjust_learning_rate(self.optimizer_G, iteration) adjust_learning_rate(self.optimizer_D, iteration) for i, (avatar_tag, avatar_img) in enumerate(self.data_loader): iteration += 1 if avatar_img.shape[0] != batch_size: logging.warn('Batch size not satisfied. Ignoring.') continue if verbose: if iteration % verbose_T == 0: msg['epoch'] = int(self.epoch) msg['step'] = int(i) msg['iteration'] = iteration avatar_img = Variable(avatar_img).to(device) avatar_tag = Variable(torch.FloatTensor(avatar_tag)).to(device) # D : G = 2 : 1 # 1. Training D # 1.1. use really image for discriminating self.D.zero_grad() label_p, tag_p = self.D(avatar_img) label.data.fill_(1.0) # 1.2. real image's loss real_label_loss = self.label_criterion(label_p, label) real_tag_loss = self.tag_criterion(tag_p, avatar_tag) real_loss_sum = real_label_loss * lambda_adv / 2.0 + real_tag_loss * lambda_adv / 2.0 real_loss_sum.backward() if verbose: if iteration % verbose_T == 0: msg['discriminator real loss'] = float(real_loss_sum) # 1.3. use fake image for discriminating g_noise, fake_tag = utils.fake_generator(batch_size, noise_size, device) fake_feat = torch.cat([g_noise, fake_tag], dim=1) fake_img = self.G(fake_feat).detach() fake_label_p, fake_tag_p = self.D(fake_img) label.data.fill_(.0) # 1.4. fake image's loss fake_label_loss = self.label_criterion(fake_label_p, label) fake_tag_loss = self.tag_criterion(fake_tag_p, fake_tag) fake_loss_sum = fake_label_loss * lambda_adv / 2.0 + fake_tag_loss * lambda_adv / 2.0 fake_loss_sum.backward() if verbose: if iteration % verbose_T == 0: msg['discriminator fake loss'] = float(fake_loss_sum) # 1.5. gradient penalty # https://github.com/jfsantos/dragan-pytorch/blob/master/dragan.py alpha_size = [1] * avatar_img.dim() alpha_size[0] = avatar_img.size(0) alpha = torch.rand(alpha_size).to(device) x_hat = Variable(alpha * avatar_img.data + (1 - alpha) * \ (avatar_img.data + 0.5 * avatar_img.data.std() * Variable(torch.rand(avatar_img.size())).to(device)), requires_grad=True).to(device) pred_hat, pred_tag = self.D(x_hat) gradients = grad(outputs=pred_hat, inputs=x_hat, grad_outputs=torch.ones(pred_hat.size()).to(device), create_graph=True, retain_graph=True, only_inputs=True)[0].view(x_hat.size(0), -1) gradient_penalty = lambda_gp * ((gradients.norm(2, dim=1) - 1) ** 2).mean() gradient_penalty.backward() if verbose: if iteration % verbose_T == 0: msg['discriminator gradient penalty'] = float(gradient_penalty) # 1.6. update optimizer self.optimizer_D.step() # 2. Training G # 2.1. generate fake image self.G.zero_grad() g_noise, fake_tag = utils.fake_generator(batch_size, noise_size, device) fake_feat = torch.cat([g_noise, fake_tag], dim=1) fake_img = self.G(fake_feat) fake_label_p, fake_tag_p = self.D(fake_img) label.data.fill_(1.0) # 2.2. calc loss label_loss_g = self.label_criterion(fake_label_p, label) tag_loss_g = self.tag_criterion(fake_tag_p, fake_tag) loss_g = label_loss_g * lambda_adv / 2.0 + tag_loss_g * lambda_adv / 2.0 loss_g.backward() if verbose: if iteration % verbose_T == 0: msg['generator loss'] = float(loss_g) # 2.2. update optimizer self.optimizer_G.step() if verbose: if iteration % verbose_T == 0: logger.info('------------------------------------------') for key in msg.keys(): logger.info('{} : {}'.format(key, msg[key])) # save intermediate file if iteration % verbose_T == 0: vutils.save_image(avatar_img.data.view(batch_size, 3, avatar_img.size(2), avatar_img.size(3)), os.path.join(tmp_path, 'real_image_{}.png'.format(str(iteration).zfill(8)))) g_noise, fake_tag = utils.fake_generator(batch_size, noise_size, device) fake_feat = torch.cat([g_noise, fake_tag], dim=1) fake_img = self.G(fake_feat) vutils.save_image(fake_img.data.view(batch_size, 3, avatar_img.size(2), avatar_img.size(3)), os.path.join(tmp_path, 'fake_image_{}.png'.format(str(iteration).zfill(8)))) logger.info('Saved intermediate file in {}'.format(os.path.join(tmp_path, 'fake_image_{}.png'.format(str(iteration).zfill(8))))) # dump checkpoint torch.save({ 'epoch': self.epoch, 'D': self.D.state_dict(), 'G': self.G.state_dict(), 'optimizer_D': self.optimizer_D.state_dict(), 'optimizer_G': self.optimizer_G.state_dict(), }, '{}/checkpoint_{}.tar'.format(model_dump_path, str(self.epoch).zfill(4))) logger.info('Checkpoint saved in: {}'.format('{}/checkpoint_{}.tar'.format(model_dump_path, str(self.epoch).zfill(4)))) self.epoch += 1
interp = (batches[0] + batches[1])/2.0 h1 = C.compute_h2(batches[0]) h2 = C.compute_h2(batches[1]) print h1.size() h1_interp = 0.5*h1 + 0.5*h2 starting_x = Variable(batches[0].data, requires_grad=True) for iteration in range(0,200): curr_h = C.compute_h2(starting_x) loss = torch.sum((h1_interp - curr_h)**2) / h1_interp.size(0) g = grad(loss, starting_x)[0] new_x = starting_x - 1.0 * g / g.norm(2) starting_x = Variable(new_x.data, requires_grad=True) print "loss", loss print C(starting_x)[0:10] save_image(denorm(batches[0].data), 'interpolation_images/batch1.png') save_image(denorm(batches[1].data), 'interpolation_images/batch2.png') save_image(denorm(interp.data), 'interpolation_images/visible_interp.png') save_image(denorm(starting_x.data), 'interpolation_images/hidden_interp.png') #print C(train).max(1) #print target
y = LeNet5(data) _, pred = torch.max(y, dim=1) num_errs += torch.sum(pred != target) return num_errs.item() / len(test_loader.dataset) Qs = [[torch.eye(W.shape[0]), torch.eye(W.shape[1])] for W in Ws] step_size = 0.1 grad_norm_clip_thr = 0.1 * sum(W.shape[0] * W.shape[1] for W in Ws)**0.5 TrainLoss, TestLoss = [], [] for epoch in range(10): trainloss = 0.0 for batch_idx, (data, target) in enumerate(train_loader): loss = train_loss(data, target) grads = grad(loss, Ws, create_graph=True) trainloss += loss.item() v = [torch.randn(W.shape) for W in Ws] Hv = grad( grads, Ws, v ) #error? check torch bug: https://github.com/pytorch/pytorch/issues/15532 with torch.no_grad(): Qs = [ psgd.update_precond_kron(q[0], q[1], dw, dg) for (q, dw, dg) in zip(Qs, v, Hv) ] pre_grads = [ psgd.precond_grad_kron(q[0], q[1], g) for (q, g) in zip(Qs, grads) ]
def mutate_sm(mutation,params, model=None, env=None, verbose=False, states=None, mag=0.1, **kwargs): model.inject_parameters(params.copy()) #TODO: why? _states = np.concatenate((states,states,states,states)) #grab old policy sz = min(100,len(_states)) #experience in this domain = the classification *input* patterns experience_states = _states experience_states = Variable(torch.from_numpy(experience_states), requires_grad=False) #old_policy in this domain = the outputs this model generated before perturbation old_policy = model(experience_states) num_classes = old_policy.size()[1] #SM-ABS abs_gradient=False #SM-SO second_order=False #SM-R sm_r = False #SM-R uses a line search linesearch=False if mutation.count("SM-R")>0: sm_r = True elif mutation.count("SO")>0: second_order=True elif mutation.count("ABS")>0: abs_gradient=True #initial perturbation delta = np.random.randn(*params.shape).astype(np.float32)*mag if sm_r: #print "SM-R" scaling = torch.ones(params.shape) linesearch = True elif second_order: #print "SM-G-SO" np_copy = np.array(old_policy.data.numpy(),dtype=np.float32) _old_policy_cached = Variable(torch.from_numpy(np_copy), requires_grad=False) #loss = a measure of squared divergence from the old policy loss = ((old_policy-_old_policy_cached)**2).sum(1).mean(0) #take a first derivative loss_gradient = grad(loss, model.parameters(), create_graph=True) flat_gradient = torch.cat([grads.view(-1) for grads in loss_gradient]) #.sum() #choose a perturbation direction direction = (delta/ np.sqrt((delta**2).sum())) direction_t = Variable(torch.from_numpy(direction),requires_grad=False) #calculate second derivative along perturbation direction grad_v_prod = (flat_gradient * direction_t).sum() second_deriv = torch.autograd.grad(grad_v_prod, model.parameters()) #extract a contiguous version of the second derivative sensitivity = torch.cat([g.contiguous().view(-1) for g in second_deriv]) #return our re-scaling based on second order sensitivity scaling = torch.sqrt(torch.abs(sensitivity).data) elif not abs_gradient: #print "SM-G-SUM" tot_size = model.count_parameters() #we want to calculate a jacobian of derivatives of each output's sensitivity to each parameter jacobian = torch.zeros(num_classes, tot_size) grad_output = torch.zeros(*old_policy.size()) #do a backward pass for each output for i in range(num_classes): model.zero_grad() grad_output.zero_() grad_output[:, i] = 1.0 old_policy.backward(grad_output, retain_variables=True) jacobian[i] = torch.from_numpy(model.extract_grad()) #summed gradients sensitivity scaling = torch.sqrt( (jacobian**2).sum(0) ) else: #print "SM-G-ABS" tot_size = model.count_parameters() jacobian = torch.zeros(num_classes, tot_size, sz) grad_output = torch.zeros(*old_policy.size()) for i in range(num_classes): for j in range(sz): old_policy_new = model(experience_states[j:j+1]) model.zero_grad() grad_output.zero_() grad_output[:, i] = 1.0/sz old_policy_new.backward(grad_output, retain_variables=True) jacobian[i,:,j] = torch.from_numpy(model.extract_grad()) mean_abs_jacobian = torch.abs(jacobian).mean(2) scaling = torch.sqrt( (mean_abs_jacobian**2).sum(0)) scaling = scaling.numpy() if verbose: print 'scaling sum',scaling.sum() scaling[scaling==0]=1.0 scaling[scaling<0.01]=0.01 old_delta = delta.copy() delta /= scaling new_params = params+delta model.inject_parameters(new_params) threshold = mag weight_clip = 10.0 #note generally probably should be smaller search_rounds = 15 old_policy = old_policy.data.numpy() #error function for SM-R to line search over #requires one forward pass for each iteration of line search def search_error(x,raw=False): final_delta = delta*x final_delta = np.clip(final_delta,-weight_clip,weight_clip) new_params = params + final_delta model.inject_parameters(new_params) output = model(experience_states).data.numpy() change = np.sqrt(((output - old_policy)**2).sum(1)).mean() if raw: return change return np.sqrt(change-threshold)**2 #do line search for SM-R to tune mutation if linesearch: mult = minimize_scalar(search_error,bounds=(0,0.1,3),tol=(threshold/4),options={'maxiter':search_rounds,'disp':True}) new_params = params+delta*mult.x chg_amt = mult.x else: chg_amt = 1.0 final_delta = delta*chg_amt #limit extreme weight changes for stability final_delta = np.clip(final_delta,-weight_clip,weight_clip) #as 1.0 #generate new parameter vector new_params = params + final_delta if verbose: print 'delta max:',final_delta.max() print("divergence:", check_policy_change(params,new_params,model,states)) print(new_params.shape,params.shape) diff = np.sqrt(((new_params - params)**2).sum()) if verbose: print("diff: ", diff) return new_params.copy(),final_delta