def sample_relax(logits, surrogate): cat = Categorical(logits=logits) u = torch.rand(B,C).clamp(1e-10, 1.-1e-10).cuda() gumbels = -torch.log(-torch.log(u)) z = logits + gumbels b = torch.argmax(z, dim=1) #.view(B,1) logprob = cat.log_prob(b).view(B,1) # czs = [] # for j in range(1): # z = sample_relax_z(logits) # surr_input = torch.cat([z, x, logits.detach()], dim=1) # cz = surrogate.net(surr_input) # czs.append(cz) # czs = torch.stack(czs) # cz = torch.mean(czs, dim=0)#.view(1,1) surr_input = torch.cat([z, x, logits.detach()], dim=1) cz = surrogate.net(surr_input) cz_tildes = [] for j in range(1): z_tilde = sample_relax_given_b(logits, b) surr_input = torch.cat([z_tilde, x, logits.detach()], dim=1) cz_tilde = surrogate.net(surr_input) cz_tildes.append(cz_tilde) cz_tildes = torch.stack(cz_tildes) cz_tilde = torch.mean(cz_tildes, dim=0) #.view(B,1) return b, logprob, cz, cz_tilde
def sample_relax_given_class(logits, samp): cat = Categorical(logits=logits) u = torch.rand(B,C).clamp(1e-8, 1.-1e-8) gumbels = -torch.log(-torch.log(u)) z = logits + gumbels b = samp #torch.argmax(z, dim=1) logprob = cat.log_prob(b).view(B,1) u_b = torch.gather(input=u, dim=1, index=b.view(B,1)) z_tilde_b = -torch.log(-torch.log(u_b)) z_tilde = -torch.log((- torch.log(u) / torch.softmax(logits, dim=1)) - torch.log(u_b)) z_tilde.scatter_(dim=1, index=b.view(B,1), src=z_tilde_b) z = z_tilde u_b = torch.gather(input=u, dim=1, index=b.view(B,1)) z_tilde_b = -torch.log(-torch.log(u_b)) u = torch.rand(B,C).clamp(1e-8, 1.-1e-8) z_tilde = -torch.log((- torch.log(u) / torch.softmax(logits, dim=1)) - torch.log(u_b)) z_tilde.scatter_(dim=1, index=b.view(B,1), src=z_tilde_b) return z, z_tilde, logprob
def relax_grad2(x, logits, b, surrogate, mixtureweights): B = logits.shape[0] C = logits.shape[1] cat = Categorical(logits=logits) # u = torch.rand(B,C).clamp(1e-10, 1.-1e-10).cuda() u = myclamp(torch.rand(B,C).cuda()) gumbels = -torch.log(-torch.log(u)) z = logits + gumbels # b = torch.argmax(z, dim=1) #.view(B,1) logq = cat.log_prob(b).view(B,1) surr_input = torch.cat([z, x, logits.detach()], dim=1) cz = surrogate.net(surr_input) z_tilde = sample_relax_given_b(logits, b) surr_input = torch.cat([z_tilde, x, logits.detach()], dim=1) cz_tilde = surrogate.net(surr_input) logpx_given_z = logprob_undercomponent(x, component=b) logpz = torch.log(mixtureweights[b]).view(B,1) logpxz = logpx_given_z + logpz #[B,1] f = logpxz - logq net_loss = - torch.mean( (f.detach() - cz_tilde.detach()) * logq - logq + cz - cz_tilde ) grad = torch.autograd.grad([net_loss], [logits], create_graph=True, retain_graph=True)[0] #[B,C] pb = torch.exp(logq) return grad, pb
def sample_relax(logits): #, k=1): # u = torch.rand(B,C).clamp(1e-8, 1.-1e-8) #.cuda() u = torch.rand(B,C).clamp(1e-12, 1.-1e-12) #.cuda() gumbels = -torch.log(-torch.log(u)) z = logits + gumbels b = torch.argmax(z, dim=1) cat = Categorical(logits=logits) logprob = cat.log_prob(b).view(B,1) v_k = torch.rand(B,1).clamp(1e-12, 1.-1e-12) z_tilde_b = -torch.log(-torch.log(v_k)) #this way seems biased even tho it shoudlnt be # v_k = torch.gather(input=u, dim=1, index=b.view(B,1)) # z_tilde_b = torch.gather(input=z, dim=1, index=b.view(B,1)) v = torch.rand(B,C).clamp(1e-12, 1.-1e-12) #.cuda() probs = torch.softmax(logits,dim=1).repeat(B,1) # print (probs.shape, torch.log(v_k).shape, torch.log(v).shape) # fasdfa # print (v.shape) # print (v.shape) z_tilde = -torch.log((- torch.log(v) / probs) - torch.log(v_k)) # print (z_tilde) # print (z_tilde_b) z_tilde.scatter_(dim=1, index=b.view(B,1), src=z_tilde_b) # print (z_tilde) # fasdfs return z, b, logprob, z_tilde
class OneHotCategorical(Distribution): r""" Creates a one-hot categorical distribution parameterized by `probs`. Samples are one-hot coded vectors of size probs.size(-1). See also: :func:`torch.distributions.Categorical` Example:: >>> m = OneHotCategorical(torch.Tensor([ 0.25, 0.25, 0.25, 0.25 ])) >>> m.sample() # equal probability of 0, 1, 2, 3 0 0 1 0 [torch.FloatTensor of size 4] Args: probs (Tensor or Variable): event probabilities """ params = {'probs': constraints.simplex} support = constraints.simplex has_enumerate_support = True def __init__(self, probs=None, logits=None): self._categorical = Categorical(probs, logits) batch_shape = self._categorical.probs.size()[:-1] event_shape = self._categorical.probs.size()[-1:] super(OneHotCategorical, self).__init__(batch_shape, event_shape) def sample(self, sample_shape=torch.Size()): sample_shape = torch.Size(sample_shape) probs = self._categorical.probs one_hot = probs.new(self._extended_shape(sample_shape)).zero_() indices = self._categorical.sample(sample_shape) if indices.dim() < one_hot.dim(): indices = indices.unsqueeze(-1) return one_hot.scatter_(-1, indices, 1) def log_prob(self, value): indices = value.max(-1)[1] return self._categorical.log_prob(indices) def entropy(self): return self._categorical.entropy() def enumerate_support(self): probs = self._categorical.probs n = self.event_shape[0] if isinstance(probs, Variable): values = Variable(torch.eye(n, out=probs.data.new(n, n))) else: values = torch.eye(n, out=probs.new(n, n)) values = values.view((n,) + (1,) * len(self.batch_shape) + (n,)) return values.expand((n,) + self.batch_shape + (n,))
def reinforce_baseline(surrogate, x, logits, mixtureweights, k=1, get_grad=False): B = logits.shape[0] probs = torch.softmax(logits, dim=1) outputs = {} cat = Categorical(probs=probs) grads =[] # net_loss = 0 for jj in range(k): cluster_H = cat.sample() outputs['logq'] = logq = cat.log_prob(cluster_H).view(B,1) outputs['logpx_given_z'] = logpx_given_z = logprob_undercomponent(x, component=cluster_H) outputs['logpz'] = logpz = torch.log(mixtureweights[cluster_H]).view(B,1) logpxz = logpx_given_z + logpz #[B,1] surr_pred = surrogate.net(x) outputs['f'] = f = logpxz - logq - 1. # outputs['net_loss'] = net_loss = net_loss - torch.mean((f.detach() ) * logq) outputs['net_loss'] = net_loss = - torch.mean((f.detach() - surr_pred.detach()) * logq) # net_loss += - torch.mean( -logq.detach()*logq) # surr_loss = torch.mean(torch.abs(f.detach() - surr_pred)) grad_logq = torch.autograd.grad([torch.mean(logq)], [logits], create_graph=True, retain_graph=True)[0] surr_loss = torch.mean(((f.detach() - surr_pred) * grad_logq )**2) if get_grad: grad = torch.autograd.grad([net_loss], [logits], create_graph=True, retain_graph=True)[0] grads.append(grad) # net_loss = net_loss/ k if get_grad: grads = torch.stack(grads) # print (grads.shape) outputs['grad_avg'] = torch.mean(torch.mean(grads, dim=0),dim=0) outputs['grad_std'] = torch.std(grads, dim=0)[0] outputs['surr_loss'] = surr_loss # return net_loss, f, logpx_given_z, logpz, logq return outputs
def get_action(self, x, action=None, prev_pol=None, prev_n=None): logits = self.get_logits(x) mu = self.get_mu(x) scale = self.get_scale(x) z = self.forward(x) n = self.policy_repeat_sampler(mu, scale) if prev_pol is not None and prev_n is not None: logits, n = self.repeat_policy(prev_pol, prev_n, logits, n) probs = Categorical(logits=logits) if action is None: # print("n", n) # print("logits", logits) # print("probs", probs) action = probs.sample() return action, probs.log_prob( action), probs.entropy(), n, logits, mu, scale, z
def sample_relax(probs): cat = Categorical(probs=probs) #Sample z u = torch.rand(B, C).cuda() u = u.clamp(1e-8, 1. - 1e-8) gumbels = -torch.log(-torch.log(u)) z = torch.log(probs) + gumbels b = torch.argmax(z, dim=1) logprob = cat.log_prob(b).view(B, 1) #Sample z_tilde u_b = torch.rand(B, 1).cuda() u_b = u_b.clamp(1e-8, 1. - 1e-8) z_tilde_b = -torch.log(-torch.log(u_b)) u = torch.rand(B, C).cuda() u = u.clamp(1e-8, 1. - 1e-8) z_tilde = -torch.log((-torch.log(u) / probs) - torch.log(u_b)) z_tilde[:, b] = z_tilde_b return z, b, logprob, z_tilde, gumbels
def sample_relax(probs): cat = Categorical(probs=probs) #Sample z u = torch.rand(B,C).cuda() u = u.clamp(1e-8, 1.-1e-8) gumbels = -torch.log(-torch.log(u)) z = torch.log(probs) + gumbels b = torch.argmax(z, dim=1) logprob = cat.log_prob(b).view(B,1) #Sample z_tilde u_b = torch.rand(B,1).cuda() u_b = u_b.clamp(1e-8, 1.-1e-8) z_tilde_b = -torch.log(-torch.log(u_b)) u = torch.rand(B,C).cuda() u = u.clamp(1e-8, 1.-1e-8) z_tilde = -torch.log((- torch.log(u) / probs) - torch.log(u_b)) z_tilde[:,b] = z_tilde_b return z, b, logprob, z_tilde, gumbels
def forward(self, encoder_inputs, hx, n_steps, greedy=False): _input = encoder_inputs.new_zeros( (encoder_inputs.size(0), encoder_inputs.size(2))) mask = encoder_inputs.new_zeros( (encoder_inputs.size(0), encoder_inputs.size(1))) log_ps = [] actions = [] entropys = [] for i in range(n_steps): hx = self.cell(_input, hx) # print (hx.size(),encoder_inputs.size(),mask.size()) p = self.attn(hx, encoder_inputs, mask) dist = Categorical(p) entropy = dist.entropy() if greedy: _, index = p.max(dim=-1) else: index = dist.sample() actions.append(index) log_p = dist.log_prob(index) log_ps.append(log_p) entropys.append(entropy) mask = mask.scatter(1, index.unsqueeze(-1).expand(mask.size(0), -1), 1) _input = torch.gather( encoder_inputs, 1, index.unsqueeze(-1).unsqueeze(-1).expand( encoder_inputs.size(0), -1, encoder_inputs.size(2))).squeeze(1) log_ps = torch.stack(log_ps, 1) actions = torch.stack(actions, 1) entropys = torch.stack(entropys, 1) log_p = log_ps.sum(dim=1) entropy = entropys.mean(dim=1) return actions, log_p, entropy
def sample_relax_given_class_k(logits, samp, k): cat = Categorical(logits=logits) b = samp #torch.argmax(z, dim=1) logprob = cat.log_prob(b).view(B, 1) zs = [] z_tildes = [] for i in range(k): u = torch.rand(B, C).clamp(1e-8, 1. - 1e-8) gumbels = -torch.log(-torch.log(u)) z = logits + gumbels u_b = torch.gather(input=u, dim=1, index=b.view(B, 1)) z_tilde_b = -torch.log(-torch.log(u_b)) z_tilde = -torch.log((-torch.log(u) / torch.softmax(logits, dim=1)) - torch.log(u_b)) z_tilde.scatter_(dim=1, index=b.view(B, 1), src=z_tilde_b) z = z_tilde u_b = torch.gather(input=u, dim=1, index=b.view(B, 1)) z_tilde_b = -torch.log(-torch.log(u_b)) u = torch.rand(B, C).clamp(1e-8, 1. - 1e-8) z_tilde = -torch.log((-torch.log(u) / torch.softmax(logits, dim=1)) - torch.log(u_b)) z_tilde.scatter_(dim=1, index=b.view(B, 1), src=z_tilde_b) zs.append(z) z_tildes.append(z_tilde) zs = torch.stack(zs) z_tildes = torch.stack(z_tildes) z = torch.mean(zs, dim=0) z_tilde = torch.mean(z_tildes, dim=0) return z, z_tilde, logprob
def get_action(self, x): logits = pg.forward(x) # ALGO LOGIC: `env.action_space` specific logic if isinstance(env.action_space, Discrete): probs = Categorical(logits=logits) action = probs.sample() return action, -probs.log_prob(action), probs.entropy() elif isinstance(env.action_space, MultiDiscrete): logits_categories = torch.split(logits, env.action_space.nvec.tolist(), dim=1) action = [] probs_categories = [] entropy = torch.zeros((logits.shape[0])) neglogprob = torch.zeros((logits.shape[0])) for i in range(len(logits_categories)): probs_categories.append(Categorical(logits=logits_categories[i])) if len(action) != env.action_space.shape: action.append(probs_categories[i].sample()) neglogprob -= probs_categories[i].log_prob(action[i]) entropy += probs_categories[i].entropy() action = torch.stack(action).transpose(0, 1) return action, neglogprob, entropy
def forward(self, masked, lengths, unmasked, mask): self.encoder.lstm.flatten_parameters() logits, attns = super().forward(masked, lengths, unmasked) bsz, seqlen, vocab_size = logits.size() # Sample from x converting it to probabilities samples = [] log_probs = [] for t in range(seqlen): logit = logits[:, t, :] distribution = Categorical(logits=logit) sampled = distribution.sample() fsampled = torch.where(mask[:, t].byte(), sampled, unmasked[:, t]) log_prob = distribution.log_prob(fsampled) # flog_prob = torch.where(mask[:, t].byte(), log_prob, torch.zeros_like(log_prob)) log_probs.append(log_prob) samples.append(fsampled) samples = torch.stack(samples, dim=1) log_probs = torch.stack(log_probs, dim=1) return (samples, log_probs, attns)
def act(self, obs: np.ndarray, explore: bool): """Returns an action (should be called at every timestep) **YOU MUST IMPLEMENT THIS FUNCTION FOR Q3** Select an action from the model's stochastic policy by sampling a discrete action from the distribution specified by the model output :param obs (np.ndarray): observation vector from the environment :param explore (bool): flag indicating whether we should explore :return (sample from self.action_space): action the agent should perform """ state = torch.from_numpy(obs).float().unsqueeze(0) probs = self.policy.forward(state) probs = torch.nn.functional.softmax(probs) m = Categorical(probs) action = m.sample() self.save_policy_probs.append(m.log_prob(action)) return action.item()
def forward(self, large_maps, small_maps, rgb_ims=None, hidden_state=None, action=None, deterministic=False): seq_len, batch_size, C, H, W = large_maps.size() large_maps = large_maps.view(batch_size * seq_len, C, H, W) l_cnn_out = self.large_map_resnet_model(large_maps) l_cnn_out = l_cnn_out.view(seq_len, batch_size, -1) seq_len, batch_size, C, H, W = small_maps.size() small_maps = small_maps.view(batch_size * seq_len, C, H, W) s_cnn_out = self.small_map_resnet_model(small_maps) s_cnn_out = s_cnn_out.view(seq_len, batch_size, -1) if self.use_rgb: seq_len, batch_size, C, H, W = rgb_ims.size() rgb_ims = rgb_ims.view(batch_size * seq_len, C, H, W) rgb_cnn_out = self.rgb_resnet_model(rgb_ims) rgb_cnn_out = rgb_cnn_out.view(seq_len, batch_size, -1) cnn_out = torch.cat((rgb_cnn_out, l_cnn_out, s_cnn_out), dim=-1) else: cnn_out = torch.cat((l_cnn_out, s_cnn_out), dim=-1) rnn_in = F.elu(self.merge_fc(cnn_out)) rnn_out, hidden_state = self.rnn(rnn_in, hidden_state) pi = self.actor_head(self.actor_fc(rnn_out)) val = self.critic_head(self.critic_fc(rnn_out)) cat_dist = Categorical(logits=pi) if action is None: if not deterministic: action = cat_dist.sample() else: action = torch.max(pi, dim=2)[1] log_prob = cat_dist.log_prob(action) return action, log_prob, cat_dist.entropy(), val, hidden_state.detach( ), pi
def update_model(model, gamma, optim, rollouts, device, iteration, writer): actor_loss, critic_loss = 0., 0. for i in range(len(rollouts)): s, a, r, ns = rollouts[i] actor, critic = model.forward(s) n_actor, n_critic = model.forward(ns) target = r + gamma * n_critic loss_c = F.mse_loss(critic, target) err = r + gamma * n_critic - critic actor_dist = Categorical(logits=actor) loss_a = -actor_dist.log_prob(a) * err loss = loss_c + loss_a optim.zero_grad() loss.backward() optim.step() actor_loss += loss_a.view([]) critic_loss += loss_c.view([]) return actor_loss, critic_loss
def reinforce(x, logits, mixtureweights, k=1): B = logits.shape[0] probs = torch.softmax(logits, dim=1) cat = Categorical(probs=probs) net_loss = 0 for jj in range(k): cluster_H = cat.sample() logq = cat.log_prob(cluster_H).view(B,1) logpx_given_z = logprob_undercomponent(x, component=cluster_H) logpz = torch.log(mixtureweights[cluster_H]).view(B,1) logpxz = logpx_given_z + logpz #[B,1] f = logpxz - logq net_loss += - torch.mean((f.detach() - 1.) * logq) # net_loss += - torch.mean( -logq.detach()*logq) net_loss = net_loss/ k return net_loss, f, logpx_given_z, logpz, logq
def sample_relax_given_class_k(logits, samp, k): cat = Categorical(logits=logits) b = samp #torch.argmax(z, dim=1) logprob = cat.log_prob(b).view(B,1) zs = [] z_tildes = [] for i in range(k): u = torch.rand(B,C).clamp(1e-8, 1.-1e-8) gumbels = -torch.log(-torch.log(u)) z = logits + gumbels u_b = torch.gather(input=u, dim=1, index=b.view(B,1)) z_tilde_b = -torch.log(-torch.log(u_b)) z_tilde = -torch.log((- torch.log(u) / torch.softmax(logits, dim=1)) - torch.log(u_b)) z_tilde.scatter_(dim=1, index=b.view(B,1), src=z_tilde_b) z = z_tilde u_b = torch.gather(input=u, dim=1, index=b.view(B,1)) z_tilde_b = -torch.log(-torch.log(u_b)) u = torch.rand(B,C).clamp(1e-8, 1.-1e-8) z_tilde = -torch.log((- torch.log(u) / torch.softmax(logits, dim=1)) - torch.log(u_b)) z_tilde.scatter_(dim=1, index=b.view(B,1), src=z_tilde_b) zs.append(z) z_tildes.append(z_tilde) zs= torch.stack(zs) z_tildes= torch.stack(z_tildes) z = torch.mean(zs, dim=0) z_tilde = torch.mean(z_tildes, dim=0) return z, z_tilde, logprob
def get_action(state, policy_model, value_model, device): policy_model.eval() value_model.eval() if not state is torch.Tensor: state = torch.from_numpy(state).float().to(device) if state.shape[0] != 1: state = state.unsqueeze(0) # Create batch dimension logits = policy_model(state) m = Categorical(logits=logits) action = m.sample() log_probability = m.log_prob(action) value = value_model(state) return action.item(), log_probability.item(), value.item()
def reinforce(x, logits, mixtureweights, k=1): B = logits.shape[0] probs = torch.softmax(logits, dim=1) cat = Categorical(probs=probs) net_loss = 0 for jj in range(k): cluster_H = cat.sample() logq = cat.log_prob(cluster_H).view(B, 1) logpx_given_z = logprob_undercomponent(x, component=cluster_H) logpz = torch.log(mixtureweights[cluster_H]).view(B, 1) logpxz = logpx_given_z + logpz #[B,1] f = logpxz - logq net_loss += -torch.mean((f.detach() - 1.) * logq) # net_loss += - torch.mean( -logq.detach()*logq) net_loss = net_loss / k return net_loss, f, logpx_given_z, logpz, logq
def forward(self, iteration): ''' ''' entropys = [] log_probs = [] sampled_arcs = [] start_idx, end_idx = self._get_stage_index(iteration) cur_layer_idx = list(range(start_idx, end_idx)) self.op_dist = [] for layer_id in range(self.num_layers): logit = self.alpha[layer_id] # if self.temperature > 0: # logit /= self.temperature # if self.tanh_constant is not None: # logit = self.tanh_constant * torch.tanh(logit) op_dist = Categorical(logits=logit) self.op_dist.append(op_dist) if layer_id in cur_layer_idx: sampled_op = op_dist.sample() log_prob = op_dist.log_prob(sampled_op) log_probs.append(log_prob.view(-1, 1)) entropy = op_dist.entropy() entropys.append(entropy.view(-1, 1)) elif layer_id < start_idx: sampled_op = logit.argmax(-1) elif layer_id >= end_idx: sampled_op = op_dist.sample() sampled_arcs.append(sampled_op.view(-1, 1)) self.sampled_arcs = torch.cat(sampled_arcs, dim=1) self.sample_entropy = torch.cat(entropys, dim=1) self.sample_log_prob = torch.cat(log_probs, dim=1) return self.sampled_arcs
def get_samples_and_logp(self, x, n_samples, return_inermediate=False): if not return_inermediate: logits = self.forward(x, False) else: logits, shared_out = self.forward(x, True) distribs = [] samples = [] logps = [] for l in logits: d = Categorical(logits=l) sample = d.sample((n_samples, )) samples.append(sample) distribs.append(d) logps.append(d.log_prob(sample)) samples = torch.stack(samples, dim=0).T logps = torch.stack(logps, dim=0).T if not return_inermediate: return distribs, samples, logps else: return distribs, samples, logps, shared_out
def act(self, obs: np.ndarray, explore: bool): """Returns an action (should be called at every timestep) **YOU MUST IMPLEMENT THIS FUNCTION FOR Q3** Select an action from the model's stochastic policy by sampling a discrete action from the distribution specified by the model output :param obs (np.ndarray): observation vector from the environment :param explore (bool): flag indicating whether we should explore :return (sample from self.action_space): action the agent should perform """ state = torch.from_numpy(obs).type(torch.FloatTensor) probs = self.actor(state) probs = self.soft_max(probs) m = Categorical(probs) action = m.sample() log_prob = m.log_prob(action) critic_state = self.critic(state) return action.item(), critic_state, log_prob
def monte_carlo_sampling(self, h_j, dec_state, enc_idx=None): current_h_j = h_j current_enc_idx = enc_idx s_t = dec_state # entire final encoding state - hidden (plus cell) for all layers from num_layers y_prev = np.ones((h_j.size(0),), dtype=int) * self.vocab[START_DEC] y_prev = torch.from_numpy(y_prev).to(self.device) y_prev = self.embedding(y_prev) ys, neg_log_probs, weights = [], [], [] if self.windower: current_h_j = h_j[:, 0:self.windower.ws, :] current_enc_idx = enc_idx[:, 0:self.windower.ws] enc_slider = EncoderSlider(h_j, enc_idx, self.windower) for t in range(self.dec_max_len): if self.windower: current_h_j, current_enc_idx = enc_slider.slide(current_h_j, current_enc_idx, t) dec_outputs, s_t, a_ij, _ = self.one_step_decode(current_h_j, s_t, y_prev, enc_idx, current_enc_idx) sample_probs = torch.exp(dec_outputs) cat_dist = Categorical(sample_probs) sample = cat_dist.sample() ys.append(sample) neg_log_prob = -cat_dist.log_prob(sample) sample_masked = mask_oov(sample, self.vocab) y_prev = self.embedding(sample_masked) weights.append(a_ij) neg_log_probs.append(neg_log_prob) return torch.stack(neg_log_probs).transpose(0, 1), \ torch.stack(ys).transpose(0, 1), \ torch.stack(weights).transpose(0, 1) # (batch_size, max_dec_len, max_enc_len)
def diagonal_FIM(agent, env, episode_len, model_name): print('Estimating diagonal FIM...') episodes = 1000 log_probs = [] avg_reward = 0.0 for step in range(episodes): # Run an episode. (states, actions, discounted_rewards) = network.run_episode(env, agent, episode_len) avg_reward += np.mean(discounted_rewards) if step % 100 == 0: print('Average reward @ episode {}: {}'.format( step, avg_reward / 100)) avg_reward = 0.0 # Repeat each action, and backpropagate discounted # rewards. This can probably be batched for efficiency with a # memoryless agent... for (step, a) in enumerate(actions): logits = agent(states[step]) dist = Categorical(logits=logits) log_probs.append(-dist.log_prob(actions[step]) * discounted_rewards[step]) loglikelihoods = torch.cat(log_probs).mean(0) loglikelihood_grads = autograd.grad(loglikelihoods, agent.parameters()) # torch.dot(loglikelihood_grads * loglikelihood_grads.T) FIM = { n: g**2 for n, g in zip([n for ( n, _) in agent.named_parameters()], loglikelihood_grads) } for (n, _) in agent.named_parameters(): FIM[n.replace(".", "__")] = FIM.pop(n) with open("data-{model}/FIM.dat".format(model=model_name), 'wb+') as f: pickle.dump(FIM, f) print("File dumped correctly.")
def relax_grad(x, logits, b, surrogate, mixtureweights): B = logits.shape[0] C = logits.shape[1] cat = Categorical(logits=logits) # u = torch.rand(B,C).clamp(1e-10, 1.-1e-10).cuda() u = myclamp(torch.rand(B,C).cuda()) gumbels = -torch.log(-torch.log(u)) z = logits + gumbels # b = torch.argmax(z, dim=1) #.view(B,1) logq = cat.log_prob(b).view(B,1) surr_input = torch.cat([z, x, logits.detach()], dim=1) cz = surrogate.net(surr_input) z_tilde = sample_relax_given_b(logits, b) surr_input = torch.cat([z_tilde, x, logits.detach()], dim=1) cz_tilde = surrogate.net(surr_input) logpx_given_z = logprob_undercomponent(x, component=b) logpz = torch.log(mixtureweights[b]).view(B,1) logpxz = logpx_given_z + logpz #[B,1] f = logpxz - logq grad_logq = torch.autograd.grad([torch.mean(logq)], [logits], create_graph=True, retain_graph=True)[0] grad_surr_z = torch.autograd.grad([torch.mean(cz)], [logits], create_graph=True, retain_graph=True)[0] grad_surr_z_tilde = torch.autograd.grad([torch.mean(cz_tilde)], [logits], create_graph=True, retain_graph=True)[0] # surr_loss = torch.mean(((f.detach() - cz_tilde) * grad_logq - grad_logq + grad_surr_z - grad_surr_z_tilde)**2, dim=1, keepdim=True) surr_loss = ((f.detach() - cz_tilde) * grad_logq - grad_logq + grad_surr_z - grad_surr_z_tilde)**2 # print (surr_loss.shape) # print (logq.shape) # fasda # print (surr_loss, torch.exp(logq)) return surr_loss, torch.exp(logq)
def update(self, rewards: List[float], observations: List[np.ndarray], actions: List[int]) -> Dict[str, float]: """Update function for REINFORCE **YOU MUST IMPLEMENT THIS FUNCTION FOR Q3** :param rewards (List[float]): rewards of episode (from first to last) :param observations (List[np.ndarray]): observations of episode (from first to last) :param actions (List[int]): applied actions of episode (from first to last) :return (Dict[str, float]): dictionary mapping from loss names to loss values """ G = self.compute_gt(rewards) p_loss = 0.0 for i in range(len(rewards)): probs = self.policy.forward(Tensor(observations[i])) dist = torch.nn.functional.softmax(probs, dim=-1) m = Categorical(dist) p_loss -= m.log_prob(torch.FloatTensor([actions[i]])) * G[i] self.policy_optim.zero_grad() p_loss.backward() self.policy_optim.step() return {"p_loss": p_loss}
def forward(self, img1, img2, mex): mex = torch.nn.functional.one_hot(mex, num_classes=self.vocab_len) img1 = img1.view(img1.size(0), -1) img2 = img2.view(img2.size(0), -1) out1 = self.policy_single_img(img1) out2 = self.policy_single_img(img2) symbol = mex.view(mex.size(0), -1).float() symbol = self.policy_single_mex(symbol) out1 = torch.bmm(symbol.view(symbol.size(0), 1, symbol.size(1)), out1.view(out1.size(0), symbol.size(1), 1)) # un numero per ogni immagine out2 = torch.bmm(symbol.view(symbol.size(0), 1, symbol.size(1)), out2.view(out2.size(0), symbol.size(1), 1)) out1 = out1.view(out1.size(0), -1) out2 = out2.view(out2.size(0), -1) combined = torch.cat((out1, out2), dim=1) probs = self.softmax(combined) dist = Categorical(probs=probs) if self.training: actions = dist.sample() else: actions = dist.argmax(dim=1) logprobs = dist.log_prob(actions) entropy = dist.entropy() return probs, actions, logprobs, entropy
def sample_relax(logits): #, k=1): # u = torch.rand(B,C).clamp(1e-8, 1.-1e-8) #.cuda() u = torch.rand(B, C).clamp(1e-12, 1. - 1e-12) #.cuda() gumbels = -torch.log(-torch.log(u)) z = logits + gumbels b = torch.argmax(z, dim=1) cat = Categorical(logits=logits) logprob = cat.log_prob(b).view(B, 1) v_k = torch.rand(B, 1).clamp(1e-12, 1. - 1e-12) z_tilde_b = -torch.log(-torch.log(v_k)) # # v = torch.rand(B,C) #.clamp(1e-12, 1.-1e-12) #.cuda() # v_k = torch.gather(input=u, dim=1, index=b.view(B,1)) # # z_tilde_b = -torch.log(-torch.log(v_k)) # z_tilde_b = torch.gather(input=z, dim=1, index=b.view(B,1)) # # print (z_tilde_b) v = torch.rand(B, C).clamp(1e-12, 1. - 1e-12) #.cuda() probs = torch.softmax(logits, dim=1).repeat(B, 1) # print (probs.shape, torch.log(v_k).shape, torch.log(v).shape) # fasdfa # print (v.shape) # print (v.shape) z_tilde = -torch.log((-torch.log(v) / probs) - torch.log(v_k)) # print (z_tilde) # print (z_tilde_b) z_tilde.scatter_(dim=1, index=b.view(B, 1), src=z_tilde_b) # print (z_tilde) # fasdfs return z, b, logprob, z_tilde
def get_action(state): state = torch.from_numpy(state).float().cuda() logits = actor(state.unsqueeze(dim=0)) l = list(range(buffer.action_space)) legal_actions = env.legal_actions() mask = [ele for ele in l if ele not in legal_actions] buffer.masks.append(mask) logits[0][mask] = -float("Inf") m = Categorical(logits=logits) action = m.sample() #print(action.item() in legal_actions) log_probs = m.log_prob(action) value = critic(state.unsqueeze(dim=0)) return action.item(), value, log_probs
def sample_relax(x, logits, surrogate): B = logits.shape[0] C = logits.shape[1] cat = Categorical(logits=logits) # u = torch.rand(B,C).clamp(1e-10, 1.-1e-10).cuda() u = myclamp(torch.rand(B, C).cuda()) gumbels = -torch.log(-torch.log(u)) z = logits + gumbels b = torch.argmax(z, dim=1) #.view(B,1) logprob = cat.log_prob(b).view(B, 1) # czs = [] # for j in range(1): # z = sample_relax_z(logits) # surr_input = torch.cat([z, x, logits.detach()], dim=1) # cz = surrogate.net(surr_input) # czs.append(cz) # czs = torch.stack(czs) # cz = torch.mean(czs, dim=0)#.view(1,1) surr_input = torch.cat([z, x, logits.detach()], dim=1) cz = surrogate.net(surr_input) # cz_tildes = [] # for j in range(1): # z_tilde = sample_relax_given_b(logits, b) # surr_input = torch.cat([z_tilde, x, logits.detach()], dim=1) # cz_tilde = surrogate.net(surr_input) # cz_tildes.append(cz_tilde) # cz_tildes = torch.stack(cz_tildes) # cz_tilde = torch.mean(cz_tildes, dim=0) #.view(B,1) z_tilde = sample_relax_given_b(logits, b) surr_input = torch.cat([z_tilde, x, logits.detach()], dim=1) cz_tilde = surrogate.net(surr_input) return b, logprob, cz, cz_tilde, z, z_tilde, gumbels, u
def forward(self): inputs, h0 = self.input_vars, None log_probs, entropys, sampled_arch = [], [], [] for iedge in range(self.num_edge): outputs, h0 = self.w_lstm(inputs, h0) logits = self.w_pred(outputs) logits = logits / self.temperature logits = self.tanh_constant * torch.tanh(logits) # distribution op_distribution = Categorical(logits=logits) op_index = op_distribution.sample() sampled_arch.append(op_index.item()) op_log_prob = op_distribution.log_prob(op_index) log_probs.append(op_log_prob.view(-1)) op_entropy = op_distribution.entropy() entropys.append(op_entropy.view(-1)) # obtain the input embedding for the next step inputs = self.w_embd(op_index) return torch.sum(torch.cat(log_probs)), torch.sum( torch.cat(entropys)), self.convert_structure(sampled_arch)
# dist = LogitRelaxedBernoulli(torch.Tensor([1.]), bern_param) # dist_bernoulli = Bernoulli(bern_param) C= 2 n_components = C B=1 probs = torch.ones(B,C) bern_param = bern_param.view(B,1) aa = 1 - bern_param probs = torch.cat([aa, bern_param], dim=1) cat = Categorical(probs= probs) grads = [] for i in range(n): b = cat.sample() logprob = cat.log_prob(b.detach()) # b_ = torch.argmax(z, dim=1) logprobgrad = torch.autograd.grad(outputs=logprob, inputs=(bern_param), retain_graph=True)[0] grad = f(b) * logprobgrad grads.append(grad[0][0].data.numpy()) print ('Grad Estimator: Reinfoce categorical') print ('Grad mean', np.mean(grads)) print ('Grad std', np.std(grads)) print () reinforce_cat_grad_means.append(np.mean(grads)) reinforce_cat_grad_stds.append(np.std(grads))
def HLAX(surrogate, surrogate2, x, logits, mixtureweights, k=1): B = logits.shape[0] probs = torch.softmax(logits, dim=1) cat = RelaxedOneHotCategorical(probs=probs, temperature=torch.tensor([1.]).cuda()) cat_bernoulli = Categorical(probs=probs) net_loss = 0 surr_loss = 0 for jj in range(k): cluster_S = cat.rsample() cluster_H = H(cluster_S) logq_z = cat.log_prob(cluster_S.detach()).view(B,1) logq_b = cat_bernoulli.log_prob(cluster_H.detach()).view(B,1) logpx_given_z = logprob_undercomponent(x, component=cluster_H) logpz = torch.log(mixtureweights[cluster_H]).view(B,1) logpxz = logpx_given_z + logpz #[B,1] f_z = logpxz - logq_z - 1. f_b = logpxz - logq_b - 1. surr_input = torch.cat([cluster_S, x], dim=1) #[B,21] # surr_pred, alpha = surrogate.net(surr_input) surr_pred = surrogate.net(surr_input) alpha = torch.sigmoid(surrogate2.net(x)) net_loss += - torch.mean( alpha.detach()*(f_z.detach() - surr_pred.detach()) * logq_z + alpha.detach()*surr_pred + (1-alpha.detach())*(f_b.detach() ) * logq_b) # surr_loss += torch.mean(torch.abs(f_z.detach() - surr_pred)) grad_logq_z = torch.mean( torch.autograd.grad([torch.mean(logq_z)], [logits], create_graph=True, retain_graph=True)[0], dim=1, keepdim=True) grad_logq_b = torch.mean( torch.autograd.grad([torch.mean(logq_b)], [logits], create_graph=True, retain_graph=True)[0], dim=1, keepdim=True) grad_surr = torch.mean( torch.autograd.grad([torch.mean(surr_pred)], [logits], create_graph=True, retain_graph=True)[0], dim=1, keepdim=True) # print (alpha.shape, f_z.shape, surr_pred.shape, grad_logq_z.shape, grad_surr.shape) # fsdfa # grad_surr = torch.autograd.grad([surr_pred[0]], [logits], create_graph=True, retain_graph=True)[0] # print (grad_surr) # fsdfasd surr_loss += torch.mean( (alpha*(f_z.detach() - surr_pred) * grad_logq_z + alpha*grad_surr + (1-alpha)*(f_b.detach()) * grad_logq_b )**2 ) surr_dif = torch.mean(torch.abs(f_z.detach() - surr_pred)) # gradd = torch.autograd.grad([surr_loss], [alpha], create_graph=True, retain_graph=True)[0] # print (gradd) # fdsf grad_path = torch.autograd.grad([torch.mean(surr_pred)], [logits], create_graph=True, retain_graph=True)[0] grad_score = torch.autograd.grad([torch.mean((f_z.detach() - surr_pred.detach()) * logq_z)], [logits], create_graph=True, retain_graph=True)[0] grad_path = torch.mean(torch.abs(grad_path)) grad_score = torch.mean(torch.abs(grad_score)) net_loss = net_loss/ k surr_loss = surr_loss/ k return net_loss, f_b, logpx_given_z, logpz, logq_b, surr_loss, surr_dif, grad_path, grad_score, torch.mean(alpha)
def sample_reinforce_given_class(logits, samp): dist = Categorical(logits=logits) logprob = dist.log_prob(samp) return logprob
def get_action(self, x, action=None): logits = self.actor(self.forward(x)) probs = Categorical(logits=logits) if action is None: action = probs.sample() return action, probs.log_prob(action), probs.entropy()
if __name__ == "__main__": for episode in range(NUM_EPISODES): s, done = env.reset(), False states, rewards, log_probs = [], [], [] while not done: s = torch.from_numpy(s).float() p = Categorical(actor(s)) a = p.sample() with torch.no_grad(): succ, r, done, _ = env.step(a.numpy()) states.append(s) rewards.append(r) log_probs.append(p.log_prob(a)) s = succ discounted_rewards = [DISCOUNT**t * r for t, r in enumerate(rewards)] cumulative_returns = [ G(discounted_rewards, t) for t, _ in enumerate(discounted_rewards) ] states = torch.stack(states) state_values = critic(states).reshape(-1) cumulative_returns = tensor(cumulative_returns) Adv = cumulative_returns - state_values log_probs = torch.stack(log_probs).reshape(-1)
def run(self, episodes, steps, train=False, render_once=1e10, saveonce=10): if train: assert self.recorder.log_message is not None, "log_message is necessary during training, Instantiate Runner with log message" reset_model = False if hasattr(self.model, "type") and self.model.type == "mem": print("Recurrent Model") reset_model = True self.env.display_neural_image = self.visual_activations for _ in range(episodes): self.env.reset() self.env.enable_draw = True if not train or _ % render_once == render_once - 1 else False if reset_model: self.model.reset() state = self.env.get_state().reshape(-1) bar = tqdm(range(steps), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') trewards = 0 for step in bar: state = T.from_numpy(state).float() actions = self.model(state) c = Categorical(actions) action = c.sample() log_prob = c.log_prob(action) u = np.zeros(self.nactions) u[action] = 1.0 newstate, reward = self.env.act(u) state = newstate.reshape(-1) trewards += reward if train: self.trainer.store_records(reward, log_prob) if self.visual_activations: u = T.cat(self.activations, dim=0).reshape(-1) self.env.neural_image_values = u.detach().numpy() self.activations = [] if _ % 10 == 0 and step / steps == 0: self.update_weights() self.env.neural_weights = self.weights self.env.weight_change = True if type(self.model.hidden_vectors) != type(None): self.env.hidden_state = self.model.hidden_vectors bar.set_description(f"Episode: {_:4} Rewards : {trewards}") if train: self.env.step() else: self.env.step(speed=0) if train: self.trainer.update() self.trainer.clear_memory() self.recorder.newdata(trewards) if _ % saveonce == saveonce - 1: self.recorder.save() self.recorder.plot() if _ % saveonce == saveonce - 1 and self.recorder.final_reward >= self.current_max_reward: self.recorder.save_model(self.model) self.current_max_reward = self.recorder.final_reward print("******* Run Complete *******")
steps_list = [] for step in range(n_steps): optim.zero_grad() loss = 0 net_loss = 0 for i in range(batch_size): x = sample_true() logits = encoder.net(x) # print (logits.shape) # print (torch.softmax(logits, dim=0)) # fsfd cat = Categorical(probs= torch.softmax(logits, dim=0)) cluster = cat.sample() logprob_cluster = cat.log_prob(cluster.detach()) # print (logprob_cluster) pxz = logprob_undercomponent(x, component=cluster, needsoftmax_mixtureweight=needsoftmax_mixtureweight, cuda=False) f = pxz - logprob_cluster # print (f) # logprob = logprob_givenmixtureeweights(x, needsoftmax_mixtureweight) net_loss += -f.detach() * logprob_cluster loss += -f loss = loss / batch_size net_loss = net_loss / batch_size # print (loss, net_loss) loss.backward(retain_graph=True) optim.step()
def simplax(): def show_surr_preds(): batch_size = 1 rows = 3 cols = 1 fig = plt.figure(figsize=(10+cols,4+rows), facecolor='white') #, dpi=150) for i in range(rows): x = sample_true(1).cuda() #.view(1,1) logits = encoder.net(x) probs = torch.softmax(logits, dim=1) cat = RelaxedOneHotCategorical(probs=probs, temperature=torch.tensor([1.]).cuda()) cluster_S = cat.rsample() cluster_H = H(cluster_S) logprob_cluster = cat.log_prob(cluster_S.detach()).view(batch_size,1) check_nan(logprob_cluster) z = cluster_S n_evals = 40 x1 = np.linspace(-9,205, n_evals) x = torch.from_numpy(x1).view(n_evals,1).float().cuda() z = z.repeat(n_evals,1) cluster_H = cluster_H.repeat(n_evals,1) xz = torch.cat([z,x], dim=1) logpxz = logprob_undercomponent(x, component=cluster_H, needsoftmax_mixtureweight=needsoftmax_mixtureweight, cuda=True) f = logpxz #- logprob_cluster surr_pred = surrogate.net(xz) surr_pred = surr_pred.data.cpu().numpy() f = f.data.cpu().numpy() col =0 row = i # print (row) ax = plt.subplot2grid((rows,cols), (row,col), frameon=False, colspan=1, rowspan=1) ax.plot(x1,surr_pred, label='Surr') ax.plot(x1,f, label='f') ax.set_title(str(cluster_H[0])) ax.legend() # save_dir = home+'/Documents/Grad_Estimators/GMM/' plt_path = exp_dir+'gmm_surr.png' plt.savefig(plt_path) print ('saved training plot', plt_path) plt.close() def plot_dist(): mixture_weights = torch.softmax(needsoftmax_mixtureweight, dim=0) rows = 1 cols = 1 fig = plt.figure(figsize=(10+cols,4+rows), facecolor='white') #, dpi=150) col =0 row = 0 ax = plt.subplot2grid((rows,cols), (row,col), frameon=False, colspan=1, rowspan=1) xs = np.linspace(-9,205, 300) sum_ = np.zeros(len(xs)) # C = 20 for c in range(n_components): m = Normal(torch.tensor([c*10.]).float(), torch.tensor([5.0]).float()) ys = [] for x in xs: # component_i = (torch.exp(m.log_prob(x) )* ((c+5.) / 290.)).numpy() component_i = (torch.exp(m.log_prob(x) )* mixture_weights[c]).detach().cpu().numpy() ys.append(component_i) ys = np.reshape(np.array(ys), [-1]) sum_ += ys ax.plot(xs, ys, label='') ax.plot(xs, sum_, label='') # save_dir = home+'/Documents/Grad_Estimators/GMM/' plt_path = exp_dir+'gmm_plot_dist.png' plt.savefig(plt_path) print ('saved training plot', plt_path) plt.close() def get_loss(): x = sample_true(batch_size).cuda() #.view(1,1) logits = encoder.net(x) probs = torch.softmax(logits, dim=1) cat = RelaxedOneHotCategorical(probs=probs, temperature=torch.tensor([temp]).cuda()) cluster_S = cat.rsample() cluster_H = H(cluster_S) # cluster_onehot = torch.zeros(n_components) # cluster_onehot[cluster_H] = 1. logprob_cluster = cat.log_prob(cluster_S.detach()).view(batch_size,1) check_nan(logprob_cluster) logpxz = logprob_undercomponent(x, component=cluster_H, needsoftmax_mixtureweight=needsoftmax_mixtureweight, cuda=True) f = logpxz - logprob_cluster surr_input = torch.cat([cluster_S, x], dim=1) #[B,21] surr_pred = surrogate.net(surr_input) # net_loss = - torch.mean((f.detach()-surr_pred.detach()) * logprob_cluster + surr_pred) # loss = - torch.mean(f) surr_loss = torch.mean(torch.abs(logpxz.detach()-surr_pred)) return surr_loss def plot_posteriors(needsoftmax_mixtureweight, name=''): x = sample_true(1).cuda() trueposterior = true_posterior(x, needsoftmax_mixtureweight).view(n_components) logits = encoder.net(x) probs = torch.softmax(logits, dim=1).view(n_components) trueposterior = trueposterior.data.cpu().numpy() qz = probs.data.cpu().numpy() error = L2_mixtureweights(trueposterior,qz) kl = KL_mixutreweights(p=trueposterior, q=qz) rows = 1 cols = 1 fig = plt.figure(figsize=(8+cols,8+rows), facecolor='white') #, dpi=150) col =0 row = 0 ax = plt.subplot2grid((rows,cols), (row,col), frameon=False, colspan=1, rowspan=1) width = .3 ax.bar(range(len(qz)), trueposterior, width=width, label='True') ax.bar(np.array(range(len(qz)))+width, qz, width=width, label='q') # ax.bar(np.array(range(len(q_b)))+width+width, q_b, width=width) ax.legend() ax.grid(True, alpha=.3) ax.set_title(str(error) + ' kl:' + str(kl)) ax.set_ylim(0.,1.) # save_dir = home+'/Documents/Grad_Estimators/GMM/' plt_path = exp_dir+'posteriors'+name+'.png' plt.savefig(plt_path) print ('saved training plot', plt_path) plt.close() def inference_error(needsoftmax_mixtureweight): error_sum = 0 kl_sum = 0 n=10 for i in range(n): # if x is None: x = sample_true(1).cuda() trueposterior = true_posterior(x, needsoftmax_mixtureweight).view(n_components) logits = encoder.net(x) probs = torch.softmax(logits, dim=1).view(n_components) error = L2_mixtureweights(trueposterior.data.cpu().numpy(),probs.data.cpu().numpy()) kl = KL_mixutreweights(trueposterior.data.cpu().numpy(), probs.data.cpu().numpy()) error_sum+=error kl_sum += kl return error_sum/n, kl_sum/n # fsdfa #SIMPLAX needsoftmax_mixtureweight = torch.randn(n_components, requires_grad=True, device="cuda")#.cuda() print ('current mixuture weights') print (torch.softmax(needsoftmax_mixtureweight, dim=0)) print() encoder = NN3(input_size=1, output_size=n_components).cuda() surrogate = NN3(input_size=1+n_components, output_size=1).cuda() # optim = torch.optim.Adam([needsoftmax_mixtureweight], lr=.00004) # optim_net = torch.optim.Adam(encoder.parameters(), lr=.0004) # optim_surr = torch.optim.Adam(surrogate.parameters(), lr=.004) # optim = torch.optim.Adam([needsoftmax_mixtureweight], lr=.0001) # optim_net = torch.optim.Adam(encoder.parameters(), lr=.0001) optim_net = torch.optim.SGD(encoder.parameters(), lr=.0001) # optim_surr = torch.optim.Adam(surrogate.parameters(), lr=.005) temp = 1. batch_size = 100 n_steps = 300000 surrugate_steps = 0 k = 1 L2_losses = [] inf_losses = [] inf_losses_kl = [] kl_losses_2 = [] surr_losses = [] steps_list =[] grad_reparam_list =[] grad_reinforce_list =[] f_list = [] logpxz_list = [] logprob_cluster_list = [] logpx_list = [] # logprob_cluster_list = [] for step in range(n_steps): for ii in range(surrugate_steps): surr_loss = get_loss() optim_surr.zero_grad() surr_loss.backward() optim_surr.step() x = sample_true(batch_size).cuda() #.view(1,1) logits = encoder.net(x) probs = torch.softmax(logits, dim=1) # print (probs) # fsdafsa # cat = RelaxedOneHotCategorical(probs=probs, temperature=torch.tensor([temp]).cuda()) cat = Categorical(probs=probs) # cluster = cat.sample() # logprob_cluster = cat.log_prob(cluster.detach()) net_loss = 0 loss = 0 surr_loss = 0 for jj in range(k): # cluster_S = cat.rsample() # cluster_H = H(cluster_S) cluster_H = cat.sample() # print (cluster_H.shape) # print (cluster_H[0]) # fsad # cluster_onehot = torch.zeros(n_components) # cluster_onehot[cluster_H] = 1. # logprob_cluster = cat.log_prob(cluster_S.detach()).view(batch_size,1) logprob_cluster = cat.log_prob(cluster_H.detach()).view(batch_size,1) # logprob_cluster = cat.log_prob(cluster_S).view(batch_size,1).detach() # cat = RelaxedOneHotCategorical(probs=probs.detach(), temperature=torch.tensor([temp]).cuda()) # logprob_cluster = cat.log_prob(cluster_S).view(batch_size,1) check_nan(logprob_cluster) logpxz = logprob_undercomponent(x, component=cluster_H, needsoftmax_mixtureweight=needsoftmax_mixtureweight, cuda=True) f = logpxz - logprob_cluster # print (f) # surr_input = torch.cat([cluster_S, x], dim=1) #[B,21] surr_input = torch.cat([probs, x], dim=1) #[B,21] surr_pred = surrogate.net(surr_input) # print (f.shape) # print (surr_pred.shape) # print (logprob_cluster.shape) # fsadfsa # net_loss += - torch.mean((logpxz.detach()-surr_pred.detach()) * logprob_cluster + surr_pred - logprob_cluster) # net_loss += - torch.mean((logpxz.detach() - surr_pred.detach() - 1.) * logprob_cluster + surr_pred) net_loss += - torch.mean((logpxz.detach() - 1.) * logprob_cluster) # net_loss += - torch.mean((logpxz.detach()) * logprob_cluster - logprob_cluster) loss += - torch.mean(logpxz) surr_loss += torch.mean(torch.abs(logpxz.detach()-surr_pred)) net_loss = net_loss/ k loss = loss / k surr_loss = surr_loss/ k # if step %2==0: # optim.zero_grad() # loss.backward(retain_graph=True) # optim.step() optim_net.zero_grad() net_loss.backward(retain_graph=True) optim_net.step() # optim_surr.zero_grad() # surr_loss.backward(retain_graph=True) # optim_surr.step() # print (torch.mean(f).cpu().data.numpy()) # plot_posteriors(name=str(step)) # fsdf # kl_batch = compute_kl_batch(x,probs) if step%500==0: print (step, 'f:', torch.mean(f).cpu().data.numpy(), 'surr_loss:', surr_loss.cpu().data.detach().numpy(), 'theta dif:', L2_mixtureweights(true_mixture_weights,torch.softmax( needsoftmax_mixtureweight, dim=0).cpu().data.detach().numpy())) # if step %5000==0: # print (torch.softmax(needsoftmax_mixtureweight, dim=0).cpu().data.detach().numpy()) # # test_samp, test_cluster = sample_true2() # # print (test_cluster.cpu().data.numpy(), test_samp.cpu().data.numpy(), torch.softmax(encoder.net(test_samp.cuda().view(1,1)), dim=1)) # print () if step > 0: L2_losses.append(L2_mixtureweights(true_mixture_weights,torch.softmax( needsoftmax_mixtureweight, dim=0).cpu().data.detach().numpy())) steps_list.append(step) surr_losses.append(surr_loss.cpu().data.detach().numpy()) inf_error, kl_error = inference_error(needsoftmax_mixtureweight) inf_losses.append(inf_error) inf_losses_kl.append(kl_error) kl_batch = compute_kl_batch(x,probs,needsoftmax_mixtureweight) kl_losses_2.append(kl_batch) logpx = copmute_logpx(x, needsoftmax_mixtureweight) logpx_list.append(logpx) f_list.append(torch.mean(f).cpu().data.detach().numpy()) logpxz_list.append(torch.mean(logpxz).cpu().data.detach().numpy()) logprob_cluster_list.append(torch.mean(logprob_cluster).cpu().data.detach().numpy()) # i_feel_like_it = 1 # if i_feel_like_it: if len(inf_losses) > 0: print ('probs', probs[0]) print('logpxz', logpxz[0]) print('pred', surr_pred[0]) print ('dif', logpxz.detach()[0]-surr_pred.detach()[0]) print ('logq', logprob_cluster[0]) print ('dif*logq', (logpxz.detach()[0]-surr_pred.detach()[0])*logprob_cluster[0]) output= torch.mean((logpxz.detach()-surr_pred.detach()) * logprob_cluster, dim=0)[0] output2 = torch.mean(surr_pred, dim=0)[0] output3 = torch.mean(logprob_cluster, dim=0)[0] # input_ = torch.mean(probs, dim=0) #[0] # print (probs.shape) # print (output.shape) # print (input_.shape) grad_reinforce = torch.autograd.grad(outputs=output, inputs=(probs), retain_graph=True)[0] grad_reparam = torch.autograd.grad(outputs=output2, inputs=(probs), retain_graph=True)[0] grad3 = torch.autograd.grad(outputs=output3, inputs=(probs), retain_graph=True)[0] # print (grad) # print (grad_reinforce.shape) # print (grad_reparam.shape) grad_reinforce = torch.mean(torch.abs(grad_reinforce)) grad_reparam = torch.mean(torch.abs(grad_reparam)) grad3 = torch.mean(torch.abs(grad3)) # print (grad_reinforce) # print (grad_reparam) # dfsfda grad_reparam_list.append(grad_reparam.cpu().data.detach().numpy()) grad_reinforce_list.append(grad_reinforce.cpu().data.detach().numpy()) # grad_reinforce_list.append(grad_reinforce.cpu().data.detach().numpy()) print ('reparam:', grad_reparam.cpu().data.detach().numpy()) print ('reinforce:', grad_reinforce.cpu().data.detach().numpy()) print ('logqz grad:', grad3.cpu().data.detach().numpy()) print ('current mixuture weights') print (torch.softmax(needsoftmax_mixtureweight, dim=0)) print() # print () else: grad_reparam_list.append(0.) grad_reinforce_list.append(0.) if len(surr_losses) > 3 and step %1000==0: plot_curve(steps=steps_list, thetaloss=L2_losses, infloss=inf_losses, surrloss=surr_losses, grad_reinforce_list=grad_reinforce_list, grad_reparam_list=grad_reparam_list, f_list=f_list, logpxz_list=logpxz_list, logprob_cluster_list=logprob_cluster_list, inf_losses_kl=inf_losses_kl, kl_losses_2=kl_losses_2, logpx_list=logpx_list) plot_posteriors(needsoftmax_mixtureweight) plot_dist() show_surr_preds() # print (f) # print (surr_pred) #Understand surr preds # if step %5000==0: # if step ==0: # fasdf data_dict = {} data_dict['steps'] = steps_list data_dict['losses'] = L2_losses # save_dir = home+'/Documents/Grad_Estimators/GMM/' with open( exp_dir+"data_simplax.p", "wb" ) as f: pickle.dump(data_dict, f) print ('saved data')
def forward(self, prev_tokens: Dict[str, torch.LongTensor], prev_tags: Dict[str, torch.LongTensor], fol_tokens: Dict[str, torch.LongTensor], fol_tags: Dict[str, torch.LongTensor], prev_labels: torch.Tensor = None, fol_labels: torch.Tensor = None, conflicts: List[Any] = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: prev_mask = get_text_field_mask(prev_tokens) # embedding sequence prev_embedding_seq = self.token_field_embedding(prev_tokens) # embedding tag prev_tag_embedding = self.char_field_embedding(prev_tags) fol_mask = get_text_field_mask(fol_tokens) # embedding sequence fol_embedding_seq = self.token_field_embedding(fol_tokens) # embedding tag fol_tag_embedding = self.char_field_embedding(fol_tags) batch_size, _ = prev_mask.size() # initialization in specific gpu devices gpu_device = prev_embedding_seq.device prev_phrase_tensor = torch.tensor([0.0], device=gpu_device) fol_phrase_tensor = torch.tensor([1.0], device=gpu_device) prev_phrase_embedding_seq = prev_phrase_tensor.repeat( prev_embedding_seq.size(0), prev_embedding_seq.size(1), 1 ) fol_phrase_embedding_seq = fol_phrase_tensor.repeat( fol_embedding_seq.size(0), fol_embedding_seq.size(1), 1 ) # concat embedding and phrase prev_embedding_seq = torch.cat([prev_embedding_seq, prev_phrase_embedding_seq, prev_tag_embedding], dim=2) fol_embedding_seq = torch.cat([fol_embedding_seq, fol_phrase_embedding_seq, fol_tag_embedding], dim=2) prev_embedding_seq = self.projection_layer(prev_embedding_seq) fol_embedding_seq = self.projection_layer(fol_embedding_seq) # embedding phrase label 0 means prev, 1 means follow-up if self.training: embedding = torch.cat([prev_embedding_seq, fol_embedding_seq], dim=1) embedding_var = self._variational_dropout(embedding) prev_mask_len = prev_mask.size(1) prev_embedding_seq_var = embedding_var[:, :prev_mask_len] fol_embedding_seq_var = embedding_var[:, prev_mask_len:] else: prev_embedding_seq_var = prev_embedding_seq fol_embedding_seq_var = fol_embedding_seq # encode sequence prev_encoder_out = self.tokens_encoder(prev_embedding_seq_var, prev_mask) fol_encoder_out = self.tokens_encoder(fol_embedding_seq_var, fol_mask) prev_forward_output = prev_encoder_out[:, :, :self.hidden_size] prev_backward_output = prev_encoder_out[:, :, self.hidden_size:] fol_forward_output = fol_encoder_out[:, :, :self.hidden_size] fol_backward_output = fol_encoder_out[:, :, self.hidden_size:] prev_attn_mask = prev_mask.view(batch_size, -1, 1) * fol_mask.view(batch_size, 1, -1) prev_forward_attn_matrix = self._self_attention(prev_forward_output, fol_forward_output) / self._scaled_value prev_backward_attn_matrix = self._self_attention(prev_backward_output, fol_backward_output) / self._scaled_value prev_mean_pooling_attn = util.masked_softmax(prev_forward_attn_matrix + prev_backward_attn_matrix, prev_attn_mask) # take max pooling rather than average prev_attn_vec = torch.matmul(prev_mean_pooling_attn, fol_encoder_out) fol_attn_mask = fol_mask.view(batch_size, -1, 1) * prev_mask.view(batch_size, 1, -1) fol_forward_attn_matrix = self._self_attention(fol_forward_output, prev_forward_output) / self._scaled_value fol_backward_attn_matrix = self._self_attention(fol_backward_output, prev_backward_output) / self._scaled_value fol_mean_pooling_attn = util.masked_softmax(fol_forward_attn_matrix + fol_backward_attn_matrix, fol_attn_mask) # take max pooling rather than average fol_attn_vec = torch.matmul(fol_mean_pooling_attn, prev_encoder_out) # non_linear_output = self._non_linear(torch.cat([encoder_out, self_attention_vec], dim=2)) # prev_linear = torch.cat([prev_encoder_out, prev_attn_vec], dim=2) # fol_linear = torch.cat([fol_encoder_out, fol_attn_vec], dim=2) prev_attn_multiply = prev_encoder_out * prev_attn_vec zero_tensor = torch.zeros((batch_size, 1, prev_attn_multiply.size(2)), device=gpu_device, dtype=torch.float) prev_attn_shift = torch.cat((zero_tensor, prev_attn_multiply[:, :-1, :]), dim=1) # shift attn vector to right, and then subtract them prev_linear = torch.cat([prev_encoder_out, prev_attn_multiply, prev_attn_shift], dim=2) fol_attn_multiply = fol_encoder_out * fol_attn_vec fol_attn_shift = torch.cat((zero_tensor, fol_attn_multiply[:, :-1, :]), dim=1) # shift attn vector to right, and then subtract them fol_linear = torch.cat([fol_encoder_out, fol_attn_multiply, fol_attn_shift], dim=2) prev_tag_logistics = self.policy_net(prev_linear) fol_tag_logistics = self.policy_net(fol_linear) # project to space prev_tag_prob = F.softmax(prev_tag_logistics, dim=2) prev_predict_labels = torch.argmax(prev_tag_prob, dim=2) fol_tag_prob = F.softmax(fol_tag_logistics, dim=2) fol_predict_labels = torch.argmax(fol_tag_prob, dim=2) predict_restate_str_list = [] predict_restate_tag_list = [] max_bleu_list = [] # debug information _debug_batch_conflict_map = {} # using predict labels to cut utterance into span and fetch representations of span for batch_ind in range(batch_size): _debug_batch_conflict_map[batch_ind] = [] # batch reference object batch_origin_obj = metadata[batch_ind]["origin_obj"] prev_start_end, fol_start_end = predict_span_start_end( prev_predict_labels[batch_ind, :sum(prev_mask[batch_ind])], fol_predict_labels[batch_ind, :sum(fol_mask[batch_ind])]) # Phase 2: Predict actual fusion str via span start/end and similar gate predict_restate_str, predict_restate_tag \ = self.predict_restate(batch_origin_obj, fol_start_end, prev_start_end, prev_forward_output, prev_backward_output, fol_forward_output, fol_backward_output, batch_ind, gpu_device, _debug_batch_conflict_map) # add it to batch predict_restate_str_list.append(predict_restate_str) predict_restate_tag_list.append(predict_restate_tag) batch_golden_restate_str = [" ".join(single_metadata["origin_obj"]["restate"].utterance) for single_metadata in metadata] batch_golden_restate_tag = [single_metadata["origin_obj"]["restate"].tags for single_metadata in metadata] output = { "probs": prev_tag_prob, "prev_labels": prev_predict_labels, "fol_labels": fol_predict_labels, "restate": predict_restate_str_list, "max_bleu": max_bleu_list } avg_bleu = self.metrics["bleu"](predict_restate_str_list, batch_golden_restate_str) avg_symbol = self.metrics["symbol"](predict_restate_tag_list, batch_golden_restate_tag) # overall measure self.metrics["overall"]([0.4 * avg_bleu + 0.6 * avg_symbol] * batch_size) conflict_confidences = [] # condition on training to if self.training: if prev_labels is not None: labels = torch.cat([prev_labels, fol_labels], dim=1) # Initialization pre-training with longest common string logistics = torch.cat([prev_tag_logistics, fol_tag_logistics], dim=1) mask = torch.cat([prev_mask, fol_mask], dim=1) loss_snippet = sequence_cross_entropy_with_logits(logistics, labels, mask, label_smoothing=0.2) # for pre-training, we regard them as optimal ground truth conflict_confidences = [1.0] * batch_size else: if DEBUG: rl_sample_count = 1 else: rl_sample_count = 20 batch_loss_snippet = [] batch_sample_conflicts = [] # Training Phase 2: train conflict model via margin loss for batch_ind in range(batch_size): dynamic_conflicts = [] dynamic_confidence = [] # batch reference object batch_origin_obj = metadata[batch_ind]["origin_obj"] prev_mask_len = prev_mask[batch_ind].sum().view(1).data.cpu().numpy()[0] fol_mask_len = fol_mask[batch_ind].sum().view(1).data.cpu().numpy()[0] sample_data = [] for _ in range(rl_sample_count): prev_multi = Categorical(logits=prev_tag_logistics[batch_ind]) fol_multi = Categorical(logits=fol_tag_logistics[batch_ind]) prev_label_tensor = prev_multi.sample() prev_label_tensor.data[0].fill_(1) prev_sample_label = prev_label_tensor.data.cpu().numpy().astype(int)[:prev_mask_len] fol_label_tensor = fol_multi.sample() fol_label_tensor.data[0].fill_(1) fol_sample_label = fol_label_tensor.data.cpu().numpy().astype(int)[:fol_mask_len] log_prob = torch.cat( [prev_multi.log_prob(prev_label_tensor), fol_multi.log_prob(fol_label_tensor)], dim=-1) conflict_prob_mat = self.calculate_conflict_prob_matrix(prev_sample_label, fol_sample_label, batch_ind, prev_forward_output, prev_backward_output, fol_forward_output, fol_backward_output, gpu_device) self.policy_net.saved_log_probs.append(log_prob) sample_data.append((prev_sample_label, fol_sample_label, batch_origin_obj, conflict_prob_mat)) if DEBUG: ret_data = [sample_action(row) for row in sample_data] else: # Parallel to speed up the sampling process with ThreadPool(4) as p: chunk_size = rl_sample_count // 4 ret_data = p.map(sample_action, sample_data, chunksize=chunk_size) for conflict_confidence, reinforce_reward, conflict_pair in ret_data: self.policy_net.rewards.append(reinforce_reward) dynamic_conflicts.append(conflict_pair) dynamic_confidence.append(conflict_confidence) rewards = torch.tensor(self.policy_net.rewards, device=gpu_device).float() self.metrics["reward"](self.policy_net.rewards) rewards -= rewards.mean().detach() self.metrics["reward_var"]([rewards.std().data.cpu().numpy()]) loss_snippet = [] # reward high, optimize it; reward low, reversal optimization for log_prob, reward in zip(self.policy_net.saved_log_probs, rewards): loss_snippet.append((- log_prob * reward).unsqueeze(0)) loss_snippet = torch.cat(loss_snippet).mean(dim=1).sum().view(1) batch_loss_snippet.append(loss_snippet) # random select one best_conflict_id = choice(range(rl_sample_count)) # best_conflict_id = np.argmax(self.policy_net.rewards) batch_sample_conflicts.append(dynamic_conflicts[best_conflict_id]) conflict_confidences.append(dynamic_confidence[best_conflict_id]) self.policy_net.reset() loss_snippet = torch.cat(batch_loss_snippet).mean() # according to confidence conflicts = [] for conflict_batch_id in range(batch_size): conflicts.append(batch_sample_conflicts[conflict_batch_id]) # Training Phase 1: train snippet model total_loss = loss_snippet border = torch.tensor([0.0], device=gpu_device) pos_target = torch.tensor([1.0], device=gpu_device) neg_target = torch.tensor([-1.0], device=gpu_device) # Training Phase 2: train conflict model via margin loss loss_conflict = torch.tensor([0.0], device=gpu_device)[0] # random decision on which to use for batch_ind in range(0, batch_size): batch_conflict_list = conflicts[batch_ind] # use prediction results to conflict temp_loss_conflict = torch.tensor([0.0], device=gpu_device)[0] if batch_conflict_list and len(batch_conflict_list) > 0: for conflict in batch_conflict_list: (prev_start, prev_end), (fol_start, fol_end), conflict_mode = conflict fol_span_repr = get_span_repr(fol_forward_output[batch_ind], fol_backward_output[batch_ind], fol_start, fol_end) prev_span_repr = get_span_repr(prev_forward_output[batch_ind], prev_backward_output[batch_ind], prev_start, prev_end) inter_prob = self.cosine_similar(fol_span_repr, prev_span_repr).view(1) # actual conflict if conflict_mode == 1: temp_loss_conflict += self.margin_loss(inter_prob, border, pos_target) else: temp_loss_conflict += self.margin_loss(inter_prob, border, neg_target) temp_confidence = conflict_confidences[batch_ind] loss_conflict += temp_confidence * temp_loss_conflict / len(batch_conflict_list) loss_conflict = loss_conflict / batch_size # for larger margin total_loss += loss_conflict output["loss"] = total_loss return output
def compute_loss_pi(states, actions, advs): probs = pi(states) # Note that this is equivalent to what used to be called multinomial m = Categorical(probs) loss = -torch.sum(torch.mul(m.log_prob(actions), advs)) return loss
def evaluate_actions(pi, actions): model = Categorical(pi) return model.log_prob( actions.squeeze(-1)).unsqueeze(-1), model.entropy().mean()
#REINFORCE print ('REINFORCE') # def sample_reinforce_given_class(logits, samp): # return logprob grads = [] for i in range (N): dist = Categorical(logits=logits) samp = dist.sample() logprob = dist.log_prob(samp) reward = f(samp) gradlogprob = torch.autograd.grad(outputs=logprob, inputs=(logits), retain_graph=True)[0] grads.append(reward*gradlogprob) print () grads = torch.stack(grads).view(N,C) # print (grads.shape) grad_mean_reinforce = torch.mean(grads,dim=0) grad_std_reinforce = torch.std(grads,dim=0) print ('REINFORCE') print ('mean:', grad_mean_reinforce) print ('std:', grad_std_reinforce) print () # print ('True')
class OneHotCategorical(Distribution): r""" Creates a one-hot categorical distribution parameterized by :attr:`probs` or :attr:`logits`. Samples are one-hot coded vectors of size ``probs.size(-1)``. .. note:: :attr:`probs` will be normalized to be summing to 1. See also: :func:`torch.distributions.Categorical` for specifications of :attr:`probs` and :attr:`logits`. Example:: >>> m = OneHotCategorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ])) >>> m.sample() # equal probability of 0, 1, 2, 3 tensor([ 0., 0., 0., 1.]) Args: probs (Tensor): event probabilities logits (Tensor): event log probabilities """ arg_constraints = {'probs': constraints.simplex} support = constraints.simplex has_enumerate_support = True def __init__(self, probs=None, logits=None, validate_args=None): self._categorical = Categorical(probs, logits) batch_shape = self._categorical.batch_shape event_shape = self._categorical.param_shape[-1:] super(OneHotCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args) def _new(self, *args, **kwargs): return self._categorical._new(*args, **kwargs) @property def probs(self): return self._categorical.probs @property def logits(self): return self._categorical.logits @property def mean(self): return self._categorical.probs @property def variance(self): return self._categorical.probs * (1 - self._categorical.probs) @property def param_shape(self): return self._categorical.param_shape def sample(self, sample_shape=torch.Size()): sample_shape = torch.Size(sample_shape) probs = self._categorical.probs one_hot = probs.new(self._extended_shape(sample_shape)).zero_() indices = self._categorical.sample(sample_shape) if indices.dim() < one_hot.dim(): indices = indices.unsqueeze(-1) return one_hot.scatter_(-1, indices, 1) def log_prob(self, value): if self._validate_args: self._validate_sample(value) indices = value.max(-1)[1] return self._categorical.log_prob(indices) def entropy(self): return self._categorical.entropy() def enumerate_support(self): n = self.event_shape[0] values = self._new((n, n)) torch.eye(n, out=values) values = values.view((n,) + (1,) * len(self.batch_shape) + (n,)) return values.expand((n,) + self.batch_shape + (n,))