def evaluate(self, state, epsilon=1e-6): ''' generate sampled action with state as input wrt the policy network; ''' mean, log_std = self.forward(state) std = log_std.exp() # no clip in evaluation, clip affects gradients flow normal = Normal(0, 1) z = normal.sample() action_0 = torch.tanh(mean + std*z.to(device)) # TanhNormal distribution as actions; reparameterization trick action = self.action_range*action_0 # The log-likelihood here is for the TanhNorm distribution instead of only Gaussian distribution. \ # The TanhNorm forces the Gaussian with infinite action range to be finite. \ # For the three terms in this log-likelihood estimation: \ # (1). the first term is the log probability of action as in common \ # stochastic Gaussian action policy (without Tanh); \ # (2). the second term is the caused by the Tanh(), \ # as shown in appendix C. Enforcing Action Bounds of https://arxiv.org/pdf/1801.01290.pdf, \ # the epsilon is for preventing the negative cases in log; \ # (3). the third term is caused by the action range I used in this code is not (-1, 1) but with \ # an arbitrary action range, which is slightly different from original paper. log_prob = Normal(mean, std).log_prob(mean+ std*z.to(device)) - torch.log(1. - action_0.pow(2) + epsilon) - np.log(self.action_range) # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, # needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal. log_prob = log_prob.sum(dim=1, keepdim=True) return action, log_prob, z, mean, log_std
def evaluate(self, state, epsilon=1e-6): ''' generate sampled action with state as input wrt the policy network; deterministic evaluation provides better performance according to the original paper; ''' mean, log_std = self.forward(state) std = log_std.exp( ) # no clip in evaluation, clip affects gradients flow normal = Normal(0, 1) z = normal.sample() action_0 = torch.tanh( mean + std * z.to(device) ) # TanhNormal distribution as actions; reparameterization trick action = self.action_range * action_0 ''' stochastic evaluation ''' log_prob = Normal( mean, std).log_prob(mean + std * z.to(device)) - torch.log( 1. - action_0.pow(2) + epsilon) - np.log(self.action_range) ''' deterministic evaluation ''' # log_prob = Normal(mean, std).log_prob(mean) - torch.log(1. - torch.tanh(mean).pow(2) + epsilon) - np.log(self.action_range) ''' both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal. ''' log_prob = log_prob.sum(dim=-1, keepdim=True) return action, log_prob, z, mean, log_std
def evaluate_action(self, state): ''' evaluate action within GPU graph, for gradients flowing through it ''' state = torch.FloatTensor(state).unsqueeze(0).to(device) # state dim: (N, dim of state) if DETERMINISTIC: action = self.forward(state) return action.detach().cpu().numpy() elif DISCRETE and not DETERMINISTIC: # actor-critic (discrete) probs = self.forward(state) m = Categorical(probs) action = m.sample().to(device) log_prob = m.log_prob(action) return action.detach().cpu().numpy(), log_prob.squeeze(0), m.entropy().mean() elif not DISCRETE and not DETERMINISTIC: # soft actor-critic (continuous) self.action_range = 30. self.epsilon = 1e-6 mean, log_std = self.forward(state) std = log_std.exp() normal = Normal(0, 1) z = normal.sample().to(device) action0 = torch.tanh(mean + std*z.to(device)) # TanhNormal distribution as actions; reparameterization trick action = self.action_range * action0 log_prob = Normal(mean, std).log_prob(mean+ std*z.to(device)) - torch.log(1. - action0.pow(2) + self.epsilon) - np.log(self.action_range) log_prob = log_prob.sum(dim=1, keepdim=True) print('mean: ', mean, 'log_std: ', log_std) # return action.item(), log_prob, z, mean, log_std return action.detach().cpu().numpy().squeeze(0), log_prob.squeeze(0), Normal(mean, std).entropy().mean()
def sample_actions_and_llhoods_for_all_skills(self, s, explore=True): x = s.clone().view(s.size(0), 1, s.size(1)).repeat(1, self.n_m_actions, 1) m, log_stdev = self(x) stdev = log_stdev.exp() if explore: u = m + stdev * torch.randn_like(m) else: u = m a = torch.tanh(u) if self.log_func == 'self': llhoods = gaussian_likelihood(u.unsqueeze(1), m.unsqueeze(2), log_stdev.unsqueeze(2), self.EPS_sigma) elif self.log_func == 'torch': llhoods = Normal(m.unsqueeze(2), stdev.unsqueeze(2)).log_prob(u.unsqueeze(1)) if self.log_lim_method == 'clamp': llhoods -= torch.log( torch.clamp(1 - a.unsqueeze(1).pow(2), self.EPS_log_1_min_a2, 1.0)) elif self.log_lim_method == 'sum': llhoods -= torch.log(1 - a.unsqueeze(1).pow(2) + self.EPS_log_1_min_a2) llhoods = llhoods.sum( 3) #.clamp(self.min_log_stdev, self.max_log_stdev) return a, llhoods
def step(self, image, location, recurrent_hidden): # image = [batch size, n channels, height, width] # location = [batch size, 2] # recurrent_hidden = [batch size, recurrent hid dim] glimpse_hidden = self.glimpse_network(image, location) # glimpse_hidden = [batch size, glimpse hid dim + location hid dim] recurrent_hidden = self.core_network(glimpse_hidden, recurrent_hidden) # recurrent_hidden = [batch size, recurrent hid dim] location, location_mu = self.location_network(recurrent_hidden) # location = [batch size, 2] # location_mu = [batch size, 2] log_location_action = Normal(location_mu, self.std).log_prob(location) log_location_action = log_location_action.sum(dim=1) # log_location_action = [batch size] baseline = self.baseline_network(recurrent_hidden) return recurrent_hidden, log_location_action, baseline, location, location_mu,
def evaluate(self, state, smooth_policy, device=torch.device("cpu"), epsilon=1e-6): mean, log_std = self.forward(state) normal = Normal(torch.zeros(mean.shape), torch.ones(log_std.shape)) z = normal.sample().to(device) std = log_std.exp() if self.args.stochastic_actor: z = torch.clamp(z, -3, 3) action_0 = mean + torch.mul(z, std) action_1 = torch.tanh(action_0) action = torch.mul(self.action_range.to(device), action_1) + self.action_bias.to(device) log_prob = Normal(mean, std).log_prob(action_0) - torch.log( 1. - action_1.pow(2) + epsilon) - torch.log( self.action_range.to(device)) log_prob = log_prob.sum(dim=-1, keepdim=True) return action, log_prob, std.detach() else: action_mean = torch.mul( self.action_range.to(device), torch.tanh(mean)) + self.action_bias.to(device) smooth_random = torch.clamp(0.2 * z, -0.5, 0.5) action_random = action_mean + smooth_random action_random = torch.min(action_random, self.action_high.to(device)) action_random = torch.max(action_random, self.action_low.to(device)) action = action_random if smooth_policy else action_mean return action, 0 * log_std.sum(dim=-1, keepdim=True), std.detach()
def get_action(self, state, deterministic, epsilon=1e-6): mean, log_std = self.forward(state) normal = Normal(torch.zeros(mean.shape), torch.ones(log_std.shape)) z = normal.sample() if self.args.stochastic_actor: std = log_std.exp() action_0 = mean + torch.mul(z, std) action_1 = torch.tanh(action_0) action = torch.mul(self.action_range, action_1) + self.action_bias log_prob = Normal(mean, std).log_prob(action_0) - torch.log( 1. - action_1.pow(2) + epsilon) - torch.log(self.action_range) log_prob = log_prob.sum(dim=-1, keepdim=True) action_mean = torch.mul(self.action_range, torch.tanh(mean)) + self.action_bias action = action_mean.detach().cpu().numpy( ) if deterministic else action.detach().cpu().numpy() return action, log_prob.detach().item() else: action_mean = torch.mul(self.action_range, torch.tanh(mean)) + self.action_bias action = action_mean + 0.1 * torch.mul(self.action_range, z) action = torch.min(action, self.action_high) action = torch.max(action, self.action_low) action = action_mean.detach().cpu().numpy( ) if deterministic else action.detach().cpu().numpy() return action, 0
def evaluate(self, state, deterministic, eval_noise_scale, epsilon=1e-6): ''' generate action with state as input wrt the policy network, for calculating gradients ''' mean, log_std = self.forward(state) std = log_std.exp( ) # no clip in evaluation, clip affects gradients flow normal = Normal(0, 1) z = normal.sample() action_0 = torch.tanh( mean + std * z.to(device) ) # TanhNormal distribution as actions; reparameterization trick action = self.action_range * mean if deterministic else self.action_range * action_0 log_prob = Normal( mean, std).log_prob(mean + std * z.to(device)) - torch.log( 1. - action_0.pow(2) + epsilon) - np.log(self.action_range) # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, # needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal. log_prob = log_prob.sum(dim=1, keepdim=True) ''' add noise ''' eval_noise_clip = 2 * eval_noise_scale noise = normal.sample(action.shape) * eval_noise_scale noise = torch.clamp(noise, -eval_noise_clip, eval_noise_clip) action = action + noise.to(device) return action, log_prob, z, mean, log_std
def get_KL(self, params, old_log_prob, state, old_action_raw, old_action): torch.nn.utils.vector_to_parameters(params, self.actor.evaluate_net.parameters()) mean, log_std = self.actor.evaluate_net.forward(state) std = log_std.exp() new_log_prob = Normal(mean, std).log_prob(old_action_raw) - torch.log(1 - old_action.pow(2) + 1e-6) new_log_prob = new_log_prob.sum(-1, keepdim=True) KL = old_log_prob - new_log_prob KL = KL.mean() return KL.item()
def evaluate(self, state, epsilon=1e-6): mean, log_std = self.forward(state) std = log_std.exp() normal = Normal(0, 1) z = normal.sample() action = torch.tanh(mean+ std*z.to(device)) log_prob = Normal(mean, std).log_prob(mean+ std*z.to(device)) - torch.log(1 - action.pow(2) + epsilon) log_prob = log_prob.sum(dim=-1, keepdim=True) return action, log_prob, z, mean, log_std
def get_log_probs(self, obs_rest, epsilon=1e-6): mean, log_std = self.forward(obs_rest) std = log_std.exp( ) # no clip in evaluation, clip affects gradients flow action_logit = Normal(mean, std).sample() action = torch.tanh(action_logit) log_prob = Normal( mean, std).log_prob(action_logit) - torch.log(1. - action.pow(2) + epsilon) #assert float(log_prob.mean())==float(log_prob.mean()), "Log_prob is nan" return log_prob.sum(dim=1, keepdim=True), action
def sample_actions_and_llhoods(self, s, explore=True): m, std = self(s) if explore: u = m + std * torch.randn_like(m) else: u = m a = torch.tanh(u) llhoods = Normal(m, std.abs()).log_prob(u) llhoods -= torch.log(1 - a.pow(2) + 1e-6) llhoods = llhoods.sum(1, keepdim=True) return a, llhoods
def get_reward(self, state): same_z = Normal(0, 1).sample() mean, log_std = self.actor.policy_net.forward(state) std = log_std.exp() action_raw = mean + std * same_z action = torch.tanh(action_raw) log_prob = Normal(mean, std).log_prob(action_raw) - torch.log(1 - action.pow(2) + 1e-6) log_prob = log_prob.sum(-1, keepdim=True) predicted_new_q_value_1, predicted_new_q_value_2 = self.critic.predict_q(state, action) predicted_new_q_value = torch.min(predicted_new_q_value_1, predicted_new_q_value_2) loss = (predicted_new_q_value - self.alpha * log_prob).mean() return loss.item()
def get_action(self, x): mean, log_std = self.pi(x) std = log_std.exp() normal = Normal(0, 1) z = normal.sample() action = mean + std*z log_prob = Normal(mean, std).log_prob(action) log_prob = log_prob.sum(dim=-1, keepdim=True) # reduce dim prob = log_prob.exp() action = self.action_range*action # scale the action return action.detach().numpy(), prob
def evaluate(self, state, epsilon=1e-6): mean, log_std = self.forward(state) std = log_std.exp() normal = Normal(torch.zeros(mean.shape), torch.ones(std.shape)) z = normal.sample().to(device) action_0 = mean + torch.mul(z, std) action_1 = torch.tanh(action_0) action = torch.mul(self.action_range.to(device), action_1) + self.action_bias.to(device) log_prob = Normal(mean, std).log_prob(action_0)-torch.log(1. - action_1.pow(2) + epsilon) - torch.log(self.action_range.to(device)) log_prob = log_prob.sum(dim=-1, keepdim=True) entropy = Normal(mean, std).entropy() return action, log_prob, entropy, mean.detach(), std.detach()
def evaluate(self, data, numerical_state, epsilon=1e-6): #must check how to fine-tune it. mean, log_std = self.forward(data, numerical_state) std = torch.exp(log_std) policy = ( mean + std * Normal(torch.zeros(4), torch.ones(4)).sample().to(self.device)) policy.requires_grad_() action = torch.tanh(policy) log_prob = Normal(torch.zeros(4).to(self.device), torch.ones(4).to(self.device)).\ log_prob(policy) - torch.log(1 - action.pow(2) + epsilon) log_prob = log_prob.sum(dim=1, keepdim=True) return action, log_prob, policy, mean, log_std
def learn(self): state, action, reward, next_state, end = self.memory.sample(self.batch_size) state = torch.FloatTensor(state).to(device) action = torch.FloatTensor(action).to(device) reward = torch.FloatTensor(reward).unsqueeze(1).to(device) next_state = torch.FloatTensor(next_state).to(device) end = torch.FloatTensor(np.float32(end)).unsqueeze(1).to(device) # Training Q Networks predicted_q_value_1, predicted_q_value_2 = self.critic.predict_q(state, action) predicted_v_target = self.critic.predict_v_target(next_state) target_q_value = reward + (1 - end) * self.discount * predicted_v_target q_loss_1 = nn.MSELoss()(predicted_q_value_1, target_q_value.detach()) q_loss_2 = nn.MSELoss()(predicted_q_value_2, target_q_value.detach()) self.critic.learn_q(q_loss_1, q_loss_2) # Training V Network new_action, log_prob = self.actor.predict(state) predicted_new_q_value_1, predicted_new_q_value_2 = self.critic.predict_q(state, new_action) predicted_new_q_value = torch.min(predicted_new_q_value_1, predicted_new_q_value_2) target_v_value = predicted_new_q_value - self.alpha * log_prob predicted_v_value = self.critic.predict_v(state) v_loss = nn.MSELoss()(predicted_v_value, target_v_value.detach()) self.critic.learn_v(v_loss) if self.debug_file is not None: z = Normal(0, 1).sample().to(device) mean, log_std = self.actor.policy_net.forward(state) std = log_std.exp() old_action_raw = mean + std * z old_action = torch.tanh(old_action_raw) old_log_prob = Normal(mean, std).log_prob(old_action_raw) - torch.log(1 - old_action.pow(2) + 1e-6) old_log_prob = old_log_prob.sum(-1, keepdim=True) old_reward = self.get_reward(state) # Training Policy Network policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean() self.actor.learn(policy_loss) if self.debug_file is not None: KL = self.get_KL(torch.nn.utils.parameters_to_vector(self.actor.policy_net.parameters()), old_log_prob, state, old_action_raw, old_action) new_reward = self.get_reward(state) self.debug_file.write("{},{}\n".format(abs(KL), new_reward - old_reward)) # Updating Target-V Network self.critic.update_target_v()
def evaluate(self, state, epsilon=1e-6): ''' generate sampled action with state as input wrt the policy network; ''' mean, log_std, d_action_prob = self.forward(state) std = log_std.exp() normal = Normal(0, 1) z = normal.sample().to(d) c_action_0 = torch.tanh(mean + std * z) c_action = self.action_range * c_action_0 log_prob = Normal(mean, std).log_prob(mean + std * z) - torch.log( 1. - c_action_0.pow(2) + epsilon) - np.log(self.action_range) log_prob = log_prob.sum(dim=-1, keepdim=True) return d_action_prob, c_action, log_prob, z, mean, log_std
def forward(self, y, X): # Sample parameters b = self.b.rsample() sig = self.sig.rsample() # Compute loglike ll = Normal(X.matmul(b), sig).log_prob(y) # Compute kl_qp kl_qp = kld(self.b.dist(), Normal(0, 1)) + kld(self.sig.dist(), Gamma(1, 1)) # Compute ELBO elbo = ll.sum() - kl_qp.sum() return elbo
def elbo(self, qz_m, qz_logv, zode_L, logpL, X, XrecL, L, qz_enc_m=None, qz_enc_logv=None): ''' Input: qz_m - latent means [N,2q] qz_logv - latent logvars [N,2q] zode_L - latent trajectory samples [L,N,T,2q] logpL - densities of latent trajectory samples [L,N,T] X - input images [N,T,nc,d,d] XrecL - reconstructions [L,N,T,nc,d,d] qz_enc_m - encoder density means [N*T,2*q] qz_enc_logv - encoder density variances [N*T,2*q] ''' [N, T, nc, d, d] = X.shape q = qz_m.shape[1] // 2 # prior log_pzt = self.mvn.log_prob(zode_L.contiguous().view( [L * N * T, 2 * q])) # L*N*T log_pzt = log_pzt.view([L, N, T]) # L,N,T kl_zt = logpL - log_pzt # L,N,T kl_z = kl_zt.sum(2).mean(0) # N # likelihood XL = X.repeat([L, 1, 1, 1, 1, 1]) # L,N,T,nc,d,d lhood_L = torch.log(XrecL) * XL + torch.log(1 - XrecL) * ( 1 - XL) # L,N,T,nc,d,d lhood = lhood_L.sum([2, 3, 4, 5]).mean(0) # N if qz_enc_m is not None: # instant encoding qz_enc_mL = qz_enc_m.repeat([L, 1]) # L*N*T,2*q qz_enc_logvL = qz_enc_logv.repeat([L, 1]) # L*N*T,2*q mean_ = qz_enc_mL.contiguous().view(-1) # L*N*T*2*q std_ = qz_enc_logvL.exp().contiguous().view(-1) # L*N*T*2*q qenc_zt_ode = Normal(mean_, std_).log_prob( zode_L.contiguous().view(-1)).view([L, N, T, 2 * q]) qenc_zt_ode = qenc_zt_ode.sum([3]) # L,N,T inst_enc_KL = logpL - qenc_zt_ode inst_enc_KL = inst_enc_KL.sum(2).mean(0) # N return lhood.mean(), kl_z.mean(), inst_enc_KL.mean() else: return lhood.mean(), kl_z.mean() # mean over training samples
def evaluate(self, state, last_action, hidden_in, epsilon=1e-6): ''' generate sampled action with state as input wrt the policy network; ''' mean, log_std, hidden_out = self.forward(state, last_action, hidden_in) std = log_std.exp() # no clip in evaluation, clip affects gradients flow normal = Normal(0, 1) z = normal.sample() action_0 = torch.tanh(mean + std * z.cuda()) # TanhNormal distribution as actions; reparameterization trick action = self.action_range * action_0 log_prob = Normal(mean, std).log_prob(mean + std * z.cuda()) - torch.log( 1. - action_0.pow(2) + epsilon) - np.log(self.action_range) # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, # needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal. log_prob = log_prob.sum(dim=-1, keepdim=True) return action, log_prob, z, mean, log_std, hidden_out
def sample_action(self, s): mean, log_std = self.forward(s) std = log_std.exp() # calculate action using reparameterization trick and action scaling normal = Normal(0, 1) xi = normal.sample() u = mean + std * xi.to(hyp.device) y = torch.tanh(u) a = y * self.action_scale + self.action_bias # enforcing action bound (appendix of paper) log_pi = Normal( mean, std).log_prob(u) - torch.log(self.action_scale * (1 - y.pow(2)) + hyp.EPSILON) log_pi = log_pi.sum(1, keepdim=True) mean = torch.tanh(mean) * self.action_scale + self.action_bias return a, log_pi, mean
def elbo(self, qz_m, qz_logv, zode_L, logpL, X, XrecL, Ndata, qz_enc_m=None, qz_enc_logv=None): ''' Input: qz_m - latent means [N,2q] qz_logv - latent logvars [N,2q] zode_L - latent trajectory samples [L,N,T,2q] logpL - densities of latent trajectory samples [L,N,T] X - input images [N,T,nc,d,d] XrecL - reconstructions [L,N,T,nc,d,d] Ndata - number of sequences in the dataset (required for elbo qz_enc_m - encoder density means [N*T,2*q] qz_enc_logv - encoder density variances [N*T,2*q] Returns: likelihood prior on ODE trajectories KL[q_ode(z_{0:T})||N(0,I)] prior on BNN weights instant encoding term KL[q_ode(z_{0:T})||q_enc(z_{0:T}|X_{0:T})] (if required) ''' [N,T,nc,d,d] = X.shape L = zode_L.shape[0] q = qz_m.shape[1]//2 # prior log_pzt = self.mvn.log_prob(zode_L.contiguous().view([L*N*T,2*q])) # L*N*T log_pzt = log_pzt.view([L,N,T]) # L,N,T kl_zt = logpL - log_pzt # L,N,T kl_z = kl_zt.sum(2).mean(0) # N kl_w = self.bnn.kl().sum() # likelihood XL = X.repeat([L,1,1,1,1,1]) # L,N,T,nc,d,d lhood_L = torch.log(1e-3+XrecL)*XL + torch.log(1e-3+1-XrecL)*(1-XL) # L,N,T,nc,d,d lhood = lhood_L.sum([2,3,4,5]).mean(0) # N if qz_enc_m is not None: # instant encoding qz_enc_mL = qz_enc_m.repeat([L,1]) # L*N*T,2*q qz_enc_logvL = qz_enc_logv.repeat([L,1]) # L*N*T,2*q mean_ = qz_enc_mL.contiguous().view(-1) # L*N*T*2*q std_ = 1e-3+qz_enc_logvL.exp().contiguous().view(-1) # L*N*T*2*q qenc_zt_ode = Normal(mean_,std_).log_prob(zode_L.contiguous().view(-1)).view([L,N,T,2*q]) qenc_zt_ode = qenc_zt_ode.sum([3]) # L,N,T inst_enc_KL = logpL - qenc_zt_ode inst_enc_KL = inst_enc_KL.sum(2).mean(0) # N return Ndata*lhood.mean(), Ndata*kl_z.mean(), kl_w, Ndata*inst_enc_KL.mean() else: return Ndata*lhood.mean(), Ndata*kl_z.mean(), kl_w
def step(self, x, l_t, h_t): """ @param x: image. (batch, channel, height, width) @param l_t: location trial. (batch, 2) @param h_t: last hidden state. (batch, rnn_hidden) @return h_t: next hidden state. (batch, rnn_hidden) @return l_t: next location trial. (batch, 2) @return b_t: baseline for step t. (batch) @return log_pi: probability for next location trial. (batch) """ glimpse = self.glimpse_net(x, l_t) h_t = self.rnn(glimpse, h_t) mu, l_t = self.location_net(h_t) b_t = self.baseline_net(h_t).squeeze() log_pi = Normal(mu, self.std).log_prob(l_t) # Note: log(p_y*p_x) = log(p_y) + log(p_x) log_pi = log_pi.sum(dim=1) return h_t, l_t, b_t, log_pi
def so3_entropy_old(w_eps, std, k=10): ''' w_eps(Tensor of dim 3): sample from so3 covar(Tensor of dim 3x3): covariance of distribution on so3 k: 2k+1 samples for truncated summation ''' # entropy of gaussian distribution on so3 # see appendix C of https://arxiv.org/pdf/1807.04689.pdf theta = w_eps.norm(p=2) u = w_eps / theta # 3 angles = 2 * np.pi * torch.arange( -k, k + 1, dtype=w_eps.dtype, device=w_eps.device) # 2k+1 theta_hat = theta + angles # 2k+1 x = u[None, :] * theta_hat[:, None] # 2k+1 , 3 log_p = Normal(torch.zeros(3, device=w_eps.device), std).log_prob(x) # 2k+1,3 clamp = 1e-3 log_vol = torch.log((theta_hat**2).clamp(min=clamp) / (2 - 2 * torch.cos(theta)).clamp(min=clamp)) # 2k+1 log_p = log_p.sum(-1) + log_vol entropy = -logsumexp(log_p) return entropy
def sample(mod, lam_draw, y_grid=None): if y_grid is None: upper = 6 lower = -6 grid_size = 100 step = (upper - lower) / grid_size y_grid = torch.arange(start=-6, end=6, step=step) # TODO: TEST gam, mu, sig = gam_post.sample(mod, lam_draw) dden = [] for i in range(mod.I): gami_onehot = util.get_one_hot(gam[i], sum(mod.L)) obs_i = 1 - mod.m[i] mu_i = (gami_onehot * mu[None, None, :]).sum(-1) dden_i = Normal(mu_i[:, :, None], sig[i]).log_prob(y_grid[None, None, :]).exp() dden_i = dden_i * obs_i[:, :, None].double() dden_i = dden_i.sum(0) / obs_i.sum(0, keepdim=True).double().transpose( 0, 1) dden.append(dden_i) return (y_grid, dden)
def so3_entropy(w_eps, std, k=10): ''' w_eps(Tensor of dim Bx3): sample from so3 std(Tensor of dim Bx3): std of distribution on so3 k: Use 2k+1 samples for truncated summation ''' # entropy of gaussian distribution on so3 # see appendix C of https://arxiv.org/pdf/1807.04689.pdf theta = w_eps.norm(p=2, dim=-1, keepdim=True) # [B, 1] u = w_eps / theta # [B, 3] angles = 2 * np.pi * torch.arange( -k, k + 1, dtype=w_eps.dtype, device=w_eps.device) # 2k+1 theta_hat = theta[:, None, :] + angles[:, None] # [B, 2k+1, 1] x = u[:, None, :] * theta_hat # [B, 2k+1 , 3] log_p = Normal(torch.zeros(3, device=w_eps.device), std).log_prob(x.permute([1, 0, 2])) # [2k+1, B, 3] log_p = log_p.permute([1, 0, 2]) # [B, 2k+1, 3] clamp = 1e-3 log_vol = torch.log( (theta_hat**2).clamp(min=clamp) / (2 - 2 * torch.cos(theta_hat)).clamp(min=clamp)) # [B, 2k+1, 1] log_p = log_p.sum(-1) + log_vol.sum(-1) #[B, 2k+1] entropy = -logsumexp(log_p, -1) return entropy
def forward(self, state, deterministic=False): x = F.relu(self.linear1(state)) x = F.relu(self.linear2(x)) mean = self.mean_linear(x) log_std = self.log_std_linear(x) log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) std = torch.exp(log_std) log_prob = None if deterministic: action = torch.tanh(mean) else: normal = Normal(0, 1) z = mean + std * normal.sample().to( torch.device("cuda" if torch.cuda.is_available() else "cpu")) action = torch.tanh(z) log_prob = Normal( mean, std).log_prob(z) - torch.log(1 - action.pow(2) + self.epsilon) log_prob = log_prob.sum(dim=1, keepdim=True) return action, mean, log_std, log_prob, std
def learn(self): state, action, reward, next_state, end = self.memory.sample( self.batch_size) state = torch.FloatTensor(state).to(device) action = torch.FloatTensor(action).to(device) reward = torch.FloatTensor(reward).unsqueeze(1).to(device) next_state = torch.FloatTensor(next_state).to(device) end = torch.FloatTensor(np.float32(end)).unsqueeze(1).to(device) # Training Q Networks predicted_q_value_1, predicted_q_value_2 = self.critic.predict_q( state, action) predicted_v_target = self.critic.predict_v_target(next_state) target_q_value = reward + (1 - end) * self.discount * predicted_v_target q_loss_1 = nn.MSELoss()(predicted_q_value_1, target_q_value.detach()) q_loss_2 = nn.MSELoss()(predicted_q_value_2, target_q_value.detach()) self.critic.learn_q(q_loss_1, q_loss_2) # Training V Network new_action, log_prob = self.actor.predict(state) predicted_new_q_value_1, predicted_new_q_value_2 = self.critic.predict_q( state, new_action) predicted_new_q_value = torch.min(predicted_new_q_value_1, predicted_new_q_value_2) target_v_value = predicted_new_q_value - self.alpha * log_prob predicted_v_value = self.critic.predict_v(state) v_loss = nn.MSELoss()(predicted_v_value, target_v_value.detach()) self.critic.learn_v(v_loss) # Training Policy Network policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean() normal = Normal(0, 1) z = normal.sample().to(device) mean, log_std = self.actor.policy_net.forward(state) std = log_std.exp() old_action_raw = mean + std * z old_action = torch.tanh(old_action_raw) old_log_prob = Normal(mean, std).log_prob(old_action_raw) - torch.log( 1 - old_action.pow(2) + 1e-6) old_log_prob = old_log_prob.sum(-1, keepdim=True) if self.debug_file is not None: old_reward = self.get_reward(state) params = torch.nn.utils.parameters_to_vector( self.actor.policy_net.parameters()) search_direction = torch.nn.utils.parameters_to_vector( torch.autograd.grad(policy_loss, self.actor.policy_net.parameters(), retain_graph=True)) unit_size = torch.FloatTensor([1e-4]).to(device) max_iteration = 5 # Now we have the iterations for i in range(max_iteration): test_params = params - search_direction * unit_size KL = self.get_KL(test_params, old_log_prob, state, old_action_raw, old_action) if abs(KL) <= self.tr: params = test_params torch.nn.utils.vector_to_parameters( params, self.actor.policy_net.parameters()) # Compute new direction new_action, log_prob = self.actor.predict(state) predicted_new_q_value_1, predicted_new_q_value_2 = self.critic.predict_q( state, new_action) predicted_new_q_value = torch.min(predicted_new_q_value_1, predicted_new_q_value_2) policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean() search_direction = torch.nn.utils.parameters_to_vector( torch.autograd.grad(policy_loss, self.actor.policy_net.parameters(), retain_graph=True)) else: break if self.debug_file is not None: KL = self.get_KL( torch.nn.utils.parameters_to_vector( self.actor.policy_net.parameters()), old_log_prob, state, old_action_raw, old_action) new_reward = self.get_reward(state) self.debug_file.write("{},{}\n".format(abs(KL), new_reward - old_reward)) # Updating Target-V Network self.critic.update_target_v()
total_ll = 0.0 aleatoric = 0.0 epistemic = 0.0 # total_d_ll = 0.0 for i, (x, y) in enumerate(test): x, y = x.to(device), y.to(device) mus = torch.zeros(args.samples, x.size(0), device=device) logvars = torch.zeros(args.samples, x.size(0), device=device) for j in range(args.samples): mus[j], logvars[j], _ = model(x) ll = Normal(mus, torch.exp(logvars / 2)).log_prob(y) total_ll += torch.logsumexp(ll.sum(dim=1), dim=0).item() # mean = mus.mean(dim=0) # std = ( # mus.var(dim=0) + torch.exp(logvars / 2).mean(dim=0) ** 2 # ) ** 0.5 # total_d_ll += Normal(mean, std).log_prob(y).sum() epistemic += mus.var(dim=0).mean().item() aleatoric += torch.exp(logvars).mean(dim=0).sum().item() real_y = y * test.dataset.y_sigma + test.dataset.y_mu # type: ignore real_mu = mus.mean( dim=0 ) * test.dataset.y_sigma + test.dataset.y_mu # type: ignore squared_err += ((real_y - real_mu)**2).sum().item()