def test_beta_shape_scalar_params(self): dist = Beta(0.1, 0.1) self.assertEqual(dist._batch_shape, torch.Size()) self.assertEqual(dist._event_shape, torch.Size()) self.assertEqual(dist.sample().size(), torch.Size((1, ))) self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2))) self.assertRaises(ValueError, dist.log_prob, self.scalar_sample) self.assertEqual( dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))) self.assertEqual( dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
def update(self, epochs, steps, total_obs, total_actions, advantage, real_values): total_obs_ = torch.from_numpy(total_obs).type(torch.FloatTensor) advantage_ = torch.from_numpy(advantage).type(torch.FloatTensor) real_values_ = torch.from_numpy(real_values).type(torch.FloatTensor) total_actions = torch.from_numpy(total_actions).type(torch.FloatTensor) for _ in range(epochs): inds = np.arange(steps) np.random.shuffle(inds) for t in range(steps): index = inds[t] alpha, beta, values_to_backprop = self.network( total_obs_[index].unsqueeze(0)) m = Beta(alpha, beta) action_taken_prob = m.log_prob(total_actions[index]).sum( dim=1, keepdim=True) entropy = m.entropy() entropy = entropy.sum(dim=1) print(entropy) alpha, beta, _ = self.old_network( total_obs_[index].unsqueeze(0)) m_old = Beta(alpha, beta) old_action_taken_probs = m_old.log_prob( total_actions[index]).sum(dim=1, keepdim=True) ratios = action_taken_prob / (old_action_taken_probs + 1e-5) surr1 = ratios * advantage_[index] surr2 = torch.clamp(ratios, min=(1. - .1), max=(1. + .1)) * advantage_[index] policy_loss = -torch.min(surr1, surr2) value_loss = ((values_to_backprop - real_values_[index])**2) #value_loss = F.smooth_l1_loss(values_to_backprop, real_values_[index]) total_loss = policy_loss + value_loss - 0.01 * entropy print(total_loss) self.optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(self.network.parameters(), 0.5) self.optimizer.step() self.old_network.load_state_dict(self.dic_placeholder) self.dic_placeholder = self.network.state_dict() return (value_loss)
def forward(self, x=None, warmup=1., inf_net=None): #, k=1): #, marginf_type=0): # x: [B,3,112,112] # q: [B,L] # inf type: 0 is both, 1 is only x, 2 is only y # dec type: 0 is both, 1 is only x, 2 is only y outputs = {} if inf_net is None: mu, logvar = self.inference_net(x) else: mu, logvar = inf_net.inference_net(x) z, logpz, logqz = self.sample(mu, logvar) z_dec = self.z_to_dec(z) B = z_dec.shape[0] # Decode Image x_hat = self.image_decoder(z_dec) alpha = torch.sigmoid(x_hat) beta = Beta(alpha * self.beta_scale, (1. - alpha) * self.beta_scale) x_noise = torch.clamp( x + torch.FloatTensor(x.shape).uniform_(0., 1. / 256.).cuda(), min=1e-5, max=1 - 1e-5) # logpx = beta.log_prob(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda()) #[120,3,112,112] # add uniform noise here logpx = beta.log_prob( x_noise) #[120,3,112,112] # add uniform noise here logpx = torch.sum(logpx.view(B, -1), 1) # [PB] * self.w_logpx # logpx = logpx * self.w_logpx log_ws = logpx + logpz - logqz outputs['logpx'] = torch.mean(logpx) outputs['x_recon'] = alpha outputs['welbo'] = torch.mean(logpx + warmup * (logpz - logqz)) outputs['elbo'] = torch.mean(log_ws) outputs['logws'] = log_ws outputs['z'] = z outputs['logpz'] = torch.mean(logpz) outputs['logqz'] = torch.mean(logqz) outputs['logvar'] = logvar # print (outputs['elbo'], outputs['welbo'], outputs['logpz'], outputs['logqz']) # fafs # if generate: # # word_preds, sampled_words = self.text_generator.teacher_force(z_dec, generate=generate, embeder=self.encoder_embed) # # if dec_type == 2: # alpha = torch.sigmoid(self.image_decoder(z_dec)) # return outputs, alpha #, word_preds, sampled_words return outputs
def chooseActionTrain(self, state): """ Choose an action during training mode Parameters ------- state: The current state of the car. Returns ------- action : np.ndarray The actions to run on the track coefficient : float The logarithmic probability for an action Notes ------- This function is only called when the --train flag IS provided. """ state = torch.from_numpy(state).double().to( self.hardwareDevice).unsqueeze(0) with torch.no_grad(): alpha, beta = self.nn(state)[0] dist = Beta(alpha, beta) action = dist.sample() coefficient = dist.log_prob(action).sum(dim=1) action = action.squeeze().cpu().numpy() coefficient = coefficient.item() return action, coefficient
def update(self): self.training_step += 1 s = torch.tensor(self.buffer['s'], dtype=torch.double).to(device) a = torch.tensor(self.buffer['a'], dtype=torch.double).to(device) r = torch.tensor(self.buffer['r'], dtype=torch.double).to(device).view(-1, 1) s_ = torch.tensor(self.buffer['s_'], dtype=torch.double).to(device) old_a_logp = torch.tensor(self.buffer['a_logp'], dtype=torch.double).to(device).view(-1, 1) with torch.no_grad(): target_v = r + args.gamma * self.net(s_, actual_obs=False)[1] adv = target_v - self.net(s, actual_obs=False)[1] # adv = (adv - adv.mean()) / (adv.std() + 1e-8) for _ in range(self.ppo_epoch): for index in BatchSampler(SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False): alpha, beta = self.net(s[index], actual_obs=False)[0] dist = Beta(alpha, beta) a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True) ratio = torch.exp(a_logp - old_a_logp[index]) surr1 = ratio * adv[index] surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv[index] action_loss = -torch.min(surr1, surr2).mean() value_loss = F.smooth_l1_loss(self.net(s[index], actual_obs=False)[1], target_v[index]) loss = action_loss + 2. * value_loss self.optimizer.zero_grad() loss.backward() # nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm) self.optimizer.step()
def forward(self, observation, reparameterize=True, deterministic=False, return_log_prob=False): """ Forward pass. Assumes input is a torch tensor. :type observation: torch.Tensor """ layer_input = observation for fc in self.fcs: layer_input = self.hidden_activation(fc(layer_input)) network_output = self.output_activation(self.last_fc(layer_input)) alpha = network_output[:, 0].unsqueeze(1) + EPSILON beta = network_output[:, 1].unsqueeze(1) + EPSILON distribution = Beta(alpha, beta) distribution_mean = distribution.mean if deterministic: sample = distribution.rsample() else: sample = distribution_mean # transform to range (min, max) action = self.min + self.max_min_difference * sample mean = self.min + self.max_min_difference * distribution_mean variance = self.max_min_difference_squared * distribution.variance std = torch.sqrt(variance) log_std = torch.log(std) log_prob = distribution.log_prob(sample) entropy = distribution.entropy() mean_action_log_prob = None pre_tanh_value = None return action, mean, log_std, log_prob, entropy, std, mean_action_log_prob, pre_tanh_value
def update(self): self.training_step += 1 s = torch.tensor(self.buffer['s'], dtype=torch.double) a = torch.tensor(self.buffer['a'], dtype=torch.double) r = torch.tensor(self.buffer['r'], dtype=torch.double).view(-1, 1) s_ = torch.tensor(self.buffer['s_'], dtype=torch.double) old_a_logp = torch.tensor(self.buffer['a_logp'], dtype=torch.double).view(-1, 1) with torch.no_grad(): target_v = r + self.gamma * self.net(s_)[1] adv = target_v - self.net(s)[1] for _ in range(self.ppo_epoch): for index in BatchSampler( SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False): alpha, beta = self.net(s[index])[0] dist = Beta(alpha, beta) a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True) ratio = torch.exp(a_logp - old_a_logp[index]) surr1 = ratio * adv[index] surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv[index] action_loss = -torch.min(surr1, surr2).mean() value_loss = F.smooth_l1_loss( self.net(s[index])[1], target_v[index]) loss = action_loss + 2. * value_loss self.optimizer.zero_grad() loss.backward() # intuition says to do this step differently # i.e. compute loss using minibatches and take multiple SGD steps # new insight: the shape of the objective function is fundamental in limiting # how the parameters theta don't move to a region where L > 1 + epsilon # because the norm of the gradient near the 'ceiling' approaches 0, we don't move far into the territory # this works with multiple SGD steps, but unclear how a step of grad * lr works # in an update, theta_k is constant so we are always moving in the same space # what happens if we move with too big of a gradient? # then the grad = 0, and we have finished early # epsilon is relevant for each individual action, so if its not yet there, # each action takes a gradient step closer to the ceiling # ppo just limits the adjustments of each action under the policy (given state) # objective must be maxed for each action # when adjusting theta for another transition, a different ratio can be > epsilon # this is fine, as long as the optimizer does not act greedily w.r.t this self.optimizer.step()
def test_beta_log_prob(self): for _ in range(100): alpha = np.exp(np.random.normal()) beta = np.exp(np.random.normal()) dist = Beta(alpha, beta) x = dist.sample() actual_log_prob = dist.log_prob(x).sum() expected_log_prob = scipy.stats.beta.logpdf(x, alpha, beta)[0] self.assertAlmostEqual(actual_log_prob, expected_log_prob, places=3, allow_inf=True)
def test_beta_shape_tensor_params(self): dist = Beta(torch.Tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]), torch.Tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])) self.assertEqual(dist._batch_shape, torch.Size((3, 2))) self.assertEqual(dist._event_shape, torch.Size(())) self.assertEqual(dist.sample().size(), torch.Size((3, 2))) self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3, 2))) self.assertEqual(dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))) self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
def forward(self, x=None, warmup=1., inf_net=None): #, k=1): #, marginf_type=0): # x: [B,3,112,112] # q: [B,L] # inf type: 0 is both, 1 is only x, 2 is only y # dec type: 0 is both, 1 is only x, 2 is only y outputs = {} if inf_net is None: mu, logvar = self.inference_net(x) else: mu, logvar = inf_net.inference_net(x) z, logpz, logqz = self.sample(mu, logvar) z_dec = self.z_to_dec(z) B = z_dec.shape[0] # Decode Image x_hat = self.image_decoder(z_dec) alpha = torch.sigmoid(x_hat) beta = Beta(alpha*self.beta_scale, (1.-alpha)*self.beta_scale) x_noise = torch.clamp(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda(), min=1e-5, max=1-1e-5) # logpx = beta.log_prob(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda()) #[120,3,112,112] # add uniform noise here logpx = beta.log_prob(x_noise) #[120,3,112,112] # add uniform noise here logpx = torch.sum(logpx.view(B, -1),1) # [PB] * self.w_logpx # logpx = logpx * self.w_logpx log_ws = logpx + logpz - logqz outputs['logpx'] = torch.mean(logpx) outputs['x_recon'] = alpha outputs['welbo'] = torch.mean(logpx + warmup*( logpz - logqz)) outputs['elbo'] = torch.mean(log_ws) outputs['logws'] = log_ws outputs['z'] = z outputs['logpz'] = torch.mean(logpz) outputs['logqz'] = torch.mean(logqz) outputs['logvar'] = logvar # print (outputs['elbo'], outputs['welbo'], outputs['logpz'], outputs['logqz']) # fafs # if generate: # # word_preds, sampled_words = self.text_generator.teacher_force(z_dec, generate=generate, embeder=self.encoder_embed) # # if dec_type == 2: # alpha = torch.sigmoid(self.image_decoder(z_dec)) # return outputs, alpha #, word_preds, sampled_words return outputs
def forward(self, x=None, warmup=1., inf_net=None): #, k=1): #, marginf_type=0): outputs = {} B = x.shape[0] if inf_net is None: # mu, logvar = self.inference_net(x) z, logits = self.q.sample(x) else: # mu, logvar = inf_net.inference_net(x) z, logqz = inf_net.sample(x) # print (z[0]) # b = harden(z) # print (b[0]) # logpz = torch.sum( self.prior.log_prob(b), dim=1) # print (logpz[0]) # print (logpz.shape) # fdasf probs_q = torch.sigmoid(logits) probs_q = torch.clamp(probs_q, min=.00000001, max=.9999999) probs_p = torch.ones(B, self.z_size).cuda() *.5 KL = probs_q*torch.log(probs_q/probs_p) + (1-probs_q)*torch.log((1-probs_q)/(1-probs_p)) KL = torch.sum(KL, dim=1) # print (z.shape) # Decode Image x_hat = self.generator.forward(z) alpha = torch.sigmoid(x_hat) beta = Beta(alpha*self.beta_scale, (1.-alpha)*self.beta_scale) x_noise = torch.clamp(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda(), min=1e-5, max=1-1e-5) logpx = beta.log_prob(x_noise) #[120,3,112,112] # add uniform noise here logpx = torch.sum(logpx.view(B, -1),1) # [PB] * self.w_logpx # print (logpx.shape,logpz.shape,logqz.shape) # fsdfda log_ws = logpx - KL #+ logpz - logqz outputs['logpx'] = torch.mean(logpx) outputs['x_recon'] = alpha # outputs['welbo'] = torch.mean(logpx + warmup*( logpz - logqz)) outputs['welbo'] = torch.mean(logpx + warmup*(KL)) outputs['elbo'] = torch.mean(log_ws) outputs['logws'] = log_ws outputs['z'] = z outputs['logpz'] = torch.zeros(1) #torch.mean(logpz) outputs['logqz'] = torch.mean(KL) # outputs['logvar'] = logvar return outputs
def trainmodel(self): s = torch.tensor(self.memory.buffer['s'], dtype=torch.double).to(device) a = torch.tensor(self.memory.buffer['a'], dtype=torch.double).to(device) #r = torch.tensor(self.memory.buffer['r'], dtype=torch.double).to(device).view(-1, 1) s_ = torch.tensor(self.memory.buffer['s_'], dtype=torch.double).to(device) #v = torch.tensor(self.memory.buffer['v'], dtype=torch.double).to(device).view(-1, 1) input = s_[-1].view(1, 4, 28, 28) future_value = self.net(input)[1].item() adv, target_v = self.getgae(future_value) adv = torch.tensor(np.array(adv), dtype=torch.double).to(device).view(-1, 1) target_v = torch.tensor(target_v, dtype=torch.double).to(device).view(-1, 1) adv = (adv - adv.mean()) / (adv.std() + 1e-5) old_a_logp = torch.tensor(self.memory.buffer['a_logp'], dtype=torch.double).to(device).view(-1, 1) for _ in range(self.PPOepoch): for index in BatchSampler( SubsetRandomSampler(range(self.memory.buffer_capacity)), self.memory.batch_size, False): alpha, beta = self.net(s[index])[0] dist = Beta(alpha, beta) a_logp = dist.log_prob(a[index]).sum(dim=1) a_logp = a_logp.reshape(-1, 1) ratio = torch.exp(a_logp - old_a_logp[index]) with torch.no_grad(): entrop = dist.entropy() surr1 = ratio * adv[index] surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv[index] action_loss = -torch.min(surr1, surr2).mean() value_loss = F.smooth_l1_loss( self.net(s[index])[1], target_v[index]) self.storeloss(action_loss, value_loss) action_loss = torch.clamp(action_loss, 0, 10) value_loss = torch.clamp(value_loss, 0, 10) loss = action_loss + 2. * value_loss - args.bound * entrop.mean( ) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm) self.optimizer.step() torch.save(self.net.state_dict(), self.path_t7)
def select_action(self, state): state = torch.from_numpy(state).double().to(device).unsqueeze(0) with torch.no_grad(): alpha, beta = self.net(state)[0] dist = Beta(alpha, beta) action = dist.sample() # 3 values in [0,1] a_logp = dist.log_prob(action).sum(dim=1) # For PPO action = action.squeeze().cpu().numpy() a_logp = a_logp.item() return action, a_logp
def select_action(self, state): state = torch.from_numpy(state).double().to(device).unsqueeze(0) with torch.no_grad(): (alpha, beta), _, rcrc_s = self.net(state) dist = Beta(alpha, beta) action = dist.sample() a_logp = dist.log_prob(action).sum(dim=1) action = action.squeeze().cpu().numpy() a_logp = a_logp.item() return action, a_logp, rcrc_s
def select_action(self, state): # deal with datatype of state and transform it state = torch.from_numpy(state).double().unsqueeze(0) with torch.no_grad(): alpha, beta = self.net(state)[0] dist = Beta(alpha, beta) action = dist.sample() # sampled action in interval (0, 1) a_logp = dist.log_prob(action).sum( dim=1) # add the log probability densities of the 3-stack action = action.squeeze().numpy() a_logp = a_logp.item() return action, a_logp
def gnll_loss_beta(y, param_1, param_2): batch_size = y.shape[0] loss = 0 for i in range(batch_size): beta = Beta(param_1[i], param_2[i]) sample = y[i].reshape(-1,1) for j in sample: # this is because log_prob is inf for score = 1.0 or 0.0, which makes loss=nan if j == 0: j += 1.0e-3 elif j == 1: j-= 1.0e-3 log_likelihood = beta.log_prob(sample) # (9,32) loss -= torch.mean(log_likelihood) return loss + 200
def select_action(self, state, hidden): with torch.no_grad(): _, latent_mu, _ = self.vae(state) alpha, beta = self.net(latent_mu, hidden[0])[0] dist = Beta(alpha, beta) action = dist.sample() a_logp = dist.log_prob(action).sum(dim=1) a_logp = a_logp.item() _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden) return action.squeeze().cpu().numpy(), a_logp, latent_mu, next_hidden
def update(self): """ Run an update on the network """ self.trainingStep += 1 sliceToConcat = torch.tensor(self.buffer['slice_to_concat'], dtype=torch.double).to( self.hardwareDevice).view(-1, 1) matrixA = torch.tensor(self.buffer['matrix_a'], dtype=torch.double).to(self.hardwareDevice) indexExp = torch.tensor(self.buffer['index_exp'], dtype=torch.double).to(self.hardwareDevice) s = torch.tensor(self.buffer['s'], dtype=torch.double).to(self.hardwareDevice) old_coefficient = torch.tensor(self.buffer['coefficient'], dtype=torch.double).to( self.hardwareDevice).view(-1, 1) with torch.no_grad(): target = sliceToConcat + self.discount * self.nn(indexExp)[1] advantage = target - self.nn(s)[1] for _ in range(self.epoch): for index in BatchSampler( SubsetRandomSampler(range(self.bufferSize)), self.batchSize, False): alpha, beta = self.nn(s[index])[0] distance = Beta(alpha, beta) coefficient = distance.log_prob(matrixA[index]).sum( dim=1, keepdim=True) relativeAdvantage = torch.exp(coefficient - old_coefficient[index]) s1 = relativeAdvantage * advantage[index] s2 = torch.clamp(ratio, 1.0 - self.clipParameter, 1.0 + self.clipParameter) * advantage[index] # Loss on an action aLoss = -torch.min(s1, s2).mean() # Loss on the value vLoss = F.smooth_l1_loss(self.nn(s[index])[1], target[index]) # Total loss calculation loss = aLoss + (vLoss * 2.0) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
class CarlaImgPolicy(nn.Module): def __init__(self, input_dim, action_dim, hidden_layer=[400, 300]): super(CarlaImgPolicy, self).__init__() self.main_actor = CarlaSimpleEncoder(latent_size=input_dim - 1) self.main_critic = CarlaSimpleEncoder(latent_size=input_dim - 1) actor_layer_size = [input_dim] + hidden_layer actor_feature_layers = nn.ModuleList([]) for i in range(len(actor_layer_size) - 1): actor_feature_layers.append( nn.Linear(actor_layer_size[i], actor_layer_size[i + 1])) actor_feature_layers.append(nn.ReLU()) self.actor = nn.Sequential(*actor_feature_layers) self.alpha_head = nn.Sequential( nn.Linear(hidden_layer[-1], action_dim), nn.Softplus()) self.beta_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim), nn.Softplus()) critic_layer_size = [input_dim] + hidden_layer critic_layers = nn.ModuleList([]) for i in range(len(critic_layer_size) - 1): critic_layers.append( nn.Linear(critic_layer_size[i], critic_layer_size[i + 1])) critic_layers.append(nn.ReLU()) critic_layers.append(layer_init(nn.Linear(hidden_layer[-1], 1), gain=1)) self.critic = nn.Sequential(*critic_layers) def forward(self, x, action=None): speed = x[:, -1:] x = x[:, :-1].view(-1, 3, 128, 128) # image size in carla driving task is 128x128 x1 = self.main_actor(x) x1 = torch.cat([x1, speed], dim=1) x2 = self.main_critic(x) x2 = torch.cat([x2, speed], dim=1) actor_features = self.actor(x1) alpha = self.alpha_head(actor_features) + 1 beta = self.beta_head(actor_features) + 1 self.dist = Beta(alpha, beta) if action is None: action = self.dist.sample() else: action = (action + 1) / 2 action_log_prob = self.dist.log_prob(action).sum(-1) entropy = self.dist.entropy().sum(-1) value = self.critic(x2) return action * 2 - 1, action_log_prob, value.squeeze(-1), entropy
def update(self): self.training_step += 1 s = torch.tensor(self.buffer['s'], dtype=torch.double).to(device) a = torch.tensor(self.buffer['a'], dtype=torch.double).to(device) r = torch.tensor(self.buffer['r'], dtype=torch.double).to(device).view(-1, 1) s_ = torch.tensor(self.buffer['s_'], dtype=torch.double).to(device) old_a_logp = torch.tensor(self.buffer['a_logp'], dtype=torch.double).to(device).view(-1, 1) with torch.no_grad(): # Using TD(1) target_v = r + gamma * self.net(s_)[ 1] # target value function = reward + value of previous state adv = target_v - self.net(s)[1] # Advantage estimator # adv = (adv - adv.mean()) / (adv.std() + 1e-8) for _ in range(self.ppo_epoch): # Iterate for index in BatchSampler( SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False): # index는 buffer_capacity를 랜덤으로 섞은 뒤 batch_size로 묶은 것 (alpha, beta), value = self.net(s[index]) dist = Beta(alpha, beta) a_logp = dist.log_prob(a[index]).sum( dim=1, keepdim=True) # 현재 state로 구한 action의 probability ratio = torch.exp( a_logp - old_a_logp[index] ) # surrogate function pi(a_t|s_t)/pi_old(a_t|s_t) surr1 = ratio * adv[index] surr2 = torch.clamp( ratio, 1.0 - self.clip_param, 1.0 + self.clip_param ) * adv[ index] # clipping (0.9~1.1 사이의 ratio로만 policy update를 진행하겠다) action_loss = -torch.min(surr1, surr2).mean( ) # clipped와 unclipped 중 작은 값을 선택 그렇게 함으로써, 너무 큰 변화 없도록 value_loss = F.smooth_l1_loss( value, target_v[index]) # smooth l1 loss : [-1,1]범위는 l2 loss 사용 loss = action_loss + 2. * value_loss # value_loss의 제곱이 되어야 하지만 그냥 l1 norm을 사용한듯 하다. self.optimizer.zero_grad() loss.backward() # nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm) # RNN 같은 계열에서 사용해주는 방법 (Gradient가 너무 많이 움직이지 않도록) self.optimizer.step()
def forward(self, s, g, greedy=False, action_logit=None): """Produce an action""" c0, c1 = self.action_stats(s, g) action_mode = (c0 - 1) / (c0 + c1 - 2) m = Beta(c0, c1) # Sample. if action_logit is None: if greedy: action_logit = action_mode else: action_logit = m.sample() n_ent = -m.entropy().mean() lprobs = m.log_prob(action_logit) action = self.scale_action(action_logit) return action, action_logit, lprobs, n_ent # Evaluate the action previously taken else: n_ent = -m.entropy().mean(dim=1) lprobs = m.log_prob(action_logit) action = self.scale_action(action_logit) return lprobs, n_ent, action
def select_action(self, state): if args.action_vec > 0: state = (torch.from_numpy( state[0]).float().to(device).unsqueeze(0), torch.from_numpy( state[1]).float().to(device).unsqueeze(0)) else: state = torch.from_numpy(state).float().to(device).unsqueeze(0) #TODO CHANGE FOR VECTOR ACTIONS with torch.no_grad(): alpha, beta = self.net(state)[0] dist = Beta(alpha, beta) action = dist.sample() a_logp = dist.log_prob(action).sum(dim=1) action = action.squeeze().cpu().numpy() a_logp = a_logp.item() return action, a_logp
def forward(self, x=None, warmup=1., inf_net=None): #, k=1): #, marginf_type=0): outputs = {} if inf_net is None: # mu, logvar = self.inference_net(x) z, logqz = self.q.sample(x) else: # mu, logvar = inf_net.inference_net(x) z, logqz = inf_net.sample(x) logpz = self.prior.logprob(z) # Decode Image x_hat = self.image_decoder(z) alpha = torch.sigmoid(x_hat) beta = Beta(alpha * self.beta_scale, (1. - alpha) * self.beta_scale) x_noise = torch.clamp( x + torch.FloatTensor(x.shape).uniform_(0., 1. / 256.).cuda(), min=1e-5, max=1 - 1e-5) # logpx = beta.log_prob(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda()) #[120,3,112,112] # add uniform noise here logpx = beta.log_prob( x_noise) #[120,3,112,112] # add uniform noise here B = z.shape[0] logpx = torch.sum(logpx.view(B, -1), 1) # [PB] * self.w_logpx log_ws = logpx + logpz - logqz outputs['logpx'] = torch.mean(logpx) outputs['x_recon'] = alpha outputs['welbo'] = torch.mean(logpx + warmup * (logpz - logqz)) outputs['elbo'] = torch.mean(log_ws) outputs['logws'] = log_ws outputs['z'] = z outputs['logpz'] = torch.mean(logpz) outputs['logqz'] = torch.mean(logqz) # outputs['logvar'] = logvar return outputs
class BetaSeparatedPolicy(nn.Module): def __init__(self, input_dim, action_dim, hidden_layer=[64, 64]): super(BetaSeparatedPolicy, self).__init__() actor_layer_size = [input_dim] + hidden_layer alpha_feature_layers = nn.ModuleList([]) beta_feature_layers = nn.ModuleList([]) for i in range(len(actor_layer_size) - 1): alpha_feature_layers.append( nn.Linear(actor_layer_size[i], actor_layer_size[i + 1])) alpha_feature_layers.append(nn.ReLU()) beta_feature_layers.append( nn.Linear(actor_layer_size[i], actor_layer_size[i + 1])) beta_feature_layers.append(nn.ReLU()) self.alpha_body = nn.Sequential(*alpha_feature_layers) self.beta_body = nn.Sequential(*beta_feature_layers) self.alpha_head = nn.Sequential( nn.Linear(hidden_layer[-1], action_dim), nn.Softplus()) self.beta_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim), nn.Softplus()) critic_layer_size = [input_dim] + hidden_layer critic_layers = nn.ModuleList([]) for i in range(len(critic_layer_size) - 1): critic_layers.append( nn.Linear(critic_layer_size[i], critic_layer_size[i + 1])) critic_layers.append(nn.ReLU()) critic_layers.append(nn.Linear(hidden_layer[-1], 1)) self.critic = nn.Sequential(*critic_layers) def forward(self, x, action=None): alpha = self.alpha_head(self.alpha_body(x)) + 1 beta = self.beta_head(self.beta_body(x)) + 1 self.dist = Beta(alpha, beta) if action is None: action = self.dist.sample() else: action = (action + 1) / 2 action_log_prob = self.dist.log_prob(action).sum(-1) entropy = self.dist.entropy().sum(-1) value = self.critic(x) return action * 2 - 1, action_log_prob, value.squeeze(-1), entropy
def beta_mle_loss(y_hat, y, reduce=True): """y_hat (batch_size x seq_len x 2) y (batch_size x seq_len x 1) """ # take exponentional to ensure positive loc_y = y_hat.exp() alpha = loc_y[:, :, 0].unsqueeze(-1) beta = loc_y[:, :, 1].unsqueeze(-1) dist = Beta(alpha, beta) # rescale y to be between y = (y + 1.0) / 2.0 # note that we will get inf loss if y == 0 or 1.0 exactly, so we will clip it slightly just in case y = torch.clamp(y, 1e-5, 0.99999) # compute logprob loss = -dist.log_prob(y).squeeze(-1) if reduce: return loss.mean() else: return loss
def update(self): self.training_step += 1 s = torch.tensor(self.buffer['s'], dtype=torch.double).to(self.device) a = torch.tensor(self.buffer['a'], dtype=torch.double).to(self.device) r = torch.tensor(self.buffer['r'], dtype=torch.double).to(self.device).view(-1, 1) next_s = torch.tensor(self.buffer['s_'], dtype=torch.double).to(self.device) old_a_logp = torch.tensor(self.buffer['a_logp'], dtype=torch.double).to(self.device).view( -1, 1) with torch.no_grad(): target_v = r + GAMMA * self.net(next_s)[1] adv = target_v - self.net(s)[1] # adv = (adv - adv.mean()) / (adv.std() + 1e-8) for _ in range(EPOCH): for index in BatchSampler(SubsetRandomSampler(range(MAX_SIZE)), BATCH, False): alpha, beta = self.net(s[index])[0] dist = Beta(alpha, beta) a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True) ratio = torch.exp(a_logp - old_a_logp[index]) surr1 = ratio * adv[index] # clipped function surr2 = torch.clamp(ratio, 1.0 - EPS, 1.0 + EPS) * adv[index] action_loss = -torch.min(surr1, surr2).mean() value_loss = F.smooth_l1_loss( self.net(s[index])[1], target_v[index]) loss = action_loss + 2. * value_loss self.optimizer.zero_grad() loss.backward() self.optimizer.step()
def forward(self, x, warmup=1.): B = x.shape[0] outputs = {} # if inf_net is None: z, logqz = self.encoder.sample(x) # else: # z, logqz = inf_net.sample(x) # print (logqz.shape) logpz = self.prior.logprob(z) # print (logpz.shape) x_hat = self.generator.decode(x, z) alpha = torch.sigmoid(x_hat) beta = Beta(alpha * self.beta_scale, (1. - alpha) * self.beta_scale) logpxz = beta.log_prob(x) #[120,3,112,112] logpxz = torch.sum(logpxz.view(B, -1), 1) # [B] * self.w_logpx # print (logpxz.shape) # logpxz = logpxz * .02 #self.w_logpx # #Compute elbo # elbo = logpxz - (warmup*logqz) #[P,B] # if k>1: # max_ = torch.max(elbo, 0)[0] #[B] # elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B] elbo = logpxz + logpz - logqz welbo = logpxz + warmup * (logpz - logqz) # elbo = torch.mean(elbo) #[1] outputs['logpxz'] = torch.mean(logpxz) #[1] outputs['logqz'] = torch.mean(logqz) outputs['logpz'] = torch.mean(logpz) outputs['elbo'] = torch.mean(elbo) outputs['welbo'] = torch.mean(welbo) outputs['x_hat'] = alpha # outputs['elbo_B'] = elbo return outputs
class MyDist(ActionDistribution): @staticmethod def required_model_output_shape(action_space, model_config): return 6 def __init__(self, inputs, model): super(MyDist, self).__init__(inputs, model) self.dist = Beta(inputs[:, :3], inputs[:, 3:]) def sample(self): self.sampled_action = self.dist.sample() return self.sampled_action def deterministic_sample(self): return self.dist.mean def sampled_action_logp(self): return self.logp(self.sampled_action) def logp(self, actions): return self.dist.log_prob(actions).sum(-1) # refered from https://github.com/pytorch/pytorch/blob/master/torch/distributions/kl.py def kl(self, other): p, q = self.dist, other.dist sum_params_p = p.concentration1 + p.concentration0 sum_params_q = q.concentration1 + q.concentration0 t1 = q.concentration1.lgamma() + q.concentration0.lgamma() + ( sum_params_p).lgamma() t2 = p.concentration1.lgamma() + p.concentration0.lgamma() + ( sum_params_q).lgamma() t3 = (p.concentration1 - q.concentration1) * torch.digamma( p.concentration1) t4 = (p.concentration0 - q.concentration0) * torch.digamma( p.concentration0) t5 = (sum_params_q - sum_params_p) * torch.digamma(sum_params_p) return (t1 - t2 + t3 + t4 + t5).sum(-1) def entropy(self): return self.dist.entropy().sum(-1)
def forward(self, x=None, warmup=1., inf_net=None): #, k=1): #, marginf_type=0): outputs = {} if inf_net is None: # mu, logvar = self.inference_net(x) z, logqz = self.q.sample(x) else: # mu, logvar = inf_net.inference_net(x) z, logqz = inf_net.sample(x) logpz = self.prior.logprob(z) # Decode Image x_hat = self.image_decoder(z) alpha = torch.sigmoid(x_hat) beta = Beta(alpha*self.beta_scale, (1.-alpha)*self.beta_scale) x_noise = torch.clamp(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda(), min=1e-5, max=1-1e-5) # logpx = beta.log_prob(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda()) #[120,3,112,112] # add uniform noise here logpx = beta.log_prob(x_noise) #[120,3,112,112] # add uniform noise here B = z.shape[0] logpx = torch.sum(logpx.view(B, -1),1) # [PB] * self.w_logpx log_ws = logpx + logpz - logqz outputs['logpx'] = torch.mean(logpx) outputs['x_recon'] = alpha outputs['welbo'] = torch.mean(logpx + warmup*( logpz - logqz)) outputs['elbo'] = torch.mean(log_ws) outputs['logws'] = log_ws outputs['z'] = z outputs['logpz'] = torch.mean(logpz) outputs['logqz'] = torch.mean(logqz) # outputs['logvar'] = logvar return outputs
def f(self, x, z, logits, hard=False): B = x.shape[0] # image likelihood given b # b = harden(z).detach() x_hat = self.generator.forward(z) alpha = torch.sigmoid(x_hat) beta = Beta(alpha*self.beta_scale, (1.-alpha)*self.beta_scale) x_noise = torch.clamp(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda(), min=1e-5, max=1-1e-5) logpx = beta.log_prob(x_noise) #[120,3,112,112] # add uniform noise here logpx = torch.sum(logpx.view(B, -1),1) # [PB] * self.w_logpx # prior is constant I think # for q(b|x), we just want to increase its entropy if hard: dist = Bernoulli(logits=logits) else: dist = RelaxedBernoulli(torch.Tensor([1.]).cuda(), logits=logits) logqb = dist.log_prob(z.detach()) logqb = torch.sum(logqb,1) return logpx, logqb, alpha
def forward(self, x=None, q=None, warmup=1., generate=False, inf_type=1, dec_type=0): #, k=1): #, marginf_type=0): # x: [B,3,112,112] # q: [B,L] # inf type: 0 is both, 1 is only x, 2 is only y # dec type: 0 is both, 1 is only x, 2 is only y outputs = {} if inf_type in [0, 2] or dec_type in [0, 2]: embed = self.encoder_embed(q) if inf_type == 0: x_enc = self.image_encoder(x) y_enc = self.encode_attributes(embed) mu, logvar = self.inference_net(x_enc, y_enc) z, logpz, logqz = self.sample(mu, logvar) elif inf_type == 1: # if self.joint_inf: x_enc = self.image_encoder2(x) mu, logvar = self.inference_net_x(x_enc) # else: # if dec_type ==0: # x_enc = self.image_encoder(x) # mu, logvar = self.inference_net(x_enc) # else: # x_enc = self.image_encoder2(x) # mu, logvar = self.inference_net_x(x_enc) z, logpz, logqz = self.sample(mu, logvar) elif inf_type == 2: y_enc = self.encode_attributes2(embed) mu, logvar = self.inference_net_y(y_enc) if self.flow_int: z, logpz, logqz = self.flow.sample(mu, logvar) else: z, logpz, logqz = self.sample(mu, logvar) # z_prior = torch.FloatTensor(self.B, self.z_size).normal_().cuda() # loss, acc = self.discrim.discrim_loss(z, z_prior) pred = self.discrim.predict( z).mean() #want to minimize this, since prior prediction = 0 z_dec = self.z_to_enc(z) B = z_dec.shape[0] if dec_type == 0: # Decode Image x_hat = self.image_decoder(z_dec) alpha = torch.sigmoid(x_hat) beta = Beta(alpha * self.beta_scale, (1. - alpha) * self.beta_scale) logpx = beta.log_prob(x) #[120,3,112,112] logpx = torch.sum(logpx.view(B, -1), 1) # [B] word_preds, logpy = self.text_generator.teacher_force( z_dec, embed, q) logpx = logpx * self.w_logpx logpy = logpy * self.w_logpy #CE of q(z|y) if inf_type == 1: embed = self.encoder_embed(q) y_enc = self.encode_attributes2(embed) mu_y, logvar_y = self.inference_net_y(y_enc) if self.flow_int: logqzy = self.flow.logprob(z.detach(), mu_y, logvar_y) # logqzy = self.flow.logprob(z, mu_y, logvar_y) else: logqzy = lognormal(z, mu_y, logvar_y) logqzy = logqzy * self.w_logqy log_ws = logpx + logpy + logpz - logqz #+ logqzy elbo = torch.mean(log_ws) # warmed_elbo = torch.mean(logpx + logpy + logqzy - logqz + warmup*( logpz - logqz)) # warmed_elbo = torch.mean(logpx + logpy + logqzy + warmup*( logpz - logqz)) warmed_elbo = torch.mean(logpx + logpy + logqzy + warmup * (pred)) # warmed_elbo = torch.mean(-torch.log(pred)) # warmed_elbo = pred # warmed_elbo = torch.mean(logpz - logqz) outputs['logpx'] = torch.mean(logpx) outputs['x_recon'] = alpha outputs['logpy'] = torch.mean(logpy) outputs['logqzy'] = torch.mean(logqzy) elif dec_type == 1: # Decode Image x_hat = self.image_decoder(z_dec) alpha = torch.sigmoid(x_hat) beta = Beta(alpha * self.beta_scale, (1. - alpha) * self.beta_scale) logpx = beta.log_prob(x) #[120,3,112,112] logpx = torch.sum(logpx.view(B, -1), 1) # [PB] * self.w_logpx logpx = logpx * self.w_logpx log_ws = logpx + logpz - logqz elbo = torch.mean(log_ws) warmed_elbo = torch.mean(logpx + warmup * (logpz - logqz)) outputs['logpx'] = torch.mean(logpx) outputs['x_recon'] = alpha elif dec_type == 2: #Decode Text word_preds, logpy = self.text_generator.teacher_force( z_dec, embed, q) logpy = logpy * self.w_logpy log_ws = logpy + logpz - logqz elbo = torch.mean(log_ws) warmed_elbo = torch.mean(logpy + warmup * (logpz - logqz)) outputs['logpy'] = torch.mean(logpy) outputs['welbo'] = warmed_elbo outputs['elbo'] = elbo outputs['logws'] = log_ws outputs['z'] = z outputs['logpz'] = torch.mean(logpz) outputs['logqz'] = torch.mean(logqz) outputs['logvar'] = logvar if generate: word_preds, sampled_words = self.text_generator.teacher_force( z_dec, generate=generate, embeder=self.encoder_embed) if dec_type == 2: alpha = torch.sigmoid(self.image_decoder(z_dec)) return outputs, alpha, word_preds, sampled_words return outputs
def run(self): updatestep = 0 update = 0 i_episode = 0 while (update < 100000): self.lr = args.lr - (args.lr * (i_episode / float(10000))) i_episode = i_episode + 1 observation = self.env.reset() step = 0 observes_list, rewards, actions, values, old_log = [], [], [], [], [] if updatestep > 2048: update = update + 1 updatestep = 0 if (args.usegae): self.add_gae(self.trajectories, self.gamma, self.lam) else: self.add_no_gae(self.trajectories, self.gamma) s, a, adv, old_a_logp, target_v, totalsize = self.gettraindata( ) minibatch = max(totalsize // args.numminibatch, 1) for _ in range(self.PPOepoch): for index in BatchSampler( SubsetRandomSampler(range(totalsize)), minibatch, False): alpha, beta = self.net(s[index])[0] dist = Beta(alpha, beta) a_logp = dist.log_prob(a[index]).sum(dim=1) ratio = torch.exp(a_logp - old_a_logp[index]) with torch.no_grad(): entrop = dist.entropy() surr1 = ratio * adv[index] surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv[index] action_loss = -torch.min(surr1, surr2).mean() value_loss = F.mse_loss( self.net(s[index])[1], target_v[index]) self.storeloss(action_loss, value_loss) loss = action_loss + 0.5 * value_loss - 0.01 * entrop.mean( ) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.net.parameters(), args.maxgradnorm) self.optimizer.step() self.trajectories = [] while (1): step = step + 1 updatestep = updatestep + 1 #self.env.render() observes = observation.astype(np.float32).reshape((1, -1)) input = torch.tensor(observes, dtype=torch.double).to(device).reshape( -1, self.inputsize) (alpha, beta), v = self.net(input) dist = Beta(alpha, beta) action = dist.sample() a_logp = dist.log_prob(action.view(-1, 6)).sum(dim=1) a_logp = a_logp.item() old_log.append(a_logp) values.append(v.item()) observes_list.append(observes) actions.append(action) action = action.squeeze().cpu().numpy() observation, reward, done, info = self.env.step(action * 2 - 1) rewards.append(reward) if done: print("Episode finished after {} timesteps, rewards is {}". format(step, sum(rewards))) self.storereward(format(step)) trajectory = { 'observes': np.concatenate([t for t in observes_list]), 'actions': np.concatenate([t.to('cpu') for t in actions]), 'rewards': np.array(rewards), 'values': np.array(values), 'old_log': np.array(old_log) } self.trajectories.append(trajectory) break
def run(self): sumoBinary = checkBinary('sumo') traci.start([sumoBinary, "-c", "roadfile/cross.sumocfg"]) listpic = [] self.loop = 0 self.dict1 = {} self.periodtime = [] self.currentstate = 0 self.time_click = 0 self.out_record = None self.max_grad_norm = 0.5 self.wtime = [] self.tflightime = np.array([30, 30, 30, 30]) while traci.simulation.getMinExpectedNumber() > 0: traci.simulationStep() if (self.currentstate is not traci.trafficlight.getPhase("0")): self.time_click = 0 self.time_click = self.time_click + 1 phase_index = int(traci.trafficlight.getPhase("0") / 2) list_car = traci.vehicle.getIDList() for k in list_car: traci.vehicle.setLaneChangeMode(k, 0b001000000000) vehiclein_l = traci.simulation.getDepartedIDList() if (vehiclein_l): for i in vehiclein_l: self.dict1[i] = self.step vehicleout_l = traci.simulation.getArrivedIDList() if (vehicleout_l): for i in vehicleout_l: self.periodtime.append(self.step - self.dict1[i]) self.wtime.append(self.step - self.dict1[i]) self.dict1.pop(i) if ((self.step - int(self.step / 2000) * 2000) % 5 == 0 and int( (self.step - int(self.step / 2000) * 2000) / 5) < 4): listpic.append(self.getstate(list_car)) if self.step % 1000 == 999: if int(self.step / 1000) % 2 == 0: if (self.wtime): self.writetime(np.array(self.wtime).mean()) self.wtime = [] if (self.step % 2000 == 15): print(self.loop) self.loop = self.loop + 1 if (len(listpic) != 4): break pict = np.array(listpic) input_d = torch.tensor(pict, dtype=torch.double).to(device) input_d = input_d.reshape(1, 4, 28, 28) listpic = [] with torch.no_grad(): alpha, beta = self.net(input_d)[0] dist = Beta(alpha, beta) action = dist.sample() a_logp = dist.log_prob(action.view(-1, 4)).sum(dim=1) action = action.squeeze().cpu().numpy() a_logp = a_logp.item() self.tflightime = np.array(action * 60) self.writeac(self.tflightime.tolist()) reward = 0 if (self.periodtime): reward = -0.9 * np.array(self.periodtime).mean( ) - 0.1 * np.array(self.periodtime).max() reward = (reward + 150) / 50 self.periodtime = [] ifupdata = None if self.out_record is not None: ifupdata = self.memory.store( (self.out_record[0], self.out_record[1], reward, pict, self.out_record[2], self.out_record[3])) self.out_record = [ pict, action, a_logp, self.net(input_d)[1].item() ] if ifupdata is True: print('train') self.trainmodel() self.currentstate = traci.trafficlight.getPhase("0") if (self.time_click >= self.tflightime[phase_index]): traci.trafficlight.setPhase("0", (self.currentstate + 1) % 8) self.step += 1 traci.close() sys.stdout.flush()