Пример #1
0
 def test_beta_shape_scalar_params(self):
     dist = Beta(0.1, 0.1)
     self.assertEqual(dist._batch_shape, torch.Size())
     self.assertEqual(dist._event_shape, torch.Size())
     self.assertEqual(dist.sample().size(), torch.Size((1, )))
     self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2)))
     self.assertRaises(ValueError, dist.log_prob, self.scalar_sample)
     self.assertEqual(
         dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
     self.assertEqual(
         dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
Пример #2
0
    def update(self, epochs, steps, total_obs, total_actions, advantage,
               real_values):

        total_obs_ = torch.from_numpy(total_obs).type(torch.FloatTensor)
        advantage_ = torch.from_numpy(advantage).type(torch.FloatTensor)
        real_values_ = torch.from_numpy(real_values).type(torch.FloatTensor)
        total_actions = torch.from_numpy(total_actions).type(torch.FloatTensor)

        for _ in range(epochs):
            inds = np.arange(steps)
            np.random.shuffle(inds)

            for t in range(steps):
                index = inds[t]

                alpha, beta, values_to_backprop = self.network(
                    total_obs_[index].unsqueeze(0))

                m = Beta(alpha, beta)
                action_taken_prob = m.log_prob(total_actions[index]).sum(
                    dim=1, keepdim=True)

                entropy = m.entropy()
                entropy = entropy.sum(dim=1)
                print(entropy)

                alpha, beta, _ = self.old_network(
                    total_obs_[index].unsqueeze(0))
                m_old = Beta(alpha, beta)
                old_action_taken_probs = m_old.log_prob(
                    total_actions[index]).sum(dim=1, keepdim=True)

                ratios = action_taken_prob / (old_action_taken_probs + 1e-5)

                surr1 = ratios * advantage_[index]
                surr2 = torch.clamp(ratios, min=(1. - .1),
                                    max=(1. + .1)) * advantage_[index]
                policy_loss = -torch.min(surr1, surr2)
                value_loss = ((values_to_backprop - real_values_[index])**2)
                #value_loss = F.smooth_l1_loss(values_to_backprop, real_values_[index])
                total_loss = policy_loss + value_loss - 0.01 * entropy
                print(total_loss)
                self.optimizer.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.network.parameters(), 0.5)
                self.optimizer.step()

        self.old_network.load_state_dict(self.dic_placeholder)
        self.dic_placeholder = self.network.state_dict()
        return (value_loss)
Пример #3
0
    def forward(self,
                x=None,
                warmup=1.,
                inf_net=None):  #, k=1): #, marginf_type=0):
        # x: [B,3,112,112]
        # q: [B,L]
        # inf type: 0 is both, 1 is only x, 2 is only y
        # dec type: 0 is both, 1 is only x, 2 is only y

        outputs = {}

        if inf_net is None:
            mu, logvar = self.inference_net(x)
        else:
            mu, logvar = inf_net.inference_net(x)

        z, logpz, logqz = self.sample(mu, logvar)

        z_dec = self.z_to_dec(z)

        B = z_dec.shape[0]

        # Decode Image
        x_hat = self.image_decoder(z_dec)
        alpha = torch.sigmoid(x_hat)

        beta = Beta(alpha * self.beta_scale, (1. - alpha) * self.beta_scale)
        x_noise = torch.clamp(
            x + torch.FloatTensor(x.shape).uniform_(0., 1. / 256.).cuda(),
            min=1e-5,
            max=1 - 1e-5)
        # logpx = beta.log_prob(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda()) #[120,3,112,112]  # add uniform noise here
        logpx = beta.log_prob(
            x_noise)  #[120,3,112,112]  # add uniform noise here

        logpx = torch.sum(logpx.view(B, -1), 1)  # [PB]  * self.w_logpx
        # logpx = logpx * self.w_logpx

        log_ws = logpx + logpz - logqz

        outputs['logpx'] = torch.mean(logpx)
        outputs['x_recon'] = alpha
        outputs['welbo'] = torch.mean(logpx + warmup * (logpz - logqz))
        outputs['elbo'] = torch.mean(log_ws)
        outputs['logws'] = log_ws
        outputs['z'] = z
        outputs['logpz'] = torch.mean(logpz)
        outputs['logqz'] = torch.mean(logqz)
        outputs['logvar'] = logvar

        # print (outputs['elbo'], outputs['welbo'], outputs['logpz'], outputs['logqz'])
        # fafs

        # if generate:
        #     # word_preds, sampled_words = self.text_generator.teacher_force(z_dec, generate=generate, embeder=self.encoder_embed)
        #     # if dec_type == 2:
        #     alpha = torch.sigmoid(self.image_decoder(z_dec))
        #     return outputs, alpha #, word_preds, sampled_words

        return outputs
Пример #4
0
    def chooseActionTrain(self, state):
        """ Choose an action during training mode
        
            Parameters
            -------
            state:
                The current state of the car.

            Returns
            -------
            action : np.ndarray
                The actions to run on the track
            coefficient : float
                The logarithmic probability for an action

            Notes
            -------
                This function is only called when the --train flag IS provided.
        """
        state = torch.from_numpy(state).double().to(
            self.hardwareDevice).unsqueeze(0)
        with torch.no_grad():
            alpha, beta = self.nn(state)[0]
        dist = Beta(alpha, beta)
        action = dist.sample()
        coefficient = dist.log_prob(action).sum(dim=1)

        action = action.squeeze().cpu().numpy()
        coefficient = coefficient.item()

        return action, coefficient
    def update(self):
        self.training_step += 1

        s = torch.tensor(self.buffer['s'], dtype=torch.double).to(device)
        a = torch.tensor(self.buffer['a'], dtype=torch.double).to(device)
        r = torch.tensor(self.buffer['r'], dtype=torch.double).to(device).view(-1, 1)
        s_ = torch.tensor(self.buffer['s_'], dtype=torch.double).to(device)
        old_a_logp = torch.tensor(self.buffer['a_logp'], dtype=torch.double).to(device).view(-1, 1)

        with torch.no_grad():
            target_v = r + args.gamma * self.net(s_, actual_obs=False)[1]
            adv = target_v - self.net(s, actual_obs=False)[1]
            # adv = (adv - adv.mean()) / (adv.std() + 1e-8)

        for _ in range(self.ppo_epoch):
            for index in BatchSampler(SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False):

                alpha, beta = self.net(s[index], actual_obs=False)[0]
                dist = Beta(alpha, beta)
                a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True)
                ratio = torch.exp(a_logp - old_a_logp[index])

                surr1 = ratio * adv[index]
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv[index]
                action_loss = -torch.min(surr1, surr2).mean()
                value_loss = F.smooth_l1_loss(self.net(s[index], actual_obs=False)[1], target_v[index])
                loss = action_loss + 2. * value_loss

                self.optimizer.zero_grad()
                loss.backward()
                # nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm)
                self.optimizer.step()
Пример #6
0
    def forward(self,
                observation,
                reparameterize=True,
                deterministic=False,
                return_log_prob=False):
        """
        Forward pass.
        Assumes input is a torch tensor.

        :type observation: torch.Tensor
        """
        layer_input = observation
        for fc in self.fcs:
            layer_input = self.hidden_activation(fc(layer_input))
        network_output = self.output_activation(self.last_fc(layer_input))

        alpha = network_output[:, 0].unsqueeze(1) + EPSILON
        beta = network_output[:, 1].unsqueeze(1) + EPSILON
        distribution = Beta(alpha, beta)
        distribution_mean = distribution.mean
        if deterministic:
            sample = distribution.rsample()
        else:
            sample = distribution_mean
        # transform to range (min, max)
        action = self.min + self.max_min_difference * sample
        mean = self.min + self.max_min_difference * distribution_mean
        variance = self.max_min_difference_squared * distribution.variance
        std = torch.sqrt(variance)
        log_std = torch.log(std)
        log_prob = distribution.log_prob(sample)
        entropy = distribution.entropy()
        mean_action_log_prob = None
        pre_tanh_value = None
        return action, mean, log_std, log_prob, entropy, std, mean_action_log_prob, pre_tanh_value
Пример #7
0
    def update(self):
        self.training_step += 1

        s = torch.tensor(self.buffer['s'], dtype=torch.double)
        a = torch.tensor(self.buffer['a'], dtype=torch.double)
        r = torch.tensor(self.buffer['r'], dtype=torch.double).view(-1, 1)
        s_ = torch.tensor(self.buffer['s_'], dtype=torch.double)

        old_a_logp = torch.tensor(self.buffer['a_logp'],
                                  dtype=torch.double).view(-1, 1)

        with torch.no_grad():
            target_v = r + self.gamma * self.net(s_)[1]
            adv = target_v - self.net(s)[1]

        for _ in range(self.ppo_epoch):
            for index in BatchSampler(
                    SubsetRandomSampler(range(self.buffer_capacity)),
                    self.batch_size, False):

                alpha, beta = self.net(s[index])[0]
                dist = Beta(alpha, beta)
                a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True)
                ratio = torch.exp(a_logp - old_a_logp[index])

                surr1 = ratio * adv[index]
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                    1.0 + self.clip_param) * adv[index]
                action_loss = -torch.min(surr1, surr2).mean()
                value_loss = F.smooth_l1_loss(
                    self.net(s[index])[1], target_v[index])
                loss = action_loss + 2. * value_loss

                self.optimizer.zero_grad()
                loss.backward()

                # intuition says to do this step differently
                # i.e. compute loss using minibatches and take multiple SGD steps

                # new insight: the shape of the objective function is fundamental in limiting
                # how the parameters theta don't move to a region where L > 1 + epsilon
                # because the norm of the gradient near the 'ceiling' approaches 0, we don't move far into the territory
                # this works with multiple SGD steps, but unclear how a step of grad * lr works

                # in an update, theta_k is constant so we are always moving in the same space
                # what happens if we move with too big of a gradient?
                # then the grad = 0, and we have finished early

                # epsilon is relevant for each individual action, so if its not yet there,
                # each action takes a gradient step closer to the ceiling

                # ppo just limits the adjustments of each action under the policy (given state)
                # objective must be maxed for each action

                # when adjusting theta for another transition, a different ratio can be > epsilon
                # this is fine, as long as the optimizer does not act greedily w.r.t this

                self.optimizer.step()
Пример #8
0
 def test_beta_log_prob(self):
     for _ in range(100):
         alpha = np.exp(np.random.normal())
         beta = np.exp(np.random.normal())
         dist = Beta(alpha, beta)
         x = dist.sample()
         actual_log_prob = dist.log_prob(x).sum()
         expected_log_prob = scipy.stats.beta.logpdf(x, alpha, beta)[0]
         self.assertAlmostEqual(actual_log_prob, expected_log_prob, places=3, allow_inf=True)
Пример #9
0
 def test_beta_shape_tensor_params(self):
     dist = Beta(torch.Tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]),
                 torch.Tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]))
     self.assertEqual(dist._batch_shape, torch.Size((3, 2)))
     self.assertEqual(dist._event_shape, torch.Size(()))
     self.assertEqual(dist.sample().size(), torch.Size((3, 2)))
     self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
     self.assertEqual(dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
     self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
Пример #10
0
    def forward(self, x=None, warmup=1., inf_net=None): #, k=1): #, marginf_type=0):
        # x: [B,3,112,112]
        # q: [B,L] 
        # inf type: 0 is both, 1 is only x, 2 is only y
        # dec type: 0 is both, 1 is only x, 2 is only y

        outputs = {}

        if inf_net is None:
        	mu, logvar = self.inference_net(x)
        else:
        	mu, logvar = inf_net.inference_net(x)   



        z, logpz, logqz = self.sample(mu, logvar) 

        z_dec = self.z_to_dec(z)

        B = z_dec.shape[0]

        # Decode Image
        x_hat = self.image_decoder(z_dec)
        alpha = torch.sigmoid(x_hat)

        beta = Beta(alpha*self.beta_scale, (1.-alpha)*self.beta_scale)
        x_noise = torch.clamp(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda(), min=1e-5, max=1-1e-5)
        # logpx = beta.log_prob(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda()) #[120,3,112,112]  # add uniform noise here
        logpx = beta.log_prob(x_noise) #[120,3,112,112]  # add uniform noise here

        logpx = torch.sum(logpx.view(B, -1),1) # [PB]  * self.w_logpx
        # logpx = logpx * self.w_logpx

        log_ws = logpx + logpz - logqz

        outputs['logpx'] = torch.mean(logpx)
        outputs['x_recon'] = alpha
        outputs['welbo'] = torch.mean(logpx + warmup*( logpz - logqz))
        outputs['elbo'] = torch.mean(log_ws)
        outputs['logws'] = log_ws
        outputs['z'] = z
        outputs['logpz'] = torch.mean(logpz)
        outputs['logqz'] = torch.mean(logqz)
        outputs['logvar'] = logvar

        # print (outputs['elbo'], outputs['welbo'], outputs['logpz'], outputs['logqz'])
        # fafs


        # if generate:
        #     # word_preds, sampled_words = self.text_generator.teacher_force(z_dec, generate=generate, embeder=self.encoder_embed)
        #     # if dec_type == 2:
        #     alpha = torch.sigmoid(self.image_decoder(z_dec))
        #     return outputs, alpha #, word_preds, sampled_words

        return outputs
Пример #11
0
    def forward(self, x=None, warmup=1., inf_net=None): #, k=1): #, marginf_type=0):

        outputs = {}
        B = x.shape[0]

        if inf_net is None:
            # mu, logvar = self.inference_net(x)
            z, logits = self.q.sample(x) 
        else:
            # mu, logvar = inf_net.inference_net(x)   
            z, logqz = inf_net.sample(x) 

        # print (z[0])
        # b = harden(z)
        # print (b[0])
        
        # logpz = torch.sum( self.prior.log_prob(b), dim=1)

        # print (logpz[0])
        # print (logpz.shape)
        # fdasf

        probs_q = torch.sigmoid(logits)
        probs_q = torch.clamp(probs_q, min=.00000001, max=.9999999)
        probs_p = torch.ones(B, self.z_size).cuda() *.5
        KL = probs_q*torch.log(probs_q/probs_p) + (1-probs_q)*torch.log((1-probs_q)/(1-probs_p))
        KL = torch.sum(KL, dim=1)

        # print (z.shape)
        # Decode Image
        x_hat = self.generator.forward(z)
        alpha = torch.sigmoid(x_hat)
        beta = Beta(alpha*self.beta_scale, (1.-alpha)*self.beta_scale)
        x_noise = torch.clamp(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda(), min=1e-5, max=1-1e-5)
        logpx = beta.log_prob(x_noise) #[120,3,112,112]  # add uniform noise here

        logpx = torch.sum(logpx.view(B, -1),1) # [PB]  * self.w_logpx

        # print (logpx.shape,logpz.shape,logqz.shape)
        # fsdfda

        log_ws = logpx - KL #+ logpz - logqz

        outputs['logpx'] = torch.mean(logpx)
        outputs['x_recon'] = alpha
        # outputs['welbo'] = torch.mean(logpx + warmup*( logpz - logqz))
        outputs['welbo'] = torch.mean(logpx + warmup*(KL))
        outputs['elbo'] = torch.mean(log_ws)
        outputs['logws'] = log_ws
        outputs['z'] = z
        outputs['logpz'] = torch.zeros(1) #torch.mean(logpz)
        outputs['logqz'] = torch.mean(KL)
        # outputs['logvar'] = logvar

        return outputs
Пример #12
0
    def trainmodel(self):

        s = torch.tensor(self.memory.buffer['s'],
                         dtype=torch.double).to(device)
        a = torch.tensor(self.memory.buffer['a'],
                         dtype=torch.double).to(device)
        #r = torch.tensor(self.memory.buffer['r'], dtype=torch.double).to(device).view(-1, 1)
        s_ = torch.tensor(self.memory.buffer['s_'],
                          dtype=torch.double).to(device)
        #v = torch.tensor(self.memory.buffer['v'], dtype=torch.double).to(device).view(-1, 1)
        input = s_[-1].view(1, 4, 28, 28)
        future_value = self.net(input)[1].item()
        adv, target_v = self.getgae(future_value)

        adv = torch.tensor(np.array(adv),
                           dtype=torch.double).to(device).view(-1, 1)
        target_v = torch.tensor(target_v,
                                dtype=torch.double).to(device).view(-1, 1)
        adv = (adv - adv.mean()) / (adv.std() + 1e-5)
        old_a_logp = torch.tensor(self.memory.buffer['a_logp'],
                                  dtype=torch.double).to(device).view(-1, 1)

        for _ in range(self.PPOepoch):
            for index in BatchSampler(
                    SubsetRandomSampler(range(self.memory.buffer_capacity)),
                    self.memory.batch_size, False):

                alpha, beta = self.net(s[index])[0]
                dist = Beta(alpha, beta)
                a_logp = dist.log_prob(a[index]).sum(dim=1)
                a_logp = a_logp.reshape(-1, 1)
                ratio = torch.exp(a_logp - old_a_logp[index])
                with torch.no_grad():
                    entrop = dist.entropy()

                surr1 = ratio * adv[index]
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                    1.0 + self.clip_param) * adv[index]
                action_loss = -torch.min(surr1, surr2).mean()
                value_loss = F.smooth_l1_loss(
                    self.net(s[index])[1], target_v[index])
                self.storeloss(action_loss, value_loss)
                action_loss = torch.clamp(action_loss, 0, 10)
                value_loss = torch.clamp(value_loss, 0, 10)
                loss = action_loss + 2. * value_loss - args.bound * entrop.mean(
                )

                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(self.net.parameters(),
                                         self.max_grad_norm)
                self.optimizer.step()

        torch.save(self.net.state_dict(), self.path_t7)
Пример #13
0
    def select_action(self, state):
        state = torch.from_numpy(state).double().to(device).unsqueeze(0)
        with torch.no_grad():
            alpha, beta = self.net(state)[0]
        dist = Beta(alpha, beta)
        action = dist.sample()  # 3 values in [0,1]
        a_logp = dist.log_prob(action).sum(dim=1)  # For PPO
        action = action.squeeze().cpu().numpy()
        a_logp = a_logp.item()

        return action, a_logp
    def select_action(self, state):
        state = torch.from_numpy(state).double().to(device).unsqueeze(0)
        with torch.no_grad():
            (alpha, beta), _, rcrc_s = self.net(state)
        dist = Beta(alpha, beta)
        action = dist.sample()
        a_logp = dist.log_prob(action).sum(dim=1)

        action = action.squeeze().cpu().numpy()
        a_logp = a_logp.item()
        return action, a_logp, rcrc_s
Пример #15
0
 def select_action(self, state):
     # deal with datatype of state and transform it
     state = torch.from_numpy(state).double().unsqueeze(0)
     with torch.no_grad():
         alpha, beta = self.net(state)[0]
     dist = Beta(alpha, beta)
     action = dist.sample()  # sampled action in interval (0, 1)
     a_logp = dist.log_prob(action).sum(
         dim=1)  # add the log probability densities of the 3-stack
     action = action.squeeze().numpy()
     a_logp = a_logp.item()
     return action, a_logp
Пример #16
0
 def gnll_loss_beta(y, param_1, param_2):   
     batch_size = y.shape[0]
     loss = 0
     for i in range(batch_size):
         beta = Beta(param_1[i], param_2[i])
         sample = y[i].reshape(-1,1)
         for j in sample:                            # this is because log_prob is inf for score = 1.0 or 0.0, which makes loss=nan
             if j == 0: j += 1.0e-3
             elif j == 1: j-= 1.0e-3
         log_likelihood = beta.log_prob(sample)    # (9,32)
         loss -= torch.mean(log_likelihood)
     return loss + 200
    def select_action(self, state, hidden):
        
        with torch.no_grad():
            _, latent_mu, _ = self.vae(state)
            alpha, beta = self.net(latent_mu, hidden[0])[0]
        
        dist = Beta(alpha, beta)
        action = dist.sample()
        a_logp = dist.log_prob(action).sum(dim=1)

        a_logp = a_logp.item()
        _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
        
        return action.squeeze().cpu().numpy(), a_logp, latent_mu, next_hidden
Пример #18
0
    def update(self):
        """ Run an update on the network """
        self.trainingStep += 1

        sliceToConcat = torch.tensor(self.buffer['slice_to_concat'],
                                     dtype=torch.double).to(
                                         self.hardwareDevice).view(-1, 1)
        matrixA = torch.tensor(self.buffer['matrix_a'],
                               dtype=torch.double).to(self.hardwareDevice)
        indexExp = torch.tensor(self.buffer['index_exp'],
                                dtype=torch.double).to(self.hardwareDevice)
        s = torch.tensor(self.buffer['s'],
                         dtype=torch.double).to(self.hardwareDevice)

        old_coefficient = torch.tensor(self.buffer['coefficient'],
                                       dtype=torch.double).to(
                                           self.hardwareDevice).view(-1, 1)

        with torch.no_grad():
            target = sliceToConcat + self.discount * self.nn(indexExp)[1]
            advantage = target - self.nn(s)[1]

        for _ in range(self.epoch):
            for index in BatchSampler(
                    SubsetRandomSampler(range(self.bufferSize)),
                    self.batchSize, False):
                alpha, beta = self.nn(s[index])[0]
                distance = Beta(alpha, beta)
                coefficient = distance.log_prob(matrixA[index]).sum(
                    dim=1, keepdim=True)
                relativeAdvantage = torch.exp(coefficient -
                                              old_coefficient[index])

                s1 = relativeAdvantage * advantage[index]
                s2 = torch.clamp(ratio, 1.0 - self.clipParameter,
                                 1.0 + self.clipParameter) * advantage[index]

                # Loss on an action
                aLoss = -torch.min(s1, s2).mean()

                # Loss on the value
                vLoss = F.smooth_l1_loss(self.nn(s[index])[1], target[index])

                # Total loss calculation
                loss = aLoss + (vLoss * 2.0)

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
Пример #19
0
class CarlaImgPolicy(nn.Module):
    def __init__(self, input_dim, action_dim, hidden_layer=[400, 300]):
        super(CarlaImgPolicy, self).__init__()
        self.main_actor = CarlaSimpleEncoder(latent_size=input_dim - 1)
        self.main_critic = CarlaSimpleEncoder(latent_size=input_dim - 1)
        actor_layer_size = [input_dim] + hidden_layer
        actor_feature_layers = nn.ModuleList([])
        for i in range(len(actor_layer_size) - 1):
            actor_feature_layers.append(
                nn.Linear(actor_layer_size[i], actor_layer_size[i + 1]))
            actor_feature_layers.append(nn.ReLU())
        self.actor = nn.Sequential(*actor_feature_layers)
        self.alpha_head = nn.Sequential(
            nn.Linear(hidden_layer[-1], action_dim), nn.Softplus())
        self.beta_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim),
                                       nn.Softplus())

        critic_layer_size = [input_dim] + hidden_layer
        critic_layers = nn.ModuleList([])
        for i in range(len(critic_layer_size) - 1):
            critic_layers.append(
                nn.Linear(critic_layer_size[i], critic_layer_size[i + 1]))
            critic_layers.append(nn.ReLU())
        critic_layers.append(layer_init(nn.Linear(hidden_layer[-1], 1),
                                        gain=1))
        self.critic = nn.Sequential(*critic_layers)

    def forward(self, x, action=None):
        speed = x[:, -1:]
        x = x[:, :-1].view(-1, 3, 128,
                           128)  # image size in carla driving task is 128x128
        x1 = self.main_actor(x)
        x1 = torch.cat([x1, speed], dim=1)

        x2 = self.main_critic(x)
        x2 = torch.cat([x2, speed], dim=1)

        actor_features = self.actor(x1)
        alpha = self.alpha_head(actor_features) + 1
        beta = self.beta_head(actor_features) + 1
        self.dist = Beta(alpha, beta)
        if action is None:
            action = self.dist.sample()
        else:
            action = (action + 1) / 2
        action_log_prob = self.dist.log_prob(action).sum(-1)
        entropy = self.dist.entropy().sum(-1)
        value = self.critic(x2)
        return action * 2 - 1, action_log_prob, value.squeeze(-1), entropy
Пример #20
0
    def update(self):
        self.training_step += 1

        s = torch.tensor(self.buffer['s'], dtype=torch.double).to(device)
        a = torch.tensor(self.buffer['a'], dtype=torch.double).to(device)
        r = torch.tensor(self.buffer['r'],
                         dtype=torch.double).to(device).view(-1, 1)
        s_ = torch.tensor(self.buffer['s_'], dtype=torch.double).to(device)

        old_a_logp = torch.tensor(self.buffer['a_logp'],
                                  dtype=torch.double).to(device).view(-1, 1)

        with torch.no_grad():
            # Using TD(1)
            target_v = r + gamma * self.net(s_)[
                1]  # target value function = reward + value of previous state
            adv = target_v - self.net(s)[1]  # Advantage estimator
            # adv = (adv - adv.mean()) / (adv.std() + 1e-8)

        for _ in range(self.ppo_epoch):  # Iterate
            for index in BatchSampler(
                    SubsetRandomSampler(range(self.buffer_capacity)),
                    self.batch_size, False):
                # index는 buffer_capacity를 랜덤으로 섞은 뒤 batch_size로 묶은 것

                (alpha, beta), value = self.net(s[index])
                dist = Beta(alpha, beta)
                a_logp = dist.log_prob(a[index]).sum(
                    dim=1, keepdim=True)  # 현재 state로 구한 action의 probability
                ratio = torch.exp(
                    a_logp - old_a_logp[index]
                )  # surrogate function pi(a_t|s_t)/pi_old(a_t|s_t)

                surr1 = ratio * adv[index]
                surr2 = torch.clamp(
                    ratio, 1.0 - self.clip_param, 1.0 + self.clip_param
                ) * adv[
                    index]  # clipping (0.9~1.1 사이의 ratio로만 policy update를 진행하겠다)
                action_loss = -torch.min(surr1, surr2).mean(
                )  # clipped와 unclipped 중 작은 값을 선택 그렇게 함으로써, 너무 큰 변화 없도록
                value_loss = F.smooth_l1_loss(
                    value,
                    target_v[index])  # smooth l1 loss : [-1,1]범위는 l2 loss 사용
                loss = action_loss + 2. * value_loss  # value_loss의 제곱이 되어야 하지만 그냥 l1 norm을 사용한듯 하다.

                self.optimizer.zero_grad()
                loss.backward()
                # nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm) # RNN 같은 계열에서 사용해주는 방법 (Gradient가 너무 많이 움직이지 않도록)
                self.optimizer.step()
Пример #21
0
    def forward(self, s, g, greedy=False, action_logit=None):
        """Produce an action"""
        c0, c1 = self.action_stats(s, g)
        action_mode = (c0 - 1) / (c0 + c1 - 2)
        m = Beta(c0, c1)

        # Sample.
        if action_logit is None:
            if greedy:
                action_logit = action_mode
            else:
                action_logit = m.sample()

            n_ent = -m.entropy().mean()
            lprobs = m.log_prob(action_logit)
            action = self.scale_action(action_logit)
            return action, action_logit, lprobs, n_ent

        # Evaluate the action previously taken
        else:
            n_ent = -m.entropy().mean(dim=1)
            lprobs = m.log_prob(action_logit)
            action = self.scale_action(action_logit)
            return lprobs, n_ent, action
Пример #22
0
    def select_action(self, state):
        if args.action_vec > 0:
            state = (torch.from_numpy(
                state[0]).float().to(device).unsqueeze(0),
                     torch.from_numpy(
                         state[1]).float().to(device).unsqueeze(0))
        else:
            state = torch.from_numpy(state).float().to(device).unsqueeze(0)
        #TODO CHANGE FOR VECTOR ACTIONS
        with torch.no_grad():
            alpha, beta = self.net(state)[0]
        dist = Beta(alpha, beta)
        action = dist.sample()
        a_logp = dist.log_prob(action).sum(dim=1)

        action = action.squeeze().cpu().numpy()
        a_logp = a_logp.item()
        return action, a_logp
Пример #23
0
    def forward(self,
                x=None,
                warmup=1.,
                inf_net=None):  #, k=1): #, marginf_type=0):

        outputs = {}

        if inf_net is None:
            # mu, logvar = self.inference_net(x)
            z, logqz = self.q.sample(x)
        else:
            # mu, logvar = inf_net.inference_net(x)
            z, logqz = inf_net.sample(x)

        logpz = self.prior.logprob(z)

        # Decode Image
        x_hat = self.image_decoder(z)
        alpha = torch.sigmoid(x_hat)
        beta = Beta(alpha * self.beta_scale, (1. - alpha) * self.beta_scale)
        x_noise = torch.clamp(
            x + torch.FloatTensor(x.shape).uniform_(0., 1. / 256.).cuda(),
            min=1e-5,
            max=1 - 1e-5)
        # logpx = beta.log_prob(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda()) #[120,3,112,112]  # add uniform noise here
        logpx = beta.log_prob(
            x_noise)  #[120,3,112,112]  # add uniform noise here
        B = z.shape[0]
        logpx = torch.sum(logpx.view(B, -1), 1)  # [PB]  * self.w_logpx

        log_ws = logpx + logpz - logqz

        outputs['logpx'] = torch.mean(logpx)
        outputs['x_recon'] = alpha
        outputs['welbo'] = torch.mean(logpx + warmup * (logpz - logqz))
        outputs['elbo'] = torch.mean(log_ws)
        outputs['logws'] = log_ws
        outputs['z'] = z
        outputs['logpz'] = torch.mean(logpz)
        outputs['logqz'] = torch.mean(logqz)
        # outputs['logvar'] = logvar

        return outputs
Пример #24
0
class BetaSeparatedPolicy(nn.Module):
    def __init__(self, input_dim, action_dim, hidden_layer=[64, 64]):
        super(BetaSeparatedPolicy, self).__init__()
        actor_layer_size = [input_dim] + hidden_layer
        alpha_feature_layers = nn.ModuleList([])
        beta_feature_layers = nn.ModuleList([])
        for i in range(len(actor_layer_size) - 1):
            alpha_feature_layers.append(
                nn.Linear(actor_layer_size[i], actor_layer_size[i + 1]))
            alpha_feature_layers.append(nn.ReLU())
            beta_feature_layers.append(
                nn.Linear(actor_layer_size[i], actor_layer_size[i + 1]))
            beta_feature_layers.append(nn.ReLU())
        self.alpha_body = nn.Sequential(*alpha_feature_layers)
        self.beta_body = nn.Sequential(*beta_feature_layers)
        self.alpha_head = nn.Sequential(
            nn.Linear(hidden_layer[-1], action_dim), nn.Softplus())
        self.beta_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim),
                                       nn.Softplus())

        critic_layer_size = [input_dim] + hidden_layer
        critic_layers = nn.ModuleList([])
        for i in range(len(critic_layer_size) - 1):
            critic_layers.append(
                nn.Linear(critic_layer_size[i], critic_layer_size[i + 1]))
            critic_layers.append(nn.ReLU())
        critic_layers.append(nn.Linear(hidden_layer[-1], 1))
        self.critic = nn.Sequential(*critic_layers)

    def forward(self, x, action=None):
        alpha = self.alpha_head(self.alpha_body(x)) + 1
        beta = self.beta_head(self.beta_body(x)) + 1
        self.dist = Beta(alpha, beta)
        if action is None:
            action = self.dist.sample()
        else:
            action = (action + 1) / 2
        action_log_prob = self.dist.log_prob(action).sum(-1)
        entropy = self.dist.entropy().sum(-1)
        value = self.critic(x)

        return action * 2 - 1, action_log_prob, value.squeeze(-1), entropy
Пример #25
0
def beta_mle_loss(y_hat, y, reduce=True):
    """y_hat (batch_size x seq_len x 2)
        y (batch_size x seq_len x 1)
        
    """
    # take exponentional to ensure positive
    loc_y = y_hat.exp()
    alpha = loc_y[:, :, 0].unsqueeze(-1)
    beta = loc_y[:, :, 1].unsqueeze(-1)
    dist = Beta(alpha, beta)
    # rescale y to be between
    y = (y + 1.0) / 2.0
    # note that we will get inf loss if y == 0 or 1.0 exactly, so we will clip it slightly just in case
    y = torch.clamp(y, 1e-5, 0.99999)
    # compute logprob
    loss = -dist.log_prob(y).squeeze(-1)
    if reduce:
        return loss.mean()
    else:
        return loss
Пример #26
0
    def update(self):
        self.training_step += 1

        s = torch.tensor(self.buffer['s'], dtype=torch.double).to(self.device)
        a = torch.tensor(self.buffer['a'], dtype=torch.double).to(self.device)
        r = torch.tensor(self.buffer['r'],
                         dtype=torch.double).to(self.device).view(-1, 1)
        next_s = torch.tensor(self.buffer['s_'],
                              dtype=torch.double).to(self.device)

        old_a_logp = torch.tensor(self.buffer['a_logp'],
                                  dtype=torch.double).to(self.device).view(
                                      -1, 1)

        with torch.no_grad():
            target_v = r + GAMMA * self.net(next_s)[1]
            adv = target_v - self.net(s)[1]
            # adv = (adv - adv.mean()) / (adv.std() + 1e-8)

        for _ in range(EPOCH):
            for index in BatchSampler(SubsetRandomSampler(range(MAX_SIZE)),
                                      BATCH, False):

                alpha, beta = self.net(s[index])[0]
                dist = Beta(alpha, beta)
                a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True)
                ratio = torch.exp(a_logp - old_a_logp[index])

                surr1 = ratio * adv[index]

                # clipped function
                surr2 = torch.clamp(ratio, 1.0 - EPS, 1.0 + EPS) * adv[index]
                action_loss = -torch.min(surr1, surr2).mean()
                value_loss = F.smooth_l1_loss(
                    self.net(s[index])[1], target_v[index])
                loss = action_loss + 2. * value_loss

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
Пример #27
0
    def forward(self, x, warmup=1.):

        B = x.shape[0]
        outputs = {}

        # if inf_net is None:
        z, logqz = self.encoder.sample(x)
        # else:
        #     z, logqz = inf_net.sample(x)
        # print (logqz.shape)

        logpz = self.prior.logprob(z)
        # print (logpz.shape)

        x_hat = self.generator.decode(x, z)
        alpha = torch.sigmoid(x_hat)
        beta = Beta(alpha * self.beta_scale, (1. - alpha) * self.beta_scale)
        logpxz = beta.log_prob(x)  #[120,3,112,112]
        logpxz = torch.sum(logpxz.view(B, -1), 1)  # [B]  * self.w_logpx
        # print (logpxz.shape)
        # logpxz = logpxz * .02 #self.w_logpx

        # #Compute elbo
        # elbo = logpxz - (warmup*logqz) #[P,B]
        # if k>1:
        #     max_ = torch.max(elbo, 0)[0] #[B]
        #     elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B]
        elbo = logpxz + logpz - logqz
        welbo = logpxz + warmup * (logpz - logqz)

        # elbo = torch.mean(elbo) #[1]
        outputs['logpxz'] = torch.mean(logpxz)  #[1]
        outputs['logqz'] = torch.mean(logqz)
        outputs['logpz'] = torch.mean(logpz)
        outputs['elbo'] = torch.mean(elbo)
        outputs['welbo'] = torch.mean(welbo)
        outputs['x_hat'] = alpha
        # outputs['elbo_B'] = elbo

        return outputs
Пример #28
0
class MyDist(ActionDistribution):
    @staticmethod
    def required_model_output_shape(action_space, model_config):
        return 6

    def __init__(self, inputs, model):
        super(MyDist, self).__init__(inputs, model)
        self.dist = Beta(inputs[:, :3], inputs[:, 3:])

    def sample(self):
        self.sampled_action = self.dist.sample()
        return self.sampled_action

    def deterministic_sample(self):
        return self.dist.mean

    def sampled_action_logp(self):
        return self.logp(self.sampled_action)

    def logp(self, actions):
        return self.dist.log_prob(actions).sum(-1)

    # refered from https://github.com/pytorch/pytorch/blob/master/torch/distributions/kl.py
    def kl(self, other):
        p, q = self.dist, other.dist
        sum_params_p = p.concentration1 + p.concentration0
        sum_params_q = q.concentration1 + q.concentration0
        t1 = q.concentration1.lgamma() + q.concentration0.lgamma() + (
            sum_params_p).lgamma()
        t2 = p.concentration1.lgamma() + p.concentration0.lgamma() + (
            sum_params_q).lgamma()
        t3 = (p.concentration1 - q.concentration1) * torch.digamma(
            p.concentration1)
        t4 = (p.concentration0 - q.concentration0) * torch.digamma(
            p.concentration0)
        t5 = (sum_params_q - sum_params_p) * torch.digamma(sum_params_p)
        return (t1 - t2 + t3 + t4 + t5).sum(-1)

    def entropy(self):
        return self.dist.entropy().sum(-1)
Пример #29
0
    def forward(self, x=None, warmup=1., inf_net=None): #, k=1): #, marginf_type=0):

        outputs = {}

        if inf_net is None:
            # mu, logvar = self.inference_net(x)
            z, logqz = self.q.sample(x) 
        else:
            # mu, logvar = inf_net.inference_net(x)   
            z, logqz = inf_net.sample(x) 


        logpz = self.prior.logprob(z)

        # Decode Image
        x_hat = self.image_decoder(z)
        alpha = torch.sigmoid(x_hat)
        beta = Beta(alpha*self.beta_scale, (1.-alpha)*self.beta_scale)
        x_noise = torch.clamp(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda(), min=1e-5, max=1-1e-5)
        # logpx = beta.log_prob(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda()) #[120,3,112,112]  # add uniform noise here
        logpx = beta.log_prob(x_noise) #[120,3,112,112]  # add uniform noise here
        B = z.shape[0]
        logpx = torch.sum(logpx.view(B, -1),1) # [PB]  * self.w_logpx

        log_ws = logpx + logpz - logqz

        outputs['logpx'] = torch.mean(logpx)
        outputs['x_recon'] = alpha
        outputs['welbo'] = torch.mean(logpx + warmup*( logpz - logqz))
        outputs['elbo'] = torch.mean(log_ws)
        outputs['logws'] = log_ws
        outputs['z'] = z
        outputs['logpz'] = torch.mean(logpz)
        outputs['logqz'] = torch.mean(logqz)
        # outputs['logvar'] = logvar

        return outputs
Пример #30
0
    def f(self, x, z, logits, hard=False):

        B = x.shape[0]

        # image likelihood given b
        # b = harden(z).detach()
        x_hat = self.generator.forward(z)
        alpha = torch.sigmoid(x_hat)
        beta = Beta(alpha*self.beta_scale, (1.-alpha)*self.beta_scale)
        x_noise = torch.clamp(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda(), min=1e-5, max=1-1e-5)
        logpx = beta.log_prob(x_noise) #[120,3,112,112]  # add uniform noise here
        logpx = torch.sum(logpx.view(B, -1),1) # [PB]  * self.w_logpx

        # prior is constant I think 
        # for q(b|x), we just want to increase its entropy 
        if hard:
            dist = Bernoulli(logits=logits)
        else:
            dist = RelaxedBernoulli(torch.Tensor([1.]).cuda(), logits=logits)
            
        logqb = dist.log_prob(z.detach())
        logqb = torch.sum(logqb,1)

        return logpx, logqb, alpha
Пример #31
0
    def forward(self,
                x=None,
                q=None,
                warmup=1.,
                generate=False,
                inf_type=1,
                dec_type=0):  #, k=1): #, marginf_type=0):
        # x: [B,3,112,112]
        # q: [B,L]
        # inf type: 0 is both, 1 is only x, 2 is only y
        # dec type: 0 is both, 1 is only x, 2 is only y

        outputs = {}

        if inf_type in [0, 2] or dec_type in [0, 2]:
            embed = self.encoder_embed(q)

        if inf_type == 0:
            x_enc = self.image_encoder(x)
            y_enc = self.encode_attributes(embed)
            mu, logvar = self.inference_net(x_enc, y_enc)
            z, logpz, logqz = self.sample(mu, logvar)

        elif inf_type == 1:
            # if self.joint_inf:
            x_enc = self.image_encoder2(x)
            mu, logvar = self.inference_net_x(x_enc)
            # else:
            #     if dec_type ==0:
            #         x_enc = self.image_encoder(x)
            #         mu, logvar = self.inference_net(x_enc)
            #     else:
            #         x_enc = self.image_encoder2(x)
            #         mu, logvar = self.inference_net_x(x_enc)
            z, logpz, logqz = self.sample(mu, logvar)

        elif inf_type == 2:
            y_enc = self.encode_attributes2(embed)
            mu, logvar = self.inference_net_y(y_enc)
            if self.flow_int:
                z, logpz, logqz = self.flow.sample(mu, logvar)
            else:
                z, logpz, logqz = self.sample(mu, logvar)

        # z_prior = torch.FloatTensor(self.B, self.z_size).normal_().cuda()
        # loss, acc = self.discrim.discrim_loss(z, z_prior)
        pred = self.discrim.predict(
            z).mean()  #want to minimize this, since prior prediction = 0

        z_dec = self.z_to_enc(z)

        B = z_dec.shape[0]

        if dec_type == 0:
            # Decode Image
            x_hat = self.image_decoder(z_dec)
            alpha = torch.sigmoid(x_hat)

            beta = Beta(alpha * self.beta_scale,
                        (1. - alpha) * self.beta_scale)
            logpx = beta.log_prob(x)  #[120,3,112,112]
            logpx = torch.sum(logpx.view(B, -1), 1)  # [B]

            word_preds, logpy = self.text_generator.teacher_force(
                z_dec, embed, q)

            logpx = logpx * self.w_logpx
            logpy = logpy * self.w_logpy

            #CE of q(z|y)
            if inf_type == 1:
                embed = self.encoder_embed(q)
            y_enc = self.encode_attributes2(embed)
            mu_y, logvar_y = self.inference_net_y(y_enc)
            if self.flow_int:
                logqzy = self.flow.logprob(z.detach(), mu_y, logvar_y)
                # logqzy = self.flow.logprob(z, mu_y, logvar_y)
            else:
                logqzy = lognormal(z, mu_y, logvar_y)
            logqzy = logqzy * self.w_logqy

            log_ws = logpx + logpy + logpz - logqz  #+ logqzy
            elbo = torch.mean(log_ws)
            # warmed_elbo = torch.mean(logpx + logpy + logqzy - logqz + warmup*( logpz - logqz))
            # warmed_elbo = torch.mean(logpx + logpy + logqzy + warmup*( logpz - logqz))
            warmed_elbo = torch.mean(logpx + logpy + logqzy + warmup * (pred))
            # warmed_elbo = torch.mean(-torch.log(pred))
            # warmed_elbo = pred
            # warmed_elbo = torch.mean(logpz - logqz)

            outputs['logpx'] = torch.mean(logpx)
            outputs['x_recon'] = alpha
            outputs['logpy'] = torch.mean(logpy)
            outputs['logqzy'] = torch.mean(logqzy)

        elif dec_type == 1:
            # Decode Image
            x_hat = self.image_decoder(z_dec)
            alpha = torch.sigmoid(x_hat)

            beta = Beta(alpha * self.beta_scale,
                        (1. - alpha) * self.beta_scale)
            logpx = beta.log_prob(x)  #[120,3,112,112]

            logpx = torch.sum(logpx.view(B, -1), 1)  # [PB]  * self.w_logpx
            logpx = logpx * self.w_logpx

            log_ws = logpx + logpz - logqz

            elbo = torch.mean(log_ws)
            warmed_elbo = torch.mean(logpx + warmup * (logpz - logqz))

            outputs['logpx'] = torch.mean(logpx)
            outputs['x_recon'] = alpha

        elif dec_type == 2:
            #Decode Text
            word_preds, logpy = self.text_generator.teacher_force(
                z_dec, embed, q)
            logpy = logpy * self.w_logpy

            log_ws = logpy + logpz - logqz
            elbo = torch.mean(log_ws)
            warmed_elbo = torch.mean(logpy + warmup * (logpz - logqz))

            outputs['logpy'] = torch.mean(logpy)

        outputs['welbo'] = warmed_elbo
        outputs['elbo'] = elbo
        outputs['logws'] = log_ws
        outputs['z'] = z
        outputs['logpz'] = torch.mean(logpz)
        outputs['logqz'] = torch.mean(logqz)
        outputs['logvar'] = logvar

        if generate:

            word_preds, sampled_words = self.text_generator.teacher_force(
                z_dec, generate=generate, embeder=self.encoder_embed)
            if dec_type == 2:
                alpha = torch.sigmoid(self.image_decoder(z_dec))

            return outputs, alpha, word_preds, sampled_words

        return outputs
Пример #32
0
    def run(self):
        updatestep = 0
        update = 0
        i_episode = 0

        while (update < 100000):
            self.lr = args.lr - (args.lr * (i_episode / float(10000)))
            i_episode = i_episode + 1
            observation = self.env.reset()
            step = 0
            observes_list, rewards, actions, values, old_log = [], [], [], [], []

            if updatestep > 2048:
                update = update + 1
                updatestep = 0
                if (args.usegae):
                    self.add_gae(self.trajectories, self.gamma, self.lam)
                else:
                    self.add_no_gae(self.trajectories, self.gamma)
                s, a, adv, old_a_logp, target_v, totalsize = self.gettraindata(
                )
                minibatch = max(totalsize // args.numminibatch, 1)

                for _ in range(self.PPOepoch):
                    for index in BatchSampler(
                            SubsetRandomSampler(range(totalsize)), minibatch,
                            False):

                        alpha, beta = self.net(s[index])[0]
                        dist = Beta(alpha, beta)
                        a_logp = dist.log_prob(a[index]).sum(dim=1)
                        ratio = torch.exp(a_logp - old_a_logp[index])
                        with torch.no_grad():
                            entrop = dist.entropy()

                        surr1 = ratio * adv[index]
                        surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                            1.0 + self.clip_param) * adv[index]
                        action_loss = -torch.min(surr1, surr2).mean()
                        value_loss = F.mse_loss(
                            self.net(s[index])[1], target_v[index])
                        self.storeloss(action_loss, value_loss)
                        loss = action_loss + 0.5 * value_loss - 0.01 * entrop.mean(
                        )

                        self.optimizer.zero_grad()
                        loss.backward()
                        nn.utils.clip_grad_norm_(self.net.parameters(),
                                                 args.maxgradnorm)
                        self.optimizer.step()

                self.trajectories = []

            while (1):
                step = step + 1
                updatestep = updatestep + 1
                #self.env.render()

                observes = observation.astype(np.float32).reshape((1, -1))
                input = torch.tensor(observes,
                                     dtype=torch.double).to(device).reshape(
                                         -1, self.inputsize)
                (alpha, beta), v = self.net(input)
                dist = Beta(alpha, beta)
                action = dist.sample()
                a_logp = dist.log_prob(action.view(-1, 6)).sum(dim=1)
                a_logp = a_logp.item()

                old_log.append(a_logp)
                values.append(v.item())
                observes_list.append(observes)
                actions.append(action)

                action = action.squeeze().cpu().numpy()
                observation, reward, done, info = self.env.step(action * 2 - 1)
                rewards.append(reward)

                if done:
                    print("Episode finished after {} timesteps, rewards is {}".
                          format(step, sum(rewards)))
                    self.storereward(format(step))

                    trajectory = {
                        'observes': np.concatenate([t for t in observes_list]),
                        'actions':
                        np.concatenate([t.to('cpu') for t in actions]),
                        'rewards': np.array(rewards),
                        'values': np.array(values),
                        'old_log': np.array(old_log)
                    }

                    self.trajectories.append(trajectory)
                    break
Пример #33
0
    def run(self):
        sumoBinary = checkBinary('sumo')
        traci.start([sumoBinary, "-c", "roadfile/cross.sumocfg"])
        listpic = []
        self.loop = 0
        self.dict1 = {}
        self.periodtime = []
        self.currentstate = 0
        self.time_click = 0
        self.out_record = None
        self.max_grad_norm = 0.5
        self.wtime = []
        self.tflightime = np.array([30, 30, 30, 30])

        while traci.simulation.getMinExpectedNumber() > 0:
            traci.simulationStep()
            if (self.currentstate is not traci.trafficlight.getPhase("0")):
                self.time_click = 0
            self.time_click = self.time_click + 1

            phase_index = int(traci.trafficlight.getPhase("0") / 2)
            list_car = traci.vehicle.getIDList()
            for k in list_car:
                traci.vehicle.setLaneChangeMode(k, 0b001000000000)

            vehiclein_l = traci.simulation.getDepartedIDList()
            if (vehiclein_l):
                for i in vehiclein_l:
                    self.dict1[i] = self.step

            vehicleout_l = traci.simulation.getArrivedIDList()
            if (vehicleout_l):
                for i in vehicleout_l:
                    self.periodtime.append(self.step - self.dict1[i])
                    self.wtime.append(self.step - self.dict1[i])
                    self.dict1.pop(i)

            if ((self.step - int(self.step / 2000) * 2000) % 5 == 0 and int(
                (self.step - int(self.step / 2000) * 2000) / 5) < 4):
                listpic.append(self.getstate(list_car))

            if self.step % 1000 == 999:
                if int(self.step / 1000) % 2 == 0:
                    if (self.wtime):
                        self.writetime(np.array(self.wtime).mean())

                self.wtime = []

            if (self.step % 2000 == 15):
                print(self.loop)
                self.loop = self.loop + 1
                if (len(listpic) != 4):
                    break
                pict = np.array(listpic)
                input_d = torch.tensor(pict, dtype=torch.double).to(device)
                input_d = input_d.reshape(1, 4, 28, 28)
                listpic = []

                with torch.no_grad():
                    alpha, beta = self.net(input_d)[0]

                dist = Beta(alpha, beta)
                action = dist.sample()
                a_logp = dist.log_prob(action.view(-1, 4)).sum(dim=1)

                action = action.squeeze().cpu().numpy()
                a_logp = a_logp.item()
                self.tflightime = np.array(action * 60)
                self.writeac(self.tflightime.tolist())

                reward = 0
                if (self.periodtime):
                    reward = -0.9 * np.array(self.periodtime).mean(
                    ) - 0.1 * np.array(self.periodtime).max()
                    reward = (reward + 150) / 50

                self.periodtime = []

                ifupdata = None
                if self.out_record is not None:
                    ifupdata = self.memory.store(
                        (self.out_record[0], self.out_record[1], reward, pict,
                         self.out_record[2], self.out_record[3]))

                self.out_record = [
                    pict, action, a_logp,
                    self.net(input_d)[1].item()
                ]

                if ifupdata is True:
                    print('train')
                    self.trainmodel()

            self.currentstate = traci.trafficlight.getPhase("0")
            if (self.time_click >= self.tflightime[phase_index]):
                traci.trafficlight.setPhase("0", (self.currentstate + 1) % 8)

            self.step += 1

        traci.close()
        sys.stdout.flush()