def evaluate(self, transition):

        output, hn = self.w_lstm(transition["inputs"], transition["h0"])
        output = output.squeeze(0)
        h0 = hn

        logits = self.w_soft(output)
        #
        if self.temperature is not None:
            logits /= self.temperature

        out_dist = Categorical(logits=logits)

        branch_logprob = out_dist.log_prob(transition["branch"])

        branch_entropy = out_dist.entropy()

        inputs = self.w_emb(transition["branch"])
        inputs = inputs.unsqueeze(0)

        output, hn = self.w_lstm(inputs, h0)
        output = output.squeeze(0)

        if transition["layer_id"] > 0:

            query = torch.cat(transition["anchors_w1"], dim=0)

            query = torch.tanh(query + self.w_attn_2(output))
            query = self.v_attn(query)
            logits = torch.cat([query, -query], dim=1)

            if self.temperature is not None:
                logits /= self.temperature
            if self.tanh_constant is not None:
                logits = self.tanh_constant * torch.tanh(logits)

            #print("logits_eval", logits)
            skip_distribution = Categorical(logits=logits)

            skip_logprob = skip_distribution.log_prob(
                transition["skip_connections"])
            skip_logprob = torch.sum(skip_logprob)  # maybe without sum()

            skip_entropy = skip_distribution.entropy()
            skip_entropy = torch.sum(skip_entropy)

        else:
            skip_logprob = 0
            skip_entropy = 0

        return branch_logprob[0], skip_logprob, branch_entropy[0], skip_entropy


#
# DONE PPO
# -update weights
#   -old policy
# -memory
# -eval action in controller
示例#2
0
def forward_entropy(model, loader, device, max_item_id=0):
    for i, batch in enumerate(loader):
        scores = softmax(model(batch.to(device), max_item_id), dim=1)
        dis_score = Categorical(scores)
        if i == 0:
            entropy = dis_score.entropy()
        else:
            entropy = torch.cat((entropy, dis_score.entropy()))

    # pro = softmax(entropy).cpu().detach().numpy()
    pro = entropy.cpu().detach().numpy()
    weights = np.exp((pd.Series(pro).rank() / len(pro)).values)
    return weights / np.sum(weights)
示例#3
0
 def get_action(self, x, action=None):
     logits = self.actor(x)
     probs = Categorical(logits=logits)
     # if action is not specified, we select it stochastically
     if action is None:
         action = probs.sample()
     return action, probs.log_prob(action), probs.entropy()
示例#4
0
    def PPO_update(self):
        self.optimizer.zero_grad()
        # Compute and normalize discounted rewards (use the discount_rewards function)
        rewards = np.asarray(self.discount_rewards())
        rewards = (rewards - np.mean(rewards)) / np.var(rewards) + 1e-5

        for _ in range(self.K_epochs):
            # sample a random 50% of the data stored in every epoch
            len_history = len(self.actions)
            n_batch = round(len_history * 0.7)
            idxs = random.sample(range(len_history), n_batch)

            old_rewards = torch.tensor([rewards[idx]
                                        for idx in idxs]).to(self.train_device)
            old_states = [self.states[idx] for idx in idxs]
            old_action_probs = [self.action_probs[idx] for idx in idxs]
            old_actions = [self.actions[idx] for idx in idxs]

            # Convert list to tensor
            old_states = torch.stack(old_states,
                                     dim=0).to(self.train_device).detach()
            old_action_probs = torch.stack(old_action_probs, dim=0).to(
                self.train_device).detach()
            old_actions = torch.stack(old_actions,
                                      dim=0).to(self.train_device).detach()

            # Evaluate batch actions and values:
            # Pass batch states to actor layers
            action_logits, values = self.policy.forward(old_states)
            action_distribution = Categorical(logits=action_logits)
            # Caculate action log probability and entropy given batch actions
            dist_entropy = action_distribution.entropy()

            # Caculate the loss:
            # Finding the ratio (pi_theta / pi_theta__batch)
            vs = np.array([[1., 0.], [0., 1.]])
            ts = torch.FloatTensor(vs[old_actions.cpu().numpy()])
            ratios = torch.sum(F.softmax(action_logits, dim=1) * ts,
                               dim=1) / old_action_probs

            # Finding Surrogate Loss:
            advantages = old_rewards - values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip,
                                1 + self.eps_clip) * advantages

            loss = (-torch.min(surr1, surr2).mean() +
                    0.5 * self.MseLoss(values.squeeze(1), old_rewards) -
                    0.01 * dist_entropy.mean())

            # Take gradient step to update network parameters
            loss.backward()
            print('Loss:', loss)
            self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())

        # Clear memory
        self.states, self.action_probs, self.actions, self.rewards, self.dones = [], [], [], [], []
示例#5
0
    def act(self, state, action=None, calc_ent=False):
        """Returns dict of trajectory info.
        Shape
        ======
            state (uint8) : (batch_size, framestack=4, 84, 84)
        
        Returns example
            {'a': tensor([10,  5,  1]),
             'ent': None,
             'log_pi_a': tensor([-2.8904, -2.8904, -2.8904], grad_fn=<SqueezeBackward1>),
             'v_ext': tensor([0.0012, 0.0012, 0.0012], grad_fn=<SqueezeBackward0>),
             'v_int': tensor([-0.0013, -0.0013, -0.0013], grad_fn=<SqueezeBackward0>)}
        """
        #state = torch.FloatTensor(state / 255).to(self.device)
        assert state.dtype == 'uint8'
        state = torch.tensor(state / 255.,
                             dtype=torch.float,
                             device=self.device)
        #state = torch.from_numpy(state /255).float().to(self.device)

        action_probs, value_ext, value_int = self.model(state)
        dist = Categorical(action_probs)
        if action is None:
            action = dist.sample()
        log_prob = dist.log_prob(action)
        entropy = dist.entropy() if calc_ent else None

        return {
            'a': action,
            'log_pi_a': log_prob,
            'ent': entropy,
            'v_ext': value_ext.squeeze(),
            'v_int': value_int.squeeze()
        }
示例#6
0
 def evaluate(self, state, action):
     pred = self.policy(state)
     value = self.vf(state).squeeze()
     dist = Categorical(pred)
     log_prob = dist.log_prob(action).squeeze()
     entropy = dist.entropy().squeeze()
     return value, log_prob, entropy
示例#7
0
    def forward(self,encoder_inputs,hx,n_steps,greedy=False):
        _input = encoder_inputs.new_zeros((encoder_inputs.size(0),encoder_inputs.size(2)))
        mask = encoder_inputs.new_zeros((encoder_inputs.size(0),encoder_inputs.size(1)))
        log_ps = []
        actions = []
        entropys = []

        for i in range(n_steps):
            hx = self.cell(_input, hx)
#                 print (hx.size(),encoder_inputs.size(),mask.size())
            p = self.attn(hx,encoder_inputs,mask)
            dist = Categorical(p)
            entropy = dist.entropy()

            if greedy:
                _,index = p.max(dim=-1)
            else:
                index = dist.sample()

            actions.append(index)
            log_p = dist.log_prob(index)
            log_ps.append(log_p)
            entropys.append(entropy)

            mask = mask.scatter(1,index.unsqueeze(-1).expand(mask.size(0),-1),1)
            _input = torch.gather(encoder_inputs,1,index.unsqueeze(-1).unsqueeze(-1).expand(encoder_inputs.size(0),-1,encoder_inputs.size(2))).squeeze(1)

        log_ps = torch.stack(log_ps,1)
        actions = torch.stack(actions,1)
        entropys = torch.stack(entropys,1)
        log_p = log_ps.sum(dim=1)
        entropy = entropys.mean(dim=1)
        return actions,log_p,entropy
    def forward(self):

        inputs, h0 = self.input_vars, None

        log_probs, entropys, sampled_arch = [], [], []
        for iedge in range(self.num_edge):
            outputs, h0 = self.w_lstm(inputs, h0)

            logits = self.w_pred(outputs)
            logits = logits / self.temperature
            logits = self.tanh_constant * torch.tanh(logits)

            # distribution
            op_distribution = Categorical(logits=logits)
            op_index = op_distribution.sample()
            sampled_arch.append(op_index.item())

            op_log_prob = op_distribution.log_prob(op_index)
            log_probs.append(op_log_prob.view(-1))
            op_entropy = op_distribution.entropy()
            entropys.append(op_entropy.view(-1))

            # obtain the input embedding for the next step
            inputs = self.w_embd(op_index)
            # print(op_index,inputs)

        return torch.sum(torch.cat(log_probs)), torch.sum(
            torch.cat(entropys)), sampled_arch
 def get_action(self, x, action=None):
     #x = x.permute(0, 3, 1, 2).contiguous()
     logits = self.actor(self.forward(x))
     probs = Categorical(logits=logits)
     if action is None:
         action = probs.sample()
     return action, probs.log_prob(action), probs.entropy()
示例#10
0
    def forward(self, ):
        '''
    '''

        entropys = []
        log_probs = []
        sampled_arcs = []

        self.op_dist = []
        for layer_id in range(self.num_layers):
            logit = self.alpha[layer_id]
            # if self.temperature > 0:
            #   logit /= self.temperature
            # if self.tanh_constant is not None:
            #   logit = self.tanh_constant * torch.tanh(logit)

            op_dist = Categorical(logits=logit)
            self.op_dist.append(op_dist)

            sampled_op = op_dist.sample()
            sampled_arcs.append(sampled_op.view(-1, 1))

            log_prob = op_dist.log_prob(sampled_op)
            log_probs.append(log_prob.view(-1, 1))
            entropy = op_dist.entropy()
            entropys.append(entropy.view(-1, 1))

            # inputs = self.w_emb(branch_id)

        self.sampled_arcs = torch.cat(sampled_arcs, dim=1)
        self.sample_entropy = torch.cat(entropys, dim=1)
        self.sample_log_prob = torch.cat(log_probs, dim=1)

        return self.sampled_arcs
示例#11
0
    def evaluate(self, encoder_inputs, hx, actions):
        _input = encoder_inputs.new_zeros(
            (encoder_inputs.size(0), encoder_inputs.size(2)))
        mask = encoder_inputs.new_zeros(
            (encoder_inputs.size(0), encoder_inputs.size(1)))
        log_ps = []
        entropys = []

        actions = actions.transpose(0, 1)
        for act in actions:
            hx = self.cell(_input, hx)
            p = self.attn(hx, encoder_inputs, mask)
            dist = Categorical(p)
            entropy = dist.entropy()

            log_p = dist.log_prob(act)
            log_ps.append(log_p)
            mask = mask.scatter(1,
                                act.unsqueeze(-1).expand(mask.size(0), -1), 1)
            _input = torch.gather(
                encoder_inputs, 1,
                act.unsqueeze(-1).unsqueeze(-1).expand(
                    encoder_inputs.size(0), -1,
                    encoder_inputs.size(2))).squeeze(1)
            entropys.append(entropy)

        log_ps = torch.stack(log_ps, 1)
        entropys = torch.stack(entropys, 1)
        log_p = log_ps.sum(dim=1)
        entropy = entropys.mean(dim=1)

        return log_p, entropy
示例#12
0
    def forward(self, img1, img2):

        #print("img1.shape", img1.shape)

        img1 = img1.view(img1.size(0), -1)
        #print("img1.shape", img1.shape)
        img2 = img2.view(img2.size(0), -1)

        out1 = self.policy_single(img1)
        #print("out1.shape", out1.shape)

        out2 = self.policy_single(img2)

        combined = torch.cat(
            (out1, out2), dim=1)  # attaccate usando asse x (una sopra l'altra)
        probs = self.policy_combined(combined)

        # sampling
        dist = Categorical(probs=probs)

        if self.training:
            actions = dist.sample()
        else:
            actions = dist.argmax(dim=1)

        logprobs = dist.log_prob(actions)

        entropy = dist.entropy()

        return probs, actions, logprobs, entropy
示例#13
0
 def get_action_and_value(self, x, action=None):
     hidden = self.network(x.permute((0, 3, 1, 2)) / 255.0)  # "bhwc" -> "bchw"
     logits = self.actor(hidden)
     probs = Categorical(logits=logits)
     if action is None:
         action = probs.sample()
     return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)
示例#14
0
def train_model(args, device, output_size, model, rnd, optimizer, s_batch, target_ext_batch, target_int_batch, y_batch, adv_batch, next_obs_batch, old_action_probs):
    epoch = 3
    update_proportion = 0.25
    s_batch = torch.FloatTensor(s_batch).to(device)
    target_ext_batch = torch.FloatTensor(target_ext_batch).to(device)
    target_int_batch = torch.FloatTensor(target_int_batch).to(device)
    y_batch = torch.LongTensor(y_batch).to(device)
    adv_batch = torch.FloatTensor(adv_batch).to(device)
    next_obs_batch = torch.FloatTensor(next_obs_batch).to(device)

    sample_range = np.arange(len(s_batch))
    forward_mse = nn.MSELoss(reduction='none')

    with torch.no_grad():
        action_probs_old_list = torch.stack(old_action_probs).permute(1, 0, 2).contiguous().view(-1, output_size).to(device)

        m_old = Categorical(action_probs_old_list)
        log_prob_old = m_old.log_prob(y_batch)
        # ------------------------------------------------------------

    for i in range(epoch):
        np.random.shuffle(sample_range)
        for j in range(int(len(s_batch) / args.batch_size)):
            sample_idx = sample_range[args.batch_size * j:args.batch_size * (j + 1)]

            # --------------------------------------------------------------------------------
            # for Curiosity-driven(Random Network Distillation)
            predict_next_state_feature, target_next_state_feature = rnd(next_obs_batch[sample_idx])

            forward_loss = forward_mse(predict_next_state_feature, target_next_state_feature.detach()).mean(-1)
            # Proportion of exp used for predictor update
            mask = torch.rand(len(forward_loss)).to(device)
            mask = (mask < update_proportion).type(torch.FloatTensor).to(device)
            forward_loss = (forward_loss * mask).sum() / torch.max(mask.sum(), torch.Tensor([1]).to(device))
            # ---------------------------------------------------------------------------------

            action_probs, value_ext, value_int = model(s_batch[sample_idx])
            m = Categorical(action_probs)
            log_prob = m.log_prob(y_batch[sample_idx])

            ratio = torch.exp(log_prob - log_prob_old[sample_idx])

            surr1 = ratio * adv_batch[sample_idx]
            surr2 = torch.clamp(
                ratio,
                1.0 - args.eps,
                1.0 + args.eps) * adv_batch[sample_idx]

            actor_loss = -torch.min(surr1, surr2).mean()
            critic_ext_loss = F.mse_loss(value_ext.sum(1), target_ext_batch[sample_idx])
            critic_int_loss = F.mse_loss(value_int.sum(1), target_int_batch[sample_idx])

            critic_loss = critic_ext_loss + critic_int_loss

            entropy = m.entropy().mean()

            optimizer.zero_grad()
            loss = actor_loss + 0.5 * critic_loss - args.entropy_coef * entropy + forward_loss
            loss.backward()
            optimizer.step()
示例#15
0
文件: core.py 项目: etendue/cups-rl
 def forward(self, x, a=None):
     policy = Categorical(logits=self.logits(x))
     if a is None:
         a = policy.sample().squeeze()
     logp_a = policy.log_prob(a).squeeze()
     ent = policy.entropy().squeeze()
     return a, logp_a, ent
    def train_model(self, s_batch, target_batch, y_batch, adv_batch,
                    actor_agent):
        s_batch = torch.FloatTensor(s_batch)
        target_batch = torch.FloatTensor(target_batch)
        adv_batch = torch.FloatTensor(adv_batch)
        with torch.no_grad():
            policy_old, value_old = actor_agent.model_old(s_batch)
            m_old = Categorical(policy_old)
            y_batch_old = torch.LongTensor(y_batch)
            log_prob_old = m_old.log_prob(y_batch_old)

        # for multiply advantage
        policy, value = self.model(s_batch)
        m = Categorical(policy)
        y_batch = m.sample()
        log_prob = m.log_prob(y_batch)
        entropy = m.entropy().mean()

        for i in range(EPOCH):
            minibatch = random.sample(range(len(s_batch)), BATCH_SIZE)
            ratio = torch.exp(log_prob[minibatch] - log_prob_old[minibatch])

            surr1 = ratio * adv_batch[minibatch].sum(1)
            surr2 = torch.clamp(ratio, 1.0 - EPSILON,
                                1.0 + EPSILON) * adv_batch[minibatch].sum(1)

            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = F.mse_loss(value_old[minibatch],
                                     target_batch[minibatch])

            self.optimizer.zero_grad()
            loss = actor_loss + V_COEF * critic_loss - 0.01 * entropy
            loss.backward(retain_graph=True)
            self.optimizer.step()
示例#17
0
文件: ppo_atari.py 项目: mcx/cleanrl
 def get_action_and_value(self, x, action=None):
     hidden = self.network(x / 255.0)
     logits = self.actor(hidden)
     probs = Categorical(logits=logits)
     if action is None:
         action = probs.sample()
     return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)
    def rl_get_action(self, state, compass):
        """Select an action by running a tile-input through the neural network.

        :param state: tile-grid; numpy tensor
        :return: int of selected action
        """

        logits = self.nn(state, compass)

        probs = Categorical(logits=logits)
        action = probs.sample()

        if state.shape[0] == 1:
            a2 = action.item()

            # update orientation
            if not hasattr(self, '_env') and a2 != self.prev_move:
                if a2 in self.rotating_actions:
                    self.prev_move = action

                self.compass_info = self.orientation[self.prev_move - 1]

            elif hasattr(self, '_env'):
                self.compass_info = self.env.orientation[self.env.prev_move -
                                                         1]

        else:
            actions = action.data.numpy()
            for act in actions:
                pass

        return action, -probs.log_prob(action), probs.entropy()
示例#19
0
 def get_action(self, x, action=None):
     logits = self.actor(self.network(x.permute(
         (0, 3, 1, 2))))  # "bhwc" -> "bchw" # "bhwc" -> "bchw"
     probs = Categorical(logits=logits)
     if action is None:
         action = probs.sample()
     return action, probs.log_prob(action), probs.entropy()
    def _evaluate_actions(self, env_states, rec_hs, rec_cs, actions):
        """
        See how likely these actions (using the current model) in the given env_states
        Called when updating, on batches of transitions

        Args:
            env_states: float tensor of shape [batch_size, *env_state_shape]
            rec_hs:     float tensor of shape [num_recurrent_layers, batch_size, recurrent_layer_size]
            rec_cs:     float tensor of shape [num_recurrent_layers, batch_size, recurrent_layer_size]
            actions:    int tensor of shape  [batch_size,]

        Returns:
            encoder_out: float tensor of shape [batch_size, m]  -- so it's not recomputed again for values
            action_log_probs: float tensor of shape [batch_size,]
            entropy:          float tensor of shape [batch_size,]
        """
        latent_means, latent_log_vars, encoder_out, _, _ = self.controller.encode(
            env_states, rec_hs, rec_cs)
        actor_logits = self.controller.actor(encoder_out)
        action_distributions = Categorical(
            logits=actor_logits
        )  # float tensor of shape [batch_size, num_actions]

        action_log_probs = action_distributions.log_prob(
            actions)  # float tensor of shape [batch_size,]
        entropy = action_distributions.entropy(
        )  # float tensor of shape [batch_size,]

        return latent_means, latent_log_vars, encoder_out, action_log_probs, entropy
示例#21
0
文件: model.py 项目: rnv93/irlc-vqa
    def sample_action(self, probs, already_selected=None, greedy=False):
        # probs = (B, k+1)
        # already_selected = (num_timesteps, B)

        if already_selected is None:
            mask = 1
        else:
            mask = Variable(torch.ones(probs.size()))
            if USE_CUDA:
                # TODO: uncomment this, when this model works
                mask = mask.cuda()
                pass
            mask = mask.scatter_(1, already_selected.t(), 0)  # (B, k+1)

        masked_probs = mask * (
            probs + 1e-20
        )  # (B, k+1), add epsilon to make sure no non-masked value is zero.
        dist = Categorical(probs=masked_probs)

        if greedy:
            _, a = masked_probs.max(dim=1)  # (B)
        else:
            a = dist.sample()  # (B)

        log_prob = dist.log_prob(a)  # (B)
        entropy = dist.entropy()  # (B)
        return a, log_prob, entropy
    def train_model(self, s_batch, target_batch, y_batch, adv_batch):
        s_batch = torch.FloatTensor(s_batch)
        target_batch = torch.FloatTensor(target_batch)
        y_batch = torch.LongTensor(y_batch)
        adv_batch = torch.FloatTensor(adv_batch)

        # for multiply advantage
        policy, value = self.model(s_batch)
        m = Categorical(policy)

        # mse = nn.SmoothL1Loss()
        mse = nn.MSELoss()

        # Actor loss
        actor_loss = -m.log_prob(y_batch) * adv_batch.sum(1)

        # Entropy(for more exploration)
        entropy = m.entropy()
        # Critic loss
        critic_loss = mse(value, target_batch)

        # Total loss
        loss = actor_loss.mean() + 0.5 * critic_loss - 0.01 * entropy.mean()
        self.optimizer.zero_grad()
        loss.backward()

        self.optimizer.step()
示例#23
0
 def get_action_and_value(self, x, lstm_state, done, action=None):
     hidden, lstm_state = self.get_states(x, lstm_state, done)
     logits = self.actor(hidden)
     probs = Categorical(logits=logits)
     if action is None:
         action = probs.sample()
     return action, probs.log_prob(action), probs.entropy(), self.critic(
         hidden), lstm_state
示例#24
0
    def evaluate(self, state, action):
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_value = self.value_layer(state)

        return action_logprobs, state_value, dist_entropy
示例#25
0
 def act(self, s, action=None):
     prob, v = self.forward(s)
     dist = Categorical(prob)
     if action is None:
         action = dist.sample()
     log_prob = dist.log_prob(action)
     entropy = dist.entropy()
     return action, log_prob, entropy, v.squeeze()
示例#26
0
    def train_model(self, observations_tensor, ext_returns_tensor,
                    int_returns_tensor, actions_tensor, advantages_tensor,
                    one_channel_observations_tensor, old_log_prob):

        if flag.DEBUG:
            print("input observations shape", observations_tensor.shape)
            print("ext returns shape", ext_returns_tensor.shape)
            print("int returns shape", int_returns_tensor.shape)
            print("input actions shape", actions_tensor.shape)
            print("input advantages shape", advantages_tensor.shape)
            print("one channel observations",
                  one_channel_observations_tensor.shape)

        self.new_model.train()
        self.predictor_model.train()
        target_value = self.target_model(one_channel_observations_tensor)
        predictor_value = self.predictor_model(one_channel_observations_tensor)
        predictor_loss = self.predictor_mse_loss(predictor_value,
                                                 target_value).mean(-1)

        mask = torch.rand(len(predictor_loss)).to(self.device)
        mask = (mask < self.predictor_update_proportion).type(
            torch.FloatTensor).to(self.device)
        predictor_loss = (predictor_loss * mask).sum() / torch.max(
            mask.sum(),
            torch.Tensor([1]).to(self.device))
        new_policy, ext_new_values, int_new_values = self.new_model(
            observations_tensor)
        ext_value_loss = self.mse_loss(ext_new_values, ext_returns_tensor)
        int_value_loss = self.mse_loss(int_new_values, int_returns_tensor)
        value_loss = ext_value_loss + int_value_loss
        softmax_policy = F.softmax(new_policy, dim=1)
        new_dist = Categorical(softmax_policy)
        new_log_prob = new_dist.log_prob(actions_tensor)

        ratio = torch.exp(new_log_prob - old_log_prob)

        clipped_policy_loss = torch.clamp(ratio, 1.0 - self.clip_range,
                                          1 + self.clip_range) \
                                          * advantages_tensor
        policy_loss = ratio * advantages_tensor

        selected_policy_loss = -torch.min(clipped_policy_loss,
                                          policy_loss).mean()
        entropy = new_dist.entropy().mean()
        self.optimizer.zero_grad()

        loss = selected_policy_loss + (self.value_coef * value_loss) \
            - (self.entropy_coef * entropy) + predictor_loss
        loss.backward()

        global_grad_norm_(
            list(self.new_model.parameters()) +
            list(self.predictor_model.parameters()))

        self.optimizer.step()
        return loss, selected_policy_loss, value_loss, predictor_loss, entropy
    def evaluate(self, state, action):
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        actInx = torch.argmax(action, dim=1)
        action_logprobs = dist.log_prob(actInx)
        dist_entropy = dist.entropy()
        state_value = self.value_layer(state)

        return action_logprobs, torch.squeeze(state_value), dist_entropy
示例#28
0
    def forward(self, class_ids, determine_sample=False):
        '''
    https://github.com/melodyguan/enas/blob/master/src/cifar10/general_controller.py#L126
    '''
        h0 = None  # setting h0 to None will initialize LSTM state with 0s
        arc_seq = []
        entropys = []
        log_probs = []
        if isinstance(class_ids, int):
            class_ids = [class_ids]
        if isinstance(class_ids, list):
            class_ids = torch.tensor(class_ids, dtype=torch.int64)
        class_ids = class_ids.type(torch.int64)
        inputs = self.g_emb.weight[class_ids]

        for layer_id in range(self.num_layers):
            if self.search_whole_channels:
                inputs = inputs.unsqueeze(dim=0)
                output, hn = self.w_lstm(inputs, h0)
                output = output.squeeze(dim=0)
                h0 = hn

                logit = self.w_soft(output)
                if self.temperature > 0:
                    logit /= self.temperature
                if self.tanh_constant is not None:
                    logit = self.tanh_constant * torch.tanh(logit)

                branch_id_dist = Categorical(logits=logit)
                if determine_sample:
                    branch_id = logit.argmax(dim=1)
                else:
                    branch_id = branch_id_dist.sample()

                arc_seq.append(branch_id)

                log_prob = branch_id_dist.log_prob(branch_id)
                log_probs.append(log_prob.view(-1))
                entropy = branch_id_dist.entropy()
                entropys.append(entropy.view(-1))

            else:
                # https://github.com/melodyguan/enas/blob/master/src/cifar10/general_controller.py#L171
                assert False, "Not implemented error: search_whole_channels = False"

            # Calculate average of class and branch embedding
            # and use it as input for next step
            inputs = self.w_emb(branch_id) + self.g_emb.weight[class_ids]
            inputs /= 2

        self.sample_arc = torch.stack(arc_seq, dim=1)

        self.sample_entropy = torch.stack(entropys, dim=1)

        self.sample_log_prob = torch.stack(log_probs, dim=1)
        self.sample_prob = self.sample_log_prob.exp()
示例#29
0
class OneHotCategorical(Distribution):
    r"""
    Creates a one-hot categorical distribution parameterized by `probs`.

    Samples are one-hot coded vectors of size probs.size(-1).

    See also: :func:`torch.distributions.Categorical`

    Example::

        >>> m = OneHotCategorical(torch.Tensor([ 0.25, 0.25, 0.25, 0.25 ]))
        >>> m.sample()  # equal probability of 0, 1, 2, 3
         0
         0
         1
         0
        [torch.FloatTensor of size 4]

    Args:
        probs (Tensor or Variable): event probabilities
    """
    params = {'probs': constraints.simplex}
    support = constraints.simplex
    has_enumerate_support = True

    def __init__(self, probs=None, logits=None):
        self._categorical = Categorical(probs, logits)
        batch_shape = self._categorical.probs.size()[:-1]
        event_shape = self._categorical.probs.size()[-1:]
        super(OneHotCategorical, self).__init__(batch_shape, event_shape)

    def sample(self, sample_shape=torch.Size()):
        sample_shape = torch.Size(sample_shape)
        probs = self._categorical.probs
        one_hot = probs.new(self._extended_shape(sample_shape)).zero_()
        indices = self._categorical.sample(sample_shape)
        if indices.dim() < one_hot.dim():
            indices = indices.unsqueeze(-1)
        return one_hot.scatter_(-1, indices, 1)

    def log_prob(self, value):
        indices = value.max(-1)[1]
        return self._categorical.log_prob(indices)

    def entropy(self):
        return self._categorical.entropy()

    def enumerate_support(self):
        probs = self._categorical.probs
        n = self.event_shape[0]
        if isinstance(probs, Variable):
            values = Variable(torch.eye(n, out=probs.data.new(n, n)))
        else:
            values = torch.eye(n, out=probs.new(n, n))
        values = values.view((n,) + (1,) * len(self.batch_shape) + (n,))
        return values.expand((n,) + self.batch_shape + (n,))
class OneHotCategorical(Distribution):
    r"""
    Creates a one-hot categorical distribution parameterized by `probs`.

    Samples are one-hot coded vectors of size probs.size(-1).

    See also: :func:`torch.distributions.Categorical`

    Example::

        >>> m = OneHotCategorical(torch.Tensor([ 0.25, 0.25, 0.25, 0.25 ]))
        >>> m.sample()  # equal probability of 0, 1, 2, 3
         0
         0
         1
         0
        [torch.FloatTensor of size 4]

    Args:
        probs (Tensor or Variable): event probabilities
    """
    params = {'probs': constraints.simplex}
    support = constraints.simplex
    has_enumerate_support = True

    def __init__(self, probs):
        self._categorical = Categorical(probs)
        batch_shape = probs.size()[:-1]
        event_shape = probs.size()[-1:]
        super(OneHotCategorical, self).__init__(batch_shape, event_shape)

    def sample(self, sample_shape=torch.Size()):
        sample_shape = torch.Size(sample_shape)
        probs = self._categorical.probs
        one_hot = probs.new(self._extended_shape(sample_shape)).zero_()
        indices = self._categorical.sample(sample_shape)
        if indices.dim() < one_hot.dim():
            indices = indices.unsqueeze(-1)
        return one_hot.scatter_(-1, indices, 1)

    def log_prob(self, value):
        indices = value.max(-1)[1]
        return self._categorical.log_prob(indices)

    def entropy(self):
        return self._categorical.entropy()

    def enumerate_support(self):
        probs = self._categorical.probs
        n = self.event_shape[0]
        if isinstance(probs, Variable):
            values = Variable(torch.eye(n, out=probs.data.new(n, n)))
        else:
            values = torch.eye(n, out=probs.new(n, n))
        values = values.view((n,) + (1,) * len(self.batch_shape) + (n,))
        return values.expand((n,) + self.batch_shape + (n,))
示例#31
0
def __preprocess_ac_space_discrete(logits: torch.Tensor, ac_space: Space, stochastic=True, action=[]):
    probs = Categorical(logits=logits)
    if len(action) == 0:
        if stochastic:
            action = probs.sample()
        else:
            action = torch.argmax(probs.probs, dim=1)
    else:
        action = torch.LongTensor(action.astype(np.int))
    return probs, action.tolist(), -probs.log_prob(action), probs.entropy()
示例#32
0
class OneHotCategorical(Distribution):
    r"""
    Creates a one-hot categorical distribution parameterized by :attr:`probs` or
    :attr:`logits`.

    Samples are one-hot coded vectors of size ``probs.size(-1)``.

    .. note:: :attr:`probs` will be normalized to be summing to 1.

    See also: :func:`torch.distributions.Categorical` for specifications of
    :attr:`probs` and :attr:`logits`.

    Example::

        >>> m = OneHotCategorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
        >>> m.sample()  # equal probability of 0, 1, 2, 3
        tensor([ 0.,  0.,  0.,  1.])

    Args:
        probs (Tensor): event probabilities
        logits (Tensor): event log probabilities
    """
    arg_constraints = {'probs': constraints.simplex}
    support = constraints.simplex
    has_enumerate_support = True

    def __init__(self, probs=None, logits=None, validate_args=None):
        self._categorical = Categorical(probs, logits)
        batch_shape = self._categorical.batch_shape
        event_shape = self._categorical.param_shape[-1:]
        super(OneHotCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args)

    def _new(self, *args, **kwargs):
        return self._categorical._new(*args, **kwargs)

    @property
    def probs(self):
        return self._categorical.probs

    @property
    def logits(self):
        return self._categorical.logits

    @property
    def mean(self):
        return self._categorical.probs

    @property
    def variance(self):
        return self._categorical.probs * (1 - self._categorical.probs)

    @property
    def param_shape(self):
        return self._categorical.param_shape

    def sample(self, sample_shape=torch.Size()):
        sample_shape = torch.Size(sample_shape)
        probs = self._categorical.probs
        one_hot = probs.new(self._extended_shape(sample_shape)).zero_()
        indices = self._categorical.sample(sample_shape)
        if indices.dim() < one_hot.dim():
            indices = indices.unsqueeze(-1)
        return one_hot.scatter_(-1, indices, 1)

    def log_prob(self, value):
        if self._validate_args:
            self._validate_sample(value)
        indices = value.max(-1)[1]
        return self._categorical.log_prob(indices)

    def entropy(self):
        return self._categorical.entropy()

    def enumerate_support(self):
        n = self.event_shape[0]
        values = self._new((n, n))
        torch.eye(n, out=values)
        values = values.view((n,) + (1,) * len(self.batch_shape) + (n,))
        return values.expand((n,) + self.batch_shape + (n,))