Пример #1
0
def eval_action_cert_rate(curr_model, env, args, epsilon=1e-4):
    episode_reward = 0
    state = env.reset()
    total = 0
    certified = 0
    with torch.no_grad():
        while True:
            input_x = torch.FloatTensor(state).unsqueeze(0)
            if args.gpu_id >= 0:
                with torch.cuda.device(args.gpu_id):
                    input_x = input_x.cuda()
            output = curr_model.forward(input_x)
            action = torch.argmax(output, dim=1)

            upper, lower = network_bounds(curr_model.model,
                                          input_x,
                                          epsilon=epsilon)
            #remove the action selected from calculations
            upper[:, action] = -1e10

            max_other = torch.max(upper, dim=1)[0]
            if lower[:, action] > max_other:
                certified += 1
            total += 1

            next_state, reward, done, info = env.step(action)
            episode_reward += reward
            state = next_state
            if done and not info:
                state = env.reset()
            elif info:
                state = env.reset()
                print('Reward:{}, action certification rate {:.4f}'.format(
                    episode_reward, certified / total))
                return certified / total
Пример #2
0
def eval_greedy_wc(curr_model, env, args, epsilon=1e-4):
    episode_reward = 0
    state = env.reset()

    with torch.no_grad():
        while True:
            input_x = torch.FloatTensor(state).unsqueeze(0)
            if args.gpu_id >= 0:
                with torch.cuda.device(args.gpu_id):
                    input_x = input_x.cuda()
            output = curr_model.forward(input_x)
            #print(output)

            upper, lower = network_bounds(curr_model.model,
                                          input_x,
                                          epsilon=epsilon)
            impossible = upper < torch.max(lower, dim=1)[0]
            #add a large number to ignore impossible ones, choose possible action with smallest q-value
            worst_case_action = torch.argmin(output + 1e6 * impossible, dim=1)
            next_state, reward, done, info = env.step(worst_case_action[0])
            episode_reward += reward
            state = next_state
            if done and not info:
                state = env.reset()
            elif info:
                state = env.reset()
                print('Worst case reward {}'.format(episode_reward))
                return episode_reward
Пример #3
0
    def action_test_losses(self, bound_epsilon=None):
        with torch.no_grad():
            value, logit = self.model(Variable(self.state.unsqueeze(0)))
            prob = torch.clamp(F.softmax(logit, dim=1), 1e-6, 1)
            log_prob = torch.clamp(F.log_softmax(logit, dim=1), -30, -1e-6)
            entropy = -(log_prob * prob).sum(1)
            self.entropies.append(entropy)

            action = prob.argmax(1, keepdim=True).data

            if bound_epsilon:
                upper, lower = network_bounds(self.model.model,
                                              Variable(
                                                  self.state.unsqueeze(0)),
                                              epsilon=bound_epsilon)
                upper, lower = upper[:, 1:], lower[:, 1:]
                with torch.cuda.device(self.gpu_id):
                    onehot_action = torch.zeros(upper.shape).cuda()
                onehot_action[range(upper.shape[0]), action] = 1
                min_prob = torch.clamp(
                    F.log_softmax(onehot_action * lower +
                                  (1 - onehot_action) * upper,
                                  dim=1), -30, -1e-6)
                max_prob = torch.clamp(
                    F.log_softmax(
                        (1 - onehot_action) * lower + onehot_action * upper,
                        dim=1), -30, -1e-6)

                self.max_log_probs.append(max_prob.gather(1, Variable(action)))
                self.min_log_probs.append(min_prob.gather(1, Variable(action)))

            log_prob = log_prob.gather(1, Variable(action))
            state, self.noclip_reward, self.done, self.info = self.env.step(
                action.cpu().numpy())
            self.reward = max(min(self.noclip_reward, 1), -1)
            self.state = torch.from_numpy(state).float()
            if self.gpu_id >= 0:
                with torch.cuda.device(self.gpu_id):
                    self.state = self.state.cuda()

            self.values.append(value)
            self.log_probs.append(log_prob)
            self.rewards.append(self.reward)
            self.eps_len += 1
        return self
Пример #4
0
def _compute_robust_loss(curr_model, target_model, data, epsilon, kappa, gamma,
                         device, args):
    state, action, reward, next_state, done = data

    q_values = curr_model(state)
    next_q_values = curr_model(next_state)
    next_q_state_values = target_model(next_state)

    q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value = next_q_state_values.gather(
        1, torch.argmax(next_q_values, 1, keepdim=True)).squeeze(1)
    expected_q_value = reward + gamma * next_q_value * (1 - done)

    standard_loss = torch.min((q_value - expected_q_value.detach()).pow(2),
                              torch.abs(q_value - expected_q_value.detach()))

    upper, lower = network_bounds(curr_model.model, state, epsilon)
    onehot_labels = torch.zeros(upper.shape).to(device)
    onehot_labels[range(state.shape[0]), action] = 1

    if args.worse_bound:
        upper_diff = upper - q_values * (
            1 - onehot_labels
        ) - expected_q_value.detach().unsqueeze(1) * onehot_labels
        lower_diff = lower - q_values * (
            1 - onehot_labels
        ) - expected_q_value.detach().unsqueeze(1) * onehot_labels
        wc_diff = torch.max(torch.abs(upper_diff), torch.abs(lower_diff))
    else:
        worst_case = onehot_labels * lower + (1 - onehot_labels) * upper
        wc_diff = torch.abs(worst_case - q_values * (1 - onehot_labels) -
                            expected_q_value.detach().unsqueeze(1) *
                            onehot_labels)

    #sum over output layer, mean only in batch dimension
    worst_case_loss = torch.sum(torch.min(wc_diff.pow(2), wc_diff),
                                dim=1).mean()

    standard_loss = standard_loss.mean()

    loss = (kappa * (standard_loss) + (1 - kappa) * (worst_case_loss))

    return loss, standard_loss, worst_case_loss