示例#1
0
    def policy_improvement(self):
        new_policy = np.empty([GRID_HEIGHT, GRID_WIDTH, self.env.NUM_ACTIONS])

        is_policy_stable = True

        # 행동-가치 함수 생성
        for i in range(GRID_HEIGHT):
            for j in range(GRID_WIDTH):
                if (i, j) in TERMINAL_STATES:
                    for action in self.env.ACTIONS:
                        new_policy[i][j][action] = 0.00
                else:
                    q_func = []
                    for action in self.env.ACTIONS:
                        (
                            next_i, next_j
                        ), reward, prob = self.env.get_state_action_probability(
                            state=(i, j), action=action)
                        q_func.append(prob *
                                      (reward + DISCOUNT_RATE *
                                       self.state_values[next_i, next_j]))

                    new_policy[i, j, :] = softmax(q_func)

        error = np.sum(np.absolute(self.policy - new_policy))

        if error > THETA_2:
            is_policy_stable = False

        self.policy = new_policy

        return is_policy_stable, error
示例#2
0
def generate_greedy_policy(env, state_action_values, policy):
    new_policy = dict()

    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            actions = []
            action_probs = []
            if (i, j) in TERMINAL_STATES:
                for action in env.ACTIONS:
                    actions.append(action)
                    action_probs.append(0.25)
                new_policy[(i, j)] = (actions, action_probs)
            else:
                for action in env.ACTIONS:
                    actions.append(action)
                    action_probs.append(state_action_values[i, j, action])
                new_policy[(i, j)] = (actions, softmax(action_probs))

    error = 0.0
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            error += np.sum(
                np.absolute(
                    np.array(policy[(i, j)][1]) - np.array(new_policy[(i, j)][1])
                )
            )

    return new_policy, error
示例#3
0
 def act(self):
     input = [np.expand_dims(self.exp['s0'], axis=0)]
     qvals = self.critic.qvals(input)[0].squeeze()
     # for i, f in enumerate(self.env.feat):
     #     self.stats['qval'+str(f)] += np.mean(np.squeeze(qvals[i]))
     action = np.random.choice(range(self.env.action_dim),
                               p=softmax(qvals[self.env.idx], theta=1))
     action = np.expand_dims(action, axis=1)
     self.exp['a'] = action
     return action
示例#4
0
    def generate_greedy_policy(self, state):
        actions = []
        q_values = []
        for action in range(NUM_ACTIONS):
            actions.append(action)
            q_values.append(self.state_action_values[(state, action)])

        assert False if True in np.isnan(
            np.array(q_values)) else True, q_values

        self.target_policy[state] = (actions, softmax(q_values))
示例#5
0
 def act(self):
     v = self.env.vs[self.env.idx]
     input = [np.expand_dims(i, axis=0) for i in [self.exp['s0'], v, v]]
     qvals = self.critic.qvals(input)[0].squeeze()
     self.stats['qval' + str(self.env.feat[self.env.idx])] += np.mean(
         np.squeeze(qvals))
     action = np.random.choice(range(self.env.action_dim),
                               p=softmax(qvals, theta=1))
     action = np.expand_dims(action, axis=1)
     self.exp['a'] = action
     return action
示例#6
0
 def get_targets_dqn(self, s, g, v, r=None):
     qvals = self.qvals([s, g, v])[0]
     probs = softmax(qvals, theta=1, axis=1)
     actions = [np.random.choice(range(self.env.action_dim), p=prob) for prob in probs]
     a1 = np.expand_dims(np.array(actions), axis=1)
     q = self.Tqval([s, g, v, a1])[0]
     if r is None:
         r = self.env.get_r(s, g, v)
     t = (r == self.env.R)
     targets = r + (1 - t) * self.gamma * q
     targets = np.clip(targets, 0, self.env.R)
     return targets
示例#7
0
    def policy_setup(self):
        # 행동-가치 함수 생성
        for i in range(GRID_HEIGHT):
            for j in range(GRID_WIDTH):
                if (i, j) in TERMINAL_STATES:
                    for action in self.env.action_space.ACTIONS:
                        self.policy[i][j][action] = 0.0
                else:
                    q_func = []
                    for action in self.env.action_space.ACTIONS:
                        (next_i, next_j), reward, prob = env.get_state_action_probability(state=(i, j), action=action)
                        q_func.append(
                            prob * (reward + DISCOUNT_RATE * self.state_values[next_i, next_j])
                        )

                    self.policy[i][j] = softmax(q_func)
示例#8
0
 def get_targets_dqn(self, s, task, r):
     qvals = self.qvals([s])[0]
     batchsize, _, numactions = qvals.shape
     qvals_for_task = qvals[np.arange(batchsize)[:, np.newaxis], task,
                            np.arange(numactions)]
     probs = softmax(qvals_for_task, theta=1, axis=1)
     actions = [
         np.random.choice(range(self.env.action_dim), p=prob)
         for prob in probs
     ]
     a1 = np.expand_dims(np.array(actions), axis=1)
     q = self.targetqval([s, task, a1])[0]
     t = (r == self.env.R)
     targets = r + (1 - t) * self.gamma * q.squeeze()
     targets = np.clip(targets, 0, self.env.R)
     return np.expand_dims(targets, axis=1)
示例#9
0
    def policy_improvement(self):
        new_policy = dict()

        is_policy_stable = True

        # 행동-가치 함수 생성
        for i in range(GRID_HEIGHT):
            for j in range(GRID_WIDTH):
                if (i, j) in TERMINAL_STATES:
                    actions = []
                    action_probs = []
                    for action in range(self.env.action_space.num_actions):
                        actions.append(action)
                        action_probs.append(0.25)
                    new_policy[(i, j)] = (actions, action_probs)
                else:
                    actions = []
                    q_func = []
                    for action in self.env.action_space.ACTIONS:
                        actions.append(action)
                        (next_i, next_j
                         ), reward, prob = env.get_state_action_probability(
                             state=(i, j), action=action)
                        q_func.append(prob *
                                      (reward + DISCOUNT_RATE *
                                       self.state_values[next_i, next_j]))

                    new_policy[(i, j)] = (actions, softmax(q_func))

        error = 0.0
        for i in range(GRID_HEIGHT):
            for j in range(GRID_WIDTH):
                error += np.sum(
                    np.absolute(
                        np.array(self.policy[(i, j)][1]) -
                        np.array(new_policy[(i, j)][1])))

        if error > THETA_2:
            is_policy_stable = False

        self.policy = new_policy

        return is_policy_stable, error
    def generate_greedy_policy(self):
        new_policy = dict()

        is_policy_stable = True

        for i in range(GRID_HEIGHT):
            for j in range(GRID_WIDTH):
                if (i, j) in TERMINAL_STATES:
                    actions = []
                    action_probs = []
                    for action in range(self.env.action_space.num_actions):
                        actions.append(action)
                        action_probs.append(0.25)
                    new_policy[(i, j)] = (actions, action_probs)
                else:
                    actions = []
                    q_values = []
                    for action in self.env.action_space.ACTIONS:
                        actions.append(action)
                        q_values.append(self.state_action_values[((i, j),
                                                                  action)])

                    new_policy[(i, j)] = (actions, softmax(q_values))

        error = 0.0
        for i in range(GRID_HEIGHT):
            for j in range(GRID_WIDTH):
                error += np.sum(
                    np.absolute(
                        np.array(self.policy[(i, j)][1]) -
                        np.array(new_policy[(i, j)][1])))

        if error > THETA_2:
            is_policy_stable = False

        self.policy = new_policy

        return is_policy_stable, error
    def _run_per_eps(self,
                     stepsize,
                     n_it,
                     eps_max,
                     abort_early,
                     random_start=True,
                     random_start_perlin=True,
                     noise_on_it_scale=12,
                     n_grad_samples=8,
                     momentum=0.0):
        """
        FGM inner loop.
        :param stepsize: How far along the gradient we should move at each step.
        :param n_it: Number of steps.
        :param eps_max: If the norm of the cumulative perturbation is larger than this, clip it to this value.
        :return: UNROUNDED img and dist.
        """

        cum_gradient = np.zeros(self.original_image.shape, dtype=np.float32)

        if random_start:
            if random_start_perlin:
                noise_eps = np.random.uniform(0.01, 3 * stepsize)
                x = self.original_image + np.float32(
                    noise_eps * self.sample_gen.get_perlin())
            else:
                x = self.original_image + np.float32(
                    self.sample_gen.get_normal() * stepsize)
        else:
            x = np.copy(self.original_image)

        x_best = None  # WARN: this is not rounded! In the future, we might do some sidestepping.
        dist_best = 9999.

        for it in range(n_it):

            x_prev = np.copy(x)

            # Take multiple samples of the gradient and average them.
            if noise_on_it_scale > 0:

                samples = np.empty(
                    (n_grad_samples, ) + self.original_image.shape,
                    dtype=np.float32)

                for i in range(n_grad_samples):
                    # Add noise to image. TODO: Change this to EOT - countering filters and transforms
                    if random_start_perlin:
                        noise_eps = np.float32(
                            np.random.uniform(-noise_on_it_scale,
                                              noise_on_it_scale))
                        samples[
                            i] = x + noise_eps * self.sample_gen.get_perlin()
                    else:
                        samples[i] = x + np.float32(
                            self.sample_gen.get_normal()) * noise_on_it_scale

                # Get gradients in a batch, if possible. This is really slow otherwise.
                if self.batch_sub_model is not None:
                    gradient_samples = self.batch_sub_model.gradient(
                        samples, [self.label] * n_grad_samples)
                else:
                    gradient_samples = np.zeros(
                        (n_grad_samples, ) + self.original_image.shape,
                        dtype=np.float32)
                    for i in range(n_grad_samples):
                        # Get misclassification gradient.
                        gradient_samples[i] = self.model.gradient(
                            samples[i], self.label)

                gradient = np.mean(gradient_samples, axis=0)
            else:
                if self.batch_sub_model is not None:
                    gradient = self.batch_sub_model.gradient(
                        x[np.newaxis, :], [self.label])[0, ...]
                else:
                    gradient = self.model.gradient(x, self.label)

            if self.is_targeted:
                gradient = -gradient

            # Norm gradient to L2 distance.
            # g_norm = np.mean(np.abs(gradient))                                 # Gradient is "old school" L1 normed
            # g_norm = np.sqrt(np.vdot(gradient, gradient) / gradient.size)      # Gradient is "old school" L2 normed
            g_norm = np.linalg.norm(
                gradient /
                255.)  # It's the evaluation L2 norm (seems to work best)
            # print("DEBUG: gradient norm = {}".format(g_norm))
            gradient /= g_norm

            # Add previous gradients (momentum)
            cum_gradient = momentum * cum_gradient + gradient
            norm_cum_gradient = cum_gradient / np.linalg.norm(
                cum_gradient / 255.)

            # Add perturbation to image.
            x = x + stepsize * norm_cum_gradient

            # Normalize the (cumulative) perturbation to be of size eps. Will only scale downward, never upward.
            perturb_total = x - self.original_image
            pert_norm = _l2_norm(perturb_total)
            if pert_norm > eps_max:
                perturb_total = (perturb_total / pert_norm) * eps_max
            x = self.original_image + perturb_total

            # Round the image to uint8, making sure we remember it exactly as
            x_rounded = np.clip(np.round(x), 0, 255)
            if np.sum(np.abs(x - x_prev)) < 1e-3:
                print(
                    "WARN: Rounded/clipped img is identical to previous one!")

            # Test if adversarial.
            dist = _l2_dist(x_rounded, self.original_image)
            msg = "Trying at L2={:.3f}.".format(dist)
            pred = self.model.predictions(x_rounded)
            pred_clsid = np.argmax(pred)
            if (pred_clsid == self.label) == self.is_targeted:
                msg += " Success!"
                if dist < dist_best:
                    dist_best = dist
                    x_best = np.copy(x)
            if print_details:
                print(msg)
                pred_self = self.batch_sub_model.batch_predictions(
                    x_rounded[np.newaxis, :])[0]
                pred_self_softmax = softmax(pred_self)
                labels = np.argsort(pred_self)[::-1]
                label_other = labels[0]
                if label_other == self.label:
                    label_other = labels[1]
                pred_self_highest = pred_self[label_other]
                pred_self_highest_softmax = pred_self_softmax[label_other]

                print(
                    "Own model reports target probability of {:.6f} (logit: {:.6f}), other is {:.6f} (logit: {:.6f})"
                    .format(pred_self_softmax[self.label],
                            pred_self[self.label], pred_self_highest_softmax,
                            pred_self_highest))

            if abort_early and dist_best < 9999:
                break

        return x_best, dist_best