def policy_improvement(self): new_policy = np.empty([GRID_HEIGHT, GRID_WIDTH, self.env.NUM_ACTIONS]) is_policy_stable = True # 행동-가치 함수 생성 for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): if (i, j) in TERMINAL_STATES: for action in self.env.ACTIONS: new_policy[i][j][action] = 0.00 else: q_func = [] for action in self.env.ACTIONS: ( next_i, next_j ), reward, prob = self.env.get_state_action_probability( state=(i, j), action=action) q_func.append(prob * (reward + DISCOUNT_RATE * self.state_values[next_i, next_j])) new_policy[i, j, :] = softmax(q_func) error = np.sum(np.absolute(self.policy - new_policy)) if error > THETA_2: is_policy_stable = False self.policy = new_policy return is_policy_stable, error
def generate_greedy_policy(env, state_action_values, policy): new_policy = dict() for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): actions = [] action_probs = [] if (i, j) in TERMINAL_STATES: for action in env.ACTIONS: actions.append(action) action_probs.append(0.25) new_policy[(i, j)] = (actions, action_probs) else: for action in env.ACTIONS: actions.append(action) action_probs.append(state_action_values[i, j, action]) new_policy[(i, j)] = (actions, softmax(action_probs)) error = 0.0 for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): error += np.sum( np.absolute( np.array(policy[(i, j)][1]) - np.array(new_policy[(i, j)][1]) ) ) return new_policy, error
def act(self): input = [np.expand_dims(self.exp['s0'], axis=0)] qvals = self.critic.qvals(input)[0].squeeze() # for i, f in enumerate(self.env.feat): # self.stats['qval'+str(f)] += np.mean(np.squeeze(qvals[i])) action = np.random.choice(range(self.env.action_dim), p=softmax(qvals[self.env.idx], theta=1)) action = np.expand_dims(action, axis=1) self.exp['a'] = action return action
def generate_greedy_policy(self, state): actions = [] q_values = [] for action in range(NUM_ACTIONS): actions.append(action) q_values.append(self.state_action_values[(state, action)]) assert False if True in np.isnan( np.array(q_values)) else True, q_values self.target_policy[state] = (actions, softmax(q_values))
def act(self): v = self.env.vs[self.env.idx] input = [np.expand_dims(i, axis=0) for i in [self.exp['s0'], v, v]] qvals = self.critic.qvals(input)[0].squeeze() self.stats['qval' + str(self.env.feat[self.env.idx])] += np.mean( np.squeeze(qvals)) action = np.random.choice(range(self.env.action_dim), p=softmax(qvals, theta=1)) action = np.expand_dims(action, axis=1) self.exp['a'] = action return action
def get_targets_dqn(self, s, g, v, r=None): qvals = self.qvals([s, g, v])[0] probs = softmax(qvals, theta=1, axis=1) actions = [np.random.choice(range(self.env.action_dim), p=prob) for prob in probs] a1 = np.expand_dims(np.array(actions), axis=1) q = self.Tqval([s, g, v, a1])[0] if r is None: r = self.env.get_r(s, g, v) t = (r == self.env.R) targets = r + (1 - t) * self.gamma * q targets = np.clip(targets, 0, self.env.R) return targets
def policy_setup(self): # 행동-가치 함수 생성 for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): if (i, j) in TERMINAL_STATES: for action in self.env.action_space.ACTIONS: self.policy[i][j][action] = 0.0 else: q_func = [] for action in self.env.action_space.ACTIONS: (next_i, next_j), reward, prob = env.get_state_action_probability(state=(i, j), action=action) q_func.append( prob * (reward + DISCOUNT_RATE * self.state_values[next_i, next_j]) ) self.policy[i][j] = softmax(q_func)
def get_targets_dqn(self, s, task, r): qvals = self.qvals([s])[0] batchsize, _, numactions = qvals.shape qvals_for_task = qvals[np.arange(batchsize)[:, np.newaxis], task, np.arange(numactions)] probs = softmax(qvals_for_task, theta=1, axis=1) actions = [ np.random.choice(range(self.env.action_dim), p=prob) for prob in probs ] a1 = np.expand_dims(np.array(actions), axis=1) q = self.targetqval([s, task, a1])[0] t = (r == self.env.R) targets = r + (1 - t) * self.gamma * q.squeeze() targets = np.clip(targets, 0, self.env.R) return np.expand_dims(targets, axis=1)
def policy_improvement(self): new_policy = dict() is_policy_stable = True # 행동-가치 함수 생성 for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): if (i, j) in TERMINAL_STATES: actions = [] action_probs = [] for action in range(self.env.action_space.num_actions): actions.append(action) action_probs.append(0.25) new_policy[(i, j)] = (actions, action_probs) else: actions = [] q_func = [] for action in self.env.action_space.ACTIONS: actions.append(action) (next_i, next_j ), reward, prob = env.get_state_action_probability( state=(i, j), action=action) q_func.append(prob * (reward + DISCOUNT_RATE * self.state_values[next_i, next_j])) new_policy[(i, j)] = (actions, softmax(q_func)) error = 0.0 for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): error += np.sum( np.absolute( np.array(self.policy[(i, j)][1]) - np.array(new_policy[(i, j)][1]))) if error > THETA_2: is_policy_stable = False self.policy = new_policy return is_policy_stable, error
def generate_greedy_policy(self): new_policy = dict() is_policy_stable = True for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): if (i, j) in TERMINAL_STATES: actions = [] action_probs = [] for action in range(self.env.action_space.num_actions): actions.append(action) action_probs.append(0.25) new_policy[(i, j)] = (actions, action_probs) else: actions = [] q_values = [] for action in self.env.action_space.ACTIONS: actions.append(action) q_values.append(self.state_action_values[((i, j), action)]) new_policy[(i, j)] = (actions, softmax(q_values)) error = 0.0 for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): error += np.sum( np.absolute( np.array(self.policy[(i, j)][1]) - np.array(new_policy[(i, j)][1]))) if error > THETA_2: is_policy_stable = False self.policy = new_policy return is_policy_stable, error
def _run_per_eps(self, stepsize, n_it, eps_max, abort_early, random_start=True, random_start_perlin=True, noise_on_it_scale=12, n_grad_samples=8, momentum=0.0): """ FGM inner loop. :param stepsize: How far along the gradient we should move at each step. :param n_it: Number of steps. :param eps_max: If the norm of the cumulative perturbation is larger than this, clip it to this value. :return: UNROUNDED img and dist. """ cum_gradient = np.zeros(self.original_image.shape, dtype=np.float32) if random_start: if random_start_perlin: noise_eps = np.random.uniform(0.01, 3 * stepsize) x = self.original_image + np.float32( noise_eps * self.sample_gen.get_perlin()) else: x = self.original_image + np.float32( self.sample_gen.get_normal() * stepsize) else: x = np.copy(self.original_image) x_best = None # WARN: this is not rounded! In the future, we might do some sidestepping. dist_best = 9999. for it in range(n_it): x_prev = np.copy(x) # Take multiple samples of the gradient and average them. if noise_on_it_scale > 0: samples = np.empty( (n_grad_samples, ) + self.original_image.shape, dtype=np.float32) for i in range(n_grad_samples): # Add noise to image. TODO: Change this to EOT - countering filters and transforms if random_start_perlin: noise_eps = np.float32( np.random.uniform(-noise_on_it_scale, noise_on_it_scale)) samples[ i] = x + noise_eps * self.sample_gen.get_perlin() else: samples[i] = x + np.float32( self.sample_gen.get_normal()) * noise_on_it_scale # Get gradients in a batch, if possible. This is really slow otherwise. if self.batch_sub_model is not None: gradient_samples = self.batch_sub_model.gradient( samples, [self.label] * n_grad_samples) else: gradient_samples = np.zeros( (n_grad_samples, ) + self.original_image.shape, dtype=np.float32) for i in range(n_grad_samples): # Get misclassification gradient. gradient_samples[i] = self.model.gradient( samples[i], self.label) gradient = np.mean(gradient_samples, axis=0) else: if self.batch_sub_model is not None: gradient = self.batch_sub_model.gradient( x[np.newaxis, :], [self.label])[0, ...] else: gradient = self.model.gradient(x, self.label) if self.is_targeted: gradient = -gradient # Norm gradient to L2 distance. # g_norm = np.mean(np.abs(gradient)) # Gradient is "old school" L1 normed # g_norm = np.sqrt(np.vdot(gradient, gradient) / gradient.size) # Gradient is "old school" L2 normed g_norm = np.linalg.norm( gradient / 255.) # It's the evaluation L2 norm (seems to work best) # print("DEBUG: gradient norm = {}".format(g_norm)) gradient /= g_norm # Add previous gradients (momentum) cum_gradient = momentum * cum_gradient + gradient norm_cum_gradient = cum_gradient / np.linalg.norm( cum_gradient / 255.) # Add perturbation to image. x = x + stepsize * norm_cum_gradient # Normalize the (cumulative) perturbation to be of size eps. Will only scale downward, never upward. perturb_total = x - self.original_image pert_norm = _l2_norm(perturb_total) if pert_norm > eps_max: perturb_total = (perturb_total / pert_norm) * eps_max x = self.original_image + perturb_total # Round the image to uint8, making sure we remember it exactly as x_rounded = np.clip(np.round(x), 0, 255) if np.sum(np.abs(x - x_prev)) < 1e-3: print( "WARN: Rounded/clipped img is identical to previous one!") # Test if adversarial. dist = _l2_dist(x_rounded, self.original_image) msg = "Trying at L2={:.3f}.".format(dist) pred = self.model.predictions(x_rounded) pred_clsid = np.argmax(pred) if (pred_clsid == self.label) == self.is_targeted: msg += " Success!" if dist < dist_best: dist_best = dist x_best = np.copy(x) if print_details: print(msg) pred_self = self.batch_sub_model.batch_predictions( x_rounded[np.newaxis, :])[0] pred_self_softmax = softmax(pred_self) labels = np.argsort(pred_self)[::-1] label_other = labels[0] if label_other == self.label: label_other = labels[1] pred_self_highest = pred_self[label_other] pred_self_highest_softmax = pred_self_softmax[label_other] print( "Own model reports target probability of {:.6f} (logit: {:.6f}), other is {:.6f} (logit: {:.6f})" .format(pred_self_softmax[self.label], pred_self[self.label], pred_self_highest_softmax, pred_self_highest)) if abort_early and dist_best < 9999: break return x_best, dist_best