def benchmark_adversarial_policy(args=get_args()): env = make_atari_env_watch(args) if args.save_video: log_path = os.path.join(args.logdir, args.task, args.policy, "critical_point_attack_eps-" + str(args.eps) +\ "_n-" + str(args.n) + "_m-" + str(args.m) + "_" + args.target_policy) env = gym.wrappers.Monitor(env, log_path, force=True) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.env.action_space.shape or env.env.action_space.n # should be N_FRAMES x H x W print("Observations shape: ", args.state_shape) print("Actions shape: ", args.action_shape) np.random.seed(args.seed) torch.manual_seed(args.seed) # make policy policy = make_policy(args, args.policy, args.resume_path) # make target policy if args.target_policy is not None: victim_policy = make_policy(args, args.target_policy, args.target_policy_path) adv_net = make_victim_network(args, victim_policy) else: adv_net = make_victim_network(args, policy) # define observations adversarial attack obs_adv_atk, atk_type = make_img_adv_attack(args, adv_net, targeted=True) print("Attack type:", atk_type) # define adversarial collector acts_mask = None dam = None if "Pong" in args.task: acts_mask = [3, 4] dam = dam_pong delta = 100 if "Breakout" in args.task: acts_mask = [1, 2, 3] dam = dam_breakout delta = 100 collector = critical_point_attack_collector( policy, env, obs_adv_atk, perfect_attack=args.perfect_attack, acts_mask=acts_mask, device=args.device, full_search=args.full_search, repeat_adv_act=args.repeat_act, dam=dam, delta=delta) collector.n = int(args.n * args.repeat_act) collector.m = int(args.m * args.repeat_act) start_time = time.time() test_adversarial_policy = collector.collect(n_episode=args.test_num) print("Attack finished in %s seconds" % (time.time() - start_time)) atk_freq_ = test_adversarial_policy['atk_rate(%)'] reward = test_adversarial_policy['rew'] n_attacks = test_adversarial_policy['n_atks'] print("attack frequency =", atk_freq_, "| n_attacks =", n_attacks, "| n_succ_atks (%)", test_adversarial_policy['succ_atks(%)'], "| reward: ", reward)
def benchmark_adversarial_policy(args=get_args()): env = make_atari_env_watch(args) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.env.action_space.shape or env.env.action_space.n # should be N_FRAMES x H x W print("Observations shape: ", args.state_shape) print("Actions shape: ", args.action_shape) np.random.seed(args.seed) torch.manual_seed(args.seed) # make policy policy = make_policy(args, args.policy, args.resume_path) # make target policy transferability_type = "" # THIS PART MAY BE REMOVED if "def" in args.logdir and args.target_policy is None: warnings.warn( "You are generating adversarial observation on the defended model, you may want to craft them on" "the undefended version instead") if args.target_policy is not None: victim_policy = make_policy(args, args.target_policy, args.target_policy_path) transferability_type = "_transf_" + str(args.target_policy) adv_net = make_victim_network(args, victim_policy) else: adv_net = make_victim_network(args, policy) # define observations adversarial attack obs_adv_atk, atk_type = make_img_adv_attack(args, adv_net, targeted=False) print("Attack type:", atk_type) # define adversarial collector collector = uniform_attack_collector(policy, env, obs_adv_atk, perfect_attack=args.perfect_attack, device=args.device) atk_freq = np.linspace(args.min, args.max, args.steps, endpoint=True) n_attacks = [] rewards = [] for f in atk_freq: collector.atk_frequency = f test_adversarial_policy = collector.collect(n_episode=args.test_num) atk_freq_ = test_adversarial_policy['atk_rate(%)'] rewards.append(test_adversarial_policy['rew']) n_attacks.append(test_adversarial_policy['n_atks']) print("attack frequency =", atk_freq_, "| n_attacks =", n_attacks[-1], "| n_succ_atks (%)", test_adversarial_policy['succ_atks(%)'], "| reward: ", rewards[-1]) # pprint.pprint(test_adversarial_policy) log_path = os.path.join( args.logdir, args.task, args.policy, "uniform_attack_" + atk_type + transferability_type + ".npy") # save results with open(log_path, 'wb') as f: np.save(f, atk_freq) np.save(f, n_attacks) np.save(f, rewards) print("Results saved to", log_path)
def benchmark_adversarial_policy(args=get_args()): env = make_atari_env_watch(args) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.env.action_space.shape or env.env.action_space.n # should be N_FRAMES x H x W print("Observations shape: ", args.state_shape) print("Actions shape: ", args.action_shape) np.random.seed(args.seed) torch.manual_seed(args.seed) # make policy policy = make_policy(args, args.policy, args.resume_path) # make target policy transferability_type = "" if args.target_policy is not None: victim_policy = make_policy(args, args.target_policy, args.target_policy_path) transferability_type = "_transf_" + str(args.target_policy) adv_net = make_victim_network(args, victim_policy) else: adv_net = make_victim_network(args, policy) # define observations adversarial attack obs_adv_atk, atk_type = make_img_adv_attack(args, adv_net, targeted=True) print("Attack type:", atk_type) # define adversarial collector collector = strategically_timed_attack_collector( policy, env, obs_adv_atk, perfect_attack=args.perfect_attack, softmax=False if args.no_softmax else True, device=args.device) beta = np.linspace(args.min, args.max, args.steps, endpoint=True) atk_freq = [] n_attacks = [] rewards = [] for b in beta: collector.beta = b test_adversarial_policy = collector.collect(n_episode=args.test_num) rewards.append(test_adversarial_policy['rew']) atk_freq.append(test_adversarial_policy['atk_rate(%)']) n_attacks.append(test_adversarial_policy['n_atks']) print("attack frequency =", atk_freq[-1], "| n_attacks =", n_attacks[-1], "| n_succ_atks (%)", test_adversarial_policy['succ_atks(%)'], "| reward: ", rewards[-1]) # pprint.pprint(test_adversarial_policy) log_path = os.path.join( args.logdir, args.task, args.policy, "strategically_timed_attack_" + atk_type + transferability_type + ".npy") with open(log_path, 'wb') as f: np.save(f, atk_freq) np.save(f, n_attacks) np.save(f, rewards) print("Results saved to", log_path)
def benchmark_adversarial_policy(args=get_args()): env = make_atari_env_watch(args) if args.save_video: log_path = os.path.join(args.logdir, args.task, args.policy, "adversarial_policy_attack_eps-" + str(args.eps) +\ "_beta-" + str(args.beta) + "_" + args.target_policy) env = gym.wrappers.Monitor(env, log_path, force=True) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.env.action_space.shape or env.env.action_space.n # should be N_FRAMES x H x W print("Observations shape: ", args.state_shape) print("Actions shape: ", args.action_shape) np.random.seed(args.seed) torch.manual_seed(args.seed) # make policy policy = make_policy(args, args.policy, args.resume_path) # make target policy if args.target_policy is not None: victim_policy = make_policy(args, args.target_policy, args.target_policy_path) adv_net = make_victim_network(args, victim_policy) else: adv_net = make_victim_network(args, policy) # define observations adversarial attack obs_adv_atk, atk_type = make_img_adv_attack(args, adv_net, targeted=True) print("Attack type:", atk_type) # define adversarial policy adv_policy = None if args.adv_policy is not None: adv_policy = make_policy(args, args.adv_policy, args.adv_policy_path) # define adversarial collector collector = adversarial_policy_attack_collector( policy, env, obs_adv_atk, perfect_attack=args.perfect_attack, softmax=False if args.no_softmax else True, device=args.device, adv_policy=adv_policy) collector.beta = args.beta start_time = time.time() test_adversarial_policy = collector.collect(n_episode=args.test_num) print("Attack finished in %s seconds" % (time.time() - start_time)) atk_freq_ = test_adversarial_policy['atk_rate(%)'] reward = test_adversarial_policy['rew'] n_attacks = test_adversarial_policy['n_atks'] print("attack frequency =", atk_freq_, "| n_attacks =", n_attacks, "| n_succ_atks (%)", test_adversarial_policy['succ_atks(%)'], "| reward: ", reward)
def benchmark_adversarial_policy(args=get_args()): env = make_atari_env_watch(args) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.env.action_space.shape or env.env.action_space.n # should be N_FRAMES x H x W print("Observations shape: ", args.state_shape) print("Actions shape: ", args.action_shape) np.random.seed(args.seed) torch.manual_seed(args.seed) # make policy policy = make_policy(args, args.policy, args.resume_path) # make target policy transferability_type = "" if args.target_policy is not None: victim_policy = make_policy(args, args.target_policy, args.target_policy_path) transferability_type = "_transf_" + str(args.target_policy) adv_net = make_victim_network(args, victim_policy) else: adv_net = make_victim_network(args, policy) # define observations adversarial attack obs_adv_atk, atk_type = make_img_adv_attack(args, adv_net, targeted=True) print("Attack type:", atk_type) # define adversarial collector acts_mask = None if "Pong" in args.task: acts_mask = [3, 4] delta = 0 if "Breakout" in args.task: acts_mask = [1, 2, 3] delta = 0 collector = critical_strategy_attack_collector( policy, env, obs_adv_atk, perfect_attack=args.perfect_attack, acts_mask=acts_mask, device=args.device, full_search=args.full_search, repeat_adv_act=args.repeat_act, delta=delta) n_range = list(np.arange(args.min, args.max)) + [args.max] m_range = [0., 0.25, 0.5, 0.75, 1.] atk_freq = [] n_attacks = [] rewards = [] for n in n_range: for m in m_range: collector.n = int(n * args.repeat_act) collector.m = int(n * args.repeat_act + n * args.repeat_act * m) test_adversarial_policy = collector.collect( n_episode=args.test_num) rewards.append(test_adversarial_policy['rew']) atk_freq.append(test_adversarial_policy['atk_rate(%)']) n_attacks.append(test_adversarial_policy['n_atks']) print("n =", str(int(n * args.repeat_act)), "m =", str(int(n * args.repeat_act + n * args.repeat_act * m)), "| attack frequency =", atk_freq[-1], "| n_attacks =", n_attacks[-1], "| n_succ_atks (%)", test_adversarial_policy['succ_atks(%)'], "| reward: ", rewards[-1]) # pprint.pprint(test_adversarial_policy) log_path = os.path.join( args.logdir, args.task, args.policy, "critical_strategy_attack_" + atk_type + transferability_type + ".npy") with open(log_path, 'wb') as f: np.save(f, atk_freq) np.save(f, n_attacks) np.save(f, rewards) print("Results saved to", log_path)
os.makedirs(save_path) file_name = "perturbation_benchmark_result.txt" if len(rl_defenses) == 1: file_name = "perturbation_benchmark_" + str(rl_defenses[0]) + ".txt" f_rew = open(os.path.join(save_path, file_name), "w+") args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.env.action_space.shape or env.env.action_space.n # should be N_FRAMES x H x W print("Observations shape: ", args.state_shape) print("Actions shape: ", args.action_shape) np.random.seed(args.seed) torch.manual_seed(args.seed) # make policy policy = make_policy(args, args.policy, args.resume_path) adv_net = make_victim_network(args, policy) # make defended policies for defense in rl_defenses: if defense == "No Defense": def_policy = policy elif "AdversarialTraining" in defense: def_policy = make_policy( args, args.policy, os.path.join("log_def", args.task, args.policy, defense + ".pth")) elif "JPEGFilter" == defense: def_policy = JPEGFilterDefense(policy, quality=20) elif "BitSqueezing" == defense: def_policy = BitSqueezingDefense(policy, bit_depth=5)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = args.device envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, None, device, False) if args.resume_path is None: actor_critic = Policy(envs.observation_space.shape, envs.action_space, device=args.device, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) actor_critic.init(device) else: actor_critic = make_policy(args, args.algo, args.resume_path) # watch agent's performance def watch(): print("Testing agent ...") actor_critic.eval() args.task, args.frames_stack = args.env_name, 4 env = make_atari_env_watch(args) obs = env.reset() n_ep, tot_rew = 0, 0 while True: inputs = Batch(obs=np.expand_dims(obs, axis=0)) with torch.no_grad(): result = actor_critic(inputs) action = result.act # Observe reward and next obs obs, reward, done, _ = env.step(action) tot_rew += reward if done: n_ep += 1 obs = env.reset() if n_ep == args.test_num: break print("Evaluation using {} episodes: mean reward {:.5f}\n".format( n_ep, tot_rew / n_ep)) if args.watch: watch() exit(0) if args.resume_path is not None: args.rms_eps = 0.1 if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.rms_eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.rms_eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) acc_rewards = np.zeros(args.num_processes) best_reward = -np.inf start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print("start training") for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = envs.step(action) for i, d in enumerate(done): acc_rewards[i] += reward[i].detach().cpu()[0] if d: episode_rewards.append(acc_rewards[i]) acc_rewards[i] = 0 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if len(episode_rewards) > 0 and np.mean( episode_rewards) >= best_reward and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass best_reward = np.mean(episode_rewards) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, "policy.pth")) if j % args.log_interval == 0 and len(episode_rewards) > 0: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \nLast {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f} (best avg reward {:.1f})\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), best_reward)) print("model saved to " + str(os.path.join(args.save_dir, args.algo, "policy.pth"))) watch()