Python make_policy示例，utils.make_policy Python示例

示例#1

0

显示文件

文件： atari_critical_point_attack.py 项目： q-learning-trader/rl-policies-attacks-defenses

def benchmark_adversarial_policy(args=get_args()):
    env = make_atari_env_watch(args)
    if args.save_video:
        log_path = os.path.join(args.logdir, args.task, args.policy, "critical_point_attack_eps-" + str(args.eps) +\
                                "_n-" + str(args.n) + "_m-" + str(args.m) + "_" + args.target_policy)
        env = gym.wrappers.Monitor(env, log_path, force=True)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.env.action_space.shape or env.env.action_space.n
    # should be N_FRAMES x H x W
    print("Observations shape: ", args.state_shape)
    print("Actions shape: ", args.action_shape)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # make policy
    policy = make_policy(args, args.policy, args.resume_path)
    # make target policy
    if args.target_policy is not None:
        victim_policy = make_policy(args, args.target_policy,
                                    args.target_policy_path)
        adv_net = make_victim_network(args, victim_policy)
    else:
        adv_net = make_victim_network(args, policy)
    # define observations adversarial attack
    obs_adv_atk, atk_type = make_img_adv_attack(args, adv_net, targeted=True)
    print("Attack type:", atk_type)

    # define adversarial collector
    acts_mask = None
    dam = None
    if "Pong" in args.task:
        acts_mask = [3, 4]
        dam = dam_pong
        delta = 100
    if "Breakout" in args.task:
        acts_mask = [1, 2, 3]
        dam = dam_breakout
        delta = 100
    collector = critical_point_attack_collector(
        policy,
        env,
        obs_adv_atk,
        perfect_attack=args.perfect_attack,
        acts_mask=acts_mask,
        device=args.device,
        full_search=args.full_search,
        repeat_adv_act=args.repeat_act,
        dam=dam,
        delta=delta)
    collector.n = int(args.n * args.repeat_act)
    collector.m = int(args.m * args.repeat_act)
    start_time = time.time()
    test_adversarial_policy = collector.collect(n_episode=args.test_num)
    print("Attack finished in %s seconds" % (time.time() - start_time))
    atk_freq_ = test_adversarial_policy['atk_rate(%)']
    reward = test_adversarial_policy['rew']
    n_attacks = test_adversarial_policy['n_atks']
    print("attack frequency =", atk_freq_, "| n_attacks =", n_attacks,
          "| n_succ_atks (%)", test_adversarial_policy['succ_atks(%)'],
          "| reward: ", reward)

示例#2

0

显示文件

文件： atari_uniform_attack_benchmark.py 项目： q-learning-trader/rl-policies-attacks-defenses

def benchmark_adversarial_policy(args=get_args()):
    env = make_atari_env_watch(args)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.env.action_space.shape or env.env.action_space.n
    # should be N_FRAMES x H x W
    print("Observations shape: ", args.state_shape)
    print("Actions shape: ", args.action_shape)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # make policy
    policy = make_policy(args, args.policy, args.resume_path)
    # make target policy
    transferability_type = ""
    # THIS PART MAY BE REMOVED
    if "def" in args.logdir and args.target_policy is None:
        warnings.warn(
            "You are generating adversarial observation on the defended model, you may want to craft them on"
            "the undefended version instead")
    if args.target_policy is not None:
        victim_policy = make_policy(args, args.target_policy,
                                    args.target_policy_path)
        transferability_type = "_transf_" + str(args.target_policy)
        adv_net = make_victim_network(args, victim_policy)
    else:
        adv_net = make_victim_network(args, policy)
    # define observations adversarial attack
    obs_adv_atk, atk_type = make_img_adv_attack(args, adv_net, targeted=False)
    print("Attack type:", atk_type)

    # define adversarial collector
    collector = uniform_attack_collector(policy,
                                         env,
                                         obs_adv_atk,
                                         perfect_attack=args.perfect_attack,
                                         device=args.device)
    atk_freq = np.linspace(args.min, args.max, args.steps, endpoint=True)
    n_attacks = []
    rewards = []
    for f in atk_freq:
        collector.atk_frequency = f
        test_adversarial_policy = collector.collect(n_episode=args.test_num)
        atk_freq_ = test_adversarial_policy['atk_rate(%)']
        rewards.append(test_adversarial_policy['rew'])
        n_attacks.append(test_adversarial_policy['n_atks'])
        print("attack frequency =", atk_freq_, "| n_attacks =", n_attacks[-1],
              "| n_succ_atks (%)", test_adversarial_policy['succ_atks(%)'],
              "| reward: ", rewards[-1])
        # pprint.pprint(test_adversarial_policy)
    log_path = os.path.join(
        args.logdir, args.task, args.policy,
        "uniform_attack_" + atk_type + transferability_type + ".npy")

    # save results
    with open(log_path, 'wb') as f:
        np.save(f, atk_freq)
        np.save(f, n_attacks)
        np.save(f, rewards)
    print("Results saved to", log_path)

示例#3

0

显示文件

文件： atari_strategically_timed_attack_benchmark.py 项目： q-learning-trader/rl-policies-attacks-defenses

def benchmark_adversarial_policy(args=get_args()):
    env = make_atari_env_watch(args)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.env.action_space.shape or env.env.action_space.n
    # should be N_FRAMES x H x W
    print("Observations shape: ", args.state_shape)
    print("Actions shape: ", args.action_shape)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # make policy
    policy = make_policy(args, args.policy, args.resume_path)
    # make target policy
    transferability_type = ""
    if args.target_policy is not None:
        victim_policy = make_policy(args, args.target_policy,
                                    args.target_policy_path)
        transferability_type = "_transf_" + str(args.target_policy)
        adv_net = make_victim_network(args, victim_policy)
    else:
        adv_net = make_victim_network(args, policy)
    # define observations adversarial attack
    obs_adv_atk, atk_type = make_img_adv_attack(args, adv_net, targeted=True)
    print("Attack type:", atk_type)

    # define adversarial collector
    collector = strategically_timed_attack_collector(
        policy,
        env,
        obs_adv_atk,
        perfect_attack=args.perfect_attack,
        softmax=False if args.no_softmax else True,
        device=args.device)
    beta = np.linspace(args.min, args.max, args.steps, endpoint=True)
    atk_freq = []
    n_attacks = []
    rewards = []
    for b in beta:
        collector.beta = b
        test_adversarial_policy = collector.collect(n_episode=args.test_num)
        rewards.append(test_adversarial_policy['rew'])
        atk_freq.append(test_adversarial_policy['atk_rate(%)'])
        n_attacks.append(test_adversarial_policy['n_atks'])
        print("attack frequency =", atk_freq[-1], "| n_attacks =",
              n_attacks[-1], "| n_succ_atks (%)",
              test_adversarial_policy['succ_atks(%)'], "| reward: ",
              rewards[-1])
        # pprint.pprint(test_adversarial_policy)
    log_path = os.path.join(
        args.logdir, args.task, args.policy, "strategically_timed_attack_" +
        atk_type + transferability_type + ".npy")

    with open(log_path, 'wb') as f:
        np.save(f, atk_freq)
        np.save(f, n_attacks)
        np.save(f, rewards)
    print("Results saved to", log_path)

示例#4

0

显示文件

文件： atari_adversarial_policy_attack.py 项目： q-learning-trader/rl-policies-attacks-defenses

def benchmark_adversarial_policy(args=get_args()):
    env = make_atari_env_watch(args)
    if args.save_video:
        log_path = os.path.join(args.logdir, args.task, args.policy, "adversarial_policy_attack_eps-" + str(args.eps) +\
                                "_beta-" + str(args.beta) + "_" + args.target_policy)
        env = gym.wrappers.Monitor(env, log_path, force=True)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.env.action_space.shape or env.env.action_space.n
    # should be N_FRAMES x H x W
    print("Observations shape: ", args.state_shape)
    print("Actions shape: ", args.action_shape)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # make policy
    policy = make_policy(args, args.policy, args.resume_path)
    # make target policy
    if args.target_policy is not None:
        victim_policy = make_policy(args, args.target_policy,
                                    args.target_policy_path)
        adv_net = make_victim_network(args, victim_policy)
    else:
        adv_net = make_victim_network(args, policy)
    # define observations adversarial attack
    obs_adv_atk, atk_type = make_img_adv_attack(args, adv_net, targeted=True)
    print("Attack type:", atk_type)

    # define adversarial policy
    adv_policy = None
    if args.adv_policy is not None:
        adv_policy = make_policy(args, args.adv_policy, args.adv_policy_path)
    # define adversarial collector
    collector = adversarial_policy_attack_collector(
        policy,
        env,
        obs_adv_atk,
        perfect_attack=args.perfect_attack,
        softmax=False if args.no_softmax else True,
        device=args.device,
        adv_policy=adv_policy)
    collector.beta = args.beta
    start_time = time.time()
    test_adversarial_policy = collector.collect(n_episode=args.test_num)
    print("Attack finished in %s seconds" % (time.time() - start_time))
    atk_freq_ = test_adversarial_policy['atk_rate(%)']
    reward = test_adversarial_policy['rew']
    n_attacks = test_adversarial_policy['n_atks']
    print("attack frequency =", atk_freq_, "| n_attacks =", n_attacks,
          "| n_succ_atks (%)", test_adversarial_policy['succ_atks(%)'],
          "| reward: ", reward)

示例#5

0

显示文件

def benchmark_adversarial_policy(args=get_args()):
    env = make_atari_env_watch(args)
    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.env.action_space.shape or env.env.action_space.n
    # should be N_FRAMES x H x W
    print("Observations shape: ", args.state_shape)
    print("Actions shape: ", args.action_shape)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # make policy
    policy = make_policy(args, args.policy, args.resume_path)
    # make target policy
    transferability_type = ""
    if args.target_policy is not None:
        victim_policy = make_policy(args, args.target_policy,
                                    args.target_policy_path)
        transferability_type = "_transf_" + str(args.target_policy)
        adv_net = make_victim_network(args, victim_policy)
    else:
        adv_net = make_victim_network(args, policy)
    # define observations adversarial attack
    obs_adv_atk, atk_type = make_img_adv_attack(args, adv_net, targeted=True)
    print("Attack type:", atk_type)

    # define adversarial collector
    acts_mask = None
    if "Pong" in args.task:
        acts_mask = [3, 4]
        delta = 0
    if "Breakout" in args.task:
        acts_mask = [1, 2, 3]
        delta = 0
    collector = critical_strategy_attack_collector(
        policy,
        env,
        obs_adv_atk,
        perfect_attack=args.perfect_attack,
        acts_mask=acts_mask,
        device=args.device,
        full_search=args.full_search,
        repeat_adv_act=args.repeat_act,
        delta=delta)
    n_range = list(np.arange(args.min, args.max)) + [args.max]
    m_range = [0., 0.25, 0.5, 0.75, 1.]
    atk_freq = []
    n_attacks = []
    rewards = []
    for n in n_range:
        for m in m_range:
            collector.n = int(n * args.repeat_act)
            collector.m = int(n * args.repeat_act + n * args.repeat_act * m)
            test_adversarial_policy = collector.collect(
                n_episode=args.test_num)
            rewards.append(test_adversarial_policy['rew'])
            atk_freq.append(test_adversarial_policy['atk_rate(%)'])
            n_attacks.append(test_adversarial_policy['n_atks'])
            print("n =", str(int(n * args.repeat_act)), "m =",
                  str(int(n * args.repeat_act + n * args.repeat_act * m)),
                  "| attack frequency =", atk_freq[-1], "| n_attacks =",
                  n_attacks[-1], "| n_succ_atks (%)",
                  test_adversarial_policy['succ_atks(%)'], "| reward: ",
                  rewards[-1])
            # pprint.pprint(test_adversarial_policy)
    log_path = os.path.join(
        args.logdir, args.task, args.policy,
        "critical_strategy_attack_" + atk_type + transferability_type + ".npy")

    with open(log_path, 'wb') as f:
        np.save(f, atk_freq)
        np.save(f, n_attacks)
        np.save(f, rewards)
    print("Results saved to", log_path)

示例#6

0

显示文件

        os.makedirs(save_path)

    file_name = "perturbation_benchmark_result.txt"
    if len(rl_defenses) == 1:
        file_name = "perturbation_benchmark_" + str(rl_defenses[0]) + ".txt"
    f_rew = open(os.path.join(save_path, file_name), "w+")

    args.state_shape = env.observation_space.shape or env.observation_space.n
    args.action_shape = env.env.action_space.shape or env.env.action_space.n
    # should be N_FRAMES x H x W
    print("Observations shape: ", args.state_shape)
    print("Actions shape: ", args.action_shape)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # make policy
    policy = make_policy(args, args.policy, args.resume_path)
    adv_net = make_victim_network(args, policy)
    # make defended policies
    for defense in rl_defenses:

        if defense == "No Defense":
            def_policy = policy
        elif "AdversarialTraining" in defense:
            def_policy = make_policy(
                args, args.policy,
                os.path.join("log_def", args.task, args.policy,
                             defense + ".pth"))
        elif "JPEGFilter" == defense:
            def_policy = JPEGFilterDefense(policy, quality=20)
        elif "BitSqueezing" == defense:
            def_policy = BitSqueezingDefense(policy, bit_depth=5)

示例#7

0

显示文件

文件： atari_a2c_ppo.py 项目： q-learning-trader/rl-policies-attacks-defenses

def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    torch.set_num_threads(1)
    device = args.device

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, None, device, False)

    if args.resume_path is None:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              device=args.device,
                              base_kwargs={'recurrent': args.recurrent_policy})
        actor_critic.to(device)
        actor_critic.init(device)
    else:
        actor_critic = make_policy(args, args.algo, args.resume_path)

    # watch agent's performance
    def watch():
        print("Testing agent ...")
        actor_critic.eval()
        args.task, args.frames_stack = args.env_name, 4
        env = make_atari_env_watch(args)
        obs = env.reset()
        n_ep, tot_rew = 0, 0
        while True:
            inputs = Batch(obs=np.expand_dims(obs, axis=0))
            with torch.no_grad():
                result = actor_critic(inputs)
            action = result.act
            # Observe reward and next obs
            obs, reward, done, _ = env.step(action)
            tot_rew += reward
            if done:
                n_ep += 1
                obs = env.reset()
                if n_ep == args.test_num:
                    break
        print("Evaluation using {} episodes: mean reward {:.5f}\n".format(
            n_ep, tot_rew / n_ep))

    if args.watch:
        watch()
        exit(0)

    if args.resume_path is not None:
        args.rms_eps = 0.1

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.rms_eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.rms_eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    acc_rewards = np.zeros(args.num_processes)
    best_reward = -np.inf

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print("start training")
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)

            for i, d in enumerate(done):
                acc_rewards[i] += reward[i].detach().cpu()[0]
                if d:
                    episode_rewards.append(acc_rewards[i])
                    acc_rewards[i] = 0

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if len(episode_rewards) > 0 and np.mean(
                episode_rewards) >= best_reward and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            best_reward = np.mean(episode_rewards)
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, "policy.pth"))

        if j % args.log_interval == 0 and len(episode_rewards) > 0:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \nLast {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f} (best avg reward {:.1f})\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), best_reward))
    print("model saved to " +
          str(os.path.join(args.save_dir, args.algo, "policy.pth")))
    watch()