예제 #1
0
def parse_arch(arch, n_actions):
    if arch == "nature":
        return nn.Sequential(
            pnn.LargeAtariCNN(),
            init_chainer_default(nn.Linear(512, n_actions)),
            DiscreteActionValueHead(),
        )
    elif arch == "doubledqn":
        # raise NotImplementedError("Single shared bias not implemented yet")
        return nn.Sequential(
            pnn.LargeAtariCNN(),
            init_chainer_default(nn.Linear(512, n_actions, bias=False)),
            SingleSharedBias(),
            DiscreteActionValueHead(),
        )
    elif arch == "nips":
        return nn.Sequential(
            pnn.SmallAtariCNN(),
            init_chainer_default(nn.Linear(256, n_actions)),
            DiscreteActionValueHead(),
        )
    elif arch == "dueling":
        return DuelingDQN(n_actions)
    else:
        raise RuntimeError("Not supported architecture: {}".format(arch))
예제 #2
0
 def __init__(self):
     super().__init__()
     self.l1 = nn.Conv2d(1, 32, 8, stride=4)
     self.l2 = nn.ReLU()
     self.l3 = nn.Conv2d(32, 64, 4, stride=2)
     self.l4 = nn.ReLU()
     self.l5 = nn.Conv2d(64, 64, 3, stride=1)
     self.l6 = nn.Flatten()
     self.l7 = nn.ReLU()
     self.r8 = nn.LSTM(input_size=3136, hidden_size=512)
     self.l9 = init_chainer_default(torch.nn.Linear(512, n_actions))
     self.l10 = DiscreteActionValueHead()
예제 #3
0
    def _test_load_dqn(self, gpu):
        from pfrl.q_functions import DiscreteActionValueHead

        n_actions = 4
        q_func = nn.Sequential(
            pnn.LargeAtariCNN(),
            init_chainer_default(nn.Linear(512, n_actions)),
            DiscreteActionValueHead(),
        )

        # Use the same hyperparameters as the Nature paper

        opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
            q_func.parameters(),
            lr=2.5e-4,
            alpha=0.95,
            momentum=0.0,
            eps=1e-2,
            centered=True,
        )

        rbuf = replay_buffers.ReplayBuffer(100)

        explorer = explorers.LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=0.1,
            decay_steps=10**6,
            random_action_func=lambda: np.random.randint(4),
        )

        agent = agents.DQN(
            q_func,
            opt,
            rbuf,
            gpu=gpu,
            gamma=0.99,
            explorer=explorer,
            replay_start_size=50,
            target_update_interval=10**4,
            clip_delta=True,
            update_interval=4,
            batch_accumulator="sum",
            phi=lambda x: x,
        )

        downloaded_model, exists = download_model(
            "DQN", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type)
        agent.load(downloaded_model)
        if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"):
            assert exists
예제 #4
0
 def __init__(self, n_actions, max_episode_steps):
     super().__init__()
     self.embed = nn.Embedding(max_episode_steps + 1, 3136)
     self.image2hidden = nn.Sequential(
         nn.Conv2d(3, 32, 8, stride=4),
         nn.ReLU(),
         nn.Conv2d(32, 64, 4, stride=2),
         nn.ReLU(),
         nn.Conv2d(64, 64, 3, stride=1),
         nn.Flatten(),
     )
     self.hidden2out = nn.Sequential(
         nn.Linear(3136, 512),
         nn.ReLU(),
         nn.Linear(512, n_actions),
         DiscreteActionValueHead(),
     )
예제 #5
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("processes", type=int)
    parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--t-max", type=int, default=5)
    parser.add_argument("--replay-start-size", type=int, default=10000)
    parser.add_argument("--n-times-replay", type=int, default=4)
    parser.add_argument("--beta", type=float, default=1e-2)
    parser.add_argument("--profile", action="store_true")
    parser.add_argument("--steps", type=int, default=10**7)
    parser.add_argument(
        "--max-frames",
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help="Maximum number of frames for each episode.",
    )
    parser.add_argument("--lr", type=float, default=7e-4)
    parser.add_argument("--eval-interval", type=int, default=10**5)
    parser.add_argument("--eval-n-runs", type=int, default=10)
    parser.add_argument("--use-lstm", action="store_true")
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load", type=str, default="")
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    utils.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    n_actions = gym.make(args.env).action_space.n

    input_to_hidden = nn.Sequential(
        nn.Conv2d(4, 16, 8, stride=4),
        nn.ReLU(),
        nn.Conv2d(16, 32, 4, stride=2),
        nn.ReLU(),
        nn.Flatten(),
        nn.Linear(2592, 256),
        nn.ReLU(),
    )

    head = acer.ACERDiscreteActionHead(
        pi=nn.Sequential(
            nn.Linear(256, n_actions),
            SoftmaxCategoricalHead(),
        ),
        q=nn.Sequential(
            nn.Linear(256, n_actions),
            DiscreteActionValueHead(),
        ),
    )

    if args.use_lstm:
        model = pfrl.nn.RecurrentSequential(
            input_to_hidden,
            nn.LSTM(num_layers=1, input_size=256, hidden_size=256),
            head,
        )
    else:
        model = nn.Sequential(input_to_hidden, head)

    model.apply(pfrl.initializers.init_chainer_default)

    opt = pfrl.optimizers.SharedRMSpropEpsInsideSqrt(model.parameters(),
                                                     lr=args.lr,
                                                     eps=4e-3,
                                                     alpha=0.99)

    replay_buffer = EpisodicReplayBuffer(10**6 // args.processes)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = acer.ACER(
        model,
        opt,
        t_max=args.t_max,
        gamma=0.99,
        replay_buffer=replay_buffer,
        n_times_replay=args.n_times_replay,
        replay_start_size=args.replay_start_size,
        beta=args.beta,
        phi=phi,
        max_grad_norm=40,
        recurrent=args.use_lstm,
    )

    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
        )
        env.seed(int(env_seed))
        if args.monitor:
            env = pfrl.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_steps=None,
                                                  n_episodes=args.eval_n_runs)
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            for pg in agent.optimizer.param_groups:
                assert "lr" in pg
                pg["lr"] = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
        )
예제 #6
0
파일: test_acer.py 프로젝트: Wataru-Y/pfrl
    def _test_abc(
        self,
        t_max,
        use_lstm,
        discrete=True,
        episodic=True,
        steps=100000,
        require_success=True,
    ):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(
                size=size,
                discrete=discrete,
                episodic=episodic or test,
                partially_observable=self.use_lstm,
                deterministic=test,
            )

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        replay_buffer = EpisodicReplayBuffer(10**4)
        obs_size = obs_space.low.size
        hidden_size = 20
        if discrete:
            n_actions = action_space.n
            head = acer.ACERDiscreteActionHead(
                pi=nn.Sequential(
                    nn.Linear(hidden_size, n_actions),
                    SoftmaxCategoricalHead(),
                ),
                q=nn.Sequential(
                    nn.Linear(hidden_size, n_actions),
                    DiscreteActionValueHead(),
                ),
            )
        else:
            action_size = action_space.low.size
            head = acer.ACERContinuousActionHead(
                pi=nn.Sequential(
                    nn.Linear(hidden_size, action_size * 2),
                    GaussianHeadWithDiagonalCovariance(),
                ),
                v=nn.Sequential(nn.Linear(hidden_size, 1), ),
                adv=nn.Sequential(
                    ConcatObsAndAction(),
                    nn.Linear(hidden_size + action_size, 1),
                ),
            )
        if use_lstm:
            model = pfrl.nn.RecurrentSequential(
                nn.Linear(obs_size, hidden_size),
                nn.LeakyReLU(),
                nn.LSTM(num_layers=1,
                        input_size=hidden_size,
                        hidden_size=hidden_size),
                head,
            )
        else:
            model = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.LeakyReLU(),
                head,
            )
        eps = 1e-8
        opt = pfrl.optimizers.SharedRMSpropEpsInsideSqrt(model.parameters(),
                                                         lr=1e-3,
                                                         eps=eps,
                                                         alpha=0.99)
        gamma = 0.5
        beta = 1e-5
        if self.n_times_replay == 0 and self.disable_online_update:
            # At least one of them must be enabled
            pytest.skip()
        agent = acer.ACER(
            model,
            opt,
            replay_buffer=replay_buffer,
            t_max=t_max,
            gamma=gamma,
            beta=beta,
            n_times_replay=self.n_times_replay,
            act_deterministically=True,
            disable_online_update=self.disable_online_update,
            replay_start_size=100,
            use_trust_region=self.use_trust_region,
            recurrent=use_lstm,
        )

        max_episode_len = None if episodic else 2

        with warnings.catch_warnings(record=True) as warns:
            train_agent_async(
                outdir=self.outdir,
                processes=nproc,
                make_env=make_env,
                agent=agent,
                steps=steps,
                max_episode_len=max_episode_len,
                eval_interval=500,
                eval_n_steps=None,
                eval_n_episodes=5,
                successful_score=1,
            )
            assert len(warns) == 0, warns[0]

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        if require_success:
            agent.load(os.path.join(self.outdir, "successful"))

        # Test
        env = make_env(0, True)
        n_test_runs = 5
        eval_returns = run_evaluation_episodes(
            env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
            max_episode_len=max_episode_len,
        )
        successful_return = 1
        if require_success:
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            assert n_succeeded == n_test_runs
예제 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--env",
        type=str,
        default="BreakoutNoFrameskip-v4",
        help="OpenAI Atari domain to perform algorithm on.",
    )
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument("--gpu",
                        type=int,
                        default=0,
                        help="GPU to use, set to -1 if no GPU.")
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load-pretrained",
                        action="store_true",
                        default=False)
    parser.add_argument("--pretrained-type",
                        type=str,
                        default="best",
                        choices=["best", "final"])
    parser.add_argument("--load", type=str, default=None)
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.add_argument(
        "--steps",
        type=int,
        default=5 * 10**7,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=5 * 10**4,
        help="Minimum replay buffer size before " +
        "performing gradient updates.",
    )
    parser.add_argument("--eval-n-steps", type=int, default=125000)
    parser.add_argument("--eval-interval", type=int, default=250000)
    parser.add_argument("--n-best-episodes", type=int, default=30)
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=None),
            episode_life=not test,
            clip_rewards=not test,
        )
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = pfrl.wrappers.RandomizeAction(env, 0.05)
        if args.monitor:
            env = pfrl.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = nn.Sequential(
        pnn.LargeAtariCNN(),
        init_chainer_default(nn.Linear(512, n_actions)),
        DiscreteActionValueHead(),
    )

    # Use the same hyperparameters as the Nature paper

    opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
        q_func.parameters(),
        lr=2.5e-4,
        alpha=0.95,
        momentum=0.0,
        eps=1e-2,
        centered=True,
    )

    rbuf = replay_buffers.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        start_epsilon=1.0,
        end_epsilon=0.1,
        decay_steps=10**6,
        random_action_func=lambda: np.random.randint(n_actions),
    )

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=10**4,
        clip_delta=True,
        update_interval=4,
        batch_accumulator="sum",
        phi=phi,
    )

    if args.load or args.load_pretrained:
        # either load or load_pretrained must be false
        assert not args.load or not args.load_pretrained
        if args.load:
            agent.load(args.load)
        else:
            agent.load(
                utils.download_model("DQN",
                                     args.env,
                                     model_type=args.pretrained_type)[0])

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=args.eval_n_steps,
                                                  n_episodes=None)
        print("n_episodes: {} mean: {} median: {} stdev {}".format(
            eval_stats["episodes"],
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            eval_env=eval_env,
        )

        dir_of_best_network = os.path.join(args.outdir, "best")
        agent.load(dir_of_best_network)

        # run 30 evaluation episodes, each capped at 5 mins of play
        stats = experiments.evaluator.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.n_best_episodes,
            max_episode_len=4500,
            logger=None,
        )
        with open(os.path.join(args.outdir, "bestscores.json"), "w") as f:
            json.dump(stats, f)
        print("The results of the best scoring network:")
        for stat in stats:
            print(str(stat) + ":" + str(stats[stat]))
예제 #8
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=(
            "Directory path to save output files."
            " If it does not exist, it will be created."
        ),
    )
    parser.add_argument(
        "--env",
        type=str,
        default="'DClawTurnFixed-v0'",
        help="OpenAI Gym MuJoCo env to perform algorithm on.",
    )
    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)")
    parser.add_argument(
        "--gpu", type=int, default=-1, help="GPU to use, set to -1 if no GPU."
    )
    parser.add_argument(
        "--load", type=str, default="", help="Directory to load agent from."
    )
    parser.add_argument(
        "--max-steps",
        type=int,
        default=10 ** 6,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=10,
        help="Number of episodes run for each evaluation.",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between evaluations.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=10000,
        help="Minimum replay buffer size before " + "performing gradient updates.",
    )
    parser.add_argument("--batch-size", type=int, default=64, help="Minibatch size")
    parser.add_argument(
        "--render", action="store_true", help="Render env states in a GUI window."
    )
    parser.add_argument(
        "--demo", action="store_true", help="Just run evaluation, not training."
    )
    parser.add_argument("--load-pretrained", action="store_true", default=False)
    parser.add_argument(
        "--pretrained-type", type=str, default="best", choices=["best", "final"]
    )
    parser.add_argument(
        "--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor."
    )
    parser.add_argument(
        "--log-level", type=int, default=logging.INFO, help="Level of the root logger."
    )
    parser.add_argument("--gamma", type=float, default=0.9)
    parser.add_argument("--ddpg-training-steps", type=int, default=int(1e3))
    parser.add_argument("--adversary-training-steps", type=int,default=int(1e3))
    args = parser.parse_args()

    logging.basicConfig(level=args.log_level)

    args.outdir = './results'
    print("Output files are saved in {}".format(args.outdir))

    # Set a random seed used in PFRL
    utils.set_random_seed(args.seed)

    def make_env(test):
        env = gym.make('DClawTurnFixed-v0')
        # Unwrap TimeLimit wrapper
        assert isinstance(env, gym.wrappers.TimeLimit)
        env = env.env
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = pfrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = pfrl.wrappers.Monitor(env, args.outdir)
        if args.render and not test:
            env = pfrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.max_episode_steps
    obs_space = env.observation_space
    action_space = env.action_space
    print("Observation space:", obs_space)
    print("Action space:", action_space)

    obs_size = obs_space.low.size
    action_size = action_space.low.size

    q_func = nn.Sequential(
        ConcatObsAndAction(),
        nn.Linear(obs_size + action_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256,256),
        nn.ReLU(),
        nn.Linear(256, 1),
    )
    policy = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256,256),
        nn.ReLU(),
        nn.Linear(256, action_size),
        BoundByTanh(low=action_space.low, high=action_space.high),
        DeterministicHead(),
    )

    ddpg_opt_a = torch.optim.Adam(policy.parameters())
    ddpg_opt_c = torch.optim.Adam(q_func.parameters())

    ddpg_rbuf = replay_buffers.ReplayBuffer(10 ** 6)

    ddpg_explorer = explorers.AdditiveGaussian(
        scale=0.1, low=action_space.low, high=action_space.high
    )

    def ddpg_burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low, action_space.high).astype(np.float32)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    ddpg_agent = DDPG(
        policy,
        q_func,
        ddpg_opt_a,
        ddpg_opt_c,
        ddpg_rbuf,
        gamma=args.gamma,
        explorer=ddpg_explorer,
        replay_start_size=args.replay_start_size,
        target_update_method="soft",
        target_update_interval=1,
        update_interval=1,
        soft_update_tau=5e-3,
        n_times_update=1,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        burnin_action_func=ddpg_burnin_action_func,
    )
    def adversary_random_func():
        return np.random.randint(0,9)
    # adversary_q = Critic(obs_size, 1, hidden_size=adversary_hidden_size)
    # adversary_action_space = gym.spaces.discrete.Discrete(9)
    # adversary_q = q_functions.FCQuadraticStateQFunction(
    #     obs_size, 1, n_hidden_channels = 256, n_hidden_layers = 2,action_space = adversary_action_space
    # )
    adversary_q = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.Linear(256,256),
        nn.Linear(256,256),
        nn.Linear(256,1),
        DiscreteActionValueHead(),
    )
    adversary_optimizer = torch.optim.Adam(adversary_q.parameters(), lr=1e-3)
    adversary_rbuf_capacity = int(1e6)
    adversary_rbuf = replay_buffers.ReplayBuffer(adversary_rbuf_capacity)
    adversary_explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, 0.1, 10**4, adversary_random_func
    )

    adversary_agent = DQN(
        adversary_q,
        adversary_optimizer,
        adversary_rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=adversary_explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=1,
        minibatch_size=args.batch_size,
        target_update_method='soft',
        soft_update_tau=5e-3

    )
    logger = logging.getLogger(__name__)
    eval_env = make_env(test=True)
    evaluator = Evaluator(
        agent=ddpg_agent,
        n_steps=None,
        n_episodes=args.eval_n_runs,
        eval_interval=args.eval_interval,
        outdir=args.outdir,
        max_episode_len=timestep_limit,
        env=eval_env,
        step_offset=0,
        save_best_so_far_agent=True,
        use_tensorboard=True,
        logger=logger,
    )

    episode_reward = 0
    ddpg_episode_idx = 0
    adversary_episode_idx = 0

    # o_0, r_0
    current_state = env.reset()

    t = 0 
    ddpg_t = 0
    adversary_t = 0
    episode_len = 0
    try:
        while t < args.max_steps:
            for i in range(args.ddpg_training_steps):
                t += 1
                ddpg_t += 1
                ddpg_action = ddpg_agent.act(current_state)
                adversary_action = adversary_agent.act(current_state)
                ddpg_action[adversary_action] = 0
                next_state, reward, done, info = env.step(ddpg_action)
                episode_reward += reward
                episode_len += 1
                reset = episode_len == timestep_limit or info.get("needs_reset", False)
                ddpg_agent.observe(next_state, reward, done, reset)
                current_state = next_state
                if done or reset or t == args.max_steps:
                    logger.info(
                        "ddpg phase: outdir:%s step:%s episode:%s R:%s",
                        args.outdir,
                        ddpg_t,
                        ddpg_episode_idx,
                        episode_reward,
                    )
                    logger.info("statistics:%s", ddpg_agent.get_statistics())
                    if evaluator is not None:
                        evaluator.evaluate_if_necessary(t=t, episodes=ddpg_episode_idx + 1)
                    if t == args.max_steps:
                        break
                    episode_reward = 0
                    ddpg_episode_idx += 1
                    episode_len = 0
                    current_state = env.reset()
            episode_reward = 0
            episode_len = 0
            current_state = env.reset()
            print("start adversary training ")
            for i in range(args.adversary_training_steps):
                t += 1
                adversary_t += 1
                ddpg_action = ddpg_agent.act(current_state)
                adversary_action = adversary_agent.act(current_state)
                ddpg_action[adversary_action] = 0
                next_state, reward, done, info = env.step(ddpg_action)
                reward = -reward
                episode_len += 1
                reset = episode_len == timestep_limit or info.get("needs_reset", False)
                adversary_agent.observe(next_state, reward, done, reset)
                current_state = next_state

                if done or reset or t == args.max_steps:
                    if t == args.max_steps:
                        break
                    episode_reward = 0
                    adversary_episode_idx += 1
                    episode_len = 0
                    current_state = env.reset()
            

    except (Exception, KeyboardInterrupt):
        # Save the current model before being killed
        save_agent(ddpg_agent, t, args.outdir, logger, suffix="_ddpg_except")
        save_agent(adversary_agent, t, args.outdir, logger, suffix="_adversary_except" )
        raise

    # Save the final model
    save_agent(ddpg_agent, t, args.outdir, logger, suffix="_ddpg_finish")
    save_agent(adversary_agent, t, args.outdir, logger, suffix="_adversary_finish" )
    # if args.demo:
    #     eval_env.render()
    #     eval_stats = experiments.eval_performance(
    #         env=eval_env,
    #         agent=ddpg_agent,
    #         n_steps=None,
    #         n_episodes=args.eval_n_runbase_envs,
    #         max_episode_len=timestep_limit,
    #     )
    #     print(
    #         "n_runs: {} mean: {} median: {} stdev {}".format(
    #             args.eval_n_runs,
    #             eval_stats["mean"],
    #             eval_stats["median"],
    #             eval_stats["stdev"],
    #         )
    #     )
    # else:
    #     experiments.train_agent_with_evaluation(
    #         agent=ddpg_agent,
    #         env=env,
    #         steps=args.steps,
    #         eval_env=eval_env,
    #         eval_n_steps=None,
    #         eval_n_episodes=args.eval_n_runs,
    #         eval_interval=args.eval_interval,
    #         outdir=args.outdir,
    #         train_max_episode_len=timestep_limit,
    #     )
    print("finish")
예제 #9
0
            self.bandit2 = 0.8
        else:
            self.bandit2 = 0.2
            self.bandit1 = 0.8


if ns.two:
    env = TwoArmedBanditEnv()
else:
    env = MultiBanditEnv()

n_actions = env.action_space.n
q_func = pfrl.nn.RecurrentSequential(
    nn.LSTM(input_size=1, hidden_size=512),
    nn.Linear(512, n_actions),
    DiscreteActionValueHead(),
)

replay_buffer = pfrl.replay_buffers.EpisodicReplayBuffer(10**6)

explorer = explorers.LinearDecayEpsilonGreedy(
    1.0,
    .01,
    1000000,
    lambda: numpy.random.randint(n_actions),
)


def phi(x):
    return numpy.asarray(x, dtype=numpy.float32) / 1.0
예제 #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--env",
        type=str,
        default="BreakoutNoFrameskip-v4",
        help="OpenAI Atari domain to perform algorithm on.",
    )
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument("--gpu",
                        type=int,
                        default=0,
                        help="GPU to use, set to -1 if no GPU.")
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load", type=str, default=None)
    parser.add_argument(
        "--final-exploration-frames",
        type=int,
        default=10**6,
        help="Timesteps after which we stop " + "annealing exploration rate",
    )
    parser.add_argument(
        "--final-epsilon",
        type=float,
        default=0.01,
        help="Final value of epsilon during training.",
    )
    parser.add_argument(
        "--eval-epsilon",
        type=float,
        default=0.001,
        help="Exploration epsilon used during eval episodes.",
    )
    parser.add_argument(
        "--steps",
        type=int,
        default=5 * 10**7,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--max-frames",
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help="Maximum number of frames for each episode.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=5 * 10**4,
        help="Minimum replay buffer size before " +
        "performing gradient updates.",
    )
    parser.add_argument(
        "--target-update-interval",
        type=int,
        default=3 * 10**4,
        help="Frequency (in timesteps) at which " +
        "the target network is updated.",
    )
    parser.add_argument("--demo-n-episodes", type=int, default=30)
    parser.add_argument("--eval-n-steps", type=int, default=125000)
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=250000,
        help="Frequency (in timesteps) of evaluation phase.",
    )
    parser.add_argument(
        "--update-interval",
        type=int,
        default=4,
        help="Frequency (in timesteps) of network updates.",
    )
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.add_argument("--lr",
                        type=float,
                        default=2.5e-4,
                        help="Learning rate.")
    parser.add_argument(
        "--recurrent",
        action="store_true",
        default=False,
        help="Use a recurrent model. See the code for the model definition.",
    )
    parser.add_argument(
        "--flicker",
        action="store_true",
        default=False,
        help=("Use so-called flickering Atari, where each"
              " screen is blacked out with probability 0.5."),
    )
    parser.add_argument(
        "--no-frame-stack",
        action="store_true",
        default=False,
        help=
        ("Disable frame stacking so that the agent can only see the current screen."
         ),
    )
    parser.add_argument(
        "--episodic-update-len",
        type=int,
        default=10,
        help="Maximum length of sequences for updating recurrent models",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=32,
        help=("Number of transitions (in a non-recurrent case)"
              " or sequences (in a recurrent case) used for an"
              " update."),
    )
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
            flicker=args.flicker,
            frame_stack=not args.no_frame_stack,
        )
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)
    print("Observation space", env.observation_space)
    print("Action space", env.action_space)

    n_frames = env.observation_space.shape[0]
    n_actions = env.action_space.n
    if args.recurrent:
        # Q-network with LSTM
        q_func = pfrl.nn.RecurrentSequential(
            nn.Conv2d(n_frames, 32, 8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, 4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, stride=1),
            nn.Flatten(),
            nn.ReLU(),
            nn.LSTM(input_size=3136, hidden_size=512),
            nn.Linear(512, n_actions),
            DiscreteActionValueHead(),
        )
        # Replay buffer that stores whole episodes
        rbuf = replay_buffers.EpisodicReplayBuffer(10**6)
    else:
        # Q-network without LSTM
        q_func = nn.Sequential(
            nn.Conv2d(n_frames, 32, 8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, 4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, stride=1),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions),
            DiscreteActionValueHead(),
        )
        # Replay buffer that stores transitions separately
        rbuf = replay_buffers.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0,
        args.final_epsilon,
        args.final_exploration_frames,
        lambda: np.random.randint(n_actions),
    )

    opt = torch.optim.Adam(q_func.parameters(), lr=1e-4, eps=1e-4)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = pfrl.agents.DoubleDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator="mean",
        phi=phi,
        minibatch_size=args.batch_size,
        episodic_update_len=args.episodic_update_len,
        recurrent=args.recurrent,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.demo_n_episodes,
        )
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.demo_n_episodes,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
        )