Пример #1
0
    def _test_load_td3(self, gpu):

        obs_size = 11
        action_size = 3

        def make_q_func_with_optimizer():
            q_func = nn.Sequential(
                pnn.ConcatObsAndAction(),
                nn.Linear(obs_size + action_size, 400),
                nn.ReLU(),
                nn.Linear(400, 300),
                nn.ReLU(),
                nn.Linear(300, 1),
            )
            q_func_optimizer = torch.optim.Adam(q_func.parameters())
            return q_func, q_func_optimizer

        q_func1, q_func1_optimizer = make_q_func_with_optimizer()
        q_func2, q_func2_optimizer = make_q_func_with_optimizer()

        policy = nn.Sequential(
            nn.Linear(obs_size, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, action_size),
            nn.Tanh(),
            pfrl.policies.DeterministicHead(),
        )
        policy_optimizer = torch.optim.Adam(policy.parameters())

        rbuf = replay_buffers.ReplayBuffer(100)
        explorer = explorers.AdditiveGaussian(scale=0.1,
                                              low=[-1.0, -1.0, -1.0],
                                              high=[1.0, 1.0, 1.0])

        agent = agents.TD3(
            policy,
            q_func1,
            q_func2,
            policy_optimizer,
            q_func1_optimizer,
            q_func2_optimizer,
            rbuf,
            gamma=0.99,
            soft_update_tau=5e-3,
            explorer=explorer,
            replay_start_size=1000,
            gpu=gpu,
            minibatch_size=100,
            burnin_action_func=None,
        )

        downloaded_model, exists = download_model(
            "TD3", "Hopper-v2", model_type=self.pretrained_type)
        agent.load(downloaded_model)
        if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"):
            assert exists
Пример #2
0
    def _test_load_ddpg(self, gpu):

        obs_size = 11
        action_size = 3
        from pfrl.nn import ConcatObsAndAction

        q_func = nn.Sequential(
            ConcatObsAndAction(),
            nn.Linear(obs_size + action_size, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, 1),
        )
        from pfrl.nn import BoundByTanh
        from pfrl.policies import DeterministicHead

        policy = nn.Sequential(
            nn.Linear(obs_size, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, action_size),
            BoundByTanh(low=[-1.0, -1.0, -1.0], high=[1.0, 1.0, 1.0]),
            DeterministicHead(),
        )

        opt_a = torch.optim.Adam(policy.parameters())
        opt_c = torch.optim.Adam(q_func.parameters())

        explorer = explorers.AdditiveGaussian(scale=0.1,
                                              low=[-1.0, -1.0, -1.0],
                                              high=[1.0, 1.0, 1.0])

        agent = agents.DDPG(
            policy,
            q_func,
            opt_a,
            opt_c,
            replay_buffers.ReplayBuffer(100),
            gamma=0.99,
            explorer=explorer,
            replay_start_size=1000,
            target_update_method="soft",
            target_update_interval=1,
            update_interval=1,
            soft_update_tau=5e-3,
            n_times_update=1,
            gpu=gpu,
            minibatch_size=100,
            burnin_action_func=None,
        )

        downloaded_model, exists = download_model(
            "DDPG", "Hopper-v2", model_type=self.pretrained_type)
        agent.load(downloaded_model)
        if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"):
            assert exists
Пример #3
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument(
        "--env",
        type=str,
        default="Hopper-v2",
        help="OpenAI Gym MuJoCo env to perform algorithm on.",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 32)")
    parser.add_argument("--gpu",
                        type=int,
                        default=0,
                        help="GPU to use, set to -1 if no GPU.")
    parser.add_argument("--load",
                        type=str,
                        default="",
                        help="Directory to load agent from.")
    parser.add_argument(
        "--steps",
        type=int,
        default=10**6,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=10,
        help="Number of episodes run for each evaluation.",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between evaluations.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=10000,
        help="Minimum replay buffer size before " +
        "performing gradient updates.",
    )
    parser.add_argument("--batch-size",
                        type=int,
                        default=100,
                        help="Minibatch size")
    parser.add_argument("--render",
                        action="store_true",
                        help="Render env states in a GUI window.")
    parser.add_argument("--demo",
                        action="store_true",
                        help="Just run evaluation, not training.")
    parser.add_argument("--load-pretrained",
                        action="store_true",
                        default=False)
    parser.add_argument("--pretrained-type",
                        type=str,
                        default="best",
                        choices=["best", "final"])
    parser.add_argument("--monitor",
                        action="store_true",
                        help="Wrap env with gym.wrappers.Monitor.")
    parser.add_argument("--log-level",
                        type=int,
                        default=logging.INFO,
                        help="Level of the root logger.")
    args = parser.parse_args()

    logging.basicConfig(level=args.log_level)

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print("Output files are saved in {}".format(args.outdir))

    # Set a random seed used in PFRL
    utils.set_random_seed(args.seed)

    def make_env(test):
        env = gym.make(args.env)
        # Unwrap TimeLimit wrapper
        assert isinstance(env, gym.wrappers.TimeLimit)
        env = env.env
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = pfrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = pfrl.wrappers.Monitor(env, args.outdir)
        if args.render and not test:
            env = pfrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.max_episode_steps
    obs_space = env.observation_space
    action_space = env.action_space
    print("Observation space:", obs_space)
    print("Action space:", action_space)

    obs_size = obs_space.low.size
    action_size = action_space.low.size

    policy = nn.Sequential(
        nn.Linear(obs_size, 400),
        nn.ReLU(),
        nn.Linear(400, 300),
        nn.ReLU(),
        nn.Linear(300, action_size),
        nn.Tanh(),
        pfrl.policies.DeterministicHead(),
    )
    policy_optimizer = torch.optim.Adam(policy.parameters())

    def make_q_func_with_optimizer():
        q_func = nn.Sequential(
            pfrl.nn.ConcatObsAndAction(),
            nn.Linear(obs_size + action_size, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, 1),
        )
        q_func_optimizer = torch.optim.Adam(q_func.parameters())
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    rbuf = replay_buffers.ReplayBuffer(10**6)

    explorer = explorers.AdditiveGaussian(scale=0.1,
                                          low=action_space.low,
                                          high=action_space.high)

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    agent = pfrl.agents.TD3(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=0.99,
        soft_update_tau=5e-3,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        burnin_action_func=burnin_action_func,
    )

    if len(args.load) > 0 or args.load_pretrained:
        # either load or load_pretrained must be false
        assert not len(args.load) > 0 or not args.load_pretrained
        if len(args.load) > 0:
            agent.load(args.load)
        else:
            agent.load(
                utils.download_model("TD3",
                                     args.env,
                                     model_type=args.pretrained_type)[0])

    eval_env = make_env(test=True)
    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit,
        )
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
        import json
        import os

        with open(os.path.join(args.outdir, "demo_scores.json"), "w") as f:
            json.dump(eval_stats, f)
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_env=eval_env,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            train_max_episode_len=timestep_limit,
        )
Пример #4
0
    def __init__(self,
                 state_dim,
                 goal_dim,
                 action_dim,
                 scale,
                 replay_buffer,
                 actor_lr,
                 critic_lr,
                 expl_noise,
                 policy_noise,
                 noise_clip,
                 gamma,
                 policy_freq,
                 tau,
                 is_low_level,
                 buffer_freq,
                 minibatch_size,
                 gpu,
                 add_entropy,
                 burnin_action_func=None,
                 replay_start_size=2500):
        self.scale = scale
        # parameters
        self.expl_noise = expl_noise
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.policy_freq = policy_freq
        self.tau = tau
        self.is_low_level = is_low_level
        self.minibatch_size = minibatch_size
        self.add_entropy = add_entropy
        # create td3 agent
        self.device = torch.device(f'cuda:{gpu}')
        if self.add_entropy:

            def squashed_diagonal_gaussian_head(x):
                mean, log_scale = torch.chunk(x, 2, dim=-1)
                log_scale = torch.clamp(log_scale, -20.0, 2.0)
                var = torch.exp(log_scale * 2)
                base_distribution = distributions.Independent(
                    distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
                return base_distribution

            policy = nn.Sequential(
                nn.Linear(state_dim + goal_dim, 300),
                nn.ReLU(),
                nn.Linear(300, 300),
                nn.ReLU(),
                nn.Linear(300, action_dim * 2),
                nn.Tanh(),
                ConstantsMult(
                    torch.cat(
                        (torch.tensor(self.scale), torch.ones(
                            self.scale.size))).float().to(self.device)),
                # pfrl.policies.DeterministicHead(),
                Lambda(squashed_diagonal_gaussian_head),
            )
        else:
            policy = nn.Sequential(
                nn.Linear(state_dim + goal_dim, 300),
                nn.ReLU(),
                nn.Linear(300, 300),
                nn.ReLU(),
                nn.Linear(300, action_dim),
                nn.Tanh(),
                ConstantsMult(
                    torch.tensor(self.scale).float().to(self.device)),
                pfrl.policies.DeterministicHead(),
            )

        policy_optimizer = torch.optim.Adam(policy.parameters(), lr=actor_lr)

        def make_q_func_with_optimizer():
            q_func = nn.Sequential(
                pfrl.nn.ConcatObsAndAction(),
                nn.Linear(state_dim + goal_dim + action_dim, 300),
                nn.ReLU(),
                nn.Linear(300, 300),
                nn.ReLU(),
                nn.Linear(300, 1),
            )
            q_func_optimizer = torch.optim.Adam(q_func.parameters(),
                                                lr=critic_lr)
            return q_func, q_func_optimizer

        q_func1, q_func1_optimizer = make_q_func_with_optimizer()
        q_func2, q_func2_optimizer = make_q_func_with_optimizer()

        # TODO - have proper low and high values from action space.
        # from the hiro paper, the scale is 1.0
        explorer = explorers.AdditiveGaussian(scale=self.expl_noise * 1.0,
                                              low=-self.scale,
                                              high=self.scale)

        def default_target_policy_smoothing_func(batch_action):
            """Add noises to actions for target policy smoothing."""
            noise = torch.clamp(
                self.policy_noise * torch.randn_like(batch_action),
                -self.noise_clip, self.noise_clip)
            smoothed_action = batch_action + noise
            smoothed_action = torch.min(
                smoothed_action,
                torch.tensor(self.scale).to(self.device).float())
            smoothed_action = torch.max(
                smoothed_action,
                torch.tensor(-self.scale).to(self.device).float())
            return smoothed_action

        if self.is_low_level:
            # standard goal conditioned td3
            self.agent = GoalConditionedTD3(
                policy,
                q_func1,
                q_func2,
                policy_optimizer,
                q_func1_optimizer,
                q_func2_optimizer,
                replay_buffer,
                gamma=gamma,
                soft_update_tau=tau,
                explorer=explorer,
                update_interval=1,
                policy_update_delay=policy_freq,
                replay_start_size=replay_start_size,
                buffer_freq=buffer_freq,
                minibatch_size=minibatch_size,
                gpu=gpu,
                add_entropy=self.add_entropy,
                burnin_action_func=burnin_action_func,
                target_policy_smoothing_func=
                default_target_policy_smoothing_func)
        else:
            self.agent = HIROHighLevelGoalConditionedTD3(
                policy,
                q_func1,
                q_func2,
                policy_optimizer,
                q_func1_optimizer,
                q_func2_optimizer,
                replay_buffer,
                gamma=gamma,
                soft_update_tau=tau,
                explorer=explorer,
                update_interval=1,
                policy_update_delay=policy_freq,
                replay_start_size=replay_start_size / buffer_freq,
                buffer_freq=buffer_freq,
                minibatch_size=minibatch_size,
                gpu=gpu,
                add_entropy=self.add_entropy,
                burnin_action_func=burnin_action_func,
                target_policy_smoothing_func=
                default_target_policy_smoothing_func)

        self.device = self.agent.device
Пример #5
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=(
            "Directory path to save output files."
            " If it does not exist, it will be created."
        ),
    )
    parser.add_argument(
        "--env",
        type=str,
        default="'DClawTurnFixed-v0'",
        help="OpenAI Gym MuJoCo env to perform algorithm on.",
    )
    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)")
    parser.add_argument(
        "--gpu", type=int, default=-1, help="GPU to use, set to -1 if no GPU."
    )
    parser.add_argument(
        "--load", type=str, default="", help="Directory to load agent from."
    )
    parser.add_argument(
        "--max-steps",
        type=int,
        default=10 ** 6,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=10,
        help="Number of episodes run for each evaluation.",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between evaluations.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=10000,
        help="Minimum replay buffer size before " + "performing gradient updates.",
    )
    parser.add_argument("--batch-size", type=int, default=64, help="Minibatch size")
    parser.add_argument(
        "--render", action="store_true", help="Render env states in a GUI window."
    )
    parser.add_argument(
        "--demo", action="store_true", help="Just run evaluation, not training."
    )
    parser.add_argument("--load-pretrained", action="store_true", default=False)
    parser.add_argument(
        "--pretrained-type", type=str, default="best", choices=["best", "final"]
    )
    parser.add_argument(
        "--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor."
    )
    parser.add_argument(
        "--log-level", type=int, default=logging.INFO, help="Level of the root logger."
    )
    parser.add_argument("--gamma", type=float, default=0.9)
    parser.add_argument("--ddpg-training-steps", type=int, default=int(1e3))
    parser.add_argument("--adversary-training-steps", type=int,default=int(1e3))
    args = parser.parse_args()

    logging.basicConfig(level=args.log_level)

    args.outdir = './results'
    print("Output files are saved in {}".format(args.outdir))

    # Set a random seed used in PFRL
    utils.set_random_seed(args.seed)

    def make_env(test):
        env = gym.make('DClawTurnFixed-v0')
        # Unwrap TimeLimit wrapper
        assert isinstance(env, gym.wrappers.TimeLimit)
        env = env.env
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = pfrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = pfrl.wrappers.Monitor(env, args.outdir)
        if args.render and not test:
            env = pfrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.max_episode_steps
    obs_space = env.observation_space
    action_space = env.action_space
    print("Observation space:", obs_space)
    print("Action space:", action_space)

    obs_size = obs_space.low.size
    action_size = action_space.low.size

    q_func = nn.Sequential(
        ConcatObsAndAction(),
        nn.Linear(obs_size + action_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256,256),
        nn.ReLU(),
        nn.Linear(256, 1),
    )
    policy = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256,256),
        nn.ReLU(),
        nn.Linear(256, action_size),
        BoundByTanh(low=action_space.low, high=action_space.high),
        DeterministicHead(),
    )

    ddpg_opt_a = torch.optim.Adam(policy.parameters())
    ddpg_opt_c = torch.optim.Adam(q_func.parameters())

    ddpg_rbuf = replay_buffers.ReplayBuffer(10 ** 6)

    ddpg_explorer = explorers.AdditiveGaussian(
        scale=0.1, low=action_space.low, high=action_space.high
    )

    def ddpg_burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low, action_space.high).astype(np.float32)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    ddpg_agent = DDPG(
        policy,
        q_func,
        ddpg_opt_a,
        ddpg_opt_c,
        ddpg_rbuf,
        gamma=args.gamma,
        explorer=ddpg_explorer,
        replay_start_size=args.replay_start_size,
        target_update_method="soft",
        target_update_interval=1,
        update_interval=1,
        soft_update_tau=5e-3,
        n_times_update=1,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        burnin_action_func=ddpg_burnin_action_func,
    )
    def adversary_random_func():
        return np.random.randint(0,9)
    # adversary_q = Critic(obs_size, 1, hidden_size=adversary_hidden_size)
    # adversary_action_space = gym.spaces.discrete.Discrete(9)
    # adversary_q = q_functions.FCQuadraticStateQFunction(
    #     obs_size, 1, n_hidden_channels = 256, n_hidden_layers = 2,action_space = adversary_action_space
    # )
    adversary_q = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.Linear(256,256),
        nn.Linear(256,256),
        nn.Linear(256,1),
        DiscreteActionValueHead(),
    )
    adversary_optimizer = torch.optim.Adam(adversary_q.parameters(), lr=1e-3)
    adversary_rbuf_capacity = int(1e6)
    adversary_rbuf = replay_buffers.ReplayBuffer(adversary_rbuf_capacity)
    adversary_explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, 0.1, 10**4, adversary_random_func
    )

    adversary_agent = DQN(
        adversary_q,
        adversary_optimizer,
        adversary_rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=adversary_explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=1,
        minibatch_size=args.batch_size,
        target_update_method='soft',
        soft_update_tau=5e-3

    )
    logger = logging.getLogger(__name__)
    eval_env = make_env(test=True)
    evaluator = Evaluator(
        agent=ddpg_agent,
        n_steps=None,
        n_episodes=args.eval_n_runs,
        eval_interval=args.eval_interval,
        outdir=args.outdir,
        max_episode_len=timestep_limit,
        env=eval_env,
        step_offset=0,
        save_best_so_far_agent=True,
        use_tensorboard=True,
        logger=logger,
    )

    episode_reward = 0
    ddpg_episode_idx = 0
    adversary_episode_idx = 0

    # o_0, r_0
    current_state = env.reset()

    t = 0 
    ddpg_t = 0
    adversary_t = 0
    episode_len = 0
    try:
        while t < args.max_steps:
            for i in range(args.ddpg_training_steps):
                t += 1
                ddpg_t += 1
                ddpg_action = ddpg_agent.act(current_state)
                adversary_action = adversary_agent.act(current_state)
                ddpg_action[adversary_action] = 0
                next_state, reward, done, info = env.step(ddpg_action)
                episode_reward += reward
                episode_len += 1
                reset = episode_len == timestep_limit or info.get("needs_reset", False)
                ddpg_agent.observe(next_state, reward, done, reset)
                current_state = next_state
                if done or reset or t == args.max_steps:
                    logger.info(
                        "ddpg phase: outdir:%s step:%s episode:%s R:%s",
                        args.outdir,
                        ddpg_t,
                        ddpg_episode_idx,
                        episode_reward,
                    )
                    logger.info("statistics:%s", ddpg_agent.get_statistics())
                    if evaluator is not None:
                        evaluator.evaluate_if_necessary(t=t, episodes=ddpg_episode_idx + 1)
                    if t == args.max_steps:
                        break
                    episode_reward = 0
                    ddpg_episode_idx += 1
                    episode_len = 0
                    current_state = env.reset()
            episode_reward = 0
            episode_len = 0
            current_state = env.reset()
            print("start adversary training ")
            for i in range(args.adversary_training_steps):
                t += 1
                adversary_t += 1
                ddpg_action = ddpg_agent.act(current_state)
                adversary_action = adversary_agent.act(current_state)
                ddpg_action[adversary_action] = 0
                next_state, reward, done, info = env.step(ddpg_action)
                reward = -reward
                episode_len += 1
                reset = episode_len == timestep_limit or info.get("needs_reset", False)
                adversary_agent.observe(next_state, reward, done, reset)
                current_state = next_state

                if done or reset or t == args.max_steps:
                    if t == args.max_steps:
                        break
                    episode_reward = 0
                    adversary_episode_idx += 1
                    episode_len = 0
                    current_state = env.reset()
            

    except (Exception, KeyboardInterrupt):
        # Save the current model before being killed
        save_agent(ddpg_agent, t, args.outdir, logger, suffix="_ddpg_except")
        save_agent(adversary_agent, t, args.outdir, logger, suffix="_adversary_except" )
        raise

    # Save the final model
    save_agent(ddpg_agent, t, args.outdir, logger, suffix="_ddpg_finish")
    save_agent(adversary_agent, t, args.outdir, logger, suffix="_adversary_finish" )
    # if args.demo:
    #     eval_env.render()
    #     eval_stats = experiments.eval_performance(
    #         env=eval_env,
    #         agent=ddpg_agent,
    #         n_steps=None,
    #         n_episodes=args.eval_n_runbase_envs,
    #         max_episode_len=timestep_limit,
    #     )
    #     print(
    #         "n_runs: {} mean: {} median: {} stdev {}".format(
    #             args.eval_n_runs,
    #             eval_stats["mean"],
    #             eval_stats["median"],
    #             eval_stats["stdev"],
    #         )
    #     )
    # else:
    #     experiments.train_agent_with_evaluation(
    #         agent=ddpg_agent,
    #         env=env,
    #         steps=args.steps,
    #         eval_env=eval_env,
    #         eval_n_steps=None,
    #         eval_n_episodes=args.eval_n_runs,
    #         eval_interval=args.eval_interval,
    #         outdir=args.outdir,
    #         train_max_episode_len=timestep_limit,
    #     )
    print("finish")
Пример #6
0
    def __init__(
            self,
            state_dim,
            goal_dim,
            action_dim,
            scale,
            replay_buffer,
            actor_lr,
            critic_lr,
            expl_noise,
            policy_noise,
            noise_clip,
            gamma,
            policy_freq,
            tau,
            is_low_level,
            buffer_freq,
            minibatch_size,
            gpu,
            add_entropy,
            burnin_action_func=None,
            replay_start_size=2500,
            temperature=1.0,
            optimize_temp=False):
        self.scale = scale

        if gpu is not None and gpu >= 0:
            assert torch.cuda.is_available()
            self.device = torch.device("cuda:{}".format(gpu))
        else:
            self.device = torch.device("cpu")

        self.scale_tensor = torch.tensor(self.scale).float().to(self.device)
        # parameters
        self.expl_noise = expl_noise
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.policy_freq = policy_freq
        self.tau = tau
        self.is_low_level = is_low_level
        self.minibatch_size = minibatch_size
        self.add_entropy = add_entropy

        # create agent
        if self.add_entropy:
            def squashed_diagonal_gaussian_head(x):
                """
                taken from the SAC code.
                """
                assert x.shape[-1] == action_dim * 2

                mean, log_scale = torch.chunk(x, 2, dim=-1)
                log_scale = torch.clamp(log_scale, -20.0, 2.0)
                var = torch.exp(log_scale * 2)
                base_distribution = distributions.Independent(
                    distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1
                )
                # cache_size=1 is required for numerical stability
                return distributions.transformed_distribution.TransformedDistribution(
                    base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]
                )

            # SAC policy definition:
            policy = nn.Sequential(
                nn.Linear(state_dim + goal_dim, 256),
                nn.ReLU(),
                nn.Linear(256, 256),
                nn.ReLU(),
                nn.Linear(256, action_dim * 2),
                Lambda(squashed_diagonal_gaussian_head),
                )

            torch.nn.init.xavier_uniform_(policy[0].weight)
            torch.nn.init.xavier_uniform_(policy[2].weight)
            torch.nn.init.xavier_uniform_(policy[4].weight)
            explorer = explorers.AdditiveGaussian(
                scale=0.0,
            )

        else:
            policy = nn.Sequential(
                nn.Linear(state_dim + goal_dim, 300),
                nn.ReLU(),
                nn.Linear(300, 300),
                nn.ReLU(),
                nn.Linear(300, action_dim),
                nn.Tanh(),
                pfrl.policies.DeterministicHead(),
                )
            # TODO - have proper low and high values from action space.
            # from the hiro paper, the scale is 1.0
            explorer = explorers.AdditiveGaussian(
                scale=self.expl_noise,
                low=-self.scale,
                high=self.scale
            )


        policy_optimizer = torch.optim.Adam(policy.parameters(), lr=actor_lr)

        def make_q_func_with_optimizer():
            q_func = nn.Sequential(
                pfrl.nn.ConcatObsAndAction(),
                nn.Linear(state_dim + goal_dim + action_dim, 300),
                nn.ReLU(),
                nn.Linear(300, 300),
                nn.ReLU(),
                nn.Linear(300, 1),
            )
            q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=critic_lr)
            return q_func, q_func_optimizer

        q_func1, q_func1_optimizer = make_q_func_with_optimizer()
        q_func2, q_func2_optimizer = make_q_func_with_optimizer()


        def default_target_policy_smoothing_func(batch_action):
            """Add noises to actions for target policy smoothing."""
            noise = torch.clamp(self.policy_noise * torch.randn_like(batch_action), -self.noise_clip, self.noise_clip)
            smoothed_action = batch_action + noise
            smoothed_action = torch.min(smoothed_action, torch.tensor(self.scale).to(self.device).float())
            smoothed_action = torch.max(smoothed_action, torch.tensor(-self.scale).to(self.device).float())
            return smoothed_action

        input_scale = self.scale_tensor

        if self.is_low_level:
            # standard goal conditioned td3
            self.agent = GoalConditionedTD3(
                policy,
                q_func1,
                q_func2,
                policy_optimizer,
                q_func1_optimizer,
                q_func2_optimizer,
                replay_buffer,
                gamma=gamma,
                soft_update_tau=tau,
                explorer=explorer,
                update_interval=1,
                policy_update_delay=policy_freq,
                replay_start_size=replay_start_size,
                buffer_freq=buffer_freq,
                minibatch_size=minibatch_size,
                gpu=gpu,
                add_entropy=self.add_entropy,
                scale=input_scale,
                burnin_action_func=burnin_action_func,
                target_policy_smoothing_func=default_target_policy_smoothing_func,
                entropy_temperature=temperature,
                optimize_temp=optimize_temp
                )
        else:
            self.agent = HIROHighLevelGoalConditionedTD3(
                policy,
                q_func1,
                q_func2,
                policy_optimizer,
                q_func1_optimizer,
                q_func2_optimizer,
                replay_buffer,
                gamma=gamma,
                soft_update_tau=tau,
                explorer=explorer,
                update_interval=1,
                policy_update_delay=policy_freq,
                replay_start_size=replay_start_size/buffer_freq - 5,
                buffer_freq=buffer_freq,
                minibatch_size=minibatch_size,
                gpu=gpu,
                add_entropy=self.add_entropy,
                scale=input_scale,
                burnin_action_func=burnin_action_func,
                target_policy_smoothing_func=default_target_policy_smoothing_func,
                entropy_temperature=temperature,
                optimize_temp=optimize_temp
                )

        self.device = self.agent.device