Exemplo n.º 1
0
def hierarchy_sac(
    variant,
    env_class,
    env_kwargs=None,
    observation_key="observation",
):
    # initialize tensorflow and the multiprocessing interface
    maybe_initialize_process()

    # run an experiment with multiple agents
    if env_kwargs is None:
        env_kwargs = {}

    # initialize the environment to track the cardinality of actions
    env = NormalizedEnv(env_class, **env_kwargs)
    action_dim = env.action_space.low.size
    observation_dim = env.observation_space.spaces[observation_key].low.size

    # create a replay buffer to store data
    replay_buffer = StepReplayBuffer(max_num_steps=variant["max_num_steps"])

    # create a logging instance
    logger = TensorboardLogger(replay_buffer, variant["logging_dir"])

    # a dict to store models for saving to the disk
    models_dict = {}

    # build a hierarchical agent that uses sac
    levels = []
    for level in range(variant["num_hierarchy_levels"]):

        # create policies for each level in the hierarchy
        policy = TanhGaussian(
            dense(observation_dim + (0 if level == 0 else observation_dim),
                  2 * (observation_dim if level == 0 else action_dim),
                  hidden_size=variant["hidden_size"],
                  num_hidden_layers=variant["num_hidden_layers"]),
            optimizer_kwargs=dict(
                learning_rate=variant["policy_learning_rate"]),
            tau=variant["tau"],
            std=None)

        # create critics for each level in the hierarchy
        qf1 = Gaussian(
            dense(observation_dim + (0 if level == 0 else observation_dim) +
                  (observation_dim if level == 0 else action_dim),
                  1,
                  hidden_size=variant["hidden_size"],
                  num_hidden_layers=variant["num_hidden_layers"]),
            optimizer_kwargs=dict(learning_rate=variant["qf_learning_rate"]),
            tau=variant["tau"],
            std=1.0)
        target_qf1 = qf1.clone()

        # create critics for each level in the hierarchy
        qf2 = Gaussian(
            dense(observation_dim + (0 if level == 0 else observation_dim) +
                  (observation_dim if level == 0 else action_dim),
                  1,
                  hidden_size=variant["hidden_size"],
                  num_hidden_layers=variant["num_hidden_layers"]),
            optimizer_kwargs=dict(learning_rate=variant["qf_learning_rate"]),
            tau=variant["tau"],
            std=1.0)
        target_qf2 = qf2.clone()

        # relabel the rewards of the lower level policies
        relabeled_buffer = (GoalConditioned(
            replay_buffer, reward_scale=0.0, goal_conditioned_scale=1.0)
                            if level > 0 else replay_buffer)

        # train the agent using soft actor critic
        algorithm = SAC(policy,
                        qf1,
                        qf2,
                        target_qf1,
                        target_qf2,
                        relabeled_buffer,
                        reward_scale=variant["reward_scale"],
                        discount=variant["discount"],
                        initial_alpha=variant["initial_alpha"],
                        alpha_optimizer_kwargs=dict(
                            learning_rate=variant["policy_learning_rate"]),
                        target_entropy=(-action_dim),
                        observation_key=observation_key,
                        batch_size=variant["batch_size"],
                        logger=logger,
                        logging_prefix="sac_level{}/".format(level))

        # create a single agent to manage the hierarchy
        levels.append(
            PolicyAgent(
                policy,
                time_skip=variant["time_skip"]**(
                    variant["num_hierarchy_levels"] - 1 - level),
                goal_skip=(variant["time_skip"]**(
                    variant["num_hierarchy_levels"] -
                    level) if level > 0 else variant["max_path_length"]),
                algorithm=algorithm,
                observation_key=observation_key))

        models_dict["policy_level{}".format(level)] = policy
        models_dict["qf1_level{}".format(level)] = qf1
        models_dict["qf2_level{}".format(level)] = qf2
        models_dict["target_qf1_level{}".format(level)] = target_qf1
        models_dict["target_qf2_level{}".format(level)] = target_qf2

    # create a hierarchy agent using the list of agents
    agent = HierarchyAgent(levels)

    # create a saver to record training progress to the disk
    saver = LocalSaver(replay_buffer, variant["logging_dir"], **models_dict)

    # make a sampler to collect data to warm up the hierarchy
    sampler = ParallelSampler(env,
                              agent,
                              max_path_length=variant["max_path_length"],
                              num_workers=variant["num_workers"])

    # collect more training samples
    sampler.set_weights(agent.get_weights())
    paths, returns, num_steps = sampler.collect(
        variant["num_warm_up_steps"],
        deterministic=False,
        keep_data=True,
        workers_to_use=variant["num_workers"])

    # insert the samples into the replay buffer
    for o, a, r in paths:
        replay_buffer.insert_path(o, a, r)

    #  train for a specified number of iterations
    for iteration in range(variant["num_epochs"]):

        if iteration % variant["num_epochs_per_eval"] == 0:

            # evaluate the policy at this step
            sampler.set_weights(agent.get_weights())
            paths, eval_returns, num_steps = sampler.collect(
                variant["num_steps_per_eval"],
                deterministic=True,
                keep_data=False,
                workers_to_use=variant["num_workers"])
            logger.record("eval_mean_return", np.mean(eval_returns))

            # save the replay buffer and the policies
            saver.save()

        # collect more training samples
        sampler.set_weights(agent.get_weights())
        paths, train_returns, num_steps = sampler.collect(
            variant["num_steps_per_epoch"],
            deterministic=False,
            keep_data=True,
            workers_to_use=1)
        logger.record("train_mean_return", np.mean(train_returns))

        # insert the samples into the replay buffer
        for o, a, r in paths:
            replay_buffer.insert_path(o, a, r)

        # train once each for the number of steps collected
        for i in range(num_steps):
            agent.train()
Exemplo n.º 2
0
def ddpg(
    variant,
    env_class,
    env_kwargs=None,
    observation_key="observation",
):
    # initialize tensorflow and the multiprocessing interface
    maybe_initialize_process()

    # run an experiment with multiple agents
    if env_kwargs is None:
        env_kwargs = {}

    # initialize the environment to track the cardinality of actions
    env = NormalizedEnv(env_class, **env_kwargs)
    action_dim = env.action_space.low.size
    observation_dim = env.observation_space.spaces[observation_key].low.size

    # create a replay buffer to store data
    replay_buffer = StepReplayBuffer(max_num_steps=variant["max_num_steps"])

    # create a logging instance
    logger = TensorboardLogger(replay_buffer, variant["logging_dir"])

    # create policies for each level in the hierarchy
    policy = Gaussian(
        dense(observation_dim,
              action_dim,
              hidden_size=variant["hidden_size"],
              num_hidden_layers=variant["num_hidden_layers"],
              output_activation="tanh"),
        optimizer_kwargs=dict(learning_rate=variant["policy_learning_rate"]),
        tau=variant["tau"],
        std=variant["exploration_noise_std"])
    target_policy = policy.clone()

    # create critics for each level in the hierarchy
    qf = Gaussian(
        dense(observation_dim + action_dim,
              1,
              hidden_size=variant["hidden_size"],
              num_hidden_layers=variant["num_hidden_layers"]),
        optimizer_kwargs=dict(learning_rate=variant["qf_learning_rate"]),
        tau=variant["tau"],
        std=1.0)
    target_qf = qf.clone()

    # train the agent using soft actor critic
    algorithm = DDPG(policy,
                     target_policy,
                     qf,
                     target_qf,
                     replay_buffer,
                     reward_scale=variant["reward_scale"],
                     discount=variant["discount"],
                     observation_key=observation_key,
                     batch_size=variant["batch_size"],
                     logger=logger,
                     logging_prefix="ddpg/")

    # create a single agent to manage the hierarchy
    agent = PolicyAgent(policy,
                        algorithm=algorithm,
                        observation_key=observation_key)

    # create a saver to record training progress to the disk
    saver = LocalSaver(replay_buffer,
                       variant["logging_dir"],
                       policy=policy,
                       target_policy=target_policy,
                       qf=qf,
                       target_qf=target_qf)

    # load the networks if already trained
    saver.load()

    # make a sampler to collect data to warm up the hierarchy
    sampler = ParallelSampler(env,
                              agent,
                              max_path_length=variant["max_path_length"],
                              num_workers=variant["num_workers"])

    # collect more training samples
    sampler.set_weights(agent.get_weights())
    paths, returns, num_steps = sampler.collect(
        variant["num_warm_up_steps"],
        deterministic=False,
        keep_data=True,
        workers_to_use=variant["num_workers"])

    # insert the samples into the replay buffer
    for o, a, r in paths:
        replay_buffer.insert_path(o, a, r)

    #  train for a specified number of iterations
    for iteration in range(variant["num_epochs"]):

        if iteration % variant["num_epochs_per_eval"] == 0:

            # evaluate the policy at this step
            sampler.set_weights(agent.get_weights())
            paths, eval_returns, num_steps = sampler.collect(
                variant["num_steps_per_eval"],
                deterministic=True,
                keep_data=False,
                workers_to_use=variant["num_workers"])
            logger.record("eval_mean_return", np.mean(eval_returns))

            # save the replay buffer and the policies
            saver.save()

        # collect more training samples
        sampler.set_weights(agent.get_weights())
        paths, train_returns, num_steps = sampler.collect(
            variant["num_steps_per_epoch"],
            deterministic=False,
            keep_data=True,
            workers_to_use=1)
        logger.record("train_mean_return", np.mean(train_returns))

        # insert the samples into the replay buffer
        for o, a, r in paths:
            replay_buffer.insert_path(o, a, r)

        # train once each for the number of steps collected
        for i in range(num_steps):
            agent.train()
Exemplo n.º 3
0
def ppo(
    variant,
    env_class,
    env_kwargs=None,
    observation_key="observation",
):
    # initialize tensorflow and the multiprocessing interface
    maybe_initialize_process()

    # run an experiment with multiple agents
    if env_kwargs is None:
        env_kwargs = {}

    # initialize the environment to track the cardinality of actions
    env = NormalizedEnv(env_class, **env_kwargs)
    action_dim = env.action_space.low.size
    observation_dim = env.observation_space.spaces[observation_key].low.size

    # create a replay buffer to store data
    replay_buffer = PathReplayBuffer(
        max_path_length=variant["max_path_length"],
        max_num_paths=variant["max_num_paths"])

    # create a logging instance
    logger = TensorboardLogger(replay_buffer, variant["logging_dir"])

    # create policies for each level in the hierarchy
    policy = Gaussian(
        dense(observation_dim,
              action_dim * 2,
              hidden_size=variant["hidden_size"],
              num_hidden_layers=variant["num_hidden_layers"]),
        optimizer_kwargs=dict(learning_rate=variant["policy_learning_rate"]),
        std=None)
    old_policy = policy.clone()

    # create critics for each level in the hierarchy
    vf = Gaussian(
        dense(observation_dim,
              1,
              hidden_size=variant["hidden_size"],
              num_hidden_layers=variant["num_hidden_layers"]),
        optimizer_kwargs=dict(learning_rate=variant["vf_learning_rate"]),
        std=1.0)

    # train the agent using soft actor critic
    algorithm = PPO(
        policy,
        old_policy,
        vf,
        replay_buffer,
        reward_scale=variant["reward_scale"],
        discount=variant["discount"],
        epsilon=variant["epsilon"],
        lamb=variant["lamb"],
        off_policy_updates=variant["off_policy_updates"],
        critic_updates=variant["critic_updates"],
        observation_key=observation_key,
        batch_size=-1,  # sample everything in the buffer
        logger=logger,
        logging_prefix="ppo/")

    # create a single agent to manage the hierarchy
    agent = PolicyAgent(policy,
                        algorithm=algorithm,
                        observation_key=observation_key)

    # create a saver to record training progress to the disk
    saver = LocalSaver(replay_buffer,
                       variant["logging_dir"],
                       policy=policy,
                       old_policy=old_policy,
                       vf=vf)

    # load the networks if already trained
    saver.load()

    # make a sampler to collect data to warm up the hierarchy
    sampler = ParallelSampler(env,
                              agent,
                              max_path_length=variant["max_path_length"],
                              num_workers=variant["num_workers"])

    #  train for a specified number of iterations
    for iteration in range(variant["num_epochs"]):

        # discard all previous samples for on policy learning
        replay_buffer.empty()

        if iteration % variant["num_epochs_per_eval"] == 0:

            # evaluate the policy at this step
            sampler.set_weights(agent.get_weights())
            paths, eval_returns, num_steps = sampler.collect(
                variant["num_steps_per_eval"],
                deterministic=True,
                keep_data=False,
                workers_to_use=variant["num_workers"])
            logger.record("eval_mean_return", np.mean(eval_returns))

            # save the replay buffer and the policies
            saver.save()

        # collect more training samples
        sampler.set_weights(agent.get_weights())
        paths, train_returns, num_steps = sampler.collect(
            variant["num_steps_per_epoch"],
            deterministic=False,
            keep_data=True,
            workers_to_use=variant["num_workers"])
        logger.record("train_mean_return", np.mean(train_returns))

        # insert the samples into the replay buffer
        for o, a, r in paths:
            replay_buffer.insert_path(o, a, r)

        # train once with the on policy data
        agent.train()