예제 #1
0
def pg(variant,
       env_class,
       observation_key="proprio_observation",
       **env_kwargs):

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    monitor = LocalMonitor(variant["logging_dir"])
    env = NormalizedEnv(env_class,
                        reward_scale=variant["reward_scale"],
                        **env_kwargs)
    action_dim = np.prod(env.action_space.shape)

    policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * action_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    actor = PolicyGradient(policy,
                           gamma=variant["gamma"],
                           batch_size=variant["batch_size"],
                           monitor=monitor)

    buffer = PathBuffer(max_size=variant["max_size"],
                        max_path_length=variant["max_path_length"],
                        selector=(lambda x: x[observation_key]),
                        monitor=monitor)

    sampler = ParallelSampler(
        env,
        policy,
        buffer,
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=(lambda i, x: x[observation_key]),
        monitor=monitor)

    saver = LocalSaver(variant["logging_dir"], policy=policy)

    trainer = LocalTrainer(sampler, [buffer], [actor],
                           num_steps=variant["num_steps"],
                           num_trains_per_step=variant["num_trains_per_step"],
                           saver=saver,
                           monitor=monitor)

    trainer.train()
예제 #2
0
                   optimizer_class=tf.keras.optimizers.Adam,
                   optimizer_kwargs=dict(lr=0.0001),
                   distribution_class=TanhGaussian,
                   distribution_kwargs=dict(std=None))

    buffer = PathBuffer(max_size=variant["max_size"],
                        max_path_length=variant["max_path_length"],
                        selector=(lambda x: x["proprio_observation"]),
                        monitor=monitor)

    sampler = ParallelSampler(
        env,
        policy,
        buffer,
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=(lambda i, x: x["proprio_observation"]),
        monitor=monitor)

    sampler.warm_up()
    sampler.explore()
    sampler.evaluate()
    print("DONE: {}".format(sampler.num_steps_collected))

    batch = buffer.sample(32)
    print(batch)

    import ipdb
예제 #3
0
def sac(variant,
        env_class,
        observation_key="proprio_observation",
        **env_kwargs):

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    monitor = LocalMonitor(variant["logging_dir"])
    env = NormalizedEnv(env_class,
                        reward_scale=variant["reward_scale"],
                        **env_kwargs)
    action_dim = np.prod(env.action_space.shape)

    policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * action_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    qf1 = Dense([variant["hidden_size"], variant["hidden_size"], 1],
                tau=variant["tau"],
                optimizer_class=tf.keras.optimizers.Adam,
                optimizer_kwargs=dict(lr=variant["learning_rate"]))

    qf2 = qf1.clone()
    target_qf1 = qf1.clone()
    target_qf2 = qf1.clone()

    tuner = EntropyTuner(policy,
                         optimizer_class=tf.keras.optimizers.Adam,
                         optimizer_kwargs=dict(lr=variant["learning_rate"]),
                         target=(-action_dim),
                         batch_size=variant["batch_size"],
                         monitor=monitor)

    critic1 = SoftQNetwork(policy,
                           qf1,
                           target_qf1,
                           gamma=variant["gamma"],
                           log_alpha=tuner.get_tuning_variable(),
                           bellman_weight=variant["bellman_weight"],
                           discount_weight=variant["discount_weight"],
                           batch_size=variant["batch_size"],
                           monitor=monitor)

    critic2 = SoftQNetwork(policy,
                           qf2,
                           target_qf2,
                           gamma=variant["gamma"],
                           log_alpha=tuner.get_tuning_variable(),
                           bellman_weight=variant["bellman_weight"],
                           discount_weight=variant["discount_weight"],
                           batch_size=variant["batch_size"],
                           monitor=monitor)

    critic = TwinCritic(critic1, critic2)

    actor = SoftActorCritic(policy,
                            critic,
                            log_alpha=tuner.get_tuning_variable(),
                            batch_size=variant["batch_size"],
                            monitor=monitor)

    buffer = PathBuffer(max_size=variant["max_size"],
                        max_path_length=variant["max_path_length"],
                        selector=(lambda x: x[observation_key]),
                        monitor=monitor)

    step_buffer = OffPolicyBuffer(buffer)

    sampler = ParallelSampler(
        env,
        policy,
        buffer,
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=(lambda i, x: x[observation_key]),
        monitor=monitor)

    saver = LocalSaver(variant["logging_dir"],
                       policy=policy,
                       qf1=qf1,
                       target_qf1=target_qf1,
                       qf2=qf2,
                       target_qf2=target_qf2)

    trainer = LocalTrainer(sampler, [step_buffer, step_buffer, step_buffer],
                           [actor, critic, tuner],
                           num_steps=variant["num_steps"],
                           num_trains_per_step=variant["num_trains_per_step"],
                           saver=saver,
                           monitor=monitor)

    trainer.train()
예제 #4
0
def hac(
    variant,
    env_class,
    observation_key="proprio_observation",
    goal_key="goal",
    **env_kwargs
):

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    observation_selector = (
        lambda x: x[observation_key])

    goal_selector = (
        lambda x: x[goal_key])

    both_selector = (
        lambda x: np.concatenate([observation_selector(x), goal_selector(x)], -1))

    hierarchy_selector = (
        lambda i, x: observation_selector(x) if i == 1 else both_selector(x))

    def relabel_goal(goal, observation):
        observation[goal_key] = goal
        return observation

    monitor = LocalMonitor(variant["logging_dir"])
    env = NormalizedEnv(env_class, reward_scale=variant["reward_scale"], **env_kwargs)
    action_dim = np.prod(env.action_space.shape)
    goal_dim = np.prod(env.observation_space[observation_key].shape)

    lower_policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * action_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    lower_qf = Dense(
        [variant["hidden_size"], variant["hidden_size"], 1],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]))

    lower_target_qf = lower_qf.clone()

    lower_critic = QNetwork(
        lower_policy,
        lower_qf,
        lower_target_qf,
        gamma=variant["gamma"],
        bellman_weight=variant["bellman_weight"],
        discount_weight=variant["discount_weight"],
        batch_size=variant["batch_size"],
        monitor=monitor,
        logging_prefix="lower_")

    lower_actor = DDPG(
        lower_policy,
        lower_critic,
        batch_size=variant["batch_size"],
        update_every=variant["num_trains_per_step"],
        monitor=monitor,
        logging_prefix="lower_")

    lower_buffer = GoalConditionedRelabeler(
        HindsightRelabeler(
            PathBuffer(
                max_size=variant["max_size"],
                max_path_length=variant["max_path_length"],
                monitor=monitor),
            time_skip=variant["time_skip"],
            observation_selector=observation_selector,
            goal_selector=goal_selector,
            goal_assigner=relabel_goal,
            relabel_probability=variant["relabel_probability"]),
        observation_selector=observation_selector,
        goal_selector=goal_selector)

    lower_buffer = OffPolicyBuffer(lower_buffer)

    upper_policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * goal_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    upper_qf = Dense(
        [variant["hidden_size"], variant["hidden_size"], 1],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]))

    upper_target_qf = upper_qf.clone()

    upper_critic = QNetwork(
        upper_policy,
        upper_qf,
        upper_target_qf,
        gamma=variant["gamma"],
        bellman_weight=variant["bellman_weight"],
        discount_weight=variant["discount_weight"],
        batch_size=variant["batch_size"],
        monitor=monitor,
        logging_prefix="upper_")

    upper_actor = DDPG(
        upper_policy,
        upper_critic,
        batch_size=variant["batch_size"],
        update_every=variant["num_trains_per_step"],
        monitor=monitor,
        logging_prefix="upper_")

    upper_buffer = SubgoalTestingRelabeler(
        HACRelabeler(
            PathBuffer(
                max_size=variant["max_size"],
                max_path_length=variant["max_path_length"],
                monitor=monitor),
            observation_selector=observation_selector,
            relabel_probability=variant["relabel_probability"]),
        observation_selector=observation_selector,
        threshold=variant["threshold"],
        penalty=variant["penalty"],
        relabel_probability=variant["relabel_probability"])

    upper_buffer = OffPolicyBuffer(upper_buffer)

    sampler = ParallelSampler(
        env,
        [lower_policy, upper_policy],
        [lower_buffer, upper_buffer],
        time_skips=(1, variant["time_skip"]),
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=hierarchy_selector,
        monitor=monitor)

    saver = LocalSaver(
        variant["logging_dir"],
        lower_policy=lower_policy,
        lower_qf=lower_qf,
        lower_target_qf=lower_target_qf,
        upper_policy=upper_policy,
        upper_qf=upper_qf,
        upper_target_qf=upper_target_qf)

    trainer = LocalTrainer(
        sampler,
        [lower_buffer, lower_buffer, lower_buffer, upper_buffer, upper_buffer, upper_buffer],
        [upper_actor, upper_critic, lower_actor, lower_critic],
        num_steps=variant["num_steps"],
        num_trains_per_step=variant["num_trains_per_step"],
        saver=saver,
        monitor=monitor)

    trainer.train()
예제 #5
0
def trpo(variant,
         env_class,
         observation_key="proprio_observation",
         **env_kwargs):

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    monitor = LocalMonitor(variant["logging_dir"])
    env = NormalizedEnv(env_class,
                        reward_scale=variant["reward_scale"],
                        **env_kwargs)
    action_dim = np.prod(env.action_space.shape)

    policy = Dense(
        [variant["hidden_size"], variant["hidden_size"], 2 * action_dim],
        tau=variant["tau"],
        optimizer_class=tf.keras.optimizers.Adam,
        optimizer_kwargs=dict(lr=variant["learning_rate"]),
        distribution_class=TanhGaussian,
        distribution_kwargs=dict(std=None))

    vf = Dense([variant["hidden_size"], variant["hidden_size"], 1],
               tau=variant["tau"],
               optimizer_class=tf.keras.optimizers.Adam,
               optimizer_kwargs=dict(lr=variant["learning_rate"]))

    old_policy = policy.clone()
    target_vf = vf.clone()

    policy = KLConstraint(LineSearch(NaturalGradient(policy, return_sAs=True),
                                     use_sAs=True),
                          old_policy,
                          delta=variant["delta"])

    tuner = EntropyTuner(policy,
                         optimizer_class=tf.keras.optimizers.Adam,
                         optimizer_kwargs=dict(lr=variant["learning_rate"]),
                         target=(-action_dim),
                         batch_size=variant["batch_size"],
                         monitor=monitor)

    critic = SoftValueNetwork(policy,
                              vf,
                              target_vf,
                              gamma=variant["gamma"],
                              log_alpha=tuner.get_tuning_variable(),
                              bellman_weight=variant["bellman_weight"],
                              discount_weight=variant["discount_weight"],
                              batch_size=variant["batch_size"],
                              monitor=monitor)

    critic = GAE(critic, gamma=variant["gamma"], lamb=variant["lamb"])

    actor = ImportanceSampling(policy,
                               old_policy,
                               critic,
                               gamma=variant["gamma"],
                               old_update_every=variant["num_trains_per_step"],
                               batch_size=variant["batch_size"],
                               monitor=monitor)

    buffer = PathBuffer(max_size=variant["max_size"],
                        max_path_length=variant["max_path_length"],
                        selector=(lambda x: x[observation_key]),
                        monitor=monitor)

    sampler = ParallelSampler(
        env,
        policy,
        buffer,
        max_path_length=variant["max_path_length"],
        num_warm_up_paths=variant["num_warm_up_paths"],
        num_exploration_paths=variant["num_exploration_paths"],
        num_evaluation_paths=variant["num_evaluation_paths"],
        num_threads=variant["num_threads"],
        selector=(lambda i, x: x[observation_key]),
        monitor=monitor)

    saver = LocalSaver(variant["logging_dir"],
                       policy=policy,
                       old_policy=old_policy,
                       vf=vf,
                       target_vf=target_vf)

    trainer = LocalTrainer(sampler, [buffer, buffer, buffer],
                           [actor, critic, tuner],
                           num_steps=variant["num_steps"],
                           num_trains_per_step=variant["num_trains_per_step"],
                           saver=saver,
                           monitor=monitor)

    trainer.train()
예제 #6
0
                     tau=1e-1,
                     optimizer_class=tf.keras.optimizers.Adam,
                     optimizer_kwargs=dict(lr=0.0001),
                     distribution_class=TanhGaussian,
                     distribution_kwargs=dict(std=None))

    policy = make_policy()

    buffer = PathBuffer(max_size=max_size,
                        max_path_length=max_path_length,
                        selector=(lambda x: x["proprio_observation"]),
                        monitor=monitor)

    sampler = ParallelSampler(make_policy,
                              make_env,
                              policy,
                              buffer,
                              num_threads=16,
                              time_skips=(1, ),
                              max_path_length=max_path_length,
                              num_warm_up_paths=num_warm_up_paths,
                              num_exploration_paths=num_exploration_paths,
                              num_evaluation_paths=num_evaluation_paths,
                              selector=(lambda i, x: x["proprio_observation"]),
                              monitor=monitor)

    sampler.warm_up()
    sampler.explore()
    sampler.evaluate()
    print("DONE: {}".format(sampler.num_steps_collected))