def pg(variant, env_class, observation_key="proprio_observation", **env_kwargs): for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) monitor = LocalMonitor(variant["logging_dir"]) env = NormalizedEnv(env_class, reward_scale=variant["reward_scale"], **env_kwargs) action_dim = np.prod(env.action_space.shape) policy = Dense( [variant["hidden_size"], variant["hidden_size"], 2 * action_dim], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) actor = PolicyGradient(policy, gamma=variant["gamma"], batch_size=variant["batch_size"], monitor=monitor) buffer = PathBuffer(max_size=variant["max_size"], max_path_length=variant["max_path_length"], selector=(lambda x: x[observation_key]), monitor=monitor) sampler = ParallelSampler( env, policy, buffer, max_path_length=variant["max_path_length"], num_warm_up_paths=variant["num_warm_up_paths"], num_exploration_paths=variant["num_exploration_paths"], num_evaluation_paths=variant["num_evaluation_paths"], num_threads=variant["num_threads"], selector=(lambda i, x: x[observation_key]), monitor=monitor) saver = LocalSaver(variant["logging_dir"], policy=policy) trainer = LocalTrainer(sampler, [buffer], [actor], num_steps=variant["num_steps"], num_trains_per_step=variant["num_trains_per_step"], saver=saver, monitor=monitor) trainer.train()
max_path_length = variant["max_path_length"] max_size = variant["max_size"] num_warm_up_paths = variant["num_warm_up_paths"] num_exploration_paths = variant["num_exploration_paths"] num_evaluation_paths = variant["num_evaluation_paths"] num_trains_per_step = variant["num_trains_per_step"] update_tuner_every = variant["update_tuner_every"] update_actor_every = variant["update_actor_every"] batch_size = variant["batch_size"] num_steps = variant["num_steps"] monitor = LocalMonitor(logging_dir) env = NormalizedEnv(PointmassEnv, size=2, ord=2) policy = Dense([256, 256, 4], tau=1e-1, optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=0.0001), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) buffer = PathBuffer(max_size=variant["max_size"], max_path_length=variant["max_path_length"], selector=(lambda x: x["proprio_observation"]), monitor=monitor)
def run_experiment(variant): ######### # SETUP # ######### for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) experiment_id = variant["experiment_id"] logging_dir = "./ant_maze/hiro/sac/{}".format( experiment_id) max_path_length = variant["max_path_length"] max_size = variant["max_size"] num_warm_up_paths = variant["num_warm_up_paths"] num_exploration_paths = variant["num_exploration_paths"] num_evaluation_paths = variant["num_evaluation_paths"] num_trains_per_step = variant["num_trains_per_step"] update_tuner_every = variant["update_tuner_every"] update_actor_every = variant["update_actor_every"] batch_size = variant["batch_size"] num_steps = variant["num_steps"] monitor = LocalMonitor(logging_dir) env = NormalizedEnv( AntMazeEnv(**variant["env_kwargs"]), reward_scale=(1 / max_path_length)) ################## # LOWER POLICIES # ################## lower_policy = Dense( [256, 256, 4], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=0.0001), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) lower_target_policy = Dense( [256, 256, 4], tau=1e-1, optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=0.0001), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) ######################### # LOWER VALUE FUNCTIONS # ######################### lower_qf = Dense( [256, 256, 1], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs={"lr": 0.0001}) lower_target_qf = Dense( [256, 256, 1], tau=1e-1, optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs={"lr": 0.0001}) ################## # UPPER POLICIES # ################## upper_policy = Dense( [256, 256, 4], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=0.0001), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) upper_target_policy = Dense( [256, 256, 4], tau=1e-1, optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=0.0001), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) ######################### # UPPER VALUE FUNCTIONS # ######################### upper_qf = Dense( [256, 256, 1], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs={"lr": 0.0001}) upper_target_qf = Dense( [256, 256, 1], tau=1e-1, optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs={"lr": 0.0001}) #################################### # OBSERVATION DICTIONARY SELECTORS # #################################### observation_selector = ( lambda x: x["proprio_observation"]) goal_selector = ( lambda x: x["goal"]) both_selector = ( lambda x: np.concatenate([observation_selector(x), goal_selector(x)], -1)) hierarchy_selector = ( lambda i, x: observation_selector(x) if i == 1 else both_selector(x)) ################## # REPLAY BUFFERS # ################## lower_buffer = GoalConditionedRelabeler( PathBuffer( max_size=max_size, max_path_length=max_path_length, monitor=monitor), observation_selector=observation_selector, goal_selector=goal_selector) upper_buffer = HIRORelabeler( lower_policy, PathBuffer( max_size=max_size, max_path_length=max_path_length, monitor=monitor), observation_selector=observation_selector, num_samples=8) ############ # SAMPLERS # ############ sampler = PathSampler( env, lower_policy, lower_buffer, upper_policy, upper_buffer, time_skips=(1, 5), max_path_length=max_path_length, num_warm_up_paths=num_warm_up_paths, num_exploration_paths=num_exploration_paths, num_evaluation_paths=num_evaluation_paths, selector=hierarchy_selector, monitor=monitor) ############################# # LOWER TRAINING ALGORITHMS # ############################# lower_tuner = EntropyTuner( lower_policy, optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=0.0001), target=(-2.0), update_every=update_tuner_every, batch_size=batch_size, selector=both_selector, monitor=monitor, logging_prefix="lower_") lower_critic = SoftQNetwork( lower_target_policy, lower_qf, lower_target_qf, gamma=0.99, clip_radius=0.2, std=0.1, log_alpha=lower_tuner.get_tuning_variable(), batch_size=batch_size, selector=both_selector, monitor=monitor, logging_prefix="lower_") lower_actor = SoftActorCritic( lower_policy, lower_target_policy, lower_critic, log_alpha=lower_tuner.get_tuning_variable(), update_every=update_actor_every, batch_size=batch_size, selector=both_selector, monitor=monitor, logging_prefix="lower_") lower_algorithm = MultiAlgorithm(lower_actor, lower_critic, lower_tuner) ############################# # UPPER TRAINING ALGORITHMS # ############################# upper_tuner = EntropyTuner( upper_policy, optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=0.0001), target=(-2.0), update_every=update_tuner_every, batch_size=batch_size, selector=observation_selector, monitor=monitor, logging_prefix="upper_") upper_critic = SoftQNetwork( upper_target_policy, upper_qf, upper_target_qf, gamma=0.99, clip_radius=0.2, std=0.1, log_alpha=upper_tuner.get_tuning_variable(), batch_size=batch_size, selector=observation_selector, monitor=monitor, logging_prefix="upper_") upper_actor = SoftActorCritic( upper_policy, upper_target_policy, upper_critic, log_alpha=upper_tuner.get_tuning_variable(), update_every=update_actor_every, batch_size=batch_size, selector=observation_selector, monitor=monitor, logging_prefix="upper_") upper_algorithm = MultiAlgorithm(upper_actor, upper_critic, upper_tuner) ################## # START TRAINING # ################## saver = Saver( logging_dir, lower_policy=lower_policy, lower_target_policy=lower_target_policy, lower_qf=lower_qf, lower_target_qf=lower_target_qf, upper_policy=upper_policy, upper_target_policy=upper_target_policy, upper_qf=upper_qf, upper_target_qf=upper_target_qf) trainer = LocalTrainer( sampler, lower_buffer, lower_algorithm, upper_buffer, upper_algorithm, num_steps=num_steps, num_trains_per_step=num_trains_per_step, save_function=saver, monitor=monitor) trainer.train()
def sac(variant, env_class, observation_key="proprio_observation", **env_kwargs): for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) monitor = LocalMonitor(variant["logging_dir"]) env = NormalizedEnv(env_class, reward_scale=variant["reward_scale"], **env_kwargs) action_dim = np.prod(env.action_space.shape) policy = Dense( [variant["hidden_size"], variant["hidden_size"], 2 * action_dim], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) qf1 = Dense([variant["hidden_size"], variant["hidden_size"], 1], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"])) qf2 = qf1.clone() target_qf1 = qf1.clone() target_qf2 = qf1.clone() tuner = EntropyTuner(policy, optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), target=(-action_dim), batch_size=variant["batch_size"], monitor=monitor) critic1 = SoftQNetwork(policy, qf1, target_qf1, gamma=variant["gamma"], log_alpha=tuner.get_tuning_variable(), bellman_weight=variant["bellman_weight"], discount_weight=variant["discount_weight"], batch_size=variant["batch_size"], monitor=monitor) critic2 = SoftQNetwork(policy, qf2, target_qf2, gamma=variant["gamma"], log_alpha=tuner.get_tuning_variable(), bellman_weight=variant["bellman_weight"], discount_weight=variant["discount_weight"], batch_size=variant["batch_size"], monitor=monitor) critic = TwinCritic(critic1, critic2) actor = SoftActorCritic(policy, critic, log_alpha=tuner.get_tuning_variable(), batch_size=variant["batch_size"], monitor=monitor) buffer = PathBuffer(max_size=variant["max_size"], max_path_length=variant["max_path_length"], selector=(lambda x: x[observation_key]), monitor=monitor) step_buffer = OffPolicyBuffer(buffer) sampler = ParallelSampler( env, policy, buffer, max_path_length=variant["max_path_length"], num_warm_up_paths=variant["num_warm_up_paths"], num_exploration_paths=variant["num_exploration_paths"], num_evaluation_paths=variant["num_evaluation_paths"], num_threads=variant["num_threads"], selector=(lambda i, x: x[observation_key]), monitor=monitor) saver = LocalSaver(variant["logging_dir"], policy=policy, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2) trainer = LocalTrainer(sampler, [step_buffer, step_buffer, step_buffer], [actor, critic, tuner], num_steps=variant["num_steps"], num_trains_per_step=variant["num_trains_per_step"], saver=saver, monitor=monitor) trainer.train()
def hac( variant, env_class, observation_key="proprio_observation", goal_key="goal", **env_kwargs ): for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) observation_selector = ( lambda x: x[observation_key]) goal_selector = ( lambda x: x[goal_key]) both_selector = ( lambda x: np.concatenate([observation_selector(x), goal_selector(x)], -1)) hierarchy_selector = ( lambda i, x: observation_selector(x) if i == 1 else both_selector(x)) def relabel_goal(goal, observation): observation[goal_key] = goal return observation monitor = LocalMonitor(variant["logging_dir"]) env = NormalizedEnv(env_class, reward_scale=variant["reward_scale"], **env_kwargs) action_dim = np.prod(env.action_space.shape) goal_dim = np.prod(env.observation_space[observation_key].shape) lower_policy = Dense( [variant["hidden_size"], variant["hidden_size"], 2 * action_dim], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) lower_qf = Dense( [variant["hidden_size"], variant["hidden_size"], 1], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"])) lower_target_qf = lower_qf.clone() lower_critic = QNetwork( lower_policy, lower_qf, lower_target_qf, gamma=variant["gamma"], bellman_weight=variant["bellman_weight"], discount_weight=variant["discount_weight"], batch_size=variant["batch_size"], monitor=monitor, logging_prefix="lower_") lower_actor = DDPG( lower_policy, lower_critic, batch_size=variant["batch_size"], update_every=variant["num_trains_per_step"], monitor=monitor, logging_prefix="lower_") lower_buffer = GoalConditionedRelabeler( HindsightRelabeler( PathBuffer( max_size=variant["max_size"], max_path_length=variant["max_path_length"], monitor=monitor), time_skip=variant["time_skip"], observation_selector=observation_selector, goal_selector=goal_selector, goal_assigner=relabel_goal, relabel_probability=variant["relabel_probability"]), observation_selector=observation_selector, goal_selector=goal_selector) lower_buffer = OffPolicyBuffer(lower_buffer) upper_policy = Dense( [variant["hidden_size"], variant["hidden_size"], 2 * goal_dim], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) upper_qf = Dense( [variant["hidden_size"], variant["hidden_size"], 1], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"])) upper_target_qf = upper_qf.clone() upper_critic = QNetwork( upper_policy, upper_qf, upper_target_qf, gamma=variant["gamma"], bellman_weight=variant["bellman_weight"], discount_weight=variant["discount_weight"], batch_size=variant["batch_size"], monitor=monitor, logging_prefix="upper_") upper_actor = DDPG( upper_policy, upper_critic, batch_size=variant["batch_size"], update_every=variant["num_trains_per_step"], monitor=monitor, logging_prefix="upper_") upper_buffer = SubgoalTestingRelabeler( HACRelabeler( PathBuffer( max_size=variant["max_size"], max_path_length=variant["max_path_length"], monitor=monitor), observation_selector=observation_selector, relabel_probability=variant["relabel_probability"]), observation_selector=observation_selector, threshold=variant["threshold"], penalty=variant["penalty"], relabel_probability=variant["relabel_probability"]) upper_buffer = OffPolicyBuffer(upper_buffer) sampler = ParallelSampler( env, [lower_policy, upper_policy], [lower_buffer, upper_buffer], time_skips=(1, variant["time_skip"]), max_path_length=variant["max_path_length"], num_warm_up_paths=variant["num_warm_up_paths"], num_exploration_paths=variant["num_exploration_paths"], num_evaluation_paths=variant["num_evaluation_paths"], num_threads=variant["num_threads"], selector=hierarchy_selector, monitor=monitor) saver = LocalSaver( variant["logging_dir"], lower_policy=lower_policy, lower_qf=lower_qf, lower_target_qf=lower_target_qf, upper_policy=upper_policy, upper_qf=upper_qf, upper_target_qf=upper_target_qf) trainer = LocalTrainer( sampler, [lower_buffer, lower_buffer, lower_buffer, upper_buffer, upper_buffer, upper_buffer], [upper_actor, upper_critic, lower_actor, lower_critic], num_steps=variant["num_steps"], num_trains_per_step=variant["num_trains_per_step"], saver=saver, monitor=monitor) trainer.train()
def trpo(variant, env_class, observation_key="proprio_observation", **env_kwargs): for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) monitor = LocalMonitor(variant["logging_dir"]) env = NormalizedEnv(env_class, reward_scale=variant["reward_scale"], **env_kwargs) action_dim = np.prod(env.action_space.shape) policy = Dense( [variant["hidden_size"], variant["hidden_size"], 2 * action_dim], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), distribution_class=TanhGaussian, distribution_kwargs=dict(std=None)) vf = Dense([variant["hidden_size"], variant["hidden_size"], 1], tau=variant["tau"], optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"])) old_policy = policy.clone() target_vf = vf.clone() policy = KLConstraint(LineSearch(NaturalGradient(policy, return_sAs=True), use_sAs=True), old_policy, delta=variant["delta"]) tuner = EntropyTuner(policy, optimizer_class=tf.keras.optimizers.Adam, optimizer_kwargs=dict(lr=variant["learning_rate"]), target=(-action_dim), batch_size=variant["batch_size"], monitor=monitor) critic = SoftValueNetwork(policy, vf, target_vf, gamma=variant["gamma"], log_alpha=tuner.get_tuning_variable(), bellman_weight=variant["bellman_weight"], discount_weight=variant["discount_weight"], batch_size=variant["batch_size"], monitor=monitor) critic = GAE(critic, gamma=variant["gamma"], lamb=variant["lamb"]) actor = ImportanceSampling(policy, old_policy, critic, gamma=variant["gamma"], old_update_every=variant["num_trains_per_step"], batch_size=variant["batch_size"], monitor=monitor) buffer = PathBuffer(max_size=variant["max_size"], max_path_length=variant["max_path_length"], selector=(lambda x: x[observation_key]), monitor=monitor) sampler = ParallelSampler( env, policy, buffer, max_path_length=variant["max_path_length"], num_warm_up_paths=variant["num_warm_up_paths"], num_exploration_paths=variant["num_exploration_paths"], num_evaluation_paths=variant["num_evaluation_paths"], num_threads=variant["num_threads"], selector=(lambda i, x: x[observation_key]), monitor=monitor) saver = LocalSaver(variant["logging_dir"], policy=policy, old_policy=old_policy, vf=vf, target_vf=target_vf) trainer = LocalTrainer(sampler, [buffer, buffer, buffer], [actor, critic, tuner], num_steps=variant["num_steps"], num_trains_per_step=variant["num_trains_per_step"], saver=saver, monitor=monitor) trainer.train()