def grill_her_td3_full_experiment(variant): full_experiment_variant_preprocess(variant) if not variant['grill_variant'].get('do_state_exp', False): train_vae_and_update_variant(variant) grill_her_td3_experiment(variant['grill_variant'])
def train_vae_demos(variant): full_experiment_variant_preprocess(variant) train_vae_and_update_variant(variant)
def experiment(variant): import railrl.samplers.rollout_functions as rf import railrl.torch.pytorch_util as ptu from railrl.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from railrl.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from railrl.torch.her.her import HERTrainer from railrl.torch.td3.td3 import TD3 as TD3Trainer from railrl.torch.networks import FlattenMlp, TanhMlpPolicy from railrl.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from railrl.samplers.data_collector import GoalConditionedPathCollector from railrl.torch.grill.launcher import ( grill_preprocess_variant, get_envs, get_exploration_strategy, full_experiment_variant_preprocess, train_vae_and_update_variant, get_video_save_func, ) full_experiment_variant_preprocess(variant) if not variant['grill_variant'].get('do_state_exp', False): train_vae_and_update_variant(variant) variant = variant['grill_variant'] grill_preprocess_variant(variant) eval_env = get_envs(variant) expl_env = get_envs(variant) es = get_exploration_strategy(variant, expl_env) observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (expl_env.observation_space.spaces[observation_key].low.size + expl_env.observation_space.spaces[desired_goal_key].low.size) action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['td3_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, exploration_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) if variant.get("save_video", True): # Does not work rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=algorithm.max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, ) video_func = get_video_save_func( rollout_function, eval_env, policy, variant, ) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) if not variant.get("do_state_exp", False): eval_env.vae.to(ptu.device) expl_env.vae.to(ptu.device) algorithm.train()