def __init__(self, config, env, replay, networks): """ Bascally a wrapper class for SAC from rlkit. Args: config: Configuration dictonary env: Environment replay: Replay buffer networks: dict containing two sub-dicts, 'individual' and 'population' which contain the networks. """ super().__init__(config, env, replay, networks) self._variant_pop = config['rl_algorithm_config']['algo_params_pop'] self._variant_spec = config['rl_algorithm_config']['algo_params'] self._ind_qf1 = networks['individual']['qf1'] self._ind_qf2 = networks['individual']['qf2'] self._ind_qf1_target = networks['individual']['qf1_target'] self._ind_qf2_target = networks['individual']['qf2_target'] self._ind_policy = networks['individual']['policy'] self._pop_qf1 = networks['population']['qf1'] self._pop_qf2 = networks['population']['qf2'] self._pop_qf1_target = networks['population']['qf1_target'] self._pop_qf2_target = networks['population']['qf2_target'] self._pop_policy = networks['population']['policy'] self._batch_size = config['rl_algorithm_config']['batch_size'] self._nmbr_indiv_updates = config['rl_algorithm_config']['indiv_updates'] self._nmbr_pop_updates = config['rl_algorithm_config']['pop_updates'] self._algorithm_ind = SoftActorCritic_rlkit( env=self._env, policy=self._ind_policy, qf1=self._ind_qf1, qf2=self._ind_qf2, target_qf1=self._ind_qf1_target, target_qf2=self._ind_qf2_target, use_automatic_entropy_tuning = False, **self._variant_spec ) self._algorithm_pop = SoftActorCritic_rlkit( env=self._env, policy=self._pop_policy, qf1=self._pop_qf1, qf2=self._pop_qf2, target_qf1=self._pop_qf1_target, target_qf2=self._pop_qf2_target, use_automatic_entropy_tuning = False, **self._variant_pop )
def get_sac(evaluation_environment, parameters): """ :param env - environment to get action shape :param parameters: dict with keys - hidden_sizes, sac_trainer_parameters :return: sac_policy, eval_policy, trainer """ obs_dim = evaluation_environment.observation_space.low.size action_dim = evaluation_environment.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) sac_policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes_policy, ) eval_policy = MakeDeterministic(sac_policy) trainer = SACTrainer(env=evaluation_environment, policy=sac_policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **parameters['trainer_params']) return sac_policy, eval_policy, trainer
def __init__(self, env, eval_env, mem, nets, train_step_params): super().__init__(env, eval_env, mem, nets, train_step_params) self._mem = mem self._env = env self._eval_env = eval_env self._policy_net, self._q1_net, self._q2_net, self._target_q1_net,\ self._target_q2_net = nets['policy_net'], nets['q1_net'], nets['q2_net'],\ nets['target_q1_net'], nets['target_q2_net'] self._train_step_params = train_step_params self._alg = SACTrainer(env=self._env, policy=self._policy_net, qf1=self._q1_net, qf2=self._q2_net, target_qf1=self._target_q1_net, target_qf2=self._target_q2_net, **train_step_params)
def run_sac(base_expl_env, base_eval_env, variant): expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True) eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant["layer_size"] num_hidden = variant["num_hidden_layers"] qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M] * num_hidden) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M] * num_hidden) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant["replay_buffer_size"], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.train()
def sac(variant): expl_env = gym.make(variant["env_name"]) eval_env = gym.make(variant["env_name"]) expl_env.seed(variant["seed"]) eval_env.set_eval() mode = variant["mode"] archi = variant["archi"] if mode == "her": variant["her"] = dict( observation_key="observation", desired_goal_key="desired_goal", achieved_goal_key="achieved_goal", representation_goal_key="representation_goal", ) replay_buffer = get_replay_buffer(variant, expl_env) qf1, qf2, target_qf1, target_qf2, policy, shared_base = get_networks( variant, expl_env) expl_policy = policy eval_policy = MakeDeterministic(policy) expl_path_collector, eval_path_collector = get_path_collector( variant, expl_env, eval_env, expl_policy, eval_policy) mode = variant["mode"] trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant["trainer_kwargs"], ) if mode == "her": trainer = HERTrainer(trainer) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"], ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant["layer_size"] qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M]) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M]) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy) expl_path_collector = MdpPathCollector(expl_env, policy) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def episode_init(self): """ Initializations to be done before the first episode. In this case basically creates a fresh instance of SAC for the individual networks and copies the values of the target network. """ self._algorithm_ind = SoftActorCritic_rlkit( env=self._env, policy=self._ind_policy, qf1=self._ind_qf1, qf2=self._ind_qf2, target_qf1=self._ind_qf1_target, target_qf2=self._ind_qf2_target, use_automatic_entropy_tuning = False, # alt_alpha = self._alt_alpha, **self._variant_spec ) if self._config['rl_algorithm_config']['copy_from_gobal']: utils.copy_pop_to_ind(networks_pop=self._networks['population'], networks_ind=self._networks['individual'])
def get_sac_trainer(env, hidden_sizes=[256, 256], reward_scale=1): obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes, ) trainer = SACTrainer(env=env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, discount=0.99, soft_target_tau=5e-3, target_update_period=1, policy_lr=3E-4, qf_lr=3E-4, reward_scale=reward_scale, use_automatic_entropy_tuning=True) return trainer
def experiment(variant): # unwrap the TimeLimitEnv wrapper since we manually terminate after 50 steps eval_env = gym.make('FetchReach-v1').env expl_env = gym.make('FetchReach-v1').env observation_key = 'observation' desired_goal_key = 'desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): checkpoint_filepath = os.path.join(variant['checkpoint_dir'], 'itr_{}.pkl'.format( variant['checkpoint_epoch'])) checkpoint = torch.load(checkpoint_filepath) eval_env = roboverse.make(variant['env'], transpose_image=True) expl_env = eval_env action_dim = eval_env.action_space.low.size cnn_params = variant['cnn_params'] cnn_params.update( input_width=48, input_height=48, input_channels=3, output_size=1, added_fc_input_size=action_dim, ) qf1 = ConcatCNN(**cnn_params) qf2 = ConcatCNN(**cnn_params) target_qf1 = ConcatCNN(**cnn_params) target_qf2 = ConcatCNN(**cnn_params) policy = checkpoint['evaluation/policy'] eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) observation_key = 'image' online_buffer_size = 500 * 10 * variant['algorithm_kwargs'][ 'max_path_length'] if variant['online_data_only']: replay_buffer = ObsDictReplayBuffer(online_buffer_size, expl_env, observation_key=observation_key) else: replay_buffer = load_data_from_npy_chaining( variant, expl_env, observation_key, extra_buffer_size=online_buffer_size) trainer_kwargs = variant['trainer_kwargs'] trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **trainer_kwargs ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=False, batch_rl=False, **variant['algorithm_kwargs'] ) video_func = VideoSaveFunction(variant) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def run_experiment_func(variant): env_params = ENV_PARAMS[variant['env']] variant.update(env_params) expl_env = NormalizedBoxEnv(variant['env_class']()) eval_env = NormalizedBoxEnv(variant['env_class']()) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) vf = ConcatMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, discount=variant['discount'], soft_target_tau=variant['soft_target_tau'], target_update_period=variant['target_update_period'], policy_lr=variant['policy_lr'], qf_lr=variant['qf_lr'], reward_scale=1, use_automatic_entropy_tuning=True, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant['num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant['min_num_steps_before_training'], ) return algorithm
def experiment(variant): from multiworld.envs.mujoco import register_mujoco_envs register_mujoco_envs() env_id = variant['env_id'] eval_env = gym.make(env_id) expl_env = gym.make(env_id) observation_key = 'state_observation' desired_goal_key = 'state_desired_goal' eval_env.reward_type = variant['reward_type'] expl_env.reward_type = variant['reward_type'] achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = ENV_PARAMS[variant['env']] env_mod_params = variant['env_mod'] variant.update(env_params) expl_env = NormalizedBoxEnv(variant['env_class'](env_mod_params)) eval_env = NormalizedBoxEnv(variant['env_class']({})) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant[ 'num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant[ 'min_num_steps_before_training'], ) else: expl_path_collector = MdpPathCollector( expl_env, policy, ) algorithm = TorchBatchRLAlgorithmModEnv( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], num_epochs=variant['num_epochs'], num_eval_steps_per_epoch=variant['num_eval_steps_per_epoch'], num_expl_steps_per_train_loop=variant[ 'num_expl_steps_per_train_loop'], num_trains_per_train_loop=variant['num_trains_per_train_loop'], min_num_steps_before_training=variant[ 'min_num_steps_before_training'], mod_env_epoch_schedule=variant['mod_env_epoch_schedule'], env_mod_dist=variant['mod_env_dist'], env_class=variant['env_class'], env_mod_params=variant['env_mod']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make(variant["env_name"]) eval_env = expl_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant["layer_size"] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[ M, M, ], # Making it easier to visualize ) # behavior_policy = TanhGaussianPolicy( # obs_dim=obs_dim, # action_dim=action_dim, # hidden_sizes=[M, M], # ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, sparse_reward=False, target_goal=eval_env.unwrapped.wrapped_env.target_goal, ) expl_path_collector = MdpPathCollector( expl_env, policy, sparse_reward=False, target_goal=eval_env.unwrapped.wrapped_env.target_goal, ) replay_buffer = EnvReplayBuffer( variant["replay_buffer_size"], expl_env, with_per=False, ) if variant["load_buffer"]: load_hdf5(eval_env.unwrapped.get_dataset(), replay_buffer) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, behavior_policy=None, **variant["trainer_kwargs"]) print(variant["algorithm_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=variant["load_buffer"], **variant["algorithm_kwargs"]) algorithm.to(ptu.device) print("training!") algorithm.train()
def _e2e_disentangled_experiment(max_path_length, encoder_kwargs, disentangled_qf_kwargs, qf_kwargs, twin_sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, vae_evaluation_goal_sampling_mode, vae_exploration_goal_sampling_mode, base_env_evaluation_goal_sampling_mode, base_env_exploration_goal_sampling_mode, algo_kwargs, env_id=None, env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', achieved_goal_key='state_achieved_goal', latent_dim=2, vae_wrapped_env_kwargs=None, vae_path=None, vae_n_vae_training_kwargs=None, vectorized=False, save_video=True, save_video_kwargs=None, have_no_disentangled_encoder=False, **kwargs): if env_kwargs is None: env_kwargs = {} assert env_id or env_class if env_id: import gym import multiworld multiworld.register_all_envs() train_env = gym.make(env_id) eval_env = gym.make(env_id) else: eval_env = env_class(**env_kwargs) train_env = env_class(**env_kwargs) train_env.goal_sampling_mode = base_env_exploration_goal_sampling_mode eval_env.goal_sampling_mode = base_env_evaluation_goal_sampling_mode if vae_path: vae = load_local_or_remote_file(vae_path) else: vae = get_n_train_vae(latent_dim=latent_dim, env=eval_env, **vae_n_vae_training_kwargs) train_env = VAEWrappedEnv(train_env, vae, imsize=train_env.imsize, **vae_wrapped_env_kwargs) eval_env = VAEWrappedEnv(eval_env, vae, imsize=train_env.imsize, **vae_wrapped_env_kwargs) obs_dim = train_env.observation_space.spaces[observation_key].low.size goal_dim = train_env.observation_space.spaces[desired_goal_key].low.size action_dim = train_env.action_space.low.size encoder = ConcatMlp(input_size=obs_dim, output_size=latent_dim, **encoder_kwargs) def make_qf(): if have_no_disentangled_encoder: return ConcatMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **qf_kwargs, ) else: return DisentangledMlpQf(encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, vectorized=vectorized, **disentangled_qf_kwargs) qf1 = make_qf() qf2 = make_qf() target_qf1 = make_qf() target_qf2 = make_qf() policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **policy_kwargs) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, vectorized=vectorized, **replay_buffer_kwargs) sac_trainer = SACTrainer(env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **twin_sac_trainer_kwargs) trainer = HERTrainer(sac_trainer) eval_path_collector = VAEWrappedEnvPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=vae_evaluation_goal_sampling_mode, ) expl_path_collector = VAEWrappedEnvPathCollector( train_env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=vae_exploration_goal_sampling_mode, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs, ) algorithm.to(ptu.device) if save_video: save_vf_heatmap = save_video_kwargs.get('save_vf_heatmap', True) if have_no_disentangled_encoder: def v_function(obs): action = policy.get_actions(obs) obs, action = ptu.from_numpy(obs), ptu.from_numpy(action) return qf1(obs, action) add_heatmap = partial(add_heatmap_img_to_o_dict, v_function=v_function) else: def v_function(obs): action = policy.get_actions(obs) obs, action = ptu.from_numpy(obs), ptu.from_numpy(action) return qf1(obs, action, return_individual_q_vals=True) add_heatmap = partial( add_heatmap_imgs_to_o_dict, v_function=v_function, vectorized=vectorized, ) rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, full_o_postprocess_func=add_heatmap if save_vf_heatmap else None, ) img_keys = ['v_vals'] + [ 'v_vals_dim_{}'.format(dim) for dim in range(latent_dim) ] eval_video_func = get_save_video_function(rollout_function, eval_env, MakeDeterministic(policy), get_extra_imgs=partial( get_extra_imgs, img_keys=img_keys), tag="eval", **save_video_kwargs) train_video_func = get_save_video_function(rollout_function, train_env, policy, get_extra_imgs=partial( get_extra_imgs, img_keys=img_keys), tag="train", **save_video_kwargs) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
def sac_on_gym_goal_env_experiment( max_path_length, qf_kwargs, sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, algo_kwargs, env_id=None, env_class=None, env_kwargs=None, observation_key='observation', desired_goal_key='desired_goal', achieved_goal_key='achieved_goal', exploration_policy_kwargs=None, evaluation_goal_sampling_mode=None, exploration_goal_sampling_mode=None, # Video parameters save_video=True, save_video_kwargs=None, renderer_kwargs=None, ): if exploration_policy_kwargs is None: exploration_policy_kwargs = {} if not save_video_kwargs: save_video_kwargs = {} if not renderer_kwargs: renderer_kwargs = {} context_key = desired_goal_key sample_context_from_obs_dict_fn = RemapKeyFn( {context_key: achieved_goal_key}) def contextual_env_distrib_and_reward(env_id, env_class, env_kwargs, goal_sampling_mode): env = get_gym_env( env_id, env_class=env_class, env_kwargs=env_kwargs, unwrap_timed_envs=True, ) env.goal_sampling_mode = goal_sampling_mode goal_distribution = GoalDictDistributionFromGymGoalEnv( env, desired_goal_key=desired_goal_key, ) distance_fn = L2Distance( achieved_goal_from_observation=IndexIntoAchievedGoal( achieved_goal_key, ), desired_goal_key=desired_goal_key, ) if (isinstance(env, robotics.FetchReachEnv) or isinstance(env, robotics.FetchPushEnv) or isinstance(env, robotics.FetchPickAndPlaceEnv) or isinstance(env, robotics.FetchSlideEnv)): success_threshold = 0.05 else: raise TypeError("I don't know the success threshold of env ", env) reward_fn = ThresholdDistanceReward(distance_fn, success_threshold) diag_fn = GenericGoalConditionedContextualDiagnostics( desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, success_threshold=success_threshold, ) env = ContextualEnv( env, context_distribution=goal_distribution, reward_fn=reward_fn, observation_key=observation_key, contextual_diagnostics_fns=[diag_fn], update_env_info_fn=delete_info, ) return env, goal_distribution, reward_fn expl_env, expl_context_distrib, expl_reward = contextual_env_distrib_and_reward( env_id, env_class, env_kwargs, exploration_goal_sampling_mode) eval_env, eval_context_distrib, eval_reward = contextual_env_distrib_and_reward( env_id, env_class, env_kwargs, evaluation_goal_sampling_mode) obs_dim = (expl_env.observation_space.spaces[observation_key].low.size + expl_env.observation_space.spaces[context_key].low.size) action_dim = expl_env.action_space.low.size def create_qf(): return ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) qf1 = create_qf() qf2 = create_qf() target_qf1 = create_qf() target_qf2 = create_qf() policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs) def concat_context_to_obs(batch, *args, **kwargs): obs = batch['observations'] next_obs = batch['next_observations'] context = batch[context_key] batch['observations'] = np.concatenate([obs, context], axis=1) batch['next_observations'] = np.concatenate([next_obs, context], axis=1) return batch replay_buffer = ContextualRelabelingReplayBuffer( env=eval_env, context_keys=[context_key], observation_keys_to_save=[observation_key, achieved_goal_key], context_distribution=eval_context_distrib, sample_context_from_obs_dict_fn=sample_context_from_obs_dict_fn, reward_fn=eval_reward, post_process_batch_fn=concat_context_to_obs, **replay_buffer_kwargs) trainer = SACTrainer(env=expl_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **sac_trainer_kwargs) eval_path_collector = ContextualPathCollector( eval_env, MakeDeterministic(policy), observation_key=observation_key, context_keys_for_policy=[context_key], ) exploration_policy = create_exploration_policy(policy=policy, env=expl_env, **exploration_policy_kwargs) expl_path_collector = ContextualPathCollector( expl_env, exploration_policy, observation_key=observation_key, context_keys_for_policy=[context_key], ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs) algorithm.to(ptu.device) if save_video: # Setting the goal like this is discourage, but the Fetch environment # are designed to visualize the goals by setting their goal parameter. def set_goal_for_visualization(env, policy, o): goal = o[desired_goal_key] print(goal) env.unwrapped.goal = goal rollout_function = partial( rf.contextual_rollout, max_path_length=max_path_length, observation_key=observation_key, context_keys_for_policy=[context_key], reset_callback=set_goal_for_visualization, ) renderer = GymEnvRenderer(**renderer_kwargs) def add_images(env, context_distribution): state_env = env.env img_env = InsertImageEnv( state_env, renderer=renderer, image_key='image_observation', ) return ContextualEnv( img_env, context_distribution=context_distribution, reward_fn=eval_reward, observation_key=observation_key, update_env_info_fn=delete_info, ) img_eval_env = add_images(eval_env, eval_context_distrib) img_expl_env = add_images(expl_env, expl_context_distrib) eval_video_func = get_save_video_function( rollout_function, img_eval_env, MakeDeterministic(policy), tag="eval", imsize=renderer.image_chw[1], image_format=renderer.output_image_format, keys_to_show=['image_observation'], **save_video_kwargs) expl_video_func = get_save_video_function( rollout_function, img_expl_env, exploration_policy, tag="train", imsize=renderer.image_chw[1], image_format=renderer.output_image_format, keys_to_show=['image_observation'], **save_video_kwargs) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(expl_video_func) algorithm.train()
def experiment(variant): eval_env = gym.make( variant['env_name'], **{ "headless": variant["headless"], "verbose": variant["verbose"] }) eval_env.seed(variant['seed']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) dataset = get_dataset(variant["h5path"], eval_env) load_hdf5(d4rl.qlearning_dataset(eval_env, dataset), replay_buffer) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import multiworld.envs.pygame env = gym.make('Point2DEnv-Image-v0') input_width, input_height = env.image_shape action_dim = int(np.prod(env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=3, output_conv_channels=True, output_size=None, ) if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf1 = MlpQfWithObsProcessor( obs_processor=qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) qf2 = MlpQfWithObsProcessor( obs_processor=qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) target_qf_cnn = CNN(**cnn_params) target_qf1 = MlpQfWithObsProcessor( obs_processor=target_qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) target_qf2 = MlpQfWithObsProcessor( obs_processor=target_qf_cnn, output_size=1, input_size=action_dim+qf_cnn.conv_output_flat_size, **variant['qf_kwargs'] ) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor( obs_processor=qf1_cnn, output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) qf2 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) target_qf1 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) target_qf2 = MlpQfWithObsProcessor( obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim+cnn_output_dim, **variant['qf_kwargs'] ) action_dim = int(np.prod(env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy = TanhGaussianPolicyAdapter( policy_cnn, policy_cnn.conv_output_flat_size, action_dim, ) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs'] ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def rig_experiment( max_path_length, qf_kwargs, sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, algo_kwargs, train_vae_kwargs, env_id=None, env_class=None, env_kwargs=None, observation_key='latent_observation', desired_goal_key='latent_desired_goal', state_goal_key='state_desired_goal', state_observation_key='state_observation', image_goal_key='image_desired_goal', exploration_policy_kwargs=None, evaluation_goal_sampling_mode=None, exploration_goal_sampling_mode=None, # Video parameters save_video=True, save_video_kwargs=None, renderer_kwargs=None, imsize=48, pretrained_vae_path="", init_camera=None, ): if exploration_policy_kwargs is None: exploration_policy_kwargs = {} if not save_video_kwargs: save_video_kwargs = {} if not renderer_kwargs: renderer_kwargs = {} renderer = EnvRenderer(init_camera=init_camera, **renderer_kwargs) def contextual_env_distrib_and_reward(env_id, env_class, env_kwargs, goal_sampling_mode): state_env = get_gym_env(env_id, env_class=env_class, env_kwargs=env_kwargs) renderer = EnvRenderer(init_camera=init_camera, **renderer_kwargs) img_env = InsertImageEnv(state_env, renderer=renderer) encoded_env = EncoderWrappedEnv( img_env, model, dict(image_observation="latent_observation", ), ) if goal_sampling_mode == "vae_prior": latent_goal_distribution = PriorDistribution( model.representation_size, desired_goal_key, ) diagnostics = StateImageGoalDiagnosticsFn({}, ) elif goal_sampling_mode == "reset_of_env": state_goal_env = get_gym_env(env_id, env_class=env_class, env_kwargs=env_kwargs) state_goal_distribution = GoalDictDistributionFromMultitaskEnv( state_goal_env, desired_goal_keys=[state_goal_key], ) image_goal_distribution = AddImageDistribution( env=state_env, base_distribution=state_goal_distribution, image_goal_key=image_goal_key, renderer=renderer, ) latent_goal_distribution = AddLatentDistribution( image_goal_distribution, image_goal_key, desired_goal_key, model, ) if hasattr(state_goal_env, 'goal_conditioned_diagnostics'): diagnostics = GoalConditionedDiagnosticsToContextualDiagnostics( state_goal_env.goal_conditioned_diagnostics, desired_goal_key=state_goal_key, observation_key=state_observation_key, ) else: state_goal_env.get_contextual_diagnostics diagnostics = state_goal_env.get_contextual_diagnostics else: raise NotImplementedError('unknown goal sampling method: %s' % goal_sampling_mode) reward_fn = DistanceRewardFn( observation_key=observation_key, desired_goal_key=desired_goal_key, ) env = ContextualEnv( encoded_env, context_distribution=latent_goal_distribution, reward_fn=reward_fn, observation_key=observation_key, contextual_diagnostics_fns=[diagnostics], ) return env, latent_goal_distribution, reward_fn if pretrained_vae_path: model = load_local_or_remote_file(pretrained_vae_path) else: model = train_vae(train_vae_kwargs, env_kwargs, env_id, env_class, imsize, init_camera) expl_env, expl_context_distrib, expl_reward = contextual_env_distrib_and_reward( env_id, env_class, env_kwargs, exploration_goal_sampling_mode) eval_env, eval_context_distrib, eval_reward = contextual_env_distrib_and_reward( env_id, env_class, env_kwargs, evaluation_goal_sampling_mode) context_key = desired_goal_key obs_dim = (expl_env.observation_space.spaces[observation_key].low.size + expl_env.observation_space.spaces[context_key].low.size) action_dim = expl_env.action_space.low.size def create_qf(): return ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **qf_kwargs) qf1 = create_qf() qf2 = create_qf() target_qf1 = create_qf() target_qf2 = create_qf() policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs) def concat_context_to_obs(batch, *args, **kwargs): obs = batch['observations'] next_obs = batch['next_observations'] context = batch[context_key] batch['observations'] = np.concatenate([obs, context], axis=1) batch['next_observations'] = np.concatenate([next_obs, context], axis=1) return batch replay_buffer = ContextualRelabelingReplayBuffer( env=eval_env, context_keys=[context_key], observation_keys=[observation_key], observation_key=observation_key, context_distribution=expl_context_distrib, sample_context_from_obs_dict_fn=RemapKeyFn( {context_key: observation_key}), reward_fn=eval_reward, post_process_batch_fn=concat_context_to_obs, **replay_buffer_kwargs) trainer = SACTrainer(env=expl_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **sac_trainer_kwargs) eval_path_collector = ContextualPathCollector( eval_env, MakeDeterministic(policy), observation_key=observation_key, context_keys_for_policy=[ context_key, ], ) exploration_policy = create_exploration_policy(expl_env, policy, **exploration_policy_kwargs) expl_path_collector = ContextualPathCollector( expl_env, exploration_policy, observation_key=observation_key, context_keys_for_policy=[ context_key, ], ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs) algorithm.to(ptu.device) if save_video: expl_video_func = RIGVideoSaveFunction( model, expl_path_collector, "train", decode_goal_image_key="image_decoded_goal", reconstruction_key="image_reconstruction", rows=2, columns=5, unnormalize=True, # max_path_length=200, imsize=48, image_format=renderer.output_image_format, **save_video_kwargs) algorithm.post_train_funcs.append(expl_video_func) eval_video_func = RIGVideoSaveFunction( model, eval_path_collector, "eval", goal_image_key=image_goal_key, decode_goal_image_key="image_decoded_goal", reconstruction_key="image_reconstruction", num_imgs=4, rows=2, columns=5, unnormalize=True, # max_path_length=200, imsize=48, image_format=renderer.output_image_format, **save_video_kwargs) algorithm.post_train_funcs.append(eval_video_func) algorithm.train()
def experiment(variant): # from softlearning.environments.gym import register_image_reach # register_image_reach() # env = gym.envs.make( # 'Pusher2d-ImageReach-v0', # ) from softlearning.environments.gym.mujoco.image_pusher_2d import ( ImageForkReacher2dEnv) env_kwargs = { 'image_shape': (32, 32, 3), 'arm_goal_distance_cost_coeff': 0.0, 'arm_object_distance_cost_coeff': 1.0, 'goal': (0, -1), } eval_env = ImageForkReacher2dEnv(**env_kwargs) expl_env = ImageForkReacher2dEnv(**env_kwargs) input_width, input_height, input_channels = eval_env.image_shape image_dim = input_width * input_height * input_channels action_dim = int(np.prod(eval_env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=input_channels, added_fc_input_size=4, output_conv_channels=True, output_size=None, ) non_image_dim = int(np.prod(eval_env.observation_space.shape)) - image_dim if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential( Split(qf_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) qf_kwargs = copy.deepcopy(variant['qf_kwargs']) qf_kwargs['obs_processor'] = qf_obs_processor qf_kwargs['output_size'] = 1 qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size + non_image_dim) qf1 = MlpQfWithObsProcessor(**qf_kwargs) qf2 = MlpQfWithObsProcessor(**qf_kwargs) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential( Split(target_qf_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) target_qf_kwargs = copy.deepcopy(variant['qf_kwargs']) target_qf_kwargs['obs_processor'] = target_qf_obs_processor target_qf_kwargs['output_size'] = 1 target_qf_kwargs['input_size'] = (action_dim + target_qf_cnn.conv_output_flat_size + non_image_dim) target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs) target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor(obs_processor=qf1_cnn, output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf1 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf2 = MlpQfWithObsProcessor(obs_processor=CNN(**cnn_params), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) action_dim = int(np.prod(eval_env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy_obs_processor = nn.Sequential( Split(policy_cnn, identity, image_dim), FlattenEach(), ConcatTuple(), ) policy = TanhGaussianPolicyAdapter( policy_obs_processor, policy_cnn.conv_output_flat_size + non_image_dim, action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def image_based_goal_conditioned_sac_experiment( max_path_length, qf_kwargs, sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, algo_kwargs, cnn_kwargs, policy_type='tanh-normal', env_id=None, env_class=None, env_kwargs=None, exploration_policy_kwargs=None, evaluation_goal_sampling_mode=None, exploration_goal_sampling_mode=None, reward_type='state_distance', env_renderer_kwargs=None, # Data augmentations apply_random_crops=False, random_crop_pixel_shift=4, # Video parameters save_video=True, save_video_kwargs=None, video_renderer_kwargs=None, ): if exploration_policy_kwargs is None: exploration_policy_kwargs = {} if not save_video_kwargs: save_video_kwargs = {} if not env_renderer_kwargs: env_renderer_kwargs = {} if not video_renderer_kwargs: video_renderer_kwargs = {} img_observation_key = 'image_observation' img_desired_goal_key = 'image_desired_goal' state_observation_key = 'state_observation' state_desired_goal_key = 'state_desired_goal' state_achieved_goal_key = 'state_achieved_goal' sample_context_from_obs_dict_fn = RemapKeyFn({ 'image_desired_goal': 'image_observation', 'state_desired_goal': 'state_observation', }) def setup_contextual_env(env_id, env_class, env_kwargs, goal_sampling_mode, renderer): state_env = get_gym_env(env_id, env_class=env_class, env_kwargs=env_kwargs) state_env.goal_sampling_mode = goal_sampling_mode state_goal_distribution = GoalDictDistributionFromMultitaskEnv( state_env, desired_goal_keys=[state_desired_goal_key], ) state_diag_fn = GoalConditionedDiagnosticsToContextualDiagnostics( state_env.goal_conditioned_diagnostics, desired_goal_key=state_desired_goal_key, observation_key=state_observation_key, ) image_goal_distribution = AddImageDistribution( env=state_env, base_distribution=state_goal_distribution, image_goal_key=img_desired_goal_key, renderer=renderer, ) goal_distribution = PresampledDistribution(image_goal_distribution, 5000) img_env = InsertImageEnv(state_env, renderer=renderer) if reward_type == 'state_distance': reward_fn = ContextualRewardFnFromMultitaskEnv( env=state_env, achieved_goal_from_observation=IndexIntoAchievedGoal( 'state_observation'), desired_goal_key=state_desired_goal_key, achieved_goal_key=state_achieved_goal_key, ) elif reward_type == 'pixel_distance': reward_fn = NegativeL2Distance( achieved_goal_from_observation=IndexIntoAchievedGoal( img_observation_key), desired_goal_key=img_desired_goal_key, ) else: raise ValueError(reward_type) env = ContextualEnv( img_env, context_distribution=goal_distribution, reward_fn=reward_fn, observation_key=img_observation_key, contextual_diagnostics_fns=[state_diag_fn], update_env_info_fn=delete_info, ) return env, goal_distribution, reward_fn env_renderer = EnvRenderer(**env_renderer_kwargs) expl_env, expl_context_distrib, expl_reward = setup_contextual_env( env_id, env_class, env_kwargs, exploration_goal_sampling_mode, env_renderer) eval_env, eval_context_distrib, eval_reward = setup_contextual_env( env_id, env_class, env_kwargs, evaluation_goal_sampling_mode, env_renderer) action_dim = expl_env.action_space.low.size if env_renderer.output_image_format == 'WHC': img_width, img_height, img_num_channels = ( expl_env.observation_space[img_observation_key].shape) elif env_renderer.output_image_format == 'CHW': img_num_channels, img_height, img_width = ( expl_env.observation_space[img_observation_key].shape) else: raise ValueError(env_renderer.output_image_format) def create_qf(): cnn = BasicCNN(input_width=img_width, input_height=img_height, input_channels=img_num_channels, **cnn_kwargs) joint_cnn = ApplyConvToStateAndGoalImage(cnn) return basic.MultiInputSequential( ApplyToObs(joint_cnn), basic.FlattenEachParallel(), ConcatMlp(input_size=joint_cnn.output_size + action_dim, output_size=1, **qf_kwargs)) qf1 = create_qf() qf2 = create_qf() target_qf1 = create_qf() target_qf2 = create_qf() cnn = BasicCNN(input_width=img_width, input_height=img_height, input_channels=img_num_channels, **cnn_kwargs) joint_cnn = ApplyConvToStateAndGoalImage(cnn) policy_obs_dim = joint_cnn.output_size if policy_type == 'normal': obs_processor = nn.Sequential( joint_cnn, basic.Flatten(), MultiHeadedMlp(input_size=policy_obs_dim, output_sizes=[action_dim, action_dim], **policy_kwargs)) policy = PolicyFromDistributionGenerator(Gaussian(obs_processor)) elif policy_type == 'tanh-normal': obs_processor = nn.Sequential( joint_cnn, basic.Flatten(), MultiHeadedMlp(input_size=policy_obs_dim, output_sizes=[action_dim, action_dim], **policy_kwargs)) policy = PolicyFromDistributionGenerator(TanhGaussian(obs_processor)) elif policy_type == 'normal-tanh-mean': obs_processor = nn.Sequential( joint_cnn, basic.Flatten(), MultiHeadedMlp(input_size=policy_obs_dim, output_sizes=[action_dim, action_dim], output_activations=['tanh', 'identity'], **policy_kwargs)) policy = PolicyFromDistributionGenerator(Gaussian(obs_processor)) else: raise ValueError("Unknown policy type: {}".format(policy_type)) if apply_random_crops: pad = BatchPad( env_renderer.output_image_format, random_crop_pixel_shift, random_crop_pixel_shift, ) crop = JointRandomCrop( env_renderer.output_image_format, env_renderer.image_shape, ) def concat_context_to_obs(batch, *args, **kwargs): obs = batch['observations'] next_obs = batch['next_observations'] context = batch[img_desired_goal_key] obs_padded = pad(obs) next_obs_padded = pad(next_obs) context_padded = pad(context) obs_aug, context_aug = crop(obs_padded, context_padded) next_obs_aug, next_context_aug = crop(next_obs_padded, context_padded) batch['observations'] = np.concatenate([obs_aug, context_aug], axis=1) batch['next_observations'] = np.concatenate( [next_obs_aug, next_context_aug], axis=1) return batch else: def concat_context_to_obs(batch, *args, **kwargs): obs = batch['observations'] next_obs = batch['next_observations'] context = batch[img_desired_goal_key] batch['observations'] = np.concatenate([obs, context], axis=1) batch['next_observations'] = np.concatenate([next_obs, context], axis=1) return batch replay_buffer = ContextualRelabelingReplayBuffer( env=eval_env, context_keys=[img_desired_goal_key, state_desired_goal_key], observation_key=img_observation_key, observation_keys=[img_observation_key, state_observation_key], context_distribution=eval_context_distrib, sample_context_from_obs_dict_fn=sample_context_from_obs_dict_fn, reward_fn=eval_reward, post_process_batch_fn=concat_context_to_obs, **replay_buffer_kwargs) trainer = SACTrainer(env=expl_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **sac_trainer_kwargs) eval_path_collector = ContextualPathCollector( eval_env, MakeDeterministic(policy), observation_key=img_observation_key, context_keys_for_policy=[img_desired_goal_key], ) exploration_policy = create_exploration_policy(expl_env, policy, **exploration_policy_kwargs) expl_path_collector = ContextualPathCollector( expl_env, exploration_policy, observation_key=img_observation_key, context_keys_for_policy=[img_desired_goal_key], ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs) algorithm.to(ptu.device) if save_video: rollout_function = partial( rf.contextual_rollout, max_path_length=max_path_length, observation_key=img_observation_key, context_keys_for_policy=[img_desired_goal_key], ) video_renderer = EnvRenderer(**video_renderer_kwargs) video_eval_env = InsertImageEnv(eval_env, renderer=video_renderer, image_key='video_observation') video_expl_env = InsertImageEnv(expl_env, renderer=video_renderer, image_key='video_observation') video_eval_env = ContextualEnv( video_eval_env, context_distribution=eval_env.context_distribution, reward_fn=lambda *_: np.array([0]), observation_key=img_observation_key, ) video_expl_env = ContextualEnv( video_expl_env, context_distribution=expl_env.context_distribution, reward_fn=lambda *_: np.array([0]), observation_key=img_observation_key, ) eval_video_func = get_save_video_function( rollout_function, video_eval_env, MakeDeterministic(policy), tag="eval", imsize=video_renderer.image_shape[1], image_formats=[ env_renderer.output_image_format, env_renderer.output_image_format, video_renderer.output_image_format, ], keys_to_show=[ 'image_desired_goal', 'image_observation', 'video_observation' ], **save_video_kwargs) expl_video_func = get_save_video_function( rollout_function, video_expl_env, exploration_policy, tag="xplor", imsize=video_renderer.image_shape[1], image_formats=[ env_renderer.output_image_format, env_renderer.output_image_format, video_renderer.output_image_format, ], keys_to_show=[ 'image_desired_goal', 'image_observation', 'video_observation' ], **save_video_kwargs) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(expl_video_func) algorithm.train()
def active_representation_learning_experiment(variant): import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import ObsDictReplayBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.arl.active_representation_learning_algorithm import \ ActiveRepresentationLearningAlgorithm from rlkit.torch.arl.representation_wrappers import RepresentationWrappedEnv from multiworld.core.image_env import ImageEnv from rlkit.samplers.data_collector import MdpPathCollector preprocess_rl_variant(variant) model_class = variant.get('model_class') model_kwargs = variant.get('model_kwargs') model = model_class(**model_kwargs) model.representation_size = 4 model.imsize = 48 variant["vae_path"] = model reward_params = variant.get("reward_params", dict()) init_camera = variant.get("init_camera", None) env = variant["env_class"](**variant['env_kwargs']) image_env = ImageEnv( env, variant.get('imsize'), init_camera=init_camera, transpose=True, normalize=True, ) env = RepresentationWrappedEnv( image_env, model, ) uniform_dataset_fn = variant.get('generate_uniform_dataset_fn', None) if uniform_dataset_fn: uniform_dataset = uniform_dataset_fn( **variant['generate_uniform_dataset_kwargs']) else: uniform_dataset = None observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = env.observation_space.spaces[observation_key].low.size action_dim = env.action_space.low.size hidden_sizes = variant.get('hidden_sizes', [400, 300]) qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes, ) vae = env.vae replay_buffer = ObsDictReplayBuffer(env=env, **variant['replay_buffer_kwargs']) model_trainer_class = variant.get('model_trainer_class') model_trainer_kwargs = variant.get('model_trainer_kwargs') model_trainer = model_trainer_class( model, **model_trainer_kwargs, ) # vae_trainer = ConvVAETrainer( # env.vae, # **variant['online_vae_trainer_kwargs'] # ) assert 'vae_training_schedule' not in variant, "Just put it in algo_kwargs" max_path_length = variant['max_path_length'] trainer = SACTrainer(env=env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['twin_sac_trainer_kwargs']) # trainer = HERTrainer(trainer) eval_path_collector = MdpPathCollector( env, MakeDeterministic(policy), # max_path_length, # observation_key=observation_key, # desired_goal_key=desired_goal_key, ) expl_path_collector = MdpPathCollector( env, policy, # max_path_length, # observation_key=observation_key, # desired_goal_key=desired_goal_key, ) algorithm = ActiveRepresentationLearningAlgorithm( trainer=trainer, exploration_env=env, evaluation_env=env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, model=model, model_trainer=model_trainer, uniform_dataset=uniform_dataset, max_path_length=max_path_length, **variant['algo_kwargs']) algorithm.to(ptu.device) vae.to(ptu.device) algorithm.train()
def _use_disentangled_encoder_distance( max_path_length, encoder_kwargs, disentangled_qf_kwargs, qf_kwargs, sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, evaluation_goal_sampling_mode, exploration_goal_sampling_mode, algo_kwargs, env_id=None, env_class=None, env_kwargs=None, encoder_key_prefix='encoder', encoder_input_prefix='state', latent_dim=2, reward_mode=EncoderWrappedEnv.ENCODER_DISTANCE_REWARD, # Video parameters save_video=True, save_video_kwargs=None, save_vf_heatmap=True, **kwargs): if save_video_kwargs is None: save_video_kwargs = {} if env_kwargs is None: env_kwargs = {} assert env_id or env_class vectorized = ( reward_mode == EncoderWrappedEnv.VECTORIZED_ENCODER_DISTANCE_REWARD) if env_id: import gym import multiworld multiworld.register_all_envs() raw_train_env = gym.make(env_id) raw_eval_env = gym.make(env_id) else: raw_eval_env = env_class(**env_kwargs) raw_train_env = env_class(**env_kwargs) raw_train_env.goal_sampling_mode = exploration_goal_sampling_mode raw_eval_env.goal_sampling_mode = evaluation_goal_sampling_mode raw_obs_dim = ( raw_train_env.observation_space.spaces['state_observation'].low.size) action_dim = raw_train_env.action_space.low.size encoder = ConcatMlp(input_size=raw_obs_dim, output_size=latent_dim, **encoder_kwargs) encoder = Identity() encoder.input_size = raw_obs_dim encoder.output_size = raw_obs_dim np_encoder = EncoderFromNetwork(encoder) train_env = EncoderWrappedEnv( raw_train_env, np_encoder, encoder_input_prefix, key_prefix=encoder_key_prefix, reward_mode=reward_mode, ) eval_env = EncoderWrappedEnv( raw_eval_env, np_encoder, encoder_input_prefix, key_prefix=encoder_key_prefix, reward_mode=reward_mode, ) observation_key = '{}_observation'.format(encoder_key_prefix) desired_goal_key = '{}_desired_goal'.format(encoder_key_prefix) achieved_goal_key = '{}_achieved_goal'.format(encoder_key_prefix) obs_dim = train_env.observation_space.spaces[observation_key].low.size goal_dim = train_env.observation_space.spaces[desired_goal_key].low.size def make_qf(): return DisentangledMlpQf(encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, vectorized=vectorized, **disentangled_qf_kwargs) qf1 = make_qf() qf2 = make_qf() target_qf1 = make_qf() target_qf2 = make_qf() policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **policy_kwargs) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, vectorized=vectorized, **replay_buffer_kwargs) sac_trainer = SACTrainer(env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **sac_trainer_kwargs) trainer = HERTrainer(sac_trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode='env', ) expl_path_collector = GoalConditionedPathCollector( train_env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode='env', ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs) algorithm.to(ptu.device) if save_video: def v_function(obs): action = policy.get_actions(obs) obs, action = ptu.from_numpy(obs), ptu.from_numpy(action) return qf1(obs, action, return_individual_q_vals=True) add_heatmap = partial( add_heatmap_imgs_to_o_dict, v_function=v_function, vectorized=vectorized, ) rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, full_o_postprocess_func=add_heatmap if save_vf_heatmap else None, ) img_keys = ['v_vals'] + [ 'v_vals_dim_{}'.format(dim) for dim in range(latent_dim) ] eval_video_func = get_save_video_function(rollout_function, eval_env, MakeDeterministic(policy), get_extra_imgs=partial( get_extra_imgs, img_keys=img_keys), tag="eval", **save_video_kwargs) train_video_func = get_save_video_function(rollout_function, train_env, policy, get_extra_imgs=partial( get_extra_imgs, img_keys=img_keys), tag="train", **save_video_kwargs) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
def experiment(variant): expl_env = make_env() eval_env = make_env() obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def twin_sac_experiment_online_vae(variant): import rlkit.torch.pytorch_util as ptu from rlkit.data_management.online_vae_replay_buffer import \ OnlineVaeRelabelingBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.vae.vae_trainer import ConvVAETrainer preprocess_rl_variant(variant) env = get_envs(variant) uniform_dataset_fn = variant.get('generate_uniform_dataset_fn', None) if uniform_dataset_fn: uniform_dataset = uniform_dataset_fn( **variant['generate_uniform_dataset_kwargs']) else: uniform_dataset = None observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size) action_dim = env.action_space.low.size hidden_sizes = variant.get('hidden_sizes', [400, 300]) qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes, ) vae = env.vae replay_buffer_class = variant.get("replay_buffer_class", OnlineVaeRelabelingBuffer) replay_buffer = replay_buffer_class(vae=env.vae, env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) vae_trainer_class = variant.get("vae_trainer_class", ConvVAETrainer) vae_trainer = vae_trainer_class(env.vae, **variant['online_vae_trainer_kwargs']) assert 'vae_training_schedule' not in variant, "Just put it in algo_kwargs" max_path_length = variant['max_path_length'] trainer = SACTrainer(env=env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['twin_sac_trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = VAEWrappedEnvPathCollector( variant['evaluation_goal_sampling_mode'], env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = VAEWrappedEnvPathCollector( variant['exploration_goal_sampling_mode'], env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = OnlineVaeAlgorithm( trainer=trainer, exploration_env=env, evaluation_env=env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, vae=vae, vae_trainer=vae_trainer, uniform_dataset=uniform_dataset, max_path_length=max_path_length, **variant['algo_kwargs']) if variant.get("save_video", True): video_func = VideoSaveFunction( env, variant, ) algorithm.post_train_funcs.append(video_func) if variant['custom_goal_sampler'] == 'replay_buffer': env.custom_goal_sampler = replay_buffer.sample_buffer_goals algorithm.to(ptu.device) vae.to(ptu.device) algorithm.train()
def goal_conditioned_sac_experiment( max_path_length, qf_kwargs, sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, algo_kwargs, env_id=None, env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', achieved_goal_key='state_achieved_goal', exploration_policy_kwargs=None, evaluation_goal_sampling_mode=None, exploration_goal_sampling_mode=None, # Video parameters save_video=True, save_video_kwargs=None, renderer_kwargs=None, ): if exploration_policy_kwargs is None: exploration_policy_kwargs = {} if not save_video_kwargs: save_video_kwargs = {} if not renderer_kwargs: renderer_kwargs = {} context_key = desired_goal_key sample_context_from_obs_dict_fn = RemapKeyFn({context_key: observation_key}) def contextual_env_distrib_and_reward( env_id, env_class, env_kwargs, goal_sampling_mode ): env = get_gym_env(env_id, env_class=env_class, env_kwargs=env_kwargs) env.goal_sampling_mode = goal_sampling_mode goal_distribution = GoalDictDistributionFromMultitaskEnv( env, desired_goal_keys=[desired_goal_key], ) reward_fn = ContextualRewardFnFromMultitaskEnv( env=env, achieved_goal_from_observation=IndexIntoAchievedGoal(observation_key), desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, ) diag_fn = GoalConditionedDiagnosticsToContextualDiagnostics( env.goal_conditioned_diagnostics, desired_goal_key=desired_goal_key, observation_key=observation_key, ) env = ContextualEnv( env, context_distribution=goal_distribution, reward_fn=reward_fn, observation_key=observation_key, contextual_diagnostics_fns=[diag_fn], update_env_info_fn=delete_info, ) return env, goal_distribution, reward_fn expl_env, expl_context_distrib, expl_reward = contextual_env_distrib_and_reward( env_id, env_class, env_kwargs, exploration_goal_sampling_mode ) eval_env, eval_context_distrib, eval_reward = contextual_env_distrib_and_reward( env_id, env_class, env_kwargs, evaluation_goal_sampling_mode ) obs_dim = ( expl_env.observation_space.spaces[observation_key].low.size + expl_env.observation_space.spaces[context_key].low.size ) action_dim = expl_env.action_space.low.size def create_qf(): return ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) qf1 = create_qf() qf2 = create_qf() target_qf1 = create_qf() target_qf2 = create_qf() policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs ) def concat_context_to_obs(batch, *args, **kwargs): obs = batch['observations'] next_obs = batch['next_observations'] context = batch[context_key] batch['observations'] = np.concatenate([obs, context], axis=1) batch['next_observations'] = np.concatenate([next_obs, context], axis=1) return batch replay_buffer = ContextualRelabelingReplayBuffer( env=eval_env, context_keys=[context_key], observation_keys=[observation_key], context_distribution=eval_context_distrib, sample_context_from_obs_dict_fn=sample_context_from_obs_dict_fn, reward_fn=eval_reward, post_process_batch_fn=concat_context_to_obs, **replay_buffer_kwargs ) trainer = SACTrainer( env=expl_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **sac_trainer_kwargs ) eval_path_collector = ContextualPathCollector( eval_env, MakeDeterministic(policy), observation_key=observation_key, context_keys_for_policy=[context_key], ) exploration_policy = create_exploration_policy( policy=policy, env=expl_env, **exploration_policy_kwargs) expl_path_collector = ContextualPathCollector( expl_env, exploration_policy, observation_key=observation_key, context_keys_for_policy=[context_key], ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs ) algorithm.to(ptu.device) if save_video: rollout_function = partial( rf.contextual_rollout, max_path_length=max_path_length, observation_key=observation_key, context_keys_for_policy=[context_key], ) renderer = EnvRenderer(**renderer_kwargs) def add_images(env, state_distribution): state_env = env.env image_goal_distribution = AddImageDistribution( env=state_env, base_distribution=state_distribution, image_goal_key='image_desired_goal', renderer=renderer, ) img_env = InsertImageEnv(state_env, renderer=renderer) return ContextualEnv( img_env, context_distribution=image_goal_distribution, reward_fn=eval_reward, observation_key=observation_key, update_env_info_fn=delete_info, ) img_eval_env = add_images(eval_env, eval_context_distrib) img_expl_env = add_images(expl_env, expl_context_distrib) eval_video_func = get_save_video_function( rollout_function, img_eval_env, MakeDeterministic(policy), tag="eval", imsize=renderer.width, image_format=renderer.output_image_format, **save_video_kwargs ) expl_video_func = get_save_video_function( rollout_function, img_expl_env, exploration_policy, tag="train", imsize=renderer.width, image_format=renderer.output_image_format, **save_video_kwargs ) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(expl_video_func) algorithm.train()
def experiment(variant): env = gym.make('RLkitGoalUR-v0')._start_ros_services() eval_env = gym.make('RLkitGoalUR-v0') expl_env = gym.make('RLkitGoalUR-v0') observation_key = 'observation' desired_goal_key = 'desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def skewfit_experiment(variant): import rlkit.torch.pytorch_util as ptu from rlkit.data_management.online_vae_replay_buffer \ import OnlineVaeRelabelingBuffer from rlkit.torch.networks import FlattenMlp from rlkit.torch.sac.policies import TanhGaussianPolicy import rlkit.torch.vae.vae_schedules as vae_schedules #### getting parameter for training VAE and RIG env = get_envs(variant) observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size) action_dim = env.action_space.low.size hidden_sizes = variant.get('hidden_sizes', [400, 300]) replay_buffer_kwargs = variant.get( 'replay_buffer_kwargs', dict( start_skew_epoch=10, max_size=int(100000), fraction_goals_rollout_goals=0.2, fraction_goals_env_goals=0.5, exploration_rewards_type='None', vae_priority_type='vae_prob', priority_function_kwargs=dict( sampling_method='importance_sampling', decoder_distribution='gaussian_identity_variance', num_latents_to_sample=10, ), power=0, relabeling_goal_sampling_mode='vae_prior', )) online_vae_trainer_kwargs = variant.get('online_vae_trainer_kwargs', dict(beta=20, lr=1e-3)) max_path_length = variant.get('max_path_length', 50) algo_kwargs = variant.get( 'algo_kwargs', dict( batch_size=1024, num_epochs=1000, num_eval_steps_per_epoch=500, num_expl_steps_per_train_loop=500, num_trains_per_train_loop=1000, min_num_steps_before_training=10000, vae_training_schedule=vae_schedules.custom_schedule_2, oracle_data=False, vae_save_period=50, parallel_vae_train=False, )) twin_sac_trainer_kwargs = variant.get( 'twin_sac_trainer_kwargs', dict( discount=0.99, reward_scale=1, soft_target_tau=1e-3, target_update_period=1, # 1 use_automatic_entropy_tuning=True, )) ############################################################################ qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes) vae = variant['vae_model'] # create a replay buffer for training an online VAE replay_buffer = OnlineVaeRelabelingBuffer( vae=vae, env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **replay_buffer_kwargs) # create an online vae_trainer to train vae on the fly vae_trainer = ConvVAETrainer(variant['vae_train_data'], variant['vae_test_data'], vae, **online_vae_trainer_kwargs) # create a SACTrainer to learn a soft Q-function and appropriate policy trainer = SACTrainer(env=env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **twin_sac_trainer_kwargs) trainer = HERTrainer(trainer) eval_path_collector = VAEWrappedEnvPathCollector( variant.get('evaluation_goal_sampling_mode', 'reset_of_env'), env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = VAEWrappedEnvPathCollector( variant.get('exploration_goal_sampling_mode', 'vae_prior'), env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = OnlineVaeAlgorithm( trainer=trainer, exploration_env=env, evaluation_env=env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, vae=vae, vae_trainer=vae_trainer, max_path_length=max_path_length, **algo_kwargs) if variant['custom_goal_sampler'] == 'replay_buffer': env.custom_goal_sampler = replay_buffer.sample_buffer_goals algorithm.to(ptu.device) vae.to(ptu.device) algorithm.train()
def experiment(variant): dummy_env = make_env(variant['env']) obs_dim = dummy_env.observation_space.low.size action_dim = dummy_env.action_space.low.size expl_env = VectorEnv([ lambda: make_env(variant['env']) for _ in range(variant['expl_env_num']) ]) expl_env.seed(variant["seed"]) expl_env.action_space.seed(variant["seed"]) eval_env = SubprocVectorEnv([ lambda: make_env(variant['env']) for _ in range(variant['eval_env_num']) ]) eval_env.seed(variant["seed"]) M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = VecMdpPathCollector( eval_env, eval_policy, ) expl_path_collector = VecMdpStepCollector( expl_env, policy, ) replay_buffer = TorchReplayBuffer( variant['replay_buffer_size'], dummy_env, ) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'], ) algorithm = TorchVecOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'], ) algorithm.to(ptu.device) algorithm.train()
def grill_her_sac_experiment(variant): import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy, MakeDeterministic from rlkit.torch.sac.sac import SACTrainer from rlkit.torch.her.her import HERTrainer from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from rlkit.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from rlkit.samplers.data_collector import GoalConditionedPathCollector from rlkit.torch.grill.launcher import (grill_preprocess_variant, get_envs, get_exploration_strategy) full_experiment_variant_preprocess(variant) variant = variant['grill_variant'] grill_preprocess_variant(variant) env = get_envs(variant) es = get_exploration_strategy(variant, env) observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size) action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = GoalConditionedPathCollector( env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( env, exploration_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) trainer = SACTrainer(env=env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=env, evaluation_env=env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) return algorithm