def experiment(variant): expl_env = gym.make('GoalGridworld-v0') eval_env = gym.make('GoalGridworld-v0') obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.n qf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) target_qf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) eval_policy = ArgmaxDiscretePolicy(qf) exploration_strategy = EpsilonGreedy(action_space=expl_env.action_space, ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=exploration_strategy, policy=eval_policy, ) replay_buffer = ObsDictRelabelingBuffer(env=eval_env, **variant['replay_buffer_kwargs']) observation_key = 'observation' desired_goal_key = 'desired_goal' eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, **variant['trainer_kwargs']) trainer = HERTrainer(trainer) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def get_path_collector(variant, expl_env, eval_env, policy, eval_policy): """ Define path collector """ mode = variant["mode"] if mode == "vanilla": expl_path_collector = MdpPathCollector(expl_env, policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy) elif mode == "her": expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=variant["her"]["observation_key"], desired_goal_key=variant["her"]["desired_goal_key"], representation_goal_key=variant["her"]["representation_goal_key"], ) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=variant["her"]["observation_key"], desired_goal_key=variant["her"]["desired_goal_key"], representation_goal_key=variant["her"]["representation_goal_key"], ) return expl_path_collector, eval_path_collector
def _use_disentangled_encoder_distance( max_path_length, encoder_kwargs, disentangled_qf_kwargs, qf_kwargs, sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, evaluation_goal_sampling_mode, exploration_goal_sampling_mode, algo_kwargs, env_id=None, env_class=None, env_kwargs=None, encoder_key_prefix='encoder', encoder_input_prefix='state', latent_dim=2, reward_mode=EncoderWrappedEnv.ENCODER_DISTANCE_REWARD, # Video parameters save_video=True, save_video_kwargs=None, save_vf_heatmap=True, **kwargs): if save_video_kwargs is None: save_video_kwargs = {} if env_kwargs is None: env_kwargs = {} assert env_id or env_class vectorized = ( reward_mode == EncoderWrappedEnv.VECTORIZED_ENCODER_DISTANCE_REWARD) if env_id: import gym import multiworld multiworld.register_all_envs() raw_train_env = gym.make(env_id) raw_eval_env = gym.make(env_id) else: raw_eval_env = env_class(**env_kwargs) raw_train_env = env_class(**env_kwargs) raw_train_env.goal_sampling_mode = exploration_goal_sampling_mode raw_eval_env.goal_sampling_mode = evaluation_goal_sampling_mode raw_obs_dim = ( raw_train_env.observation_space.spaces['state_observation'].low.size) action_dim = raw_train_env.action_space.low.size encoder = ConcatMlp(input_size=raw_obs_dim, output_size=latent_dim, **encoder_kwargs) encoder = Identity() encoder.input_size = raw_obs_dim encoder.output_size = raw_obs_dim np_encoder = EncoderFromNetwork(encoder) train_env = EncoderWrappedEnv( raw_train_env, np_encoder, encoder_input_prefix, key_prefix=encoder_key_prefix, reward_mode=reward_mode, ) eval_env = EncoderWrappedEnv( raw_eval_env, np_encoder, encoder_input_prefix, key_prefix=encoder_key_prefix, reward_mode=reward_mode, ) observation_key = '{}_observation'.format(encoder_key_prefix) desired_goal_key = '{}_desired_goal'.format(encoder_key_prefix) achieved_goal_key = '{}_achieved_goal'.format(encoder_key_prefix) obs_dim = train_env.observation_space.spaces[observation_key].low.size goal_dim = train_env.observation_space.spaces[desired_goal_key].low.size def make_qf(): return DisentangledMlpQf(encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, vectorized=vectorized, **disentangled_qf_kwargs) qf1 = make_qf() qf2 = make_qf() target_qf1 = make_qf() target_qf2 = make_qf() policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **policy_kwargs) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, vectorized=vectorized, **replay_buffer_kwargs) sac_trainer = SACTrainer(env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **sac_trainer_kwargs) trainer = HERTrainer(sac_trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode='env', ) expl_path_collector = GoalConditionedPathCollector( train_env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode='env', ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs) algorithm.to(ptu.device) if save_video: def v_function(obs): action = policy.get_actions(obs) obs, action = ptu.from_numpy(obs), ptu.from_numpy(action) return qf1(obs, action, return_individual_q_vals=True) add_heatmap = partial( add_heatmap_imgs_to_o_dict, v_function=v_function, vectorized=vectorized, ) rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, full_o_postprocess_func=add_heatmap if save_vf_heatmap else None, ) img_keys = ['v_vals'] + [ 'v_vals_dim_{}'.format(dim) for dim in range(latent_dim) ] eval_video_func = get_save_video_function(rollout_function, eval_env, MakeDeterministic(policy), get_extra_imgs=partial( get_extra_imgs, img_keys=img_keys), tag="eval", **save_video_kwargs) train_video_func = get_save_video_function(rollout_function, train_env, policy, get_extra_imgs=partial( get_extra_imgs, img_keys=img_keys), tag="train", **save_video_kwargs) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
def experiment(variant): from multiworld.envs.mujoco import register_mujoco_envs register_mujoco_envs() env_id = variant['env_id'] eval_env = gym.make(env_id) expl_env = gym.make(env_id) observation_key = 'state_observation' desired_goal_key = 'state_desired_goal' eval_env.reward_type = variant['reward_type'] expl_env.reward_type = variant['reward_type'] achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make('pick_and_lift-state-v0',sparse=True, not_special_p=0.5, ground_p = 0, special_is_grip=True, img_size=256, force_randomly_place=False, force_change_position=False) observation_key = 'observation' desired_goal_key = 'desired_goal' achieved_goal_key = "achieved_goal" replay_buffer = ObsDictRelabelingBuffer( env=expl_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = expl_env.observation_space.spaces['observation'].low.size action_dim = expl_env.action_space.low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.3, min_sigma=0.1, decay_period=1000000 # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) trainer = TD3Trainer( # env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['td3_trainer_kwargs'] ) trainer = HERTrainer(trainer) expl_path_collector = GoalConditionedPathCollector( expl_env, exploration_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=None, exploration_data_collector=expl_path_collector, evaluation_data_collector=None, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # unwrap the TimeLimitEnv wrapper since we manually terminate after 50 steps eval_env = gym.make('FetchReach-v1').env expl_env = gym.make('FetchReach-v1').env observation_key = 'observation' desired_goal_key = 'desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def grill_her_sac_experiment(variant): import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy, MakeDeterministic from rlkit.torch.sac.sac import SACTrainer from rlkit.torch.her.her import HERTrainer from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from rlkit.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from rlkit.samplers.data_collector import GoalConditionedPathCollector from rlkit.torch.grill.launcher import (grill_preprocess_variant, get_envs, get_exploration_strategy) full_experiment_variant_preprocess(variant) variant = variant['grill_variant'] grill_preprocess_variant(variant) env = get_envs(variant) es = get_exploration_strategy(variant, env) observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size) action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = GoalConditionedPathCollector( env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( env, exploration_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) trainer = SACTrainer(env=env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=env, evaluation_env=env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) return algorithm
def experiment(variant): expl_env = envs[variant['env']](variant['dr']) expl_env = TransformObservationWrapper(expl_env, variant['main_t_fn']) observation_key = 'observation' desired_goal_key = 'desired_goal' achieved_goal_key = "achieved_goal" replay_buffer = imgObsDictRelabelingBuffer( env=expl_env, rerendering_env=rerendering_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, t_fn=variant['t_fn'], **variant['replay_buffer_kwargs']) # obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size if variant['mlf']: if variant['alt'] == 'both': conv_args = { "input_width": 16, "input_height": 16, "input_channels": 16, "kernel_sizes": [4], "n_channels": [32], "strides": [4], "paddings": [0], "hidden_sizes": [1024, 512], # "added_fc_input_size": action_dim, "batch_norm_conv": False, "batch_norm_fc": False, 'init_w': 1e-4, "hidden_init": nn.init.orthogonal_, "hidden_activation": nn.ReLU(), } else: conv_args = { "input_width": 16, "input_height": 16, "input_channels": 8, "kernel_sizes": [4], "n_channels": [32], "strides": [4], "paddings": [0], "hidden_sizes": [1024, 512], "batch_norm_conv": False, "batch_norm_fc": False, 'init_w': 1e-4, "hidden_init": nn.init.orthogonal_, "hidden_activation": nn.ReLU(), } else: if variant['alt'] == 'both': conv_args = { "input_width": 64, "input_height": 64, "input_channels": 6, "kernel_sizes": [4, 4, 3], "n_channels": [32, 64, 64], "strides": [2, 1, 1], "paddings": [0, 0, 0], "hidden_sizes": [1024, 512], "batch_norm_conv": False, "batch_norm_fc": False, 'init_w': 1e-4, "hidden_init": nn.init.orthogonal_, "hidden_activation": nn.ReLU(), } else: conv_args = { "input_width": 64, "input_height": 64, "input_channels": 3, "kernel_sizes": [4, 4, 3], "n_channels": [32, 64, 64], "strides": [2, 1, 1], "paddings": [0, 0, 0], "hidden_sizes": [1024, 512], "batch_norm_conv": False, "batch_norm_fc": False, 'init_w': 1e-4, "hidden_init": nn.init.orthogonal_, "hidden_activation": nn.ReLU(), } qf1 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) qf2 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) target_qf1 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) target_qf2 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) policy = TanhCNNPolicy(output_size=action_dim, **variant['policy_kwargs'], **conv_args) target_policy = TanhCNNPolicy(output_size=action_dim, **variant['policy_kwargs'], **conv_args) if variant['noise'] == "eps": es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, epsilon=0.3, max_sigma=0.0, min_sigma=0.0, #constant sigma 0 decay_period=1000000) elif variant['noise'] == "gaussian": es = GaussianStrategy(action_space=expl_env.action_space, max_sigma=0.3, min_sigma=0.1, decay_period=1000000) else: print("unsupported param for --noise") assert False exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) # eval_path_collector = MdpPathCollector( # eval_env, # policy, # ) expl_path_collector = GoalConditionedPathCollector( expl_env, exploration_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) eval_path_collector = GoalConditionedPathCollector( expl_env, exploration_policy.policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) # trainer = HERTrainer(trainer) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=None, exploration_data_collector=expl_path_collector, evaluation_data_collector=None, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def her_sac_experiment( max_path_length, qf_kwargs, twin_sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, evaluation_goal_sampling_mode, exploration_goal_sampling_mode, algo_kwargs, save_video=True, env_id=None, env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', achieved_goal_key='state_achieved_goal', # Video parameters save_video_kwargs=None, exploration_policy_kwargs=None, **kwargs ): if exploration_policy_kwargs is None: exploration_policy_kwargs = {} import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm if not save_video_kwargs: save_video_kwargs = {} if env_kwargs is None: env_kwargs = {} assert env_id or env_class if env_id: import gym import multiworld multiworld.register_all_envs() train_env = gym.make(env_id) eval_env = gym.make(env_id) else: eval_env = env_class(**env_kwargs) train_env = env_class(**env_kwargs) obs_dim = ( train_env.observation_space.spaces[observation_key].low.size + train_env.observation_space.spaces[desired_goal_key].low.size ) action_dim = train_env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs ) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **replay_buffer_kwargs ) trainer = SACTrainer( env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **twin_sac_trainer_kwargs ) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=evaluation_goal_sampling_mode, ) exploration_policy = create_exploration_policy( train_env, policy, **exploration_policy_kwargs) expl_path_collector = GoalConditionedPathCollector( train_env, exploration_policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=exploration_goal_sampling_mode, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs ) algorithm.to(ptu.device) if save_video: rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, return_dict_obs=True, ) eval_video_func = get_save_video_function( rollout_function, eval_env, MakeDeterministic(policy), tag="eval", **save_video_kwargs ) train_video_func = get_save_video_function( rollout_function, train_env, exploration_policy, tag="expl", **save_video_kwargs ) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_achieved_goal')) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_desired_goal')) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
def _disentangled_her_twin_sac_experiment_v2( max_path_length, encoder_kwargs, disentangled_qf_kwargs, qf_kwargs, twin_sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, evaluation_goal_sampling_mode, exploration_goal_sampling_mode, algo_kwargs, save_video=True, env_id=None, env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', achieved_goal_key='state_achieved_goal', # Video parameters latent_dim=2, save_video_kwargs=None, **kwargs ): import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm if save_video_kwargs is None: save_video_kwargs = {} if env_kwargs is None: env_kwargs = {} assert env_id or env_class if env_id: import gym import multiworld multiworld.register_all_envs() train_env = gym.make(env_id) eval_env = gym.make(env_id) else: eval_env = env_class(**env_kwargs) train_env = env_class(**env_kwargs) obs_dim = train_env.observation_space.spaces[observation_key].low.size goal_dim = train_env.observation_space.spaces[desired_goal_key].low.size action_dim = train_env.action_space.low.size encoder = ConcatMlp( input_size=goal_dim, output_size=latent_dim, **encoder_kwargs ) qf1 = DisentangledMlpQf( encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) qf2 = DisentangledMlpQf( encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) target_qf1 = DisentangledMlpQf( encoder=Detach(encoder), preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) target_qf2 = DisentangledMlpQf( encoder=Detach(encoder), preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, **policy_kwargs ) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **replay_buffer_kwargs ) sac_trainer = SACTrainer( env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **twin_sac_trainer_kwargs ) trainer = HERTrainer(sac_trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=evaluation_goal_sampling_mode, ) expl_path_collector = GoalConditionedPathCollector( train_env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=exploration_goal_sampling_mode, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs, ) algorithm.to(ptu.device) if save_video: save_vf_heatmap = save_video_kwargs.get('save_vf_heatmap', True) def v_function(obs): action = policy.get_actions(obs) obs, action = ptu.from_numpy(obs), ptu.from_numpy(action) return qf1(obs, action, return_individual_q_vals=True) add_heatmap = partial(add_heatmap_imgs_to_o_dict, v_function=v_function) rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, full_o_postprocess_func=add_heatmap if save_vf_heatmap else None, ) img_keys = ['v_vals'] + [ 'v_vals_dim_{}'.format(dim) for dim in range(latent_dim) ] eval_video_func = get_save_video_function( rollout_function, eval_env, MakeDeterministic(policy), tag="eval", get_extra_imgs=partial(get_extra_imgs, img_keys=img_keys), **save_video_kwargs ) train_video_func = get_save_video_function( rollout_function, train_env, policy, tag="train", get_extra_imgs=partial(get_extra_imgs, img_keys=img_keys), **save_video_kwargs ) decoder = ConcatMlp( input_size=obs_dim, output_size=obs_dim, hidden_sizes=[128, 128], ) decoder.to(ptu.device) # algorithm.post_train_funcs.append(train_decoder(variant, encoder, decoder)) # algorithm.post_train_funcs.append(plot_encoder_function(variant, encoder)) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_achieved_goal')) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_desired_goal')) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
def encoder_wrapped_td3bc_experiment(variant): representation_size = 128 output_classes = 20 model_class = variant.get('model_class', TimestepPredictionModel) model = model_class( representation_size, # decoder_output_activation=decoder_activation, output_classes=output_classes, **variant['model_kwargs'], ) # model = torch.nn.DataParallel(model) model_path = variant.get("model_path") # model = load_local_or_remote_file(model_path) state_dict = torch.load(model_path) model.load_state_dict(state_dict) model.to(ptu.device) model.eval() traj = np.load(variant.get("desired_trajectory"), allow_pickle=True)[0] goal_image = traj["observations"][-1]["image_observation"] goal_image = goal_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0 # goal_image = goal_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0 # BECAUSE RLBENCH DEMOS ARENT IMAGE_ENV WRAPPED # goal_image = goal_image[:, :, :240, 60:500] goal_image = goal_image[:, :, 60:, 60:500] goal_image_pt = ptu.from_numpy(goal_image) save_image(goal_image_pt.data.cpu(), 'demos/goal.png', nrow=1) goal_latent = model.encode(goal_image_pt).detach().cpu().numpy().flatten() initial_image = traj["observations"][0]["image_observation"] initial_image = initial_image.reshape(1, 3, 500, 300).transpose([0, 1, 3, 2]) / 255.0 # initial_image = initial_image.reshape(1, 300, 500, 3).transpose([0, 3, 1, 2]) / 255.0 # initial_image = initial_image[:, :, :240, 60:500] initial_image = initial_image[:, :, 60:, 60:500] initial_image_pt = ptu.from_numpy(initial_image) save_image(initial_image_pt.data.cpu(), 'demos/initial.png', nrow=1) initial_latent = model.encode(initial_image_pt).detach().cpu().numpy().flatten() # Move these to td3_bc and bc_v3 (or at least type for reward_params) reward_params = dict( goal_latent=goal_latent, initial_latent=initial_latent, type=variant["reward_params_type"], ) config_params = variant.get("config_params") env = variant['env_class'](**variant['env_kwargs']) env = ImageEnv(env, recompute_reward=False, transpose=True, image_length=450000, reward_type="image_distance", # init_camera=sawyer_pusher_camera_upright_v2, ) env = EncoderWrappedEnv( env, model, reward_params, config_params, **variant.get("encoder_wrapped_env_kwargs", dict()) ) expl_env = env # variant['env_class'](**variant['env_kwargs']) eval_env = env # variant['env_class'](**variant['env_kwargs']) observation_key = variant.get("observation_key", 'state_observation') # one of 'state_observation', 'latent_observation', 'concat_observation' desired_goal_key = 'latent_desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, **variant["exploration_kwargs"], ) obs_dim = expl_env.observation_space.spaces[observation_key].low.size goal_dim = expl_env.observation_space.spaces[desired_goal_key].low.size action_dim = expl_env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, # output_activation=TorchMaxClamp(0.0), **variant['qf_kwargs'] ) qf2 = ConcatMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, # output_activation=TorchMaxClamp(0.0), **variant['qf_kwargs'] ) target_qf1 = ConcatMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, # output_activation=TorchMaxClamp(0.0), **variant['qf_kwargs'] ) target_qf2 = ConcatMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, # output_activation=TorchMaxClamp(0.0), **variant['qf_kwargs'] ) # Support for CNNPolicy based policy/target policy # Defaults to TanhMlpPolicy unless cnn_params is supplied in variant if 'cnn_params' in variant.keys(): imsize = 48 policy = CNNPolicy(input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=3, **variant['cnn_params'], output_activation=torch.tanh, ) target_policy = CNNPolicy(input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=3, **variant['cnn_params'], output_activation=torch.tanh, ) else: policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) demo_train_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) demo_test_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) td3bc_trainer = TD3BCTrainer( env=env, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs'] ) trainer = HERTrainer(td3bc_trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) if variant.get("save_video", True): video_func = VideoSaveFunction( env, variant, ) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) td3bc_trainer.load_demos() td3bc_trainer.pretrain_policy_with_bc() td3bc_trainer.pretrain_q_with_bc_data() algorithm.train()
def her_td3_experiment(variant): import gym import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import ObsDictRelabelingBuffer from rlkit.exploration_strategies.base import \ PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.gaussian_and_epislon import \ GaussianAndEpislonStrategy from rlkit.launchers.launcher_util import setup_logger from rlkit.samplers.data_collector import GoalConditionedPathCollector from rlkit.torch.her.her import HERTrainer from rlkit.torch.networks import ConcatMlp, TanhMlpPolicy from rlkit.torch.td3.td3 import TD3 from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm import rlkit.samplers.rollout_functions as rf from rlkit.torch.grill.launcher import get_state_experiment_video_save_function if 'env_id' in variant: eval_env = gym.make(variant['env_id']) expl_env = gym.make(variant['env_id']) else: eval_env_kwargs = variant.get('eval_env_kwargs', variant['env_kwargs']) eval_env = variant['env_class'](**eval_env_kwargs) expl_env = variant['env_class'](**variant['env_kwargs']) observation_key = 'state_observation' desired_goal_key = 'state_desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) trainer = TD3(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) if variant.get("save_video", False): rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=algorithm.max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, ) video_func = get_state_experiment_video_save_function( rollout_function, eval_env, policy, variant, ) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def experiment(args, variant): #eval_env = gym.make('FetchReach-v1') #expl_env = gym.make('FetchReach-v1') core_env = sim.DeepBuilderEnv(args.session_name, args.act_dim, args.box_dim, args.max_num_boxes, args.height_field_dim) eval_env = stuff.NormalizedActions(core_env) expl_env = stuff.NormalizedActions(core_env) observation_key = 'observation' desired_goal_key = 'desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = Reacher( path_length=variant['algo_kwargs']['max_path_length'], reward_type='threshold') expl_env = Reacher( path_length=variant['algo_kwargs']['max_path_length'], reward_type='threshold') observation_key = 'observation' # ground truth goals desired_goal_key = 'desired_goal' achieved_goal_key = 'achieved_goal' replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) # eval_env.observation_space.spaces[observation_key].low.size obs_dim = 128 action_dim = eval_env.action_space.low.size # eval_env.observation_space.spaces[desired_goal_key].low.size goal_dim = 128 qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs'] ) """ qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs'] ) """ eval_policy = MakeDeterministic(policy) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs'] ) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import gym from multiworld.envs.mujoco import register_custom_envs register_custom_envs() observation_key = 'state_observation' desired_goal_key = 'xy_desired_goal' expl_env = gym.make(variant['env_id']) eval_env = gym.make(variant['env_id']) achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size M = variant['layer_size'] qf1 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) trainer = HERTrainer(trainer) if variant['collection_mode'] == 'online': expl_step_collector = GoalConditionedStepCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_step_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) else: expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import multiworld multiworld.register_all_envs() eval_env = gym.make('SawyerPushXYZEnv-v0') expl_env = gym.make('SawyerPushXYZEnv-v0') observation_key = 'state_observation' desired_goal_key = 'state_desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) trainer = TD3Trainer( policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs'] ) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('RLkitGoalUR-v0')._start_ros_services() eval_env = gym.make('RLkitGoalUR-v0') expl_env = gym.make('RLkitGoalUR-v0') observation_key = 'observation' desired_goal_key = 'desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def state_td3bc_experiment(variant): if variant.get('env_id', None): import gym import multiworld multiworld.register_all_envs() eval_env = gym.make(variant['env_id']) expl_env = gym.make(variant['env_id']) else: eval_env_kwargs = variant.get('eval_env_kwargs', variant['env_kwargs']) eval_env = variant['env_class'](**eval_env_kwargs) expl_env = variant['env_class'](**variant['env_kwargs']) observation_key = 'state_observation' desired_goal_key = 'state_desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") es_strat = variant.get('es', 'ou') if es_strat == 'ou': es = OUStrategy( action_space=expl_env.action_space, max_sigma=variant['exploration_noise'], min_sigma=variant['exploration_noise'], ) elif es_strat == 'gauss_eps': es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) else: raise ValueError("invalid exploration strategy provided") obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) demo_train_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, max_size=variant['replay_buffer_kwargs']['max_size']) demo_test_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, max_size=variant['replay_buffer_kwargs']['max_size'], ) if variant.get('td3_bc', True): td3_trainer = TD3BCTrainer(env=expl_env, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['td3_bc_trainer_kwargs']) else: td3_trainer = TD3(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['td3_trainer_kwargs']) trainer = HERTrainer(td3_trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) if variant.get("save_video", True): if variant.get("presampled_goals", None): variant['image_env_kwargs'][ 'presampled_goals'] = load_local_or_remote_file( variant['presampled_goals']).item() image_eval_env = ImageEnv(eval_env, **variant["image_env_kwargs"]) image_eval_path_collector = GoalConditionedPathCollector( image_eval_env, policy, observation_key='state_observation', desired_goal_key='state_desired_goal', ) image_expl_env = ImageEnv(expl_env, **variant["image_env_kwargs"]) image_expl_path_collector = GoalConditionedPathCollector( image_expl_env, expl_policy, observation_key='state_observation', desired_goal_key='state_desired_goal', ) video_func = VideoSaveFunction( image_eval_env, variant, image_expl_path_collector, image_eval_path_collector, ) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) if variant.get('load_demos', False): td3_trainer.load_demos() if variant.get('pretrain_policy', False): td3_trainer.pretrain_policy_with_bc() if variant.get('pretrain_rl', False): td3_trainer.pretrain_q_with_bc_data() algorithm.train()