def get_exploration_strategy(variant, env): from railrl.exploration_strategies.epsilon_greedy import EpsilonGreedy from railrl.exploration_strategies.gaussian_strategy import GaussianStrategy from railrl.exploration_strategies.ou_strategy import OUStrategy exploration_type = variant.get('exploration_type', 'epsilon') exploration_noise = variant.get('exploration_noise', 0.1) if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, max_sigma=exploration_noise, min_sigma=exploration_noise, # Constant sigma ) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=exploration_noise, min_sigma=exploration_noise, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=exploration_noise, ) else: raise Exception("Invalid type: " + exploration_type) return es
def experiment(variant): env = gym.make(variant['env_id']) env = NormalizedBoxEnv(env) es = GaussianStrategy(action_space=env.action_space, ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[128, 128]) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[128, 128], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant['multitask']: env = MultitaskFullVAEPoint2DEnv( **variant['env_kwargs']) # used point2d-conv-sweep/run1/id4 env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) algorithm.to(ptu.device) env._wrapped_env.vae.to(ptu.device) algorithm.train()
def experiment(variant): # if variant['multitask']: # env = MultitaskPoint2DEnv(**variant['env_kwargs']) # env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) env_name = variant["env_name"] env = gym.make(env_name) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train()
def experiment(variant): imsize = variant['imsize'] history = variant['history'] env = gym.make(variant['env_id']) env = NormalizedBoxEnv( ImageEnv(env, imsize=imsize, keep_prev=history - 1, init_viewer=variant['init_viewer'])) es = GaussianStrategy(action_space=env.action_space, ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=history, added_fc_input_size=action_dim, **variant['cnn_params']) qf2 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=history, added_fc_input_size=action_dim, **variant['cnn_params']) policy = CNNPolicy( input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=history, **variant['cnn_params'], output_activation=torch.tanh, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, policy_and_target_update_period=15, policy_learning_rate=1e-5, **variant['algo_kwargs']) """ algorithm = DDPG( env, qf=qf1, policy=policy, # qf_weight_decay=.01, exploration_policy=exploration_policy, **variant['algo_kwargs'] )""" algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if 'history_len' in variant: history_len = variant['history_len'] env = MultiTaskHistoryEnv(env, history_len=history_len) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def tdm_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) tdm_normalizer = None qf1 = TdmQf(env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) qf2 = TdmQf(env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) policy = TdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) qf_criterion = variant['qf_criterion_class']() algo_kwargs = variant['algo_kwargs'] algo_kwargs['td3_kwargs']['qf_criterion'] = qf_criterion algo_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmTd3(env, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **algo_kwargs) algorithm.to(ptu.device) algorithm.train()
def td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = SawyerHumanControlEnv(action_mode='joint_space_impd', position_action_scale=1, max_speed=0.015) # max_speed does not actually do anything, it is now included in the function request_angle_action of sawyer_env_base. training_env = env obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3BC(env=env, policy=policy, qf1=qf1, qf2=qf2, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def get_exploration_strategy(variant, env): from railrl.exploration_strategies.epsilon_greedy import EpsilonGreedy from railrl.exploration_strategies.gaussian_strategy import GaussianStrategy from railrl.exploration_strategies.gaussian_and_epislon import \ GaussianAndEpislonStrategy from railrl.exploration_strategies.ou_strategy import OUStrategy from railrl.exploration_strategies.noop import NoopStrategy exploration_type = variant['exploration_type'] # exploration_noise = variant.get('exploration_noise', 0.1) es_kwargs = variant.get('es_kwargs', {}) if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, # max_sigma=exploration_noise, # min_sigma=exploration_noise, # Constant sigma **es_kwargs) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, # max_sigma=exploration_noise, # min_sigma=exploration_noise, # Constant sigma **es_kwargs) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, # prob_random_action=exploration_noise, **es_kwargs) elif exploration_type == 'gaussian_and_epsilon': es = GaussianAndEpislonStrategy( action_space=env.action_space, # max_sigma=exploration_noise, # min_sigma=exploration_noise, # Constant sigma # epsilon=exploration_noise, **es_kwargs) elif exploration_type == 'noop': es = NoopStrategy(action_space=env.action_space) else: raise Exception("Invalid type: " + exploration_type) return es
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'] ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): import gym import multiworld.envs.mujoco import multiworld.envs.pygame import railrl.samplers.rollout_functions as rf import railrl.torch.pytorch_util as ptu from railrl.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from railrl.exploration_strategies.epsilon_greedy import EpsilonGreedy from railrl.exploration_strategies.gaussian_strategy import GaussianStrategy from railrl.exploration_strategies.ou_strategy import OUStrategy from railrl.torch.grill.launcher import get_video_save_func from railrl.demos.her_td3bc import HerTD3BC from railrl.torch.networks import FlattenMlp, TanhMlpPolicy from railrl.data_management.obs_dict_replay_buffer import ( ObsDictRelabelingBuffer) if 'env_id' in variant: env = gym.make(variant['env_id']) else: env = variant['env_class'](**variant['env_kwargs']) observation_key = variant['observation_key'] desired_goal_key = variant['desired_goal_key'] variant['algo_kwargs']['her_kwargs']['observation_key'] = observation_key variant['algo_kwargs']['her_kwargs']['desired_goal_key'] = desired_goal_key if variant.get('normalize', False): raise NotImplementedError() achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) demo_train_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) demo_test_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = HerTD3BC(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, replay_buffer=replay_buffer, demo_path=variant["demo_path"], **variant['algo_kwargs']) if variant.get("save_video", False): rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=algorithm.max_path_length, observation_key=algorithm.observation_key, desired_goal_key=algorithm.desired_goal_key, ) video_func = get_video_save_func( rollout_function, env, policy, variant, ) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): import multiworld.envs.mujoco import multiworld.envs.pygame import railrl.samplers.rollout_functions as rf import railrl.torch.pytorch_util as ptu from railrl.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from railrl.exploration_strategies.epsilon_greedy import EpsilonGreedy from railrl.exploration_strategies.gaussian_strategy import GaussianStrategy from railrl.exploration_strategies.ou_strategy import OUStrategy from railrl.torch.grill.launcher import get_video_save_func from railrl.torch.her.her_td3 import HerTd3 from railrl.data_management.obs_dict_replay_buffer import ( ObsDictRelabelingBuffer) if 'env_id' in variant: env = gym.make(variant['env_id']) else: env = variant['env_class'](**variant['env_kwargs']) imsize = 84 env = MujocoGymToMultiEnv(env.env) # unwrap TimeLimit env = ImageEnv(env, non_presampled_goal_img_is_garbage=True, recompute_reward=False) observation_key = variant['observation_key'] desired_goal_key = variant['desired_goal_key'] variant['algo_kwargs']['her_kwargs']['observation_key'] = observation_key variant['algo_kwargs']['her_kwargs']['desired_goal_key'] = desired_goal_key if variant.get('normalize', False): raise NotImplementedError() achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = env.observation_space.spaces[observation_key].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces[desired_goal_key].low.size exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) use_images_for_q = variant["use_images_for_q"] use_images_for_pi = variant["use_images_for_pi"] qs = [] for i in range(2): if use_images_for_q: image_q = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3, added_fc_input_size=action_dim, **variant['cnn_params']) q = ImageStateQ(image_q, None) else: state_q = FlattenMlp(input_size=action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) q = ImageStateQ(None, state_q) qs.append(q) qf1, qf2 = qs if use_images_for_pi: image_policy = CNNPolicy( input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=3, **variant['cnn_params'], output_activation=torch.tanh, ) policy = ImageStatePolicy(image_policy, None) else: state_policy = TanhMlpPolicy(input_size=goal_dim, output_size=action_dim, **variant['policy_kwargs']) policy = ImageStatePolicy(None, state_policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) if variant.get("save_video", False): rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=algorithm.max_path_length, observation_key=algorithm.observation_key, desired_goal_key=algorithm.desired_goal_key, ) video_func = get_video_save_func( rollout_function, env, policy, variant, ) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size variant['algo_kwargs'] = dict( num_epochs=variant['num_epochs'], num_steps_per_epoch=variant['num_steps_per_epoch'], num_steps_per_eval=variant['num_steps_per_eval'], max_path_length=variant['max_path_length'], min_num_steps_before_training=variant['min_num_steps_before_training'], batch_size=variant['batch_size'], discount=variant['discount'], replay_buffer_size=variant['replay_buffer_size'], reward_scale=variant['reward_scale'], ) M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], # **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], # **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[M, M], # **variant['policy_kwargs'] ) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): vectorized = variant['sac_tdm_kwargs']['tdm_kwargs']['vectorized'] env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs'])) max_tau = variant['sac_tdm_kwargs']['tdm_kwargs']['max_tau'] qf = TdmQf(env, vectorized=vectorized, **variant['qf_kwargs']) tdm_normalizer = TdmNormalizer(env, vectorized, max_tau=max_tau, **variant['tdm_normalizer_kwargs']) implicit_model = TdmToImplicitModel( env, qf, tau=0, ) vf = TdmVf(env=env, vectorized=vectorized, tdm_normalizer=tdm_normalizer, **variant['vf_kwargs']) policy = StochasticTdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) goal_slice = env.ob_to_goal_slice lbfgs_mpc_controller = TdmLBfgsBCMC(implicit_model, env, goal_slice=goal_slice, multitask_goal_slice=goal_slice, **variant['mpc_controller_kwargs']) state_only_mpc_controller = TdmLBfgsBStateOnlyCMC( vf, policy, env, goal_slice=goal_slice, multitask_goal_slice=goal_slice, **variant['state_only_mpc_controller_kwargs']) es = GaussianStrategy(action_space=env.action_space, **variant['es_kwargs']) if variant['explore_with'] == 'TdmLBfgsBCMC': exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=lbfgs_mpc_controller, ) variant['sac_tdm_kwargs']['base_kwargs']['exploration_policy'] = ( exploration_policy) elif variant['explore_with'] == 'TdmLBfgsBStateOnlyCMC': exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=state_only_mpc_controller, ) variant['sac_tdm_kwargs']['base_kwargs']['exploration_policy'] = ( exploration_policy) if variant['eval_with'] == 'TdmLBfgsBCMC': variant['sac_tdm_kwargs']['base_kwargs']['eval_policy'] = ( lbfgs_mpc_controller) elif variant['eval_with'] == 'TdmLBfgsBStateOnlyCMC': variant['sac_tdm_kwargs']['base_kwargs']['eval_policy'] = ( state_only_mpc_controller) algorithm = TdmSac(env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **variant['sac_tdm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): rdim = variant["rdim"] use_env_goals = variant["use_env_goals"] vae_path = variant["vae_paths"][str(rdim)] render = variant["render"] wrap_mujoco_env = variant.get("wrap_mujoco_env", False) # vae = torch.load(vae_path) # print("loaded", vae_path) from railrl.envs.wrappers import ImageMujocoEnv, NormalizedBoxEnv from railrl.images.camera import sawyer_init_camera env = variant["env"](**variant['env_kwargs']) env = NormalizedBoxEnv(ImageMujocoEnv( env, imsize=84, keep_prev=0, init_camera=sawyer_init_camera, )) if wrap_mujoco_env: env = ImageMujocoEnv(env, 84, camera_name="topview", transpose=True, normalize=True) if use_env_goals: track_qpos_goal = variant.get("track_qpos_goal", 0) env = VAEWrappedImageGoalEnv(env, vae_path, use_vae_obs=True, use_vae_reward=True, use_vae_goals=True, render_goals=render, render_rollouts=render, track_qpos_goal=track_qpos_goal) else: env = VAEWrappedEnv(env, vae_path, use_vae_obs=True, use_vae_reward=True, use_vae_goals=True, render_goals=render, render_rollouts=render) env = MultitaskToFlatEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, training_env=env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) env._wrapped_env.vae.to(ptu.device)
def experiment(variant): feat_points = 16 history = 1 latent_obs_dim = feat_points * 2 * history imsize = 64 downsampled_size = 32 env = SawyerXYZEnv() extra_fc_size = env.obs_dim env = ImageMujocoWithObsEnv(env, imsize=imsize, normalize=True, grayscale=True, keep_prev=history - 1, init_camera=camera.sawyer_init_camera) """env = ImageMujocoEnv(env, imsize=imsize, keep_prev=history-1, init_camera=camera.sawyer_init_camera)""" es = GaussianStrategy(action_space=env.action_space, ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size ae = FeatPointMlp(input_size=imsize, downsample_size=downsampled_size, input_channels=1, num_feat_points=feat_points) replay_buffer = AEEnvReplayBuffer(int(1e4), env, imsize=imsize, history_length=history, downsampled_size=downsampled_size) qf = FlattenMlp(input_size=latent_obs_dim + extra_fc_size + action_dim, output_size=1, hidden_sizes=[400, 300]) policy = AETanhPolicy( input_size=latent_obs_dim + extra_fc_size, ae=ae, env=env, history_length=history, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = FeatPointDDPG(ae, history, env=env, qf=qf, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, extra_fc_size=extra_fc_size, imsize=imsize, downsampled_size=downsampled_size, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def td3_experiment(variant): import gym import multiworld.envs.mujoco import multiworld.envs.pygame import railrl.samplers.rollout_functions as rf import railrl.torch.pytorch_util as ptu from railrl.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from railrl.exploration_strategies.epsilon_greedy import EpsilonGreedy from railrl.exploration_strategies.gaussian_strategy import GaussianStrategy from railrl.exploration_strategies.ou_strategy import OUStrategy from railrl.torch.grill.launcher import get_state_experiment_video_save_function from railrl.torch.her.her_td3 import HerTd3 from railrl.torch.td3.td3 import TD3 from railrl.torch.networks import FlattenMlp, TanhMlpPolicy from railrl.data_management.obs_dict_replay_buffer import ( ObsDictReplayBuffer) from railrl.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from railrl.samplers.data_collector.path_collector import ObsDictPathCollector if 'env_id' in variant: eval_env = gym.make(variant['env_id']) expl_env = gym.make(variant['env_id']) else: eval_env_kwargs = variant.get('eval_env_kwargs', variant['env_kwargs']) eval_env = variant['env_class'](**eval_env_kwargs) expl_env = variant['env_class'](**variant['env_kwargs']) observation_key = variant['observation_key'] # desired_goal_key = variant['desired_goal_key'] # variant['algo_kwargs']['her_kwargs']['observation_key'] = observation_key # variant['algo_kwargs']['her_kwargs']['desired_goal_key'] = desired_goal_key if variant.get('normalize', False): raise NotImplementedError() # achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictReplayBuffer( env=eval_env, observation_key=observation_key, # desired_goal_key=desired_goal_key, # achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=eval_env.action_space, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=eval_env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=eval_env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) trainer = TD3(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) observation_key = 'observation' desired_goal_key = 'desired_goal' eval_path_collector = ObsDictPathCollector( eval_env, policy, observation_key=observation_key, # render=True, # desired_goal_key=desired_goal_key, ) expl_path_collector = ObsDictPathCollector( expl_env, expl_policy, observation_key=observation_key, # render=True, # desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) # if variant.get("save_video", False): # rollout_function = rf.create_rollout_function( # rf.multitask_rollout, # max_path_length=algorithm.max_path_length, # observation_key=observation_key, # desired_goal_key=algorithm.desired_goal_key, # ) # video_func = get_state_experiment_video_save_function( # rollout_function, # env, # policy, # variant, # ) # algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) observation_key = variant.get('observation_key', 'observation') desired_goal_key = variant.get('desired_goal_key', 'desired_goal') replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, max_sigma=0.1, **variant['es_kwargs'] ) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = HerTd3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) her_kwargs = variant['algo_kwargs']['her_kwargs'] observation_key = her_kwargs['observation_key'] desired_goal_key = her_kwargs['desired_goal_key'] achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) num_ensemble_qs = variant.get("num_ensemble_qs", 0) ensemble_qs = [FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) for _ in range(num_ensemble_qs)] policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) render = variant.get("render", False) algorithm = HerExplorationTd3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, render=render, render_during_eval=render, ensemble_qs=ensemble_qs, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) observation_key = variant['observation_key'] desired_goal_key = variant['desired_goal_key'] achieved_goal_key = desired_goal_key.replace("desired", "achieved") variant['algo_kwargs']['her_kwargs']['observation_key'] = observation_key variant['algo_kwargs']['her_kwargs']['desired_goal_key'] = desired_goal_key replay_buffer = variant['replay_buffer_class']( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) variant['count_based_sampler_kwargs']['replay_buffer'] = replay_buffer env = CountBasedGoalSamplingEnv(wrapped_env=env, **variant['count_based_sampler_kwargs']) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = HerTd3( env, qf1=qf1, qf2=qf2, policy=policy, training_env=env, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) if variant.get("save_video", False): rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=algorithm.max_path_length, observation_key=algorithm.observation_key, desired_goal_key=algorithm.desired_goal_key, ) video_func = get_video_save_func( rollout_function, env, policy, variant, ) algorithm.post_epoch_funcs.append(video_func) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): rdim = variant["rdim"] vae_paths = { 2: "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id0/params.pkl", 4: "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id1/params.pkl", 8: "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id2/params.pkl", 16: "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id3/params.pkl" } vae_path = vae_paths[rdim] vae = torch.load(vae_path) print("loaded", vae_path) if variant['multitask']: env = FullPusher2DEnv(**variant["env_kwargs"]) env = ImageMujocoEnv(env, 84, camera_name="topview", transpose=True, normalize=True) env = VAEWrappedImageGoalEnv(env, vae, use_vae_obs=True, use_vae_reward=True, use_vae_goals=True, render_goals=True, render_rollouts=True, track_qpos_goal=5) env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, training_env=env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) print("use_gpu", variant["use_gpu"], bool(variant["use_gpu"])) if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) algorithm.to(ptu.device) env._wrapped_env.vae.to(ptu.device) algorithm.train()
def grill_her_td3_experiment(variant): env = variant["env_class"](**variant['env_kwargs']) render = variant["render"] rdim = variant["rdim"] vae_path = variant["vae_paths"][str(rdim)] reward_params = variant.get("reward_params", dict()) init_camera = variant.get("init_camera", None) if init_camera is None: camera_name = "topview" else: camera_name = None env = ImageEnv( env, 84, init_camera=init_camera, camera_name=camera_name, transpose=True, normalize=True, ) env = VAEWrappedEnv( env, vae_path, decode_goals=render, render_goals=render, render_rollouts=render, reward_params=reward_params, **variant.get('vae_wrapped_env_kwargs', {}) ) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] exploration_noise = variant.get('exploration_noise', 0.1) if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=exploration_noise, min_sigma=exploration_noise, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=exploration_noise, ) else: raise Exception("Invalid type: " + exploration_type) observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = ( env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size ) action_dim = env.action_space.low.size hidden_sizes = variant.get('hidden_sizes', [400, 300]) qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) training_mode = variant.get("training_mode", "train") testing_mode = variant.get("testing_mode", "test") testing_env = pickle.loads(pickle.dumps(env)) testing_env.mode(testing_mode) training_env = pickle.loads(pickle.dumps(env)) training_env.mode(training_mode) relabeling_env = pickle.loads(pickle.dumps(env)) relabeling_env.mode(training_mode) relabeling_env.disable_render() video_vae_env = pickle.loads(pickle.dumps(env)) video_vae_env.mode("video_vae") video_goal_env = pickle.loads(pickle.dumps(env)) video_goal_env.mode("video_env") replay_buffer = ObsDictRelabelingBuffer( env=relabeling_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_kwargs'] ) variant["algo_kwargs"]["replay_buffer"] = replay_buffer algorithm = HerTd3( testing_env, training_env=training_env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, render=render, render_during_eval=render, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): print("using GPU") algorithm.to(ptu.device) for e in [testing_env, training_env, video_vae_env, video_goal_env]: e.vae.to(ptu.device) algorithm.train() if variant.get("save_video", True): logdir = logger.get_snapshot_dir() policy.train(False) filename = osp.join(logdir, 'video_final_env.mp4') rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=algorithm.max_path_length, observation_key=algorithm.observation_key, desired_goal_key=algorithm.desired_goal_key, ) dump_video(video_goal_env, policy, filename, rollout_function) filename = osp.join(logdir, 'video_final_vae.mp4') dump_video(video_vae_env, policy, filename, rollout_function)
def experiment(variant): rdim = variant["rdim"] vae_paths = { 2: "/home/ashvin/data/s3doodad/ashvin/vae/point2d-conv-sweep2/run0/id1/params.pkl", 4: "/home/ashvin/data/s3doodad/ashvin/vae/point2d-conv-sweep2/run0/id4/params.pkl" } vae_path = vae_paths[rdim] vae = joblib.load(vae_path) print("loaded", vae_path) if variant['multitask']: env = MultitaskImagePoint2DEnv(**variant['env_kwargs']) env = VAEWrappedEnv(env, vae, use_vae_obs=True, use_vae_reward=False, use_vae_goals=False) env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, training_env=env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) print("use_gpu", variant["use_gpu"], bool(variant["use_gpu"])) if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) algorithm.to(ptu.device) env._wrapped_env.vae.to(ptu.device) algorithm.train()
def tdm_td3_experiment(variant): variant['env_kwargs'].update(variant['reward_params']) env = variant['env_class'](**variant['env_kwargs']) multiworld_env = variant.get('multiworld_env', True) if multiworld_env is not True: env = MultitaskEnvToSilentMultitaskEnv(env) if variant["render"]: env.pause_on_goal = True if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, max_sigma=0.1, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) if multiworld_env is True: obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size else: obs_dim = action_dim = goal_dim = None vectorized = 'vectorized' in env.reward_type variant['algo_kwargs']['tdm_kwargs']['vectorized'] = vectorized norm_order = env.norm_order variant['algo_kwargs']['tdm_kwargs']['norm_order'] = norm_order qf1 = TdmQf(env=env, observation_dim=obs_dim, action_dim=action_dim, goal_dim=goal_dim, vectorized=vectorized, norm_order=norm_order, **variant['qf_kwargs']) qf2 = TdmQf(env=env, observation_dim=obs_dim, action_dim=action_dim, goal_dim=goal_dim, vectorized=vectorized, norm_order=norm_order, **variant['qf_kwargs']) policy = TdmPolicy(env=env, observation_dim=obs_dim, action_dim=action_dim, goal_dim=goal_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) relabeling_env = pickle.loads(pickle.dumps(env)) algo_kwargs = variant['algo_kwargs'] if multiworld_env is True: observation_key = variant.get('observation_key', 'state_observation') desired_goal_key = variant.get('desired_goal_key', 'state_desired_goal') achieved_goal_key = variant.get('achieved_goal_key', 'state_achieved_goal') replay_buffer = ObsDictRelabelingBuffer( env=relabeling_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, vectorized=vectorized, **variant['replay_buffer_kwargs']) algo_kwargs['tdm_kwargs']['observation_key'] = observation_key algo_kwargs['tdm_kwargs']['desired_goal_key'] = desired_goal_key else: replay_buffer = RelabelingReplayBuffer( env=relabeling_env, **variant['replay_buffer_kwargs']) # qf_criterion = variant['qf_criterion_class']() # algo_kwargs['td3_kwargs']['qf_criterion'] = qf_criterion algo_kwargs['td3_kwargs']['training_env'] = env if 'tau_schedule_kwargs' in variant: tau_schedule = IntPiecewiseLinearSchedule( **variant['tau_schedule_kwargs']) else: tau_schedule = None algo_kwargs['tdm_kwargs']['epoch_max_tau_schedule'] = tau_schedule algorithm = TdmTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()