def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) vectorized = variant['sac_tdm_kwargs']['tdm_kwargs']['vectorized'] qf = FlattenMlp(input_size=obs_dim + action_dim + env.goal_dim + 1, output_size=env.goal_dim if vectorized else 1, **variant['qf_params']) vf = FlattenMlp(input_size=obs_dim + env.goal_dim + 1, output_size=env.goal_dim if vectorized else 1, **variant['vf_params']) policy = TanhGaussianPolicy(obs_dim=obs_dim + env.goal_dim + 1, action_dim=action_dim, **variant['policy_params']) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_params']) algorithm = TdmSac(env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **variant['sac_tdm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = SawyerXYZReachingEnv(**env_params) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = SawyerXYZReachingEnv(**env_params) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size hidden_size = variant['hidden_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[hidden_size, hidden_size], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[hidden_size, hidden_size], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[hidden_size, hidden_size], ) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs'])) if variant['multitask']: env = MultitaskToFlatEnv(env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v2')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs'] ) algorithm = TwinSAC( env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = variant['env_class']() if variant['normalize']: env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['vf_params']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_params']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_params']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = N3DPG(env, qf=qf, vf=vf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = CylinderXYPusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) vf = FlattenMlp(input_size=obs_dim + goal_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) replay_buffer = SimpleHerReplayBuffer(env=env, **variant['replay_buffer_kwargs']) algorithm = HerSac(env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv( MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = FlattenMlp( hidden_sizes=[100, 100], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[100, 100], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[100, 100], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) with torch.autograd.profiler.profile() as prof: algorithm.train() prof.export_chrome_trace("tmp-torch-chrome-trace.prof")
def experiment(variant): # env = normalize(GymEnv( # 'HalfCheetah-v1', # force_reset=True, # record_video=False, # record_log=False, # )) env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant['multitask']: env = MultitaskFullVAEPoint2DEnv( **variant['env_kwargs']) # used point2d-conv-sweep/run1/id4 env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) algorithm.to(ptu.device) env._wrapped_env.vae.to(ptu.device) algorithm.train()
def experiment(variant): # if variant['multitask']: # env = MultitaskPoint2DEnv(**variant['env_kwargs']) # env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) env_name = variant["env_name"] env = gym.make(env_name) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size variant['algo_kwargs'] = dict( num_epochs=variant['num_epochs'], num_steps_per_epoch=variant['num_steps_per_epoch'], num_steps_per_eval=variant['num_steps_per_eval'], max_path_length=variant['max_path_length'], min_num_steps_before_training=variant['min_num_steps_before_training'], batch_size=variant['batch_size'], discount=variant['discount'], replay_buffer_size=variant['replay_buffer_size'], soft_target_tau=variant['soft_target_tau'], target_update_period=variant['target_update_period'], train_policy_with_reparameterization=variant[ 'train_policy_with_reparameterization'], policy_lr=variant['policy_lr'], qf_lr=variant['qf_lr'], vf_lr=variant['vf_lr'], reward_scale=variant['reward_scale'], use_automatic_entropy_tuning=variant.get( 'use_automatic_entropy_tuning', False)) M = variant['layer_size'] qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], # **variant['qf_kwargs'] ) vf = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], # **variant['vf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], # **variant['policy_kwargs'] ) algorithm = SoftActorCritic(env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf.cuda() vf.cuda() policy.cuda() algorithm.cuda() algorithm.train()
def experiment(variant): env = Point2DEnv(**variant['env_kwargs']) env = FlatGoalEnv(env) env = NormalizedBoxEnv(env) action_dim = int(np.prod(env.action_space.shape)) obs_dim = int(np.prod(env.observation_space.shape)) qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TwinSACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, data_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def her_twin_sac_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) observation_key = variant.get('observation_key', 'observation') desired_goal_key = variant.get('desired_goal_key', 'desired_goal') replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size if variant['normalize']: env = NormalizedBoxEnv(env) qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) vf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=1, **variant['vf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs'] ) algorithm = HerTwinSac( env, qf1=qf1, qf2=qf2, vf=vf, policy=policy, replay_buffer=replay_buffer, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) vf.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # env = NormalizedBoxEnv(MultiGoalEnv( # actuation_cost_coeff=10, # distance_cost_coeff=1, # goal_reward=10, # )) env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) # qf = ExpectableQF( # obs_dim=obs_dim, # action_dim=action_dim, # hidden_size=100, # ) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) # TODO(vitchyr): just creating the plotter crashes EC2 # plotter = QFPolicyPlotter( # qf=qf, # policy=policy, # obs_lst=np.array([[-2.5, 0.0], # [0.0, 0.0], # [2.5, 2.5]]), # default_action=[np.nan, np.nan], # n_samples=100 # ) algorithm = ExpectedSAC( env=env, policy=policy, qf=qf, vf=vf, # plotter=plotter, # render_eval_paths=True, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if 'history_len' in variant: history_len = variant['history_len'] env = MultiTaskHistoryEnv(env, history_len=history_len) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) gcm = FlattenMlp(input_size=env.goal_dim + obs_dim + action_dim + 1, output_size=env.goal_dim, **variant['gcm_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + env.goal_dim + 1, output_size=action_dim, **variant['policy_kwargs']) es = OUStrategy( action_space=env.action_space, theta=0.1, max_sigma=0.1, min_sigma=0.1, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) gcm_criterion = variant['gcm_criterion_class']( **variant['gcm_criterion_kwargs']) algo_kwargs = variant['algo_kwargs'] algo_kwargs['base_kwargs']['replay_buffer'] = replay_buffer algorithm = GcmDdpg(env, gcm=gcm, policy=policy, exploration_policy=exploration_policy, gcm_criterion=gcm_criterion, **algo_kwargs) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = SawyerXYZReachingEnv(**env_params) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def __init__( self, goal_processor, qf_kwargs, preprocess_obs_dim, action_dim, encode_state=False, vectorized=False, give_each_qf_single_goal_dim=False, ): super().__init__() self.goal_processor = goal_processor self.preprocess_obs_dim = preprocess_obs_dim self.preprocess_goal_dim = goal_processor.input_size self.postprocess_goal_dim = goal_processor.output_size self.encode_state = encode_state self.vectorized = vectorized self._give_each_qf_single_goal_dim = give_each_qf_single_goal_dim # We have a qf for each goal dim, described by qf_kwargs. self.feature_qfs = nn.ModuleList() if give_each_qf_single_goal_dim: qf_goal_input_size = 1 else: qf_goal_input_size = self.postprocess_goal_dim if self.encode_state: qf_input_size = (self.postprocess_goal_dim + action_dim + qf_goal_input_size) else: qf_input_size = preprocess_obs_dim + action_dim + qf_goal_input_size for _ in range(self.postprocess_goal_dim): self.feature_qfs.append( FlattenMlp(input_size=qf_input_size, output_size=1, **qf_kwargs))
def experiment(variant): env = gym.make(variant['env_id']) env = NormalizedBoxEnv(env) es = GaussianStrategy(action_space=env.action_space, ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[128, 128]) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[128, 128], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = SawyerXYReachingEnv(**env_params) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[100, 100], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[100, 100], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[100, 100], ) # es = GaussianStrategy( # action_space=env.action_space, # **variant['es_kwargs'] # ) # es = EpsilonGreedy( # action_space=env.action_space, # prob_random_action=0.2, # ) es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = variant['env_class']() obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) vectorized = variant['algo_params']['tdm_kwargs']['vectorized'] # qf = StructuredQF( # observation_dim=obs_dim, # action_dim=action_dim, # goal_dim=env.goal_dim, # output_size=env.goal_dim if vectorized else 1, # **variant['qf_params'] # ) qf = OneHotTauQF(observation_dim=obs_dim, action_dim=action_dim, goal_dim=env.goal_dim, output_size=env.goal_dim if vectorized else 1, **variant['qf_params']) vf = FlattenMlp(input_size=obs_dim + env.goal_dim + 1, output_size=env.goal_dim if vectorized else 1, **variant['vf_params']) policy = MlpPolicy(input_size=obs_dim + env.goal_dim + 1, output_size=action_dim, **variant['policy_params']) es = OUStrategy( action_space=env.action_space, theta=0.1, max_sigma=0.1, min_sigma=0.1, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_params']) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_params']) algo_params = variant['algo_params'] algo_params['n3dpg_kwargs']['qf_criterion'] = qf_criterion plotter = Simple1DTdmPlotter( tdm=qf, # location_lst=np.array([-10, 0, 10]), # goal_lst=np.array([-10, 0, 5]), location_lst=np.array([-5, 0, 5]), goal_lst=np.array([-5, 0, 5]), max_tau=algo_params['tdm_kwargs']['max_tau'], grid_size=10, ) algo_params['n3dpg_kwargs']['plotter'] = plotter algorithm = TdmN3dpg(env, qf=qf, vf=vf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **algo_params) algorithm.to(ptu.device) algorithm.train()
def td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(Reacher7DofFullGoal()) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) vectorized = variant['sac_tdm_kwargs']['tdm_kwargs']['vectorized'] qf = FlattenMlp( input_size=obs_dim + action_dim + env.goal_dim + 1, output_size=env.goal_dim if vectorized else 1, **variant['qf_params'] ) vf = FlattenMlp( input_size=obs_dim + env.goal_dim + 1, output_size=env.goal_dim if vectorized else 1, **variant['vf_params'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim + env.goal_dim + 1, action_dim=action_dim, **variant['policy_params'] ) mpc_controller = CollocationMpcController( env, qf, policy, ) variant['sac_tdm_kwargs']['base_kwargs']['eval_policy'] = mpc_controller variant['sac_tdm_kwargs']['base_kwargs']['exploration_policy'] = ( mpc_controller ) replay_buffer = HerReplayBuffer( env=env, **variant['her_replay_buffer_params'] ) algorithm = TdmSac( env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **variant['sac_tdm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv( MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = FlattenMlp( hidden_sizes=[100, 100], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[100, 100], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[100, 100], obs_dim=obs_dim, action_dim=action_dim, ) plotter = QFPolicyPlotter(qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, # plotter=plotter, # render_eval_paths=True, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = CylinderXYPusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = SimpleHerReplayBuffer(env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = SawyerXYZEnv(**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class']() env = NormalizedBoxEnv(env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()