def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) observation_key = variant.get('observation_key', 'observation') desired_goal_key = variant.get('desired_goal_key', 'desired_goal') replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, max_sigma=0.1, **variant['es_kwargs'] ) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = HerTd3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size variant['algo_kwargs'] = dict( num_epochs=variant['num_epochs'], num_steps_per_epoch=variant['num_steps_per_epoch'], num_steps_per_eval=variant['num_steps_per_eval'], max_path_length=variant['max_path_length'], min_num_steps_before_training=variant['min_num_steps_before_training'], batch_size=variant['batch_size'], discount=variant['discount'], replay_buffer_size=variant['replay_buffer_size'], reward_scale=variant['reward_scale'], ) M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], # **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], # **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[M, M], # **variant['policy_kwargs'] ) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()