def experiment(variant): # env = NormalizedBoxEnv(HalfCheetahEnv()) # env = NormalizedBoxEnv(InvertedPendulumEnv()) # --------- # env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) env = ReacherEnv() training_env = ReacherEnv() # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) total_meta_variable_dim = 0 for dims in exp_specs['true_meta_variable_dims']: total_meta_variable_dim += sum(dims) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + total_meta_variable_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + total_meta_variable_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + total_meta_variable_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=training_env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): # Or for a specific version (Daniel: doesn't work): # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) if 'Ant' in args.env: expl_env = NormalizedBoxEnv(AntEnv()) eval_env = NormalizedBoxEnv(AntEnv()) elif 'InvertedPendulum' in args.env: expl_env = NormalizedBoxEnv(InvertedPendulumEnv()) eval_env = NormalizedBoxEnv(InvertedPendulumEnv()) elif 'HalfCheetah' in args.env: expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) elif 'Hopper' in args.env: expl_env = NormalizedBoxEnv(HopperEnv()) eval_env = NormalizedBoxEnv(HopperEnv()) elif 'Reacher' in args.env: expl_env = NormalizedBoxEnv(ReacherEnv()) eval_env = NormalizedBoxEnv(ReacherEnv()) elif 'Swimmer' in args.env: expl_env = NormalizedBoxEnv(SwimmerEnv()) eval_env = NormalizedBoxEnv(SwimmerEnv()) elif 'Walker2d' in args.env: expl_env = NormalizedBoxEnv(Walker2dEnv()) eval_env = NormalizedBoxEnv(Walker2dEnv()) else: raise ValueError(args.env) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def __init__(self): ReacherEnv.__init__(self) self._hit_timesteps = 0
def __init__(self, noise_std=0.1): NoisyEnv.__init__(self, noise_std=noise_std) ReacherEnv.__init__(self)
def __init__(self, multiplier=0.01, offset=0.1): self.multiplier = multiplier self.offset = offset ReacherEnv.__init__(self)
def __init__(self, coefficent=1.0): self.coefficent = coefficent ReacherEnv.__init__(self)
def __init__(self, distance_threshold=0.05): self.distance_threshold = distance_threshold ReacherEnv.__init__(self)
def experiment(variant, args): # Doesn't work :( #import gym #expl_env = NormalizedBoxEnv( gym.make(args.env) ) #eval_env = NormalizedBoxEnv( gym.make(args.env) ) if 'Ant' in args.env: expl_env = NormalizedBoxEnv( AntEnv() ) eval_env = NormalizedBoxEnv( AntEnv() ) elif 'InvertedPendulum' in args.env: expl_env = NormalizedBoxEnv( InvertedPendulumEnv() ) eval_env = NormalizedBoxEnv( InvertedPendulumEnv() ) elif 'HalfCheetah' in args.env: expl_env = NormalizedBoxEnv( HalfCheetahEnv() ) eval_env = NormalizedBoxEnv( HalfCheetahEnv() ) elif 'Hopper' in args.env: expl_env = NormalizedBoxEnv( HopperEnv() ) eval_env = NormalizedBoxEnv( HopperEnv() ) elif 'Reacher' in args.env: expl_env = NormalizedBoxEnv( ReacherEnv() ) eval_env = NormalizedBoxEnv( ReacherEnv() ) elif 'Swimmer' in args.env: expl_env = NormalizedBoxEnv( SwimmerEnv() ) eval_env = NormalizedBoxEnv( SwimmerEnv() ) elif 'Walker2d' in args.env: expl_env = NormalizedBoxEnv( Walker2dEnv() ) eval_env = NormalizedBoxEnv( Walker2dEnv() ) else: raise ValueError(args.env) # Back to normal. obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TD3Trainer( policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()