def get_environment_from_params_custom(environment_params): universe = environment_params['universe'] task = environment_params['task'] domain = environment_params['domain'] # st() environment_kwargs_gym = environment_params.get('kwargs', {}).copy() if "map3D" in environment_kwargs_gym: environment_kwargs_gym.pop("map3D") if "observation_keys" in environment_kwargs_gym: environment_kwargs_gym.pop("observation_keys") env = gym.make(f"{domain}-{task}",**environment_kwargs_gym) camera_space={'dist_low': 0.7,'dist_high': 1.5,'angle_low': 0,'angle_high': 180,'elev_low': -180,'elev_high': -90} env_n = ImageEnv( wrapped_env=env, imsize=64, normalize=True, camera_space=camera_space, init_camera=(lambda x: init_multiple_cameras(x, camera_space)), num_cameras=4,#4 for training depth=True, cam_info=True, reward_type='wrapped_env', flatten=False ) environment_kwargs = environment_params.get('kwargs', {}).copy() environment_kwargs["env"] = env_n return get_environment(universe, domain, task, environment_kwargs)
def setUp(self): self.env = get_environment('gym', 'Swimmer', 'v3', {}) self.policy = get_policy_from_params({'type': 'UniformPolicy'}, env=self.env) self.pool = SimpleReplayPool(max_size=100, environment=self.env) self.remote_sampler = RemoteSampler(max_path_length=10, min_pool_size=10, batch_size=10)
def run_experiment(variant, reporter): env = get_environment('gym', 'MultiGoal', 'Default', { 'actuation_cost_coeff': 1, 'distance_cost_coeff': 0.1, 'goal_reward': 1, 'init_sigma': 0.1, }) pool = SimpleReplayPool( observation_space=env.observation_space, action_space=env.action_space, max_size=1e6) sampler = SimpleSampler( max_path_length=30, min_pool_size=100, batch_size=64) Qs = get_Q_function_from_variant(variant, env) policy = get_policy_from_variant(variant, env, Qs) plotter = QFPolicyPlotter( Q=Qs[0], policy=policy, obs_lst=np.array(((-2.5, 0.0), (0.0, 0.0), (2.5, 2.5), (-2.5, -2.5))), default_action=(np.nan, np.nan), n_samples=100) algorithm = SAC( sampler=sampler, reparameterize=True, epoch_length=100, n_epochs=1000, n_train_repeat=1, eval_render_mode=None, eval_n_episodes=10, eval_deterministic=False, env=env, policy=policy, initial_exploration_policy=None, pool=pool, Qs=Qs, plotter=plotter, lr=3e-4, target_entropy=-2.0, discount=0.99, tau=1e-4, save_full_state=True, ) initialize_tf_variables(algorithm._session, only_uninitialized=True) for train_result in algorithm.train(): reporter(**train_result)
def setUp(self): self.env = get_environment('gym', 'Swimmer', 'v3', {}) self.hidden_layer_sizes = (128, 128) self.policy = FeedforwardGaussianPolicy( input_shapes=self.env.observation_shape, output_shape=self.env.action_space.shape, hidden_layer_sizes=self.hidden_layer_sizes, observation_keys=self.env.observation_keys)
def setUp(self): self.env = get_environment('gym', 'Swimmer', 'v3', {}) self.policy = ContinuousUniformPolicy( action_range=( self.env.action_space.low, self.env.action_space.high, ), input_shapes=self.env.observation_shape, output_shape=self.env.action_shape, observation_keys=self.env.observation_keys)
def run_experiment(variant, reporter): training_environment = (get_environment( 'gym', 'MultiGoal', 'Default-v0', { 'actuation_cost_coeff': 30, 'distance_cost_coeff': 1, 'goal_reward': 10, 'init_sigma': 0.1, })) evaluation_environment = training_environment.copy() pool = SimpleReplayPool(environment=training_environment, max_size=1e6) sampler = SimpleSampler(max_path_length=30) variant['Q_params']['config'].update({ 'input_shapes': ( training_environment.observation_shape, training_environment.action_shape, ) }) Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = policies.get(variant['policy_params']) plotter = QFPolicyPlotter(Q=Qs[0], policy=policy, obs_lst=np.array(((-2.5, 0.0), (0.0, 0.0), (2.5, 2.5), (-2.5, -2.5))), default_action=(np.nan, np.nan), n_samples=100) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': pool, 'sampler': sampler, 'min_pool_size': 100, 'batch_size': 64, 'plotter': plotter, }) algorithm = algorithms.get(variant['algorithm_params']) for train_result in algorithm.train(): reporter(**train_result)
def setUp(self): self.env = get_environment('gym', 'Swimmer', 'v3', {}) self.policy = policies.ContinuousUniformPolicy( action_range=( self.env.action_space.low, self.env.action_space.high, ), input_shapes=self.env.observation_shape, output_shape=self.env.action_shape, observation_keys=self.env.observation_keys) self.pool = SimpleReplayPool(max_size=100, environment=self.env) self.remote_sampler = RemoteSampler(max_path_length=10)
def setUp(self): self.env = get_environment('gym', 'Swimmer', 'v3', {}) self.hidden_layer_sizes = (8, 8) observation_shapes = OrderedDict( ((key, value) for key, value in self.env.observation_shape.items())) action_shape = self.env.action_shape input_shapes = (observation_shapes, action_shape) self.value_function = feedforward_Q_function( input_shapes=input_shapes, hidden_layer_sizes=self.hidden_layer_sizes, )
def setUp(self): self.env = get_environment('gym', 'Swimmer', 'v3', {}) self.hidden_layer_sizes = (16, 16) self.num_coupling_layers = 2 self.policy = RealNVPPolicy( input_shapes=self.env.observation_shape, output_shape=self.env.action_shape, action_range=( self.env.action_space.low, self.env.action_space.high, ), hidden_layer_sizes=self.hidden_layer_sizes, num_coupling_layers=self.num_coupling_layers, observation_keys=self.env.observation_keys, )
def run_experiment(variant, reporter): training_environment = ( get_environment('gym', 'MultiGoal', 'Default-v0', { 'actuation_cost_coeff': 30, 'distance_cost_coeff': 1, 'goal_reward': 10, 'init_sigma': 0.1, })) evaluation_environment = training_environment.copy() pool = SimpleReplayPool( environment=training_environment, max_size=1e6) sampler = SimpleSampler(max_path_length=30) Qs = get_Q_function_from_variant(variant, training_environment) policy = get_policy_from_variant(variant, training_environment) plotter = QFPolicyPlotter( Q=Qs[0], policy=policy, obs_lst=np.array(((-2.5, 0.0), (0.0, 0.0), (2.5, 2.5), (-2.5, -2.5))), default_action=(np.nan, np.nan), n_samples=100) algorithm = get_algorithm_from_variant( variant=variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, Qs=Qs, pool=pool, sampler=sampler, min_pool_size=100, batch_size=46, plotter=plotter, ) initialize_tf_variables(algorithm._session, only_uninitialized=True) for train_result in algorithm.train(): reporter(**train_result)
def test_add_samples_dict_observation(self): env = get_environment('gym', 'Swimmer', 'v3', {}) pool = create_pool(env=env, max_size=100) env.reset() num_samples = pool._max_size // 2 samples = { 'observations': { name: np.empty((num_samples, *space.shape), dtype=space.dtype) for name, space in env.observation_space.spaces.items() }, 'next_observations': { name: np.empty((num_samples, *space.shape), dtype=space.dtype) for name, space in env.observation_space.spaces.items() }, 'actions': np.empty((num_samples, *env.action_space.shape)), 'rewards': np.empty((num_samples, 1), dtype=np.float32), 'terminals': np.empty((num_samples, 1), dtype=bool), } for i in range(num_samples): action = env.action_space.sample() observation, reward, terminal, info = env.step(action) for name, value in observation.items(): samples['observations'][name][i, :] = value samples['next_observations'][name][i, :] = value samples['actions'][i] = action samples['rewards'][i] = reward samples['terminals'][i] = terminal pool.add_path(samples) last_n_batch = pool.last_n_batch(num_samples) np.testing.assert_equal( { key: value for key, value in last_n_batch.items() if key not in ('episode_index_backwards', 'episode_index_forwards') }, samples)
def test_create_pool(self): ENVIRONMENTS = ( get_environment('gym', 'Swimmer', 'v3', {}), gym.make('Swimmer-v3'), gym.make('HandManipulateBlock-v0'), ) for environment in ENVIRONMENTS: pool = create_pool(env=environment, max_size=100) def verify_field(field, expected_name, expected_dtype, expected_shape): self.assertIsInstance(field, Field) self.assertEqual(field.name, expected_name) self.assertEqual(field.dtype, expected_dtype) self.assertEqual(field.shape, expected_shape) self.assertEqual(field.initializer, np.zeros) self.assertEqual(field.default_value, 0.0) if isinstance(environment.observation_space, gym.spaces.Dict): self.assertIsInstance(pool.fields['observations'], dict) for name, space in environment.observation_space.spaces.items( ): self.assertIn(name, pool.fields['observations']) field = pool.fields['observations'][name] verify_field(field, name, space.dtype, space.shape) elif isinstance(environment.observation_space, gym.spaces.Box): self.assertIsInstance(pool.fields['observations'], Field) verify_field(field, 'observations', environment.observation_space.dtype, environment.observation_space.shape) else: raise ValueError(environment.observation_space) verify_field(pool.fields['actions'], 'actions', environment.action_space.dtype, environment.action_space.shape) verify_field(pool.fields['rewards'], 'rewards', 'float32', (1, )) verify_field(pool.fields['terminals'], 'terminals', 'bool', (1, ))
def test_create_pool(self): env = get_environment('gym', 'Swimmer', 'v3', {}) self.assertIsInstance(env.observation_space, gym.spaces.Dict) pool = create_pool(env=env, max_size=100) def verify_field(field, expected_name, expected_dtype, expected_shape): self.assertIsInstance(field, Field) self.assertEqual(field.name, expected_name) self.assertEqual(field.dtype, expected_dtype) self.assertEqual(field.shape, expected_shape) self.assertEqual(field.initializer, np.zeros) self.assertEqual(field.default_value, 0.0) self.assertIsInstance(pool.fields['observations'], dict) for name, space in env.observation_space.spaces.items(): self.assertIn(name, pool.fields) field = pool.fields['observations'][name] verify_field(field, name, space.dtype, space.shape) verify_field(pool.fields['actions'], 'actions', env.action_space.dtype, env.action_space.shape) verify_field(pool.fields['rewards'], 'rewards', 'float32', (1, )) verify_field(pool.fields['terminals'], 'terminals', 'bool', (1, ))
def test_resampling(self, strategy_type, resampling_probability): env = get_environment('gym', 'HandReach', 'v0', { 'observation_keys': ('observation', ), 'goal_keys': ('desired_goal', ), }) assert isinstance(env.observation_space, gym.spaces.Dict) max_size = 1000 episode_length = 50 her_strategy = { 'type': strategy_type, 'resampling_probability': resampling_probability, } pool = create_pool( env=env, max_size=max_size, her_strategy=her_strategy, ) strategy_validator = { 'random': RandomStrategyValidator, 'final': FinalStrategyValidator, 'episode': EpisodeStrategyValidator, 'future': FutureStrategyValidator, }[strategy_type](her_strategy=her_strategy) episode_lengths = [] while pool.size < pool._max_size: episode_length = np.random.randint(5, 50) episode_lengths.append(episode_length) samples = { 'observations': { name: np.empty( (episode_length, *space.shape), dtype=space.dtype) for name, space in env.observation_space.spaces.items() }, 'next_observations': { name: np.empty( (episode_length, *space.shape), dtype=space.dtype) for name, space in env.observation_space.spaces.items() }, 'actions': np.empty((episode_length, *env.action_space.shape)), 'rewards': np.empty((episode_length, 1), dtype=np.float32), 'terminals': np.empty((episode_length, 1), dtype=bool), } observation = env.reset() for i in range(episode_length): action = env.action_space.sample() next_observation, reward, terminal, info = env.step(action) for name, value in observation.items(): samples['observations'][name][i, :] = value samples['next_observations'][name][i, :] = next_observation[name] samples['actions'][i] = action samples['rewards'][i] = reward samples['terminals'][i] = terminal observation = next_observation pool.add_path(samples) for i in range(100): random_batch = pool.random_batch(256) strategy_validator.verify_batch(random_batch) assert strategy_validator.statistics_match()
def setUp(self): self.env = get_environment('gym', 'Swimmer', 'v3', {}) self.policy = UniformPolicy(input_shapes=self.env.observation_shape, output_shape=self.env.action_space.shape, observation_keys=self.env.observation_keys)