def test_gumbel_softmax(self): """Tests the GumbelSoftmax ActionDistribution (tf + eager only).""" for fw, sess in framework_iterator(frameworks=["tf", "tfe"], session=True): batch_size = 1000 num_categories = 5 input_space = Box(-1.0, 1.0, shape=(batch_size, num_categories)) # Batch of size=n and deterministic. inputs = input_space.sample() gumbel_softmax = GumbelSoftmax(inputs, {}, temperature=1.0) expected = softmax(inputs) # Sample n times, expect always mean value (deterministic draw). out = gumbel_softmax.deterministic_sample() check(out, expected) # Batch of size=n and non-deterministic -> expect roughly that # the max-likelihood (argmax) ints are output (most of the time). inputs = input_space.sample() gumbel_softmax = GumbelSoftmax(inputs, {}, temperature=1.0) expected_mean = np.mean(np.argmax(inputs, -1)).astype(np.float32) outs = gumbel_softmax.sample() if sess: outs = sess.run(outs) check(np.mean(np.argmax(outs, -1)), expected_mean, rtol=0.08)
def test_vtrace(self): """Tests V-trace against ground truth data calculated in python.""" seq_len = 5 batch_size = 10 # Create log_rhos such that rho will span from near-zero to above the # clipping thresholds. In particular, calculate log_rhos in # [-2.5, 2.5), # so that rho is in approx [0.08, 12.2). space_w_time = Box(-1.0, 1.0, (seq_len, batch_size), np.float32) space_only_batch = Box(-1.0, 1.0, (batch_size, ), np.float32) log_rhos = space_w_time.sample() / (batch_size * seq_len) log_rhos = 5 * (log_rhos - 0.5) # [0.0, 1.0) -> [-2.5, 2.5). values = { "log_rhos": log_rhos, # T, B where B_i: [0.9 / (i+1)] * T "discounts": np.array([[0.9 / (b + 1) for b in range(batch_size)] for _ in range(seq_len)]), "rewards": space_w_time.sample(), "values": space_w_time.sample() / batch_size, "bootstrap_value": space_only_batch.sample() + 1.0, "clip_rho_threshold": 3.7, "clip_pg_rho_threshold": 2.2, } for fw, sess in framework_iterator( frameworks=("torch", "tf"), session=True): vtrace = vtrace_tf if fw != "torch" else vtrace_torch output = vtrace.from_importance_weights(**values) if sess: output = sess.run(output) ground_truth_v = _ground_truth_calculation(vtrace, **values) check(output, ground_truth_v)
def test_multi_categorical(self): batch_size = 100 num_categories = 3 num_sub_distributions = 5 # Create 5 categorical distributions of 3 categories each. inputs_space = Box(-1.0, 2.0, shape=(batch_size, num_sub_distributions * num_categories)) values_space = Box(0, num_categories - 1, shape=(num_sub_distributions, batch_size), dtype=np.int32) inputs = inputs_space.sample() input_lengths = [num_categories] * num_sub_distributions inputs_split = np.split(inputs, num_sub_distributions, axis=1) for fw in framework_iterator(): # Create the correct distribution object. cls = MultiCategorical if fw != "torch" else TorchMultiCategorical multi_categorical = cls(inputs, None, input_lengths) # Batch of size=3 and deterministic (True). expected = np.transpose(np.argmax(inputs_split, axis=-1)) # Sample, expect always max value # (max likelihood for deterministic draw). out = multi_categorical.deterministic_sample() check(out, expected) # Batch of size=3 and non-deterministic -> expect roughly the mean. out = multi_categorical.sample() check(tf.reduce_mean(out) if fw != "torch" else torch.mean(out.float()), 1.0, decimals=0) # Test log-likelihood outputs. probs = softmax(inputs_split) values = values_space.sample() out = multi_categorical.logp(values if fw != "torch" else [ torch.Tensor(values[i]) for i in range(num_sub_distributions) ]) # v in np.stack(values, 1)]) expected = [] for i in range(batch_size): expected.append( np.sum( np.log( np.array([ probs[j][i][values[j][i]] for j in range(num_sub_distributions) ])))) check(out, expected, decimals=4) # Test entropy outputs. out = multi_categorical.entropy() expected_entropy = -np.sum(np.sum(probs * np.log(probs), 0), -1) check(out, expected_entropy)
def test_categorical(self): batch_size = 10000 num_categories = 4 # Create categorical distribution with n categories. inputs_space = Box(-1.0, 2.0, shape=(batch_size, num_categories), dtype=np.float32) values_space = Box(0, num_categories - 1, shape=(batch_size, ), dtype=np.int32) inputs = inputs_space.sample() for fw, sess in framework_iterator(session=True, frameworks=("tf", "tf2", "torch")): # Create the correct distribution object. cls = JAXCategorical if fw == "jax" else Categorical if \ fw != "torch" else TorchCategorical categorical = cls(inputs, {}) # Do a stability test using extreme NN outputs to see whether # sampling and logp'ing result in NaN or +/-inf values. self._stability_test(cls, inputs_space.shape, fw=fw, sess=sess, bounds=(0, num_categories - 1)) # Batch of size=3 and deterministic (True). expected = np.transpose(np.argmax(inputs, axis=-1)) # Sample, expect always max value # (max likelihood for deterministic draw). out = categorical.deterministic_sample() check(out, expected) # Batch of size=3 and non-deterministic -> expect roughly the mean. out = categorical.sample() check(np.mean(out) if fw == "jax" else tf.reduce_mean(out) if fw != "torch" else torch.mean(out.float()), 1.0, decimals=0) # Test log-likelihood outputs. probs = softmax(inputs) values = values_space.sample() out = categorical.logp( values if fw != "torch" else torch.Tensor(values)) expected = [] for i in range(batch_size): expected.append(np.sum(np.log(np.array(probs[i][values[i]])))) check(out, expected, decimals=4) # Test entropy outputs. out = categorical.entropy() expected_entropy = -np.sum(probs * np.log(probs), -1) check(out, expected_entropy)
def test_beta(self): input_space = Box(-2.0, 1.0, shape=(2000, 10)) input_space.seed(42) low, high = -1.0, 2.0 plain_beta_value_space = Box(0.0, 1.0, shape=(2000, 5)) plain_beta_value_space.seed(42) for fw, sess in framework_iterator(session=True): cls = TorchBeta if fw == "torch" else Beta inputs = input_space.sample() beta_distribution = cls(inputs, {}, low=low, high=high) inputs = beta_distribution.inputs if sess: inputs = sess.run(inputs) else: inputs = inputs.numpy() alpha, beta_ = np.split(inputs, 2, axis=-1) # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] expected = (1.0 / (1.0 + beta_ / alpha)) * (high - low) + low # Sample n times, expect always mean value (deterministic draw). out = beta_distribution.deterministic_sample() check(out, expected, rtol=0.01) # Batch of size=n and non-deterministic -> expect roughly the mean. values = beta_distribution.sample() if sess: values = sess.run(values) else: values = values.numpy() self.assertTrue(np.max(values) <= high) self.assertTrue(np.min(values) >= low) check(np.mean(values), expected.mean(), decimals=1) # Test log-likelihood outputs (against scipy). inputs = input_space.sample() beta_distribution = cls(inputs, {}, low=low, high=high) inputs = beta_distribution.inputs if sess: inputs = sess.run(inputs) else: inputs = inputs.numpy() alpha, beta_ = np.split(inputs, 2, axis=-1) values = plain_beta_value_space.sample() values_scaled = values * (high - low) + low if fw == "torch": values_scaled = torch.Tensor(values_scaled) print(values_scaled) out = beta_distribution.logp(values_scaled) check( out, np.sum(np.log(beta.pdf(values, alpha, beta_)), -1), rtol=0.01)
def test_space_utils(): # Box box = Box(-1.0, 1.0, shape=[2, 3], dtype=np.float32) sample = box.sample() assert flatdim(box) == 2 * 3 assert flatten(box, sample).shape == (2 * 3, ) assert np.allclose(sample, unflatten(box, flatten(box, sample))) x = np.array([[1.0, 1.0], [1.0, 1.0]]) box = Box(low=-x, high=x, dtype=np.float32) sample = box.sample() assert flatdim(box) == 2 * 2 assert flatten(box, sample).shape == (2 * 2, ) assert np.allclose(sample, unflatten(box, flatten(box, sample))) # Discrete discrete = Discrete(5) sample = discrete.sample() assert flatdim(discrete) == 5 assert flatten(discrete, sample).shape == (5, ) assert sample == unflatten(discrete, flatten(discrete, sample)) # Tuple S = Tuple([ Discrete(5), Box(-1.0, 1.0, shape=(2, 3), dtype=np.float32), Dict({ 'success': Discrete(2), 'velocity': Box(-1, 1, shape=(1, 3), dtype=np.float32) }) ]) sample = S.sample() assert flatdim(S) == 5 + 2 * 3 + 2 + 3 assert flatten(S, sample).shape == (16, ) _sample = unflatten(S, flatten(S, sample)) assert sample[0] == _sample[0] assert np.allclose(sample[1], _sample[1]) assert sample[2]['success'] == _sample[2]['success'] assert np.allclose(sample[2]['velocity'], _sample[2]['velocity']) # Dict D0 = Dict({ 'position': Box(-100, 100, shape=(3, ), dtype=np.float32), 'velocity': Box(-1, 1, shape=(4, ), dtype=np.float32) }) D = Dict({'sensors': D0, 'score': Discrete(100)}) sample = D.sample() assert flatdim(D) == 3 + 4 + 100 assert flatten(D, sample).shape == (107, ) _sample = unflatten(D, flatten(D, sample)) assert sample['score'] == _sample['score'] assert np.allclose(sample['sensors']['position'], _sample['sensors']['position']) assert np.allclose(sample['sensors']['velocity'], _sample['sensors']['velocity'])
def test_trajectory(self): """Tests the Trajectory class.""" buffer_size = 5 # Small trajecory object for testing purposes. trajectory = Trajectory(buffer_size=buffer_size) self.assertEqual(trajectory.cursor, 0) self.assertEqual(trajectory.timestep, 0) self.assertEqual(trajectory.sample_batch_offset, 0) assert not trajectory.buffers observation_space = Box(-1.0, 1.0, shape=(3, )) action_space = Discrete(2) trajectory.add_init_obs(env_id=0, agent_id="agent", policy_id="policy", init_obs=observation_space.sample()) self.assertEqual(trajectory.cursor, 0) self.assertEqual(trajectory.initial_obs.shape, observation_space.shape) # Fill up the buffer and make it extend if it hits the limit. cur_buffer_size = buffer_size for i in range(buffer_size + 1): trajectory.add_action_reward_next_obs( env_id=0, agent_id="agent", policy_id="policy", values=dict( t=i, actions=action_space.sample(), rewards=1.0, dones=i == buffer_size, new_obs=observation_space.sample(), action_logp=-0.5, action_dist_inputs=np.array([[0.5, 0.5]]), )) self.assertEqual(trajectory.cursor, i + 1) self.assertEqual(trajectory.timestep, i + 1) self.assertEqual(trajectory.sample_batch_offset, 0) if i == buffer_size - 1: cur_buffer_size *= 2 self.assertEqual(len(trajectory.buffers["new_obs"]), cur_buffer_size) self.assertEqual(len(trajectory.buffers["rewards"]), cur_buffer_size) # Create a SampleBatch from the Trajectory and reset it. batch = trajectory.get_sample_batch_and_reset() self.assertEqual(batch.count, buffer_size + 1) # Make sure, Trajectory was reset properly. self.assertEqual(trajectory.cursor, buffer_size + 1) self.assertEqual(trajectory.timestep, 0) self.assertEqual(trajectory.sample_batch_offset, buffer_size + 1)
class FakeImageEnv(Env): """ Fake image environment for testing purposes, it mimics Atari games. :param action_dim: Number of discrete actions :param screen_height: Height of the image :param screen_width: Width of the image :param n_channels: Number of color channels :param discrete: Create discrete action space instead of continuous :param channel_first: Put channels on first axis instead of last """ def __init__( self, action_dim: int = 6, screen_height: int = 84, screen_width: int = 84, n_channels: int = 1, discrete: bool = True, channel_first: bool = False, ): self.observation_shape = (screen_height, screen_width, n_channels) if channel_first: self.observation_shape = (n_channels, screen_height, screen_width) self.observation_space = Box(low=0, high=255, shape=self.observation_shape, dtype=np.uint8) if discrete: self.action_space = Discrete(action_dim) else: self.action_space = Box(low=-1, high=1, shape=(5, ), dtype=np.float32) self.ep_length = 10 self.current_step = 0 def reset(self) -> np.ndarray: self.current_step = 0 return self.observation_space.sample() def step(self, action: Union[np.ndarray, int]) -> GymStepReturn: reward = 0.0 self.current_step += 1 done = self.current_step >= self.ep_length return self.observation_space.sample(), reward, done, {} def render(self, mode: str = "human") -> None: pass
def test_json(self): """Test JSON conversions.""" box_space = Box(low=np.array([[1., 2.], [3., 4.]]), high=np.array([[1.3, 4.9], [3.5, 5.]])) space = StackedBoxSpace(box_space, 2) samples = [box_space.sample() for _ in range(5)] jsoned = space.to_jsonable(samples) self.assertEqual(space.to_jsonable(space.from_jsonable(jsoned)), jsoned) in_data = [[ np.array([[1.1, 2.1], [3.1, 4.1]]), np.array([[1.2, 2.2], [3.2, 4.2]]) ], [ np.array([[1.11, 2.11], [3.3, 4.]]), np.array([[1.21, 2.2], [3.23, 4.21]]) ]] self.assertEqual( space.to_jsonable(in_data), [[[[1.1, 2.1], [3.1, 4.1]], [[1.11, 2.11], [3.3, 4.]]], [[[1.2, 2.2], [3.2, 4.2]], [[1.21, 2.2], [3.23, 4.21]]]]) inverted = space.from_jsonable(space.to_jsonable(in_data)) for idx1, idx2 in [(0, 0), (0, 1), (1, 0), (1, 1)]: self.assertTrue( np.allclose(in_data[idx1][idx2], inverted[idx1][idx2]))
def test_default_models(self): ray.init(object_store_memory=1000 * 1024 * 1024) for fw in framework_iterator(frameworks=("jax", "tf", "tf2", "torch")): obs_space = Box(0, 1, shape=(3, ), dtype=np.float32) p1 = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=Discrete(5), num_outputs=5, model_config={}, framework=fw, ) self.assertTrue("FullyConnectedNetwork" in type(p1).__name__) # Do a test forward pass. obs = np.array([obs_space.sample()]) if fw == "torch": obs = torch.from_numpy(obs) out, state_outs = p1({"obs": obs}) self.assertTrue(out.shape == (1, 5)) self.assertTrue(state_outs == []) # No Conv2Ds for JAX yet. if fw != "jax": p2 = ModelCatalog.get_model_v2( obs_space=Box(0, 1, shape=(84, 84, 3), dtype=np.float32), action_space=Discrete(5), num_outputs=5, model_config={}, framework=fw, ) self.assertTrue("VisionNetwork" in type(p2).__name__)
def sample_slider_position(T_aug=None): # always at the origin basically pos = np.array([0, 0, 0.03]) yaw_min = np.array([0]) yaw_max = np.array([2 * np.pi]) yaw_sampler = Box(yaw_min, yaw_max) yaw = yaw_sampler.sample() quat = transforms3d.euler.euler2quat(0, 0, yaw) T_O_slider = transform_utils.transform_from_pose(pos, quat) T_W_slider = None if T_aug is not None: T_W_slider = T_aug @ T_O_slider else: T_W_slider = T_O_slider pose_dict = transform_utils.matrix_to_dict(T_W_slider) # note the quat/pos orderining q = np.concatenate((pose_dict['quaternion'], pose_dict['position'])) return q
def test_stacked_box_space_json(): """ Test JSON conversions for StackedBoxSpace. """ box_space = Box(low=np.array([[1., 2.], [3., 4.]]), high=np.array([[1.3, 4.9], [3.5, 5.]])) space = StackedBoxSpace(box_space, 2) samples = [box_space.sample() for _ in range(5)] jsoned = space.to_jsonable(samples) assert space.to_jsonable(space.from_jsonable(jsoned)) == jsoned in_data = [[ np.array([[1.1, 2.1], [3.1, 4.1]]), np.array([[1.2, 2.2], [3.2, 4.2]]) ], [ np.array([[1.11, 2.11], [3.3, 4.]]), np.array([[1.21, 2.2], [3.23, 4.21]]) ]] assert (space.to_jsonable(in_data) == [[[[1.1, 2.1], [3.1, 4.1]], [[1.11, 2.11], [3.3, 4.]]], [[[1.2, 2.2], [3.2, 4.2]], [[1.21, 2.2], [3.23, 4.21]]]]) inverted = space.from_jsonable(space.to_jsonable(in_data)) for idx1, idx2 in [(0, 0), (0, 1), (1, 0), (1, 1)]: assert np.allclose(in_data[idx1][idx2], inverted[idx1][idx2])
def __init__( self, wrapped_env, encoder: Encoder, encoder_input_prefix, key_prefix='encoder', reward_mode='encoder_distance', ): """ :param wrapped_env: :param encoder: :param encoder_input_prefix: :param key_prefix: :param reward_mode: - 'encoder_distance': l1 distance in encoder distance - 'vectorized_encoder_distance': vectorized l1 distance in encoder distance, i.e. negative absolute value - 'env': use the wrapped env's reward """ super().__init__(wrapped_env) if reward_mode not in { self.ENCODER_DISTANCE_REWARD, self.VECTORIZED_ENCODER_DISTANCE_REWARD, self.ENV_REWARD, }: raise ValueError(reward_mode) self._encoder = encoder self._encoder_input_obs_key = '{}_observation'.format( encoder_input_prefix) self._encoder_input_desired_goal_key = '{}_desired_goal'.format( encoder_input_prefix ) self._encoder_input_achieved_goal_key = '{}_achieved_goal'.format( encoder_input_prefix ) self._reward_mode = reward_mode spaces = self.wrapped_env.observation_space.spaces latent_space = Box( encoder.min_embedding, encoder.max_embedding, dtype=np.float32, ) self._embedding_size = encoder.min_embedding.size self._obs_key = '{}_observation'.format(key_prefix) self._desired_goal_key = '{}_desired_goal'.format(key_prefix) self._achieved_goal_key = '{}_achieved_goal'.format(key_prefix) self._distance_name = '{}_distance'.format(key_prefix) self._key_prefix = key_prefix self._desired_goal = { self._desired_goal_key: np.zeros_like(latent_space.sample()) } spaces[self._obs_key] = latent_space spaces[self._desired_goal_key] = latent_space spaces[self._achieved_goal_key] = latent_space self.observation_space = Dict(spaces) self._goal_sampling_mode = 'env'
def test_seed_Dict(): test_space = Dict( { "a": Box(low=0, high=1, shape=(3, 3)), "b": Dict( { "b_1": Box(low=-100, high=100, shape=(2,)), "b_2": Box(low=-1, high=1, shape=(2,)), } ), "c": Discrete(5), } ) seed_dict = { "a": 0, "b": { "b_1": 1, "b_2": 2, }, "c": 3, } test_space.seed(seed_dict) # "Unpack" the dict sub-spaces into individual spaces a = Box(low=0, high=1, shape=(3, 3)) a.seed(0) b_1 = Box(low=-100, high=100, shape=(2,)) b_1.seed(1) b_2 = Box(low=-1, high=1, shape=(2,)) b_2.seed(2) c = Discrete(5) c.seed(3) for i in range(10): test_s = test_space.sample() a_s = a.sample() assert (test_s["a"] == a_s).all() b_1_s = b_1.sample() assert (test_s["b"]["b_1"] == b_1_s).all() b_2_s = b_2.sample() assert (test_s["b"]["b_2"] == b_2_s).all() c_s = c.sample() assert test_s["c"] == c_s
def sample_pusher_velocity(self): angle_threshold = np.deg2rad(30) angle_space = Box(-angle_threshold, angle_threshold) angle = angle_space.sample() magnitude = 100 pusher_velocity = magnitude * np.array([np.cos(angle), np.sin(angle)]) return pusher_velocity
class FakeImageEnv(Env): """ Fake image environment for testing purposes, it mimics Atari games. :param action_dim: (int) Number of discrete actions :param screen_height: (int) Height of the image :param screen_width: (int) Width of the image :param n_channels: (int) Number of color channels :param discrete: (bool) """ def __init__(self, action_dim: int = 6, screen_height: int = 84, screen_width: int = 84, n_channels: int = 1, discrete: bool = True): self.observation_space = Box(low=0, high=255, shape=(screen_height, screen_width, n_channels), dtype=np.uint8) if discrete: self.action_space = Discrete(action_dim) else: self.action_space = Box(low=-1, high=1, shape=(5, ), dtype=np.float32) self.ep_length = 10 self.current_step = 0 def reset(self) -> np.ndarray: self.current_step = 0 return self.observation_space.sample() def step(self, action: Union[np.ndarray, int]) -> GymStepReturn: reward = 0.0 self.current_step += 1 done = self.current_step >= self.ep_length return self.observation_space.sample(), reward, done, {} def render(self, mode: str = 'human') -> None: pass
class RandomPolicy(Policy): """Hand-coded policy that returns random actions.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Whether for compute_actions, the bounds given in action_space # should be ignored (default: False). This is to test action-clipping # and any Env's reaction to bounds breaches. if self.config.get("ignore_action_bounds", False) and isinstance( self.action_space, Box): self.action_space_for_sampling = Box( -float("inf"), float("inf"), shape=self.action_space.shape, dtype=self.action_space.dtype, ) else: self.action_space_for_sampling = self.action_space @override(Policy) def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, **kwargs): # Alternatively, a numpy array would work here as well. # e.g.: np.array([random.choice([0, 1])] * len(obs_batch)) return [self.action_space_for_sampling.sample() for _ in obs_batch], [], {} @override(Policy) def learn_on_batch(self, samples): """No learning.""" return {} @override(Policy) def compute_log_likelihoods( self, actions, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, ): return np.array([random.random()] * len(obs_batch)) @override(Policy) def get_weights(self) -> ModelWeights: """No weights to save.""" return {} @override(Policy) def set_weights(self, weights: ModelWeights) -> None: """No weights to set.""" pass
class SimpleEnv(Env): def __init__(self, config): self.action_space = Box(0.0, 1.0, (1, )) self.observation_space = Box(0.0, 1.0, (1, )) self.max_steps = config.get("max_steps", 100) self.state = None self.steps = None def reset(self): self.state = self.observation_space.sample() self.steps = 0 return self.state def step(self, action): self.steps += 1 # Reward is 1.0 - (action - state). [r] = 1.0 - np.abs(action - self.state) d = self.steps >= self.max_steps self.state = self.observation_space.sample() return self.state, r, d, {}
class MockEnv(gym.Env): # pylint:disable=abstract-method """Dummy environment with continuous action space.""" def __init__(self, _): self.horizon = 200 self.time = 0 low = np.array([-1] * 3 + [0], dtype=np.float32) high = np.array([1] * 4, dtype=np.float32) self.observation_space = Box(low=low, high=high) action_dim = 3 self.action_space = Box(high=1, low=-1, shape=(action_dim, ), dtype=np.float32) self.goal = torch.zeros(*self.observation_space.shape)[..., :-1] self.state = None @override(gym.Env) def reset(self): self.time = 0 self.state = self.observation_space.sample() self.state[-1] = 0 return self.state @override(gym.Env) def step(self, action): self.time += 1 self.state[:3] = np.clip( self.state[:3] + action, self.observation_space.low[:3], self.observation_space.high[:3], ) self.state[-1] = self.time / self.horizon reward = np.linalg.norm((self.state[:3] - self.goal.numpy()), axis=-1) return self.state, reward, self.time >= self.horizon, {} @staticmethod def reward_fn(state, action, next_state): # pylint:disable=missing-docstring,unused-argument return torch.norm(next_state[..., :3], dim=-1) def dynamics_fn(self, state, action): state, time = state[..., :3], state[..., 3:] new_state = state + action new_state = torch.max( torch.min(new_state, torch.from_numpy(self.action_space.high)), torch.from_numpy(self.action_space.low), ) time = time * self.horizon new_time = torch.clamp((time + 1) / self.horizon, min=0, max=1) return torch.cat([new_state, new_time], dim=-1), None
class UnittestSlowEnv(gym.Env): def __init__(self, slow_reset=0.3): super(UnittestSlowEnv, self).__init__() self.slow_reset = slow_reset self.observation_space = Box(low=0, high=255, shape=(HEIGHT, WIDTH, 3), dtype=np.uint8) self.action_space = Box(low=0., high=1., shape=(), dtype=np.float32) def reset(self): if self.slow_reset > 0: time.sleep(self.slow_reset) return self.observation_space.sample() def step(self, action): time.sleep(action) observation = self.observation_space.sample() reward, done = 0., False return observation, reward, done, {}
class ReachTorque: def __init__(self, settings, simulation): self.settings = settings self.sim = simulation stateMin = np.concatenate( (settings['robot']['workspace-min'], np.radians(settings['robot']['joint-min']), -np.radians(settings['robot']['max-velocities']))) stateMax = np.concatenate( (settings['robot']['workspace-max'], np.radians(settings['robot']['joint-max']), np.radians(settings['robot']['max-velocities']))) self.action_space = Box(-np.array(settings['robot']['max-torques']), np.array(settings['robot']['max-torques'])) self.observation_space = Box(stateMin, stateMax) self.sim.readDistance(settings['error-object-name']) self.rewardVelFactor = 1 / np.linalg.norm( np.radians(settings['robot']['max-velocities'])) def close(self): self.sim.close() def reset(self): self.sim.stop() self.state = self.observation_space.sample() ref = self.state[:len(self.settings['robot']['workspace-min'])] self.sim.setDummyPosition(self.settings['target-object-name'], ref) pose = self.state[len(self.settings['robot']['workspace-min'] ):-len(self.settings['robot']['max-velocities'])] self.sim.setPose(pose) self.sim.setVelocities(np.zeros(self.action_space.low.size)) self.sim.start() self.sim.step() self.state[len(self.settings['robot']['workspace-min'] ):] = np.concatenate(self.sim.getRobotState()) self.curStep = 0 return self.state def render(self): pass def step(self, action): self.curStep += 1 self.sim.setTorques(action) self.sim.step() self.state[len(self.settings['robot']['workspace-min'] ):] = np.concatenate(self.sim.getRobotState()) error = self.sim.readDistance(self.settings['error-object-name']) reward = -error - np.linalg.norm( self.state[-self.action_space.low.size:]) * self.rewardVelFactor reset = self.curStep >= self.settings['max-steps'] return self.state, reward, reset, None
class DummyAtari(gym.Env): def __init__(self, grayscale=True, squeeze=False): if grayscale: shape = (84, 84) if squeeze else (84, 84, 1) else: shape = (84, 84, 3) self.observation_space = Box( low=np.zeros(shape), high=np.zeros(shape) + 255, shape=shape, dtype=np.uint8, ) self.action_space = Discrete(4) self.t = 1 def step(self, action): observation = self.observation_space.sample() reward = np.random.random() return observation, reward, self.t % 80 == 0, {} def reset(self): self.t = 1 return self.observation_space.sample()
class UnittestEnv(gym.Env): def __init__(self, max_length): super(UnittestEnv, self).__init__() self.max_length = max_length self._length = 0 self.observation_space = Box(low=0, high=255, shape=(HEIGHT, WIDTH, 3), dtype=np.uint8) self.action_space = Box(low=0., high=1., shape=(2, ), dtype=np.float32) def reset_task(self): pass def reset(self): self._length = 0 return self.observation_space.sample() def step(self, action): observation = self.observation_space.sample() self._length += 1 reward, done = 0, (self._length >= self.max_length) return (observation, reward, done, {})
def sample_pusher_velocity(self): """ Default action sampler :return: :rtype: """ angle_threshold = np.deg2rad(30) magnitude = 0.2 angle_space = Box(-angle_threshold, angle_threshold) angle = angle_space.sample() pusher_velocity = magnitude * np.array([np.cos(angle), np.sin(angle)]) return pusher_velocity
class SimpleEnv(Env): def __init__(self, config): self._skip_env_checking = True if config.get("simplex_actions", False): self.action_space = Simplex((2, )) else: self.action_space = Box(0.0, 1.0, (1, )) self.observation_space = Box(0.0, 1.0, (1, )) self.max_steps = config.get("max_steps", 100) self.state = None self.steps = None def reset(self): self.state = self.observation_space.sample() self.steps = 0 return self.state def step(self, action): self.steps += 1 # Reward is 1.0 - (max(actions) - state). [r] = 1.0 - np.abs(np.max(action) - self.state) d = self.steps >= self.max_steps self.state = self.observation_space.sample() return self.state, r, d, {}
class RandomTeacher(AbstractTeacher): def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub): AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) self.random_task_generator = Box(np.array(mins), np.array(maxs), dtype=np.float32) self.random_task_generator.seed(self.seed) def sample_task(self): return self.random_task_generator.sample() def non_exploratory_task_sampling(self): return {"task": self.sample_task(), "infos": { "bk_index": -1, "task_infos": None} }
def test_tf_modelv2(self): obs_space = Box(-1.0, 1.0, (3, )) action_space = Box(-1.0, 1.0, (2, )) my_tf_model = TestTFModel(obs_space, action_space, 5, {}, "my_tf_model") # Call the model. out, states = my_tf_model({"obs": np.array([obs_space.sample()])}) self.assertTrue(out.shape == (1, 5)) self.assertTrue(out.dtype == tf.float32) self.assertTrue(states == []) vars = my_tf_model.variables(as_dict=True) self.assertTrue(len(vars) == 6) self.assertTrue("keras_model.dense.kernel:0" in vars) self.assertTrue("keras_model.dense.bias:0" in vars) self.assertTrue("fc_net.base_model.fc_out.kernel:0" in vars) self.assertTrue("fc_net.base_model.fc_out.bias:0" in vars) self.assertTrue("fc_net.base_model.value_out.kernel:0" in vars) self.assertTrue("fc_net.base_model.value_out.bias:0" in vars)
def __init__(self, observation_space: spaces.Box, features_dim: int = 256): super(CustomCNN, self).__init__(observation_space, features_dim) n_input_channels = observation_space[0] self.cnn = nn.Sequential( nn.Conv2d(n_input_channels, 256, kernel_size=2, stride=1, padding=0), nn.ReLU(), nn.Conv2d(256, 512, kernel_size=2, stride=1, padding=0), nn.ReLU(), nn.Flatten()) with no_grad(): n_flatten = self.cnn( as_tensor(observation_space.sample()[None]).float()).shape[1] self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())
def test_convert_element_to_space_type(self): """Test if space converter works for all elements/space permutations""" box_space = Box(low=-1, high=1, shape=(2, )) discrete_space = Discrete(2) multi_discrete_space = MultiDiscrete([2, 2]) multi_binary_space = MultiBinary(2) tuple_space = Tuple((box_space, discrete_space)) dict_space = Dict({ "box": box_space, "discrete": discrete_space, "multi_discrete": multi_discrete_space, "multi_binary": multi_binary_space, "dict_space": Dict({ "box2": box_space, "discrete2": discrete_space, }), "tuple_space": tuple_space, }) box_space_uncoverted = box_space.sample().astype(np.float64) multi_discrete_unconverted = multi_discrete_space.sample().astype( np.int32) multi_binary_unconverted = multi_binary_space.sample().astype(np.int32) tuple_unconverted = (box_space_uncoverted, float(0)) modified_element = { "box": box_space_uncoverted, "discrete": float(0), "multi_discrete": multi_discrete_unconverted, "multi_binary": multi_binary_unconverted, "tuple_space": tuple_unconverted, "dict_space": { "box2": box_space_uncoverted, "discrete2": float(0), }, } element_with_correct_types = convert_element_to_space_type( modified_element, dict_space.sample()) assert dict_space.contains(element_with_correct_types)
class RandomTeacher(): def __init__(self, mins, maxs, seed=None): self.seed = seed if not seed: self.seed = np.random.randint(42,424242) np.random.seed(self.seed) self.mins = mins self.maxs = maxs self.random_task_generator = Box(np.array(mins), np.array(maxs), dtype=np.float32) def update(self, task, competence): pass def sample_task(self): return self.random_task_generator.sample() def dump(self, dump_dict): return dump_dict