예제 #1
0
    def test_gumbel_softmax(self):
        """Tests the GumbelSoftmax ActionDistribution (tf + eager only)."""
        for fw, sess in framework_iterator(frameworks=["tf", "tfe"],
                                           session=True):
            batch_size = 1000
            num_categories = 5
            input_space = Box(-1.0, 1.0, shape=(batch_size, num_categories))

            # Batch of size=n and deterministic.
            inputs = input_space.sample()
            gumbel_softmax = GumbelSoftmax(inputs, {}, temperature=1.0)

            expected = softmax(inputs)
            # Sample n times, expect always mean value (deterministic draw).
            out = gumbel_softmax.deterministic_sample()
            check(out, expected)

            # Batch of size=n and non-deterministic -> expect roughly that
            # the max-likelihood (argmax) ints are output (most of the time).
            inputs = input_space.sample()
            gumbel_softmax = GumbelSoftmax(inputs, {}, temperature=1.0)
            expected_mean = np.mean(np.argmax(inputs, -1)).astype(np.float32)
            outs = gumbel_softmax.sample()
            if sess:
                outs = sess.run(outs)
            check(np.mean(np.argmax(outs, -1)), expected_mean, rtol=0.08)
예제 #2
0
    def test_vtrace(self):
        """Tests V-trace against ground truth data calculated in python."""
        seq_len = 5
        batch_size = 10

        # Create log_rhos such that rho will span from near-zero to above the
        # clipping thresholds. In particular, calculate log_rhos in
        # [-2.5, 2.5),
        # so that rho is in approx [0.08, 12.2).
        space_w_time = Box(-1.0, 1.0, (seq_len, batch_size), np.float32)
        space_only_batch = Box(-1.0, 1.0, (batch_size, ), np.float32)
        log_rhos = space_w_time.sample() / (batch_size * seq_len)
        log_rhos = 5 * (log_rhos - 0.5)  # [0.0, 1.0) -> [-2.5, 2.5).
        values = {
            "log_rhos": log_rhos,
            # T, B where B_i: [0.9 / (i+1)] * T
            "discounts": np.array([[0.9 / (b + 1) for b in range(batch_size)]
                                   for _ in range(seq_len)]),
            "rewards": space_w_time.sample(),
            "values": space_w_time.sample() / batch_size,
            "bootstrap_value": space_only_batch.sample() + 1.0,
            "clip_rho_threshold": 3.7,
            "clip_pg_rho_threshold": 2.2,
        }

        for fw, sess in framework_iterator(
                frameworks=("torch", "tf"), session=True):
            vtrace = vtrace_tf if fw != "torch" else vtrace_torch
            output = vtrace.from_importance_weights(**values)
            if sess:
                output = sess.run(output)

            ground_truth_v = _ground_truth_calculation(vtrace, **values)
            check(output, ground_truth_v)
예제 #3
0
    def test_multi_categorical(self):
        batch_size = 100
        num_categories = 3
        num_sub_distributions = 5
        # Create 5 categorical distributions of 3 categories each.
        inputs_space = Box(-1.0,
                           2.0,
                           shape=(batch_size,
                                  num_sub_distributions * num_categories))
        values_space = Box(0,
                           num_categories - 1,
                           shape=(num_sub_distributions, batch_size),
                           dtype=np.int32)

        inputs = inputs_space.sample()
        input_lengths = [num_categories] * num_sub_distributions
        inputs_split = np.split(inputs, num_sub_distributions, axis=1)

        for fw in framework_iterator():
            # Create the correct distribution object.
            cls = MultiCategorical if fw != "torch" else TorchMultiCategorical
            multi_categorical = cls(inputs, None, input_lengths)

            # Batch of size=3 and deterministic (True).
            expected = np.transpose(np.argmax(inputs_split, axis=-1))
            # Sample, expect always max value
            # (max likelihood for deterministic draw).
            out = multi_categorical.deterministic_sample()
            check(out, expected)

            # Batch of size=3 and non-deterministic -> expect roughly the mean.
            out = multi_categorical.sample()
            check(tf.reduce_mean(out)
                  if fw != "torch" else torch.mean(out.float()),
                  1.0,
                  decimals=0)

            # Test log-likelihood outputs.
            probs = softmax(inputs_split)
            values = values_space.sample()

            out = multi_categorical.logp(values if fw != "torch" else [
                torch.Tensor(values[i]) for i in range(num_sub_distributions)
            ])  # v in np.stack(values, 1)])
            expected = []
            for i in range(batch_size):
                expected.append(
                    np.sum(
                        np.log(
                            np.array([
                                probs[j][i][values[j][i]]
                                for j in range(num_sub_distributions)
                            ]))))
            check(out, expected, decimals=4)

            # Test entropy outputs.
            out = multi_categorical.entropy()
            expected_entropy = -np.sum(np.sum(probs * np.log(probs), 0), -1)
            check(out, expected_entropy)
예제 #4
0
    def test_categorical(self):
        batch_size = 10000
        num_categories = 4
        # Create categorical distribution with n categories.
        inputs_space = Box(-1.0,
                           2.0,
                           shape=(batch_size, num_categories),
                           dtype=np.float32)
        values_space = Box(0,
                           num_categories - 1,
                           shape=(batch_size, ),
                           dtype=np.int32)

        inputs = inputs_space.sample()

        for fw, sess in framework_iterator(session=True,
                                           frameworks=("tf", "tf2", "torch")):
            # Create the correct distribution object.
            cls = JAXCategorical if fw == "jax" else Categorical if \
                fw != "torch" else TorchCategorical
            categorical = cls(inputs, {})

            # Do a stability test using extreme NN outputs to see whether
            # sampling and logp'ing result in NaN or +/-inf values.
            self._stability_test(cls,
                                 inputs_space.shape,
                                 fw=fw,
                                 sess=sess,
                                 bounds=(0, num_categories - 1))

            # Batch of size=3 and deterministic (True).
            expected = np.transpose(np.argmax(inputs, axis=-1))
            # Sample, expect always max value
            # (max likelihood for deterministic draw).
            out = categorical.deterministic_sample()
            check(out, expected)

            # Batch of size=3 and non-deterministic -> expect roughly the mean.
            out = categorical.sample()
            check(np.mean(out) if fw == "jax" else tf.reduce_mean(out)
                  if fw != "torch" else torch.mean(out.float()),
                  1.0,
                  decimals=0)

            # Test log-likelihood outputs.
            probs = softmax(inputs)
            values = values_space.sample()

            out = categorical.logp(
                values if fw != "torch" else torch.Tensor(values))
            expected = []
            for i in range(batch_size):
                expected.append(np.sum(np.log(np.array(probs[i][values[i]]))))
            check(out, expected, decimals=4)

            # Test entropy outputs.
            out = categorical.entropy()
            expected_entropy = -np.sum(probs * np.log(probs), -1)
            check(out, expected_entropy)
예제 #5
0
    def test_beta(self):
        input_space = Box(-2.0, 1.0, shape=(2000, 10))
        input_space.seed(42)
        low, high = -1.0, 2.0
        plain_beta_value_space = Box(0.0, 1.0, shape=(2000, 5))
        plain_beta_value_space.seed(42)

        for fw, sess in framework_iterator(session=True):
            cls = TorchBeta if fw == "torch" else Beta
            inputs = input_space.sample()
            beta_distribution = cls(inputs, {}, low=low, high=high)

            inputs = beta_distribution.inputs
            if sess:
                inputs = sess.run(inputs)
            else:
                inputs = inputs.numpy()
            alpha, beta_ = np.split(inputs, 2, axis=-1)

            # Mean for a Beta distribution: 1 / [1 + (beta/alpha)]
            expected = (1.0 / (1.0 + beta_ / alpha)) * (high - low) + low
            # Sample n times, expect always mean value (deterministic draw).
            out = beta_distribution.deterministic_sample()
            check(out, expected, rtol=0.01)

            # Batch of size=n and non-deterministic -> expect roughly the mean.
            values = beta_distribution.sample()
            if sess:
                values = sess.run(values)
            else:
                values = values.numpy()
            self.assertTrue(np.max(values) <= high)
            self.assertTrue(np.min(values) >= low)

            check(np.mean(values), expected.mean(), decimals=1)

            # Test log-likelihood outputs (against scipy).
            inputs = input_space.sample()
            beta_distribution = cls(inputs, {}, low=low, high=high)
            inputs = beta_distribution.inputs
            if sess:
                inputs = sess.run(inputs)
            else:
                inputs = inputs.numpy()
            alpha, beta_ = np.split(inputs, 2, axis=-1)

            values = plain_beta_value_space.sample()
            values_scaled = values * (high - low) + low
            if fw == "torch":
                values_scaled = torch.Tensor(values_scaled)
            print(values_scaled)
            out = beta_distribution.logp(values_scaled)
            check(
                out,
                np.sum(np.log(beta.pdf(values, alpha, beta_)), -1),
                rtol=0.01)
예제 #6
0
def test_space_utils():
    # Box
    box = Box(-1.0, 1.0, shape=[2, 3], dtype=np.float32)
    sample = box.sample()
    assert flatdim(box) == 2 * 3
    assert flatten(box, sample).shape == (2 * 3, )
    assert np.allclose(sample, unflatten(box, flatten(box, sample)))

    x = np.array([[1.0, 1.0], [1.0, 1.0]])
    box = Box(low=-x, high=x, dtype=np.float32)
    sample = box.sample()
    assert flatdim(box) == 2 * 2
    assert flatten(box, sample).shape == (2 * 2, )
    assert np.allclose(sample, unflatten(box, flatten(box, sample)))

    # Discrete
    discrete = Discrete(5)
    sample = discrete.sample()
    assert flatdim(discrete) == 5
    assert flatten(discrete, sample).shape == (5, )
    assert sample == unflatten(discrete, flatten(discrete, sample))

    # Tuple
    S = Tuple([
        Discrete(5),
        Box(-1.0, 1.0, shape=(2, 3), dtype=np.float32),
        Dict({
            'success': Discrete(2),
            'velocity': Box(-1, 1, shape=(1, 3), dtype=np.float32)
        })
    ])
    sample = S.sample()
    assert flatdim(S) == 5 + 2 * 3 + 2 + 3
    assert flatten(S, sample).shape == (16, )
    _sample = unflatten(S, flatten(S, sample))
    assert sample[0] == _sample[0]
    assert np.allclose(sample[1], _sample[1])
    assert sample[2]['success'] == _sample[2]['success']
    assert np.allclose(sample[2]['velocity'], _sample[2]['velocity'])

    # Dict
    D0 = Dict({
        'position': Box(-100, 100, shape=(3, ), dtype=np.float32),
        'velocity': Box(-1, 1, shape=(4, ), dtype=np.float32)
    })
    D = Dict({'sensors': D0, 'score': Discrete(100)})
    sample = D.sample()
    assert flatdim(D) == 3 + 4 + 100
    assert flatten(D, sample).shape == (107, )
    _sample = unflatten(D, flatten(D, sample))
    assert sample['score'] == _sample['score']
    assert np.allclose(sample['sensors']['position'],
                       _sample['sensors']['position'])
    assert np.allclose(sample['sensors']['velocity'],
                       _sample['sensors']['velocity'])
예제 #7
0
    def test_trajectory(self):
        """Tests the Trajectory class."""

        buffer_size = 5

        # Small trajecory object for testing purposes.
        trajectory = Trajectory(buffer_size=buffer_size)
        self.assertEqual(trajectory.cursor, 0)
        self.assertEqual(trajectory.timestep, 0)
        self.assertEqual(trajectory.sample_batch_offset, 0)
        assert not trajectory.buffers
        observation_space = Box(-1.0, 1.0, shape=(3, ))
        action_space = Discrete(2)
        trajectory.add_init_obs(env_id=0,
                                agent_id="agent",
                                policy_id="policy",
                                init_obs=observation_space.sample())
        self.assertEqual(trajectory.cursor, 0)
        self.assertEqual(trajectory.initial_obs.shape, observation_space.shape)

        # Fill up the buffer and make it extend if it hits the limit.
        cur_buffer_size = buffer_size
        for i in range(buffer_size + 1):
            trajectory.add_action_reward_next_obs(
                env_id=0,
                agent_id="agent",
                policy_id="policy",
                values=dict(
                    t=i,
                    actions=action_space.sample(),
                    rewards=1.0,
                    dones=i == buffer_size,
                    new_obs=observation_space.sample(),
                    action_logp=-0.5,
                    action_dist_inputs=np.array([[0.5, 0.5]]),
                ))
            self.assertEqual(trajectory.cursor, i + 1)
            self.assertEqual(trajectory.timestep, i + 1)
            self.assertEqual(trajectory.sample_batch_offset, 0)
            if i == buffer_size - 1:
                cur_buffer_size *= 2
            self.assertEqual(len(trajectory.buffers["new_obs"]),
                             cur_buffer_size)
            self.assertEqual(len(trajectory.buffers["rewards"]),
                             cur_buffer_size)

        # Create a SampleBatch from the Trajectory and reset it.
        batch = trajectory.get_sample_batch_and_reset()
        self.assertEqual(batch.count, buffer_size + 1)
        # Make sure, Trajectory was reset properly.
        self.assertEqual(trajectory.cursor, buffer_size + 1)
        self.assertEqual(trajectory.timestep, 0)
        self.assertEqual(trajectory.sample_batch_offset, buffer_size + 1)
예제 #8
0
class FakeImageEnv(Env):
    """
    Fake image environment for testing purposes, it mimics Atari games.

    :param action_dim: Number of discrete actions
    :param screen_height: Height of the image
    :param screen_width: Width of the image
    :param n_channels: Number of color channels
    :param discrete: Create discrete action space instead of continuous
    :param channel_first: Put channels on first axis instead of last
    """
    def __init__(
        self,
        action_dim: int = 6,
        screen_height: int = 84,
        screen_width: int = 84,
        n_channels: int = 1,
        discrete: bool = True,
        channel_first: bool = False,
    ):
        self.observation_shape = (screen_height, screen_width, n_channels)
        if channel_first:
            self.observation_shape = (n_channels, screen_height, screen_width)
        self.observation_space = Box(low=0,
                                     high=255,
                                     shape=self.observation_shape,
                                     dtype=np.uint8)
        if discrete:
            self.action_space = Discrete(action_dim)
        else:
            self.action_space = Box(low=-1,
                                    high=1,
                                    shape=(5, ),
                                    dtype=np.float32)
        self.ep_length = 10
        self.current_step = 0

    def reset(self) -> np.ndarray:
        self.current_step = 0
        return self.observation_space.sample()

    def step(self, action: Union[np.ndarray, int]) -> GymStepReturn:
        reward = 0.0
        self.current_step += 1
        done = self.current_step >= self.ep_length
        return self.observation_space.sample(), reward, done, {}

    def render(self, mode: str = "human") -> None:
        pass
예제 #9
0
 def test_json(self):
     """Test JSON conversions."""
     box_space = Box(low=np.array([[1., 2.], [3., 4.]]),
                     high=np.array([[1.3, 4.9], [3.5, 5.]]))
     space = StackedBoxSpace(box_space, 2)
     samples = [box_space.sample() for _ in range(5)]
     jsoned = space.to_jsonable(samples)
     self.assertEqual(space.to_jsonable(space.from_jsonable(jsoned)),
                      jsoned)
     in_data = [[
         np.array([[1.1, 2.1], [3.1, 4.1]]),
         np.array([[1.2, 2.2], [3.2, 4.2]])
     ],
                [
                    np.array([[1.11, 2.11], [3.3, 4.]]),
                    np.array([[1.21, 2.2], [3.23, 4.21]])
                ]]
     self.assertEqual(
         space.to_jsonable(in_data),
         [[[[1.1, 2.1], [3.1, 4.1]], [[1.11, 2.11], [3.3, 4.]]],
          [[[1.2, 2.2], [3.2, 4.2]], [[1.21, 2.2], [3.23, 4.21]]]])
     inverted = space.from_jsonable(space.to_jsonable(in_data))
     for idx1, idx2 in [(0, 0), (0, 1), (1, 0), (1, 1)]:
         self.assertTrue(
             np.allclose(in_data[idx1][idx2], inverted[idx1][idx2]))
예제 #10
0
    def test_default_models(self):
        ray.init(object_store_memory=1000 * 1024 * 1024)

        for fw in framework_iterator(frameworks=("jax", "tf", "tf2", "torch")):
            obs_space = Box(0, 1, shape=(3, ), dtype=np.float32)
            p1 = ModelCatalog.get_model_v2(
                obs_space=obs_space,
                action_space=Discrete(5),
                num_outputs=5,
                model_config={},
                framework=fw,
            )
            self.assertTrue("FullyConnectedNetwork" in type(p1).__name__)
            # Do a test forward pass.
            obs = np.array([obs_space.sample()])
            if fw == "torch":
                obs = torch.from_numpy(obs)
            out, state_outs = p1({"obs": obs})
            self.assertTrue(out.shape == (1, 5))
            self.assertTrue(state_outs == [])

            # No Conv2Ds for JAX yet.
            if fw != "jax":
                p2 = ModelCatalog.get_model_v2(
                    obs_space=Box(0, 1, shape=(84, 84, 3), dtype=np.float32),
                    action_space=Discrete(5),
                    num_outputs=5,
                    model_config={},
                    framework=fw,
                )
                self.assertTrue("VisionNetwork" in type(p2).__name__)
예제 #11
0
def sample_slider_position(T_aug=None):
    # always at the origin basically
    pos = np.array([0, 0, 0.03])

    yaw_min = np.array([0])
    yaw_max = np.array([2 * np.pi])
    yaw_sampler = Box(yaw_min, yaw_max)
    yaw = yaw_sampler.sample()

    quat = transforms3d.euler.euler2quat(0, 0, yaw)

    T_O_slider = transform_utils.transform_from_pose(pos, quat)

    T_W_slider = None
    if T_aug is not None:
        T_W_slider = T_aug @ T_O_slider
    else:
        T_W_slider = T_O_slider

    pose_dict = transform_utils.matrix_to_dict(T_W_slider)

    # note the quat/pos orderining
    q = np.concatenate((pose_dict['quaternion'], pose_dict['position']))

    return q
예제 #12
0
def test_stacked_box_space_json():
    """
    Test JSON conversions for StackedBoxSpace.
    """
    box_space = Box(low=np.array([[1., 2.], [3., 4.]]),
                    high=np.array([[1.3, 4.9], [3.5, 5.]]))
    space = StackedBoxSpace(box_space, 2)
    samples = [box_space.sample() for _ in range(5)]
    jsoned = space.to_jsonable(samples)
    assert space.to_jsonable(space.from_jsonable(jsoned)) == jsoned
    in_data = [[
        np.array([[1.1, 2.1], [3.1, 4.1]]),
        np.array([[1.2, 2.2], [3.2, 4.2]])
    ],
               [
                   np.array([[1.11, 2.11], [3.3, 4.]]),
                   np.array([[1.21, 2.2], [3.23, 4.21]])
               ]]
    assert (space.to_jsonable(in_data) == [[[[1.1, 2.1], [3.1, 4.1]],
                                            [[1.11, 2.11], [3.3, 4.]]],
                                           [[[1.2, 2.2], [3.2, 4.2]],
                                            [[1.21, 2.2], [3.23, 4.21]]]])
    inverted = space.from_jsonable(space.to_jsonable(in_data))
    for idx1, idx2 in [(0, 0), (0, 1), (1, 0), (1, 1)]:
        assert np.allclose(in_data[idx1][idx2], inverted[idx1][idx2])
예제 #13
0
    def __init__(
            self,
            wrapped_env,
            encoder: Encoder,
            encoder_input_prefix,
            key_prefix='encoder',
            reward_mode='encoder_distance',
    ):
        """

        :param wrapped_env:
        :param encoder:
        :param encoder_input_prefix:
        :param key_prefix:
        :param reward_mode:
         - 'encoder_distance': l1 distance in encoder distance
         - 'vectorized_encoder_distance': vectorized l1 distance in encoder
             distance, i.e. negative absolute value
         - 'env': use the wrapped env's reward
        """
        super().__init__(wrapped_env)
        if reward_mode not in {
            self.ENCODER_DISTANCE_REWARD,
            self.VECTORIZED_ENCODER_DISTANCE_REWARD,
            self.ENV_REWARD,
        }:
            raise ValueError(reward_mode)
        self._encoder = encoder
        self._encoder_input_obs_key = '{}_observation'.format(
            encoder_input_prefix)
        self._encoder_input_desired_goal_key = '{}_desired_goal'.format(
            encoder_input_prefix
        )
        self._encoder_input_achieved_goal_key = '{}_achieved_goal'.format(
            encoder_input_prefix
        )
        self._reward_mode = reward_mode
        spaces = self.wrapped_env.observation_space.spaces
        latent_space = Box(
            encoder.min_embedding,
            encoder.max_embedding,
            dtype=np.float32,
        )
        self._embedding_size = encoder.min_embedding.size
        self._obs_key = '{}_observation'.format(key_prefix)
        self._desired_goal_key = '{}_desired_goal'.format(key_prefix)
        self._achieved_goal_key = '{}_achieved_goal'.format(key_prefix)
        self._distance_name = '{}_distance'.format(key_prefix)

        self._key_prefix = key_prefix
        self._desired_goal = {
            self._desired_goal_key: np.zeros_like(latent_space.sample())
        }
        spaces[self._obs_key] = latent_space
        spaces[self._desired_goal_key] = latent_space
        spaces[self._achieved_goal_key] = latent_space
        self.observation_space = Dict(spaces)
        self._goal_sampling_mode = 'env'
예제 #14
0
def test_seed_Dict():
    test_space = Dict(
        {
            "a": Box(low=0, high=1, shape=(3, 3)),
            "b": Dict(
                {
                    "b_1": Box(low=-100, high=100, shape=(2,)),
                    "b_2": Box(low=-1, high=1, shape=(2,)),
                }
            ),
            "c": Discrete(5),
        }
    )

    seed_dict = {
        "a": 0,
        "b": {
            "b_1": 1,
            "b_2": 2,
        },
        "c": 3,
    }

    test_space.seed(seed_dict)

    # "Unpack" the dict sub-spaces into individual spaces
    a = Box(low=0, high=1, shape=(3, 3))
    a.seed(0)
    b_1 = Box(low=-100, high=100, shape=(2,))
    b_1.seed(1)
    b_2 = Box(low=-1, high=1, shape=(2,))
    b_2.seed(2)
    c = Discrete(5)
    c.seed(3)

    for i in range(10):
        test_s = test_space.sample()
        a_s = a.sample()
        assert (test_s["a"] == a_s).all()
        b_1_s = b_1.sample()
        assert (test_s["b"]["b_1"] == b_1_s).all()
        b_2_s = b_2.sample()
        assert (test_s["b"]["b_2"] == b_2_s).all()
        c_s = c.sample()
        assert test_s["c"] == c_s
예제 #15
0
    def sample_pusher_velocity(self):
        angle_threshold = np.deg2rad(30)
        angle_space = Box(-angle_threshold, angle_threshold)
        angle = angle_space.sample()

        magnitude = 100
        pusher_velocity = magnitude * np.array([np.cos(angle), np.sin(angle)])

        return pusher_velocity
예제 #16
0
class FakeImageEnv(Env):
    """
    Fake image environment for testing purposes, it mimics Atari games.

    :param action_dim: (int) Number of discrete actions
    :param screen_height: (int) Height of the image
    :param screen_width: (int) Width of the image
    :param n_channels: (int) Number of color channels
    :param discrete: (bool)
    """
    def __init__(self,
                 action_dim: int = 6,
                 screen_height: int = 84,
                 screen_width: int = 84,
                 n_channels: int = 1,
                 discrete: bool = True):

        self.observation_space = Box(low=0,
                                     high=255,
                                     shape=(screen_height, screen_width,
                                            n_channels),
                                     dtype=np.uint8)
        if discrete:
            self.action_space = Discrete(action_dim)
        else:
            self.action_space = Box(low=-1,
                                    high=1,
                                    shape=(5, ),
                                    dtype=np.float32)
        self.ep_length = 10
        self.current_step = 0

    def reset(self) -> np.ndarray:
        self.current_step = 0
        return self.observation_space.sample()

    def step(self, action: Union[np.ndarray, int]) -> GymStepReturn:
        reward = 0.0
        self.current_step += 1
        done = self.current_step >= self.ep_length
        return self.observation_space.sample(), reward, done, {}

    def render(self, mode: str = 'human') -> None:
        pass
예제 #17
0
class RandomPolicy(Policy):
    """Hand-coded policy that returns random actions."""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Whether for compute_actions, the bounds given in action_space
        # should be ignored (default: False). This is to test action-clipping
        # and any Env's reaction to bounds breaches.
        if self.config.get("ignore_action_bounds", False) and isinstance(
                self.action_space, Box):
            self.action_space_for_sampling = Box(
                -float("inf"),
                float("inf"),
                shape=self.action_space.shape,
                dtype=self.action_space.dtype,
            )
        else:
            self.action_space_for_sampling = self.action_space

    @override(Policy)
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        **kwargs):
        # Alternatively, a numpy array would work here as well.
        # e.g.: np.array([random.choice([0, 1])] * len(obs_batch))
        return [self.action_space_for_sampling.sample()
                for _ in obs_batch], [], {}

    @override(Policy)
    def learn_on_batch(self, samples):
        """No learning."""
        return {}

    @override(Policy)
    def compute_log_likelihoods(
        self,
        actions,
        obs_batch,
        state_batches=None,
        prev_action_batch=None,
        prev_reward_batch=None,
    ):
        return np.array([random.random()] * len(obs_batch))

    @override(Policy)
    def get_weights(self) -> ModelWeights:
        """No weights to save."""
        return {}

    @override(Policy)
    def set_weights(self, weights: ModelWeights) -> None:
        """No weights to set."""
        pass
예제 #18
0
파일: test_sac.py 프로젝트: yongjun823/ray
class SimpleEnv(Env):
    def __init__(self, config):
        self.action_space = Box(0.0, 1.0, (1, ))
        self.observation_space = Box(0.0, 1.0, (1, ))
        self.max_steps = config.get("max_steps", 100)
        self.state = None
        self.steps = None

    def reset(self):
        self.state = self.observation_space.sample()
        self.steps = 0
        return self.state

    def step(self, action):
        self.steps += 1
        # Reward is 1.0 - (action - state).
        [r] = 1.0 - np.abs(action - self.state)
        d = self.steps >= self.max_steps
        self.state = self.observation_space.sample()
        return self.state, r, d, {}
예제 #19
0
class MockEnv(gym.Env):  # pylint:disable=abstract-method
    """Dummy environment with continuous action space."""
    def __init__(self, _):
        self.horizon = 200
        self.time = 0

        low = np.array([-1] * 3 + [0], dtype=np.float32)
        high = np.array([1] * 4, dtype=np.float32)
        self.observation_space = Box(low=low, high=high)

        action_dim = 3
        self.action_space = Box(high=1,
                                low=-1,
                                shape=(action_dim, ),
                                dtype=np.float32)

        self.goal = torch.zeros(*self.observation_space.shape)[..., :-1]
        self.state = None

    @override(gym.Env)
    def reset(self):
        self.time = 0
        self.state = self.observation_space.sample()
        self.state[-1] = 0
        return self.state

    @override(gym.Env)
    def step(self, action):
        self.time += 1
        self.state[:3] = np.clip(
            self.state[:3] + action,
            self.observation_space.low[:3],
            self.observation_space.high[:3],
        )
        self.state[-1] = self.time / self.horizon
        reward = np.linalg.norm((self.state[:3] - self.goal.numpy()), axis=-1)
        return self.state, reward, self.time >= self.horizon, {}

    @staticmethod
    def reward_fn(state, action, next_state):
        # pylint:disable=missing-docstring,unused-argument
        return torch.norm(next_state[..., :3], dim=-1)

    def dynamics_fn(self, state, action):
        state, time = state[..., :3], state[..., 3:]
        new_state = state + action
        new_state = torch.max(
            torch.min(new_state, torch.from_numpy(self.action_space.high)),
            torch.from_numpy(self.action_space.low),
        )

        time = time * self.horizon
        new_time = torch.clamp((time + 1) / self.horizon, min=0, max=1)
        return torch.cat([new_state, new_time], dim=-1), None
예제 #20
0
class UnittestSlowEnv(gym.Env):
    def __init__(self, slow_reset=0.3):
        super(UnittestSlowEnv, self).__init__()
        self.slow_reset = slow_reset
        self.observation_space = Box(low=0,
                                     high=255,
                                     shape=(HEIGHT, WIDTH, 3),
                                     dtype=np.uint8)
        self.action_space = Box(low=0., high=1., shape=(), dtype=np.float32)

    def reset(self):
        if self.slow_reset > 0:
            time.sleep(self.slow_reset)
        return self.observation_space.sample()

    def step(self, action):
        time.sleep(action)
        observation = self.observation_space.sample()
        reward, done = 0., False
        return observation, reward, done, {}
예제 #21
0
class ReachTorque:
    def __init__(self, settings, simulation):
        self.settings = settings
        self.sim = simulation
        stateMin = np.concatenate(
            (settings['robot']['workspace-min'],
             np.radians(settings['robot']['joint-min']),
             -np.radians(settings['robot']['max-velocities'])))
        stateMax = np.concatenate(
            (settings['robot']['workspace-max'],
             np.radians(settings['robot']['joint-max']),
             np.radians(settings['robot']['max-velocities'])))
        self.action_space = Box(-np.array(settings['robot']['max-torques']),
                                np.array(settings['robot']['max-torques']))
        self.observation_space = Box(stateMin, stateMax)
        self.sim.readDistance(settings['error-object-name'])
        self.rewardVelFactor = 1 / np.linalg.norm(
            np.radians(settings['robot']['max-velocities']))

    def close(self):
        self.sim.close()

    def reset(self):
        self.sim.stop()
        self.state = self.observation_space.sample()
        ref = self.state[:len(self.settings['robot']['workspace-min'])]
        self.sim.setDummyPosition(self.settings['target-object-name'], ref)
        pose = self.state[len(self.settings['robot']['workspace-min']
                              ):-len(self.settings['robot']['max-velocities'])]
        self.sim.setPose(pose)
        self.sim.setVelocities(np.zeros(self.action_space.low.size))
        self.sim.start()
        self.sim.step()
        self.state[len(self.settings['robot']['workspace-min']
                       ):] = np.concatenate(self.sim.getRobotState())
        self.curStep = 0
        return self.state

    def render(self):
        pass

    def step(self, action):
        self.curStep += 1
        self.sim.setTorques(action)
        self.sim.step()
        self.state[len(self.settings['robot']['workspace-min']
                       ):] = np.concatenate(self.sim.getRobotState())
        error = self.sim.readDistance(self.settings['error-object-name'])
        reward = -error - np.linalg.norm(
            self.state[-self.action_space.low.size:]) * self.rewardVelFactor
        reset = self.curStep >= self.settings['max-steps']
        return self.state, reward, reset, None
예제 #22
0
class DummyAtari(gym.Env):
    def __init__(self, grayscale=True, squeeze=False):
        if grayscale:
            shape = (84, 84) if squeeze else (84, 84, 1)
        else:
            shape = (84, 84, 3)
        self.observation_space = Box(
            low=np.zeros(shape),
            high=np.zeros(shape) + 255,
            shape=shape,
            dtype=np.uint8,
        )
        self.action_space = Discrete(4)
        self.t = 1

    def step(self, action):
        observation = self.observation_space.sample()
        reward = np.random.random()
        return observation, reward, self.t % 80 == 0, {}

    def reset(self):
        self.t = 1
        return self.observation_space.sample()
예제 #23
0
class UnittestEnv(gym.Env):
    def __init__(self, max_length):
        super(UnittestEnv, self).__init__()
        self.max_length = max_length
        self._length = 0

        self.observation_space = Box(low=0,
                                     high=255,
                                     shape=(HEIGHT, WIDTH, 3),
                                     dtype=np.uint8)
        self.action_space = Box(low=0., high=1., shape=(2, ), dtype=np.float32)

    def reset_task(self):
        pass

    def reset(self):
        self._length = 0
        return self.observation_space.sample()

    def step(self, action):
        observation = self.observation_space.sample()
        self._length += 1
        reward, done = 0, (self._length >= self.max_length)
        return (observation, reward, done, {})
예제 #24
0
    def sample_pusher_velocity(self):
        """
        Default action sampler
        :return:
        :rtype:
        """
        angle_threshold = np.deg2rad(30)
        magnitude = 0.2

        angle_space = Box(-angle_threshold, angle_threshold)
        angle = angle_space.sample()

        pusher_velocity = magnitude * np.array([np.cos(angle), np.sin(angle)])

        return pusher_velocity
예제 #25
0
파일: test_sac.py 프로젝트: krfricke/ray
class SimpleEnv(Env):
    def __init__(self, config):
        self._skip_env_checking = True
        if config.get("simplex_actions", False):
            self.action_space = Simplex((2, ))
        else:
            self.action_space = Box(0.0, 1.0, (1, ))
        self.observation_space = Box(0.0, 1.0, (1, ))
        self.max_steps = config.get("max_steps", 100)
        self.state = None
        self.steps = None

    def reset(self):
        self.state = self.observation_space.sample()
        self.steps = 0
        return self.state

    def step(self, action):
        self.steps += 1
        # Reward is 1.0 - (max(actions) - state).
        [r] = 1.0 - np.abs(np.max(action) - self.state)
        d = self.steps >= self.max_steps
        self.state = self.observation_space.sample()
        return self.state, r, d, {}
예제 #26
0
class RandomTeacher(AbstractTeacher):
    def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub):
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed)

        self.random_task_generator = Box(np.array(mins), np.array(maxs), dtype=np.float32)
        self.random_task_generator.seed(self.seed)

    def sample_task(self):
        return self.random_task_generator.sample()

    def non_exploratory_task_sampling(self):
        return {"task": self.sample_task(),
                "infos": {
                    "bk_index": -1,
                    "task_infos": None}
                }
예제 #27
0
 def test_tf_modelv2(self):
     obs_space = Box(-1.0, 1.0, (3, ))
     action_space = Box(-1.0, 1.0, (2, ))
     my_tf_model = TestTFModel(obs_space, action_space, 5, {},
                               "my_tf_model")
     # Call the model.
     out, states = my_tf_model({"obs": np.array([obs_space.sample()])})
     self.assertTrue(out.shape == (1, 5))
     self.assertTrue(out.dtype == tf.float32)
     self.assertTrue(states == [])
     vars = my_tf_model.variables(as_dict=True)
     self.assertTrue(len(vars) == 6)
     self.assertTrue("keras_model.dense.kernel:0" in vars)
     self.assertTrue("keras_model.dense.bias:0" in vars)
     self.assertTrue("fc_net.base_model.fc_out.kernel:0" in vars)
     self.assertTrue("fc_net.base_model.fc_out.bias:0" in vars)
     self.assertTrue("fc_net.base_model.value_out.kernel:0" in vars)
     self.assertTrue("fc_net.base_model.value_out.bias:0" in vars)
예제 #28
0
    def __init__(self, observation_space: spaces.Box, features_dim: int = 256):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels,
                      256,
                      kernel_size=2,
                      stride=1,
                      padding=0), nn.ReLU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=1, padding=0), nn.ReLU(),
            nn.Flatten())

        with no_grad():
            n_flatten = self.cnn(
                as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim),
                                    nn.ReLU())
예제 #29
0
    def test_convert_element_to_space_type(self):
        """Test if space converter works for all elements/space permutations"""
        box_space = Box(low=-1, high=1, shape=(2, ))
        discrete_space = Discrete(2)
        multi_discrete_space = MultiDiscrete([2, 2])
        multi_binary_space = MultiBinary(2)
        tuple_space = Tuple((box_space, discrete_space))
        dict_space = Dict({
            "box":
            box_space,
            "discrete":
            discrete_space,
            "multi_discrete":
            multi_discrete_space,
            "multi_binary":
            multi_binary_space,
            "dict_space":
            Dict({
                "box2": box_space,
                "discrete2": discrete_space,
            }),
            "tuple_space":
            tuple_space,
        })

        box_space_uncoverted = box_space.sample().astype(np.float64)
        multi_discrete_unconverted = multi_discrete_space.sample().astype(
            np.int32)
        multi_binary_unconverted = multi_binary_space.sample().astype(np.int32)
        tuple_unconverted = (box_space_uncoverted, float(0))
        modified_element = {
            "box": box_space_uncoverted,
            "discrete": float(0),
            "multi_discrete": multi_discrete_unconverted,
            "multi_binary": multi_binary_unconverted,
            "tuple_space": tuple_unconverted,
            "dict_space": {
                "box2": box_space_uncoverted,
                "discrete2": float(0),
            },
        }
        element_with_correct_types = convert_element_to_space_type(
            modified_element, dict_space.sample())
        assert dict_space.contains(element_with_correct_types)
class RandomTeacher():
    def __init__(self, mins, maxs, seed=None):
        self.seed = seed
        if not seed:
            self.seed = np.random.randint(42,424242)
        np.random.seed(self.seed)

        self.mins = mins
        self.maxs = maxs

        self.random_task_generator = Box(np.array(mins), np.array(maxs), dtype=np.float32)

    def update(self, task, competence):
        pass

    def sample_task(self):
        return self.random_task_generator.sample()

    def dump(self, dump_dict):
        return dump_dict