예제 #1
0
 def action(self, *args, **kwargs):
     """Compute an action for a single input, (e.g. observation)."""
     args_, kwargs_ = tree.map_structure(lambda x: x[None, ...],
                                         (args, kwargs))
     actions = self.actions(*args_, **kwargs_)
     action = tree.map_structure(lambda x: x[0], actions)
     return action
예제 #2
0
    def test_values(self):
        _ = self.env.reset()
        action1_np = self.env.action_space.sample()
        observation1_np = self.env.step(action1_np)[0]
        action2_np = self.env.action_space.sample()
        observation2_np = self.env.step(action2_np)[0]

        observations_np = type(observation1_np)((
            (key, np.stack((
                observation1_np[key], observation2_np[key]
            ), axis=0).astype(np.float32))
            for key in observation1_np.keys()
        ))

        actions_np = np.stack((
            action1_np, action2_np
        ), axis=0).astype(np.float32)

        observations_tf = tree.map_structure(
            lambda x: tf.constant(x, dtype=x.dtype), observations_np)
        actions_tf = tree.map_structure(
            lambda x: tf.constant(x, dtype=x.dtype), actions_np)

        for observations, actions in (
                (observations_np, actions_np),
                (observations_tf, actions_tf)):
            values = self.value_function.values(observations, actions)

            tf.debugging.assert_shapes(((values, (2, 1)),))
예제 #3
0
 def prob(self, *args, **kwargs):
     """Compute the probability for a single action."""
     args_, kwargs_ = tree.map_structure(lambda x: x[None, ...],
                                         (args, kwargs))
     probs = self.probs(*args_, **kwargs_)
     prob = tree.map_structure(lambda x: x[0], probs)
     return prob
예제 #4
0
 def value(self, *args, **kwargs):
     """Compute a value for a single input, (e.g. observation)."""
     args_, kwargs_ = tree.map_structure(
         lambda x: x[None, ...], (args, kwargs))
     values = self.values(*args_, **kwargs_)
     value = tree.map_structure(lambda x: x[0], values)
     return value
예제 #5
0
    def test_field_initialization(self):
        def verify_field(field_attrs, field_values):
            self.assertEqual(field_values.shape,
                             (self.pool._max_size, *field_attrs.shape))
            self.assertEqual(field_values.dtype.name, field_attrs.dtype)

            np.testing.assert_array_equal(field_values, 0.0)

        tree.map_structure(verify_field, self.pool.fields, self.pool.data)
예제 #6
0
    def load_experience(self, experience_path):
        with gzip.open(experience_path, 'rb') as f:
            latest_samples = pickle.load(f)

        num_samples = tree.flatten(latest_samples)[0].shape[0]

        def assert_shape(data):
            assert data.shape[0] == num_samples, data.shape

        tree.map_structure(assert_shape, latest_samples)

        self.add_samples(latest_samples)
        self._samples_since_save = 0
예제 #7
0
    def test_actions_and_log_probs(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]

        observations_np = type(observation1_np)(
            ((key,
              np.stack((observation1_np[key], observation2_np[key]),
                       axis=0).astype(np.float32))
             for key in observation1_np.keys()))

        observations_tf = tree.map_structure(
            lambda x: tf.constant(x, dtype=x.dtype), observations_np)

        for observations in (observations_np, observations_tf):
            actions = self.policy.actions(observations)
            log_pis = self.policy.log_probs(observations, actions)

            self.assertAllEqual(
                log_pis,
                tfp.distributions.Independent(
                    tfp.distributions.Uniform(
                        low=self.env.action_space.low,
                        high=self.env.action_space.high,
                    ),
                    reinterpreted_batch_ndims=1,
                ).log_prob(actions)[..., None])

            self.assertEqual(actions.shape, (2, *self.env.action_shape))
예제 #8
0
파일: vanilla.py 프로젝트: Alonso94/RAIL
def feedforward_Q_function(input_shapes,
                           *args,
                           preprocessors=None,
                           observation_keys=None,
                           name='feedforward_Q',
                           **kwargs):
    inputs = create_inputs(input_shapes)

    if preprocessors is None:
        preprocessors = tree.map_structure(lambda _: None, inputs)

    preprocessors = tree.map_structure_up_to(inputs,
                                             preprocessors_lib.deserialize,
                                             preprocessors)

    preprocessed_inputs = apply_preprocessors(preprocessors, inputs)

    # NOTE(hartikainen): `feedforward_model` would do the `cast_and_concat`
    # step for us, but tf2.2 broke the sequential multi-input handling: See:
    # https://github.com/tensorflow/tensorflow/issues/37061.
    out = tf.keras.layers.Lambda(cast_and_concat)(preprocessed_inputs)
    Q_model_body = feedforward_model(*args,
                                     output_shape=[1],
                                     name=name,
                                     **kwargs)

    Q_model = tf.keras.Model(inputs, Q_model_body(out), name=name)

    Q_function = StateActionValueFunction(model=Q_model,
                                          observation_keys=observation_keys,
                                          name=name)

    return Q_function
예제 #9
0
    def test_sequence_batch_by_indices(self):
        sequence_length = 2

        with self.assertRaises(ValueError):
            self.pool.sequence_batch_by_indices(
                np.array([-1, 2, 4]), sequence_length=sequence_length)

        path_lengths = [10, 4, 50, 36]
        assert sum(path_lengths) == self.pool._max_size, path_lengths

        samples = {
            'field1': np.arange(self.pool._max_size)[:, None],
            'field2': -np.arange(self.pool._max_size)[:, None] * 2,
        }
        for path_end, path_length in zip(np.cumsum(path_lengths),
                                         path_lengths):
            self.pool.add_path(
                tree.map_structure(
                    lambda s: s[path_end - path_length:path_end], samples))
        batch_indices = np.flip(np.arange(self.pool._max_size))
        full_pool_batch = self.pool.sequence_batch_by_indices(
            batch_indices, sequence_length=sequence_length)

        for key, values in full_pool_batch.items():
            if key == 'mask':
                self.assertEqual(values.shape,
                                 (self.pool._max_size, sequence_length))
            else:
                self.assertEqual(values.shape,
                                 (self.pool._max_size, sequence_length, 1))

        self.assertIn('mask', full_pool_batch)
        self.assertTrue(
            all(index_field in full_pool_batch.keys()
                for index_field in INDEX_FIELDS))

        episode_start_indices = np.flatnonzero(
            self.pool.data['episode_index_forwards'] == 0)
        episode_start_indices_batch = np.flatnonzero(
            np.isin(batch_indices, episode_start_indices))

        mask = full_pool_batch['mask']

        for key, values in full_pool_batch.items():
            if key in ('mask', *INDEX_FIELDS): continue
            expected = np.stack((
                np.roll(np.flip(samples[key]), -1),
                np.flip(samples[key]),
            ),
                                axis=1)
            expected[episode_start_indices_batch, 0, :] = 0
            np.testing.assert_array_equal(expected, ~mask[..., None] * values)
            self.assertEqual(values.shape,
                             (self.pool._max_size, sequence_length, 1))

            # Make sure that the values at the start of the episode are zero.
            np.testing.assert_equal(
                ~mask[episode_start_indices_batch, :-1, None] *
                values[episode_start_indices_batch][:, :-1, :], 0)
예제 #10
0
 def preprocess(x):
     """Cast to float, normalize, and concatenate images along last axis."""
     x = tree.map_structure(
         lambda image: tf.image.convert_image_dtype(image, tf.float32), x)
     x = tree.flatten(x)
     x = tf.concat(x, axis=-1)
     x = (tf.image.convert_image_dtype(x, tf.float32) - 0.5) * 2.0
     return x
예제 #11
0
    def observation_shape(self):
        if not isinstance(self.observation_space, spaces.Dict):
            raise NotImplementedError(type(self.observation_space))

        observation_shape = tree.map_structure(
            lambda space: tf.TensorShape(space.shape),
            self.observation_space.spaces)

        return observation_shape
예제 #12
0
def apply_preprocessors(preprocessors, inputs):
    tree.assert_same_structure(inputs, preprocessors)
    preprocessed_inputs = tree.map_structure(
        lambda preprocessor, input_: (preprocessor(input_)
                                      if preprocessor is not None else input_),
        preprocessors,
        inputs,
    )

    return preprocessed_inputs
예제 #13
0
    def __init__(self, max_size, fields):
        super(FlexibleReplayPool, self).__init__()

        max_size = int(max_size)
        self._max_size = max_size

        self.fields = {**fields, **INDEX_FIELDS}
        self.data = tree.map_structure(self._initialize_field, self.fields)

        self._pointer = 0
        self._size = 0
        self._samples_since_save = 0
예제 #14
0
    def _preprocess_inputs(self, inputs):
        if self.preprocessors is None:
            preprocessors = tree.map_structure(lambda x: None, inputs)
        else:
            preprocessors = self.preprocessors

        preprocessed_inputs = apply_preprocessors(preprocessors, inputs)

        preprocessed_inputs = tf.keras.layers.Lambda(cast_and_concat)(
            preprocessed_inputs)

        return preprocessed_inputs
예제 #15
0
    def test_save_latest_experience_with_overflown_pool(self):
        self.assertEqual(self.pool._samples_since_save, 0)

        num_samples = self.pool._max_size + 10
        samples = {
            'field1': np.arange(num_samples)[:, None],
            'field2': -np.arange(num_samples)[:, None] * 2,
        }
        self.pool.add_samples(samples)

        self.assertEqual(self.pool.size, self.pool._max_size)
        self.assertEqual(self.pool._samples_since_save, num_samples)
        self.pool.save_latest_experience('./tmp/pool_1.pkl')
        pool = create_pool(self.pool._max_size)
        self.assertEqual(pool.size, 0)

        import gzip
        with gzip.open('./tmp/pool_1.pkl', 'rb') as f:
            latest_samples = pickle.load(f)

            def assert_same_shape(field, data):
                expected_shape = (self.pool._max_size, *field.shape)
                self.assertEqual(data.shape, expected_shape)

            tree.map_structure(assert_same_shape, self.pool.fields,
                               latest_samples)

        pool.load_experience('./tmp/pool_1.pkl')
        self.assertEqual(pool.size, self.pool._max_size)

        assert all(index_field in pool.fields.keys()
                   for index_field in INDEX_FIELDS)

        def assert_field_data_shape(field_data, field_samples):
            np.testing.assert_array_equal(field_data,
                                          field_samples[-self.pool._max_size:])

        tree.map_structure(assert_field_data_shape, pool.data, samples)
예제 #16
0
def create_sequence_inputs(shapes, dtypes=None):
    """Creates `tf.keras.layers.Input`s usable for sequential models like RNN.

    Args:
        See `create_inputs`.

    Returns:
        inputs: nested structure, of same shape as input_shapes, containing
        `tf.keras.layers.Input`s, each with shape (None, ...).
    """
    shapes = tree.map_structure(lambda x: tf.TensorShape([None]) + x, shapes)
    sequence_inputs = create_inputs(shapes, dtypes)

    return sequence_inputs
예제 #17
0
    def sample(self):
        if self._is_first_step:
            self.reset()

        action = self.policy.action(self._policy_input).numpy()

        next_observation, reward, terminal, info = self.environment.step(
            action)
        self._path_length += 1
        self._path_return += reward
        self._total_samples += 1

        processed_sample = self._process_sample(
            observation=self._current_observation,
            action=action,
            reward=reward,
            terminal=terminal,
            next_observation=next_observation,
            info=info,
        )

        self._current_path.append(processed_sample)

        if terminal or self._path_length >= self._max_path_length:
            last_path = tree.map_structure(
                lambda *x: np.stack(x, axis=0), *self._current_path)

            self.pool.add_path({
                key: value
                for key, value in last_path.items()
                if key != 'infos'
            })

            self._last_n_paths.appendleft(last_path)

            self._max_path_return = max(self._max_path_return,
                                        self._path_return)
            self._last_path_return = self._path_return
            self._n_episodes += 1

            self.pool.terminate_episode()

            self._is_first_step = True
            # Reset is done in the beginning of next episode, see above.

        else:
            self._current_observation = next_observation
            self._is_first_step = False

        return next_observation, reward, terminal, info
예제 #18
0
    def batch_by_indices(self,
                         indices,
                         field_name_filter=None,
                         validate_index=True):
        if validate_index and np.any(self.size <= indices % self._max_size):
            raise ValueError(
                "Tried to retrieve batch with indices greater than current"
                " size")

        if field_name_filter is not None:
            raise NotImplementedError("TODO(hartikainen)")

        batch = tree.map_structure(
            lambda field: field[indices % self._max_size], self.data)
        return batch
예제 #19
0
    def test_serialize_deserialize_empty(self):
        self.assertEqual(self.pool._size, 0)
        tree.map_structure(
            lambda field_data: np.testing.assert_array_equal(field_data, 0.0),
            self.pool.data)

        serialized = pickle.dumps(self.pool)
        deserialized = pickle.loads(serialized)
        for key in deserialized.__dict__:
            if key == 'data':
                for field_name in self.pool.__dict__[key]:
                    np.testing.assert_array_equal(
                        self.pool.__dict__[key][field_name],
                        deserialized.__dict__[key][field_name])
            else:
                np.testing.assert_array_equal(self.pool.__dict__[key],
                                              deserialized.__dict__[key])

        self.assertNotEqual(id(self.pool), id(deserialized))

        self.assertEqual(deserialized._size, 0)
        for field_name, field_attrs in self.pool.fields.items():
            np.testing.assert_array_equal(self.pool.fields[field_name],
                                          deserialized.fields[field_name])
def REPLACE_FULL_OBSERVATION(original_batch, resampled_batch, where_resampled,
                             environment):
    def replace_original_with_resampled(original_goals, resampled_goals):
        np.testing.assert_equal(original_goals[where_resampled].shape,
                                resampled_goals.shape)
        new_goals = original_goals.copy()
        new_goals[where_resampled] = resampled_goals.copy()
        return new_goals

    new_batch = original_batch.copy()
    new_goals = tree.map_structure(replace_original_with_resampled,
                                   original_batch['goals'],
                                   resampled_batch['goals'])
    new_batch['goals'] = new_goals

    return new_batch
예제 #21
0
def create_inputs(shapes, dtypes=None):
    """Creates `tf.keras.layers.Input`s based on input shapes.

    Args:
        input_shapes: (possibly nested) list/array/dict structure of
        inputs shapes.

    Returns:
        inputs: nested structure, of same shape as input_shapes, containing
        `tf.keras.layers.Input`s.

    TODO(hartikainen): Need to figure out a better way for handling the dtypes.
    """
    if dtypes is None:
        dtypes = tree.map_structure(lambda _: None, shapes)
    inputs = tree.map_structure_with_path(create_input, shapes, dtypes)

    return inputs
예제 #22
0
    def test_actions_and_log_probs(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]

        observations_np = type(observation1_np)(
            ((key,
              np.stack((observation1_np[key], observation2_np[key]),
                       axis=0).astype(np.float32))
             for key in observation1_np.keys()))

        observations_tf = tree.map_structure(
            lambda x: tf.constant(x, dtype=x.dtype), observations_np)

        for observations in (observations_np, observations_tf):
            actions = self.policy.actions(observations)
            log_pis = self.policy.log_probs(observations, actions)

            self.assertEqual(actions.shape, (2, *self.env.action_space.shape))
            self.assertEqual(log_pis.shape, (2, 1))
예제 #23
0
    def _do_training_repeats(self, timestep):
        """Repeat training _n_train_repeat times every _train_every_n_steps"""
        if timestep % self._train_every_n_steps > 0: return
        trained_enough = (
            self._train_steps_this_epoch
            > self._max_train_repeat_per_timestep * self._timestep)
        if trained_enough: return

        diagnostics = [
            self._do_training(iteration=timestep, batch=self._training_batch())
            for i in range(self._n_train_repeat)
        ]

        diagnostics = tree.map_structure(
            lambda *d: tf.reduce_mean(d).numpy(), *diagnostics)

        self._num_train_steps += self._n_train_repeat
        self._train_steps_this_epoch += self._n_train_repeat

        return diagnostics
예제 #24
0
    def __init__(self,
                 input_shapes,
                 output_shape,
                 observation_keys=None,
                 preprocessors=None,
                 name='policy'):
        self._input_shapes = input_shapes
        self._output_shape = output_shape
        self._observation_keys = observation_keys
        self._inputs = create_inputs(input_shapes)

        if preprocessors is None:
            preprocessors = tree.map_structure(lambda x: None, input_shapes)

        preprocessors = tree.map_structure_up_to(input_shapes,
                                                 preprocessors_lib.deserialize,
                                                 preprocessors)

        self._preprocessors = preprocessors

        self._name = name
예제 #25
0
 def get_diagnostics_np(self, *args, **kwargs):
     diagnostics = self.get_diagnostics(*args, **kwargs)
     diagnostics_np = tree.map_structure(lambda x: x.numpy(), diagnostics)
     return diagnostics_np
예제 #26
0
    def test_sequence_overlaps_two_episodes(self):
        sequence_length = self.pool._max_size
        path_lengths = [
            self.pool._max_size // 2 - 5,
            (self.pool._max_size // 2) - 3,
            8,
        ]

        samples = {
            'field1': np.arange(self.pool._max_size)[:, None],
            'field2': -np.arange(self.pool._max_size)[::-1, None] * 2,
        }
        for path_end, path_length in zip(np.cumsum(path_lengths),
                                         path_lengths):
            self.pool.add_path(
                tree.map_structure(
                    lambda s: s[path_end - path_length:path_end], samples))

        batch_indices = np.array([0, *np.cumsum(path_lengths), -2, -1])
        batch = self.pool.sequence_batch_by_indices(
            batch_indices, sequence_length=sequence_length)

        self.assertIn('mask', batch)
        self.assertTrue(
            all(index_field in batch.keys() for index_field in INDEX_FIELDS))

        for key, values in batch.items():
            if key == 'mask':
                self.assertEqual(values.shape,
                                 (batch_indices.size, sequence_length))
            else:
                self.assertEqual(values.shape,
                                 (batch_indices.size, sequence_length, 1))

        for i, (episode_start_index, episode_length) in enumerate(
                zip((0, *np.cumsum(path_lengths)[:-1]), path_lengths)):
            np.testing.assert_equal(
                ~batch['mask'][i][:-1] * batch['field1'][i][:-1], 0)
            np.testing.assert_equal(
                ~batch['mask'][i][-1] * batch['field1'][i][-1],
                samples['field1'][episode_start_index])
            np.testing.assert_equal(
                ~batch['mask'][i][:-1] * batch['field2'][i][:-1], 0)
            np.testing.assert_equal(
                ~batch['mask'][i][-1] * batch['field2'][i][-1],
                samples['field2'][episode_start_index])

            np.testing.assert_equal(
                ~batch['mask'][i, :, None] *
                batch['episode_index_forwards'][i], 0)
            np.testing.assert_equal(
                ~batch['mask'][i, :, None][:-1] *
                batch['episode_index_backwards'][i][:-1], 0)
            np.testing.assert_equal(
                ~batch['mask'][i, :, None][-1] *
                batch['episode_index_backwards'][i][-1], episode_length - 1)

        np.testing.assert_equal(
            ~batch['mask'][-2, :-path_lengths[-1] + 1, None] *
            batch['field1'][-2][:-path_lengths[-1] + 1], 0)
        np.testing.assert_equal(
            ~batch['mask'][-2, -path_lengths[-1] + 1:, None] *
            batch['field1'][-2][-path_lengths[-1] + 1:],
            samples['field1'][-path_lengths[-1]:-1])

        np.testing.assert_equal(
            ~batch['mask'][-2, :-path_lengths[-1] + 1, None] *
            batch['field2'][-2][:-path_lengths[-1] + 1], 0)
        np.testing.assert_equal(
            ~batch['mask'][-2, -path_lengths[-1] + 1:, None] *
            batch['field2'][-2][-path_lengths[-1] + 1:],
            samples['field2'][-path_lengths[-1]:-1])

        np.testing.assert_equal(
            ~batch['mask'][-1, :-path_lengths[-1], None] *
            batch['field1'][-1][:-path_lengths[-1]], 0)
        np.testing.assert_equal(
            ~batch['mask'][-1, -path_lengths[-1]:, None] *
            batch['field1'][-1][-path_lengths[-1]:],
            samples['field1'][-path_lengths[-1]:])

        np.testing.assert_equal(
            ~batch['mask'][-1, :-path_lengths[-1], None] *
            batch['field2'][-1][:-path_lengths[-1]], 0)
        np.testing.assert_equal(
            ~batch['mask'][-1, -path_lengths[-1]:, None] *
            batch['field2'][-1][-path_lengths[-1]:],
            samples['field2'][-path_lengths[-1]:])
예제 #27
0
 def add_sample(self, sample):
     samples = tree.map_structure(lambda x: x[..., np.newaxis], sample)
     self.add_samples(samples)
예제 #28
0
def cast_and_concat(x):
    x = tree.map_structure(lambda element: tf.cast(element, tf.float32), x)
    x = tree.flatten(x)
    x = tf.concat(x, axis=-1)
    return x
예제 #29
0
    def _train(self):
        """Return a generator that performs RL training.

        Args:
            rail (`SoftlearningEnv`): Environment used for training.
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """
        training_environment = self._training_environment
        evaluation_environment = self._evaluation_environment
        policy = self._policy

        gt.reset_root()
        gt.rename_root('RLAlgorithm')
        gt.set_def_unique(False)

        self._training_before_hook()

        for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)):
            self._epoch_before_hook()
            gt.stamp('epoch_before_hook')

            update_diagnostics = []

            start_samples = self.sampler._total_samples
            for i in count():
                samples_now = self.sampler._total_samples
                self._timestep = samples_now - start_samples

                if (samples_now >= start_samples + self._epoch_length
                    and self.ready_to_train):
                    break

                self._timestep_before_hook()
                gt.stamp('timestep_before_hook')

                self._do_sampling(timestep=self._total_timestep)
                gt.stamp('sample')

                if self.ready_to_train:
                    update_diagnostics.append(self._do_training_repeats(
                        timestep=self._total_timestep))

                gt.stamp('train')

                self._timestep_after_hook()
                gt.stamp('timestep_after_hook')

            update_diagnostics = tree.map_structure(
                lambda *d: np.mean(d), *update_diagnostics)

            training_paths = self.sampler.get_last_n_paths(
                math.ceil(self._epoch_length / self.sampler._max_path_length))
            gt.stamp('training_paths')
            evaluation_paths = self._evaluation_paths(
                policy, evaluation_environment)
            gt.stamp('evaluation_paths')

            training_metrics = self._evaluate_rollouts(
                training_paths,
                training_environment,
                self._total_timestep,
                evaluation_type='train')
            gt.stamp('training_metrics')
            if evaluation_paths:
                evaluation_metrics = self._evaluate_rollouts(
                    evaluation_paths,
                    evaluation_environment,
                    self._total_timestep,
                    evaluation_type='evaluation')
                gt.stamp('evaluation_metrics')
            else:
                evaluation_metrics = {}

            self._epoch_after_hook(training_paths)
            gt.stamp('epoch_after_hook')

            sampler_diagnostics = self.sampler.get_diagnostics()

            diagnostics = self.get_diagnostics(
                iteration=self._total_timestep,
                batch=self._evaluation_batch(),
                training_paths=training_paths,
                evaluation_paths=evaluation_paths)

            time_diagnostics = {
                key: times[-1]
                for key, times in gt.get_times().stamps.itrs.items()
            }

            # TODO(hartikainen/tf2): Fix the naming of training/update
            # diagnostics/metric
            diagnostics.update((
                ('evaluation', evaluation_metrics),
                ('training', training_metrics),
                ('update', update_diagnostics),
                ('times', time_diagnostics),
                ('sampler', sampler_diagnostics),
                ('epoch', self._epoch),
                ('timestep', self._timestep),
                ('total_timestep', self._total_timestep),
                ('num_train_steps', self._num_train_steps),
            ))

            if self._eval_render_kwargs and hasattr(
                    evaluation_environment, 'render_rollouts'):
                # TODO(hartikainen): Make this consistent such that there's no
                # need for the hasattr check.
                training_environment.render_rollouts(evaluation_paths)

            yield diagnostics

        self.sampler.terminate()

        self._training_after_hook()

        yield {'done': True, **diagnostics}