Exemplo n.º 1
0
    def _get_Q_target(self):
        policy_inputs = flatten_input_structure({
            name: self._placeholders['next_observations'][name]
            for name in self._policy.observation_keys
        })
        next_actions = self._policy.actions(policy_inputs)
        next_log_pis = self._policy.log_pis(policy_inputs, next_actions)

        next_Q_observations = {
            name: self._placeholders['next_observations'][name]
            for name in self._Qs[0].observation_keys
        }
        next_Q_inputs = flatten_input_structure({
            **next_Q_observations, 'actions':
            next_actions
        })
        next_Qs_values = tuple(Q(next_Q_inputs) for Q in self._Q_targets)

        min_next_Q = tf.reduce_min(next_Qs_values, axis=0)
        next_values = min_next_Q - self._alpha * next_log_pis

        terminals = tf.cast(self._placeholders['terminals'], next_values.dtype)

        Q_target = td_target(reward=self._reward_scale *
                             self._placeholders['rewards'],
                             discount=self._discount,
                             next_value=(1 - terminals) * next_values)

        return tf.stop_gradient(Q_target)
Exemplo n.º 2
0
    def _init_classifier_update(self):
        classifier_inputs = flatten_input_structure({
            name: self._placeholders['observations'][name]
            for name in self._classifier.observation_keys
        })
        log_p = self._classifier(classifier_inputs)
        policy_inputs = flatten_input_structure({
            name: self._placeholders['observations'][name]
            for name in self._policy.observation_keys
        })
        sampled_actions = self._policy.actions(policy_inputs)
        log_pi = self._policy.log_pis(policy_inputs, sampled_actions)
        # pi / (pi + f), f / (f + pi)
        log_pi_log_p_concat = tf.concat([log_pi, log_p], axis=1)

#        self._classifier.summary()
#        gradient = tf.norm(tf.gradients(log_p, self._classifier.layers[1].outputs), ord=2, axis=-1)
#        gradient_penalty = \
#            self._gradient_penalty_weight * gradient

        self._classifier_loss_t = tf.reduce_mean(
            tf.compat.v1.losses.softmax_cross_entropy(
                self._placeholders['labels'],
                log_pi_log_p_concat,
            )
#            + gradient_penalty
        )
        self._classifier_training_op = self._get_classifier_training_op()
Exemplo n.º 3
0
    def _get_Q_target(self):
        policy_inputs = flatten_input_structure({
            name: self._placeholders['next_observations'][name]
            for name in self._policy.observation_keys
        })
        next_actions = self._policy.actions(policy_inputs)
        next_log_pis = self._policy.log_pis(policy_inputs, next_actions)

        next_Q_observations = {
            name: self._placeholders['next_observations'][name]
            for name in self._Qs[0].observation_keys
        }
        next_Q_inputs = flatten_input_structure(
            {**next_Q_observations, 'actions': next_actions})
        next_Qs_values = tuple(Q(next_Q_inputs) for Q in self._Q_targets)

        min_next_Q = tf.reduce_min(next_Qs_values, axis=0)
        next_values = min_next_Q - self._alpha * next_log_pis

        # TODO: pass through both, and filter by goal_index
        classifier_0_inputs = flatten_input_structure({
            name: self._placeholders['observations'][name]
            for name in self._classifier_0.observation_keys
        })
        classifier_1_inputs = flatten_input_structure({
            name: self._placeholders['observations'][name]
            for name in self._classifier_1.observation_keys
        })

        observation_logits_0 = self._classifier_0(classifier_0_inputs)
        observation_logits_1 = self._classifier_1(classifier_1_inputs)

        # TODO: Merge the two outputs, based on the info/obs/current_goal
        goal_index_mask = self._placeholders['observations']['goal_index']
        # Use above to merge the two.

        # Use observation_logits_1 where goal is 1, observation_logits_0 where goal is 0
        observation_logits = tf.where(
            tf.cast(goal_index_mask, dtype=tf.bool),
            x=observation_logits_1,
            y=observation_logits_0)

        self._reward_t = observation_logits

        terminals = tf.cast(self._placeholders['terminals'], next_values.dtype)

        Q_target = td_target(
            reward=self._reward_scale * self._reward_t,
            discount=self._discount,
            next_value=(1 - terminals) * next_values)

        return Q_target
Exemplo n.º 4
0
 def _policy_input(self):
     observation = super(GoalSampler, self)._policy_input
     goal = flatten_input_structure({
         key: self._current_observation[key][None, ...]
         for key in self.policy.goal_keys
     })
     return observation + goal
Exemplo n.º 5
0
 def test_env_step_with_actions(self):
     observation_np = self.env.reset()
     observations_np = flatten_input_structure(
         {key: value[None, :]
          for key, value in observation_np.items()})
     action = self.policy.actions_np(observations_np)[0, ...]
     self.env.step(action)
Exemplo n.º 6
0
    def test_serialize_deserialize(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]

        observations_np = {}
        for key in observation1_np.keys():
            observations_np[key] = np.stack(
                (observation1_np[key],
                 observation2_np[key])).astype(np.float32)
        observations_np = flatten_input_structure(observations_np)

        weights = self.policy.get_weights()
        actions_np = self.policy.actions_np(observations_np)
        log_pis_np = self.policy.log_pis_np(observations_np, actions_np)

        serialized = pickle.dumps(self.policy)
        deserialized = pickle.loads(serialized)

        weights_2 = deserialized.get_weights()
        log_pis_np_2 = deserialized.log_pis_np(observations_np, actions_np)

        for weight, weight_2 in zip(weights, weights_2):
            np.testing.assert_array_equal(weight, weight_2)

        np.testing.assert_array_equal(log_pis_np, log_pis_np_2)
        np.testing.assert_equal(actions_np.shape,
                                deserialized.actions_np(observations_np).shape)
Exemplo n.º 7
0
    def _init_external_reward(self):
        classifier_inputs = flatten_input_structure({
            name: self._placeholders['observations'][name]
            for name in self._classifiers[0].observation_keys
        })

        observation_logits_per_classifier = [
            classifier(classifier_inputs) for classifier in self._classifiers
        ]

        # DEBUG
        # self._observation_logits_per_classifier = observation_logits_per_classifier
        goal_indices = self._placeholders['observations']['goal_index']
        goal_index_masks = [
            tf.equal(goal_indices, goal) for goal in range(self._num_goals)
        ]

        # DEBUG
        # self._goal_index_masks = goal_index_masks

        # Replace the correct classification logits for the repsective goals
        observation_logits = observation_logits_per_classifier[0]
        for goal in range(1, self._num_goals):
            observation_logits = tf.where(
                goal_index_masks[goal],
                x=observation_logits_per_classifier[goal],
                y=observation_logits)

        self._ext_reward = self._reward_t = observation_logits
Exemplo n.º 8
0
    def _init_external_rewards(self):
        classifier_inputs = flatten_input_structure({
            name: self._placeholders['observations'][name]
            for name in self._classifiers[0].observation_keys
        })

        observation_logits_per_classifier = [
            classifier(classifier_inputs) for classifier in self._classifiers
        ]

        # # DEBUG
        # # self._observation_logits_per_classifier = observation_logits_per_classifier
        # goal_indices = self._placeholders['observations']['goal_index']
        # goal_index_masks = [
        #     tf.equal(goal_indices, goal)
        #     for goal in range(self._num_goals)
        # ]

        # DEBUG
        # self._goal_index_masks = goal_index_masks

        # Replace the correct classification logits for the repsective goals
        # observation_logits = observation_logits_per_classifier[0]
        # for goal in range(1, self._num_goals):
        #     observation_logits = tf.where(
        #        goal_index_masks[goal],
        #        x=observation_logits_per_classifier[goal],
        #        y=observation_logits
        #     )
        self._unscaled_ext_rewards = observation_logits_per_classifier
Exemplo n.º 9
0
    def test_get_diagnostics(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]

        observations_np = {}
        for key in observation1_np.keys():
            observations_np[key] = np.stack((
                observation1_np[key], observation2_np[key]
            )).astype(np.float32)
        observations_np = flatten_input_structure(observations_np)

        diagnostics = self.policy.get_diagnostics(observations_np)

        self.assertTrue(isinstance(diagnostics, OrderedDict))
        self.assertEqual(
            tuple(diagnostics.keys()),
            ('shifts-mean',
             'shifts-std',
             'log_scale_diags-mean',
             'log_scale_diags-std',
             '-log-pis-mean',
             '-log-pis-std',
             'raw-actions-mean',
             'raw-actions-std',
             'actions-mean',
             'actions-std',
             'actions-min',
             'actions-max'))

        for value in diagnostics.values():
            self.assertTrue(np.isscalar(value))
Exemplo n.º 10
0
    def test_latent_smoothing(self):
        observation_np = self.env.reset()
        smoothed_policy = FeedforwardGaussianPolicy(
            input_shapes=self.env.observation_shape,
            output_shape=self.env.action_space.shape,
            hidden_layer_sizes=self.hidden_layer_sizes,
            smoothing_coefficient=0.5,
            observation_keys=self.env.observation_keys)

        np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0)
        self.assertEqual(smoothed_policy._smoothing_alpha, 0.5)
        self.assertEqual(
            smoothed_policy._smoothing_beta,
            np.sqrt((1.0 - np.power(smoothed_policy._smoothing_alpha, 2.0)))
            / (1.0 - smoothed_policy._smoothing_alpha))

        smoothing_x_previous = smoothed_policy._smoothing_x
        for i in range(5):
            observations_np = flatten_input_structure({
                key: value[None, :] for key, value in observation_np.items()
            })
            action_np = smoothed_policy.actions_np(observations_np)[0]
            observation_np = self.env.step(action_np)[0]

            self.assertFalse(np.all(np.equal(
                smoothing_x_previous,
                smoothed_policy._smoothing_x)))
            smoothing_x_previous = smoothed_policy._smoothing_x

        smoothed_policy.reset()

        np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0)
Exemplo n.º 11
0
    def _policy_input(self):
        observation = flatten_input_structure({
            key: self._current_observation[key][None, ...]
            for key in self.policy.observation_keys
        })

        return observation
Exemplo n.º 12
0
def create_feedforward_Q_function(input_shapes,
                                  *args,
                                  preprocessors=None,
                                  observation_keys=None,
                                  goal_keys=None,
                                  name='feedforward_Q',
                                  **kwargs):
    inputs_flat = create_inputs(input_shapes)
    preprocessors_flat = (flatten_input_structure(preprocessors)
                          if preprocessors is not None else tuple(
                              None for _ in inputs_flat))

    assert len(inputs_flat) == len(preprocessors_flat), (inputs_flat,
                                                         preprocessors_flat)

    preprocessed_inputs = [
        preprocessor(input_) if preprocessor is not None else input_
        for preprocessor, input_ in zip(preprocessors_flat, inputs_flat)
    ]

    Q_function = feedforward_model(*args, output_size=1, name=name, **kwargs)

    Q_function = PicklableModel(inputs_flat, Q_function(preprocessed_inputs))
    preprocessed_inputs_fn = PicklableModel(inputs_flat, preprocessed_inputs)

    Q_function.observation_keys = observation_keys or ()
    Q_function.goal_keys = goal_keys or ()
    Q_function.all_keys = observation_keys + goal_keys

    Q_function.actions_preprocessors = preprocessors['actions']
    Q_function.observations_preprocessors = preprocessors['observations']

    Q_function.preprocessed_inputs_fn = preprocessed_inputs_fn
    return Q_function
Exemplo n.º 13
0
    def _init_rnd_updates(self):
        (self._rnd_errors,
         self._rnd_losses,
         self._rnd_error_stds,
         self._rnd_optimizers) = [], [], [], []
        for i in range(self._num_goals):
            self._placeholders['reward'].update({
                f'running_int_rew_std_{i}': tf.compat.v1.placeholder(
                    tf.float32, shape=(), name=f'running_int_rew_std_{i}')
            })
            policy_inputs = flatten_input_structure({
                name: self._placeholders['observations'][name]
                for name in self._policies[i].observation_keys
            })

            targets = tf.stop_gradient(self._rnd_targets[i](policy_inputs))
            predictions = self._rnd_predictors[i](policy_inputs)

            self._rnd_errors.append(tf.expand_dims(tf.reduce_mean(
                tf.math.squared_difference(targets, predictions), axis=-1), 1))
            self._rnd_losses.append(tf.reduce_mean(self._rnd_errors[i]))
            self._rnd_error_stds.append(tf.math.reduce_std(self._rnd_errors[i]))
            self._rnd_optimizers.append(tf.compat.v1.train.AdamOptimizer(
                learning_rate=self._rnd_lr,
                name=f"rnd_optimizer_{i}"))
            rnd_train_op = self._rnd_optimizers[i].minimize(
                loss=self._rnd_losses[i])
            self._training_ops_per_policy[i].update(
                {f'rnd_train_op_{i}': rnd_train_op}
            )
Exemplo n.º 14
0
def create_feedforward_Q_function(input_shapes,
                                  *args,
                                  preprocessors=None,
                                  observation_keys=None,
                                  name='feedforward_Q',
                                  **kwargs):
    print(input_shapes)
    inputs_flat = create_inputs(input_shapes)
    preprocessors_flat = (flatten_input_structure(preprocessors)
                          if preprocessors is not None else tuple(
                              None for _ in inputs_flat))

    assert len(inputs_flat) == len(preprocessors_flat), (inputs_flat,
                                                         preprocessors_flat)

    preprocessed_inputs = [
        tf.cast(preprocessor(input_), dtype=tf.float32)
        if preprocessor is not None else tf.cast(input_, dtype=tf.float32)
        for preprocessor, input_ in zip(preprocessors_flat, inputs_flat)
    ]

    Q_function = feedforward_model(*args, output_size=1, name=name, **kwargs)

    Q_function = PicklableModel(inputs_flat, Q_function(preprocessed_inputs))
    Q_function.observation_keys = observation_keys

    return Q_function
Exemplo n.º 15
0
def create_embedding_fn(input_shapes,
                        embedding_dim,
                        *args,
                        preprocessors=None,
                        observation_keys=None,
                        goal_keys=None,
                        name='embedding_fn',
                        **kwargs):
    inputs_flat = create_inputs(input_shapes)
    preprocessors_flat = (flatten_input_structure(preprocessors)
                          if preprocessors is not None else tuple(
                              None for _ in inputs_flat))

    assert len(inputs_flat) == len(preprocessors_flat), (inputs_flat,
                                                         preprocessors_flat)

    preprocessed_inputs = [
        preprocessor(input_) if preprocessor is not None else input_
        for preprocessor, input_ in zip(preprocessors_flat, inputs_flat)
    ]

    embedding_fn = feedforward_model(*args,
                                     output_size=embedding_dim,
                                     name=f'feedforward_{name}',
                                     **kwargs)

    embedding_fn = PicklableModel(inputs_flat,
                                  embedding_fn(preprocessed_inputs),
                                  name=name)

    embedding_fn.observation_keys = observation_keys or tuple()
    embedding_fn.goal_keys = goal_keys or tuple()
    embedding_fn.all_keys = embedding_fn.observation_keys + embedding_fn.goal_keys

    return embedding_fn
Exemplo n.º 16
0
    def test_actions_and_log_pis_symbolic(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]

        observations_np = {}
        for key in observation1_np.keys():
            observations_np[key] = np.stack(
                (observation1_np[key],
                 observation2_np[key])).astype(np.float32)

        observations_np = flatten_input_structure(observations_np)
        observations_tf = [
            tf.constant(x, dtype=tf.float32) for x in observations_np
        ]

        actions = self.policy.actions(observations_tf)
        log_pis = self.policy.log_pis(observations_tf, actions)

        self.assertEqual(actions.shape, (2, *self.env.action_space.shape))
        self.assertEqual(log_pis.shape, (2, 1))

        self.evaluate(tf.compat.v1.global_variables_initializer())

        actions_np = self.evaluate(actions)
        log_pis_np = self.evaluate(log_pis)

        self.assertEqual(actions_np.shape, (2, *self.env.action_space.shape))
        self.assertEqual(log_pis_np.shape, (2, 1))
Exemplo n.º 17
0
    def get_diagnostics(self,
                        iteration,
                        batch,
                        evaluation_paths,
                        training_paths):
        """Return diagnostic information as ordered dictionary.

        Also calls the `draw` method of the plotter, if plotter defined.
        """

        feed_dict = self._get_feed_dict(iteration, batch)
        # TODO: We need to unwrap self._diagnostics_ops from its
        # tensorflow `_DictWrapper`.
        diagnostics = self._session.run({**self._diagnostics_ops}, feed_dict)

        diagnostics.update(OrderedDict([
            (f'policy/{key}', value)
            for key, value in
            self._policy.get_diagnostics(flatten_input_structure({
                name: batch['observations'][name]
                for name in self._policy.observation_keys
            })).items()
        ]))

        if self._plotter:
            self._plotter.draw()

        return diagnostics
Exemplo n.º 18
0
 def _Q_inputs(self, observations, actions):
     Q_observations = {
         name: observations[name]
         for name in self._Qs[0].observation_keys
     }
     Q_inputs = flatten_input_structure(
         {**Q_observations, 'actions': actions})
     return Q_inputs
Exemplo n.º 19
0
    def sample(self):
        if self._current_observation is None:
            self._current_observation = self.env.reset()

        policy_input = flatten_input_structure({
            key: self._current_observation[key][None, ...]
            for key in self.policy.observation_keys
        })
        action = self.policy.actions_np(policy_input)[0]

        next_observation, reward, terminal, info = self.env.step(action)
        self._path_length += 1
        self._path_return += reward
        self._total_samples += 1

        processed_sample = self._process_sample(
            observation=self._current_observation,
            action=action,
            reward=reward,
            terminal=terminal,
            next_observation=next_observation,
            info=info,
        )

        for key, value in flatten(processed_sample).items():
            self._current_path[key].append(value)

        if terminal or self._path_length >= self._max_path_length:
            last_path = unflatten({
                field_name: np.array(values)
                for field_name, values in self._current_path.items()
            })

            self.pool.add_path({
                key: value
                for key, value in last_path.items()
                if key != 'infos'
            })

            self._last_n_paths.appendleft(last_path)

            self._max_path_return = max(self._max_path_return,
                                        self._path_return)
            self._last_path_return = self._path_return

            self.policy.reset()
            self.pool.terminate_episode()
            self._current_observation = None
            self._path_length = 0
            self._path_return = 0
            self._current_path = defaultdict(list)

            self._n_episodes += 1
        else:
            self._current_observation = next_observation

        return next_observation, reward, terminal, info
Exemplo n.º 20
0
 def _init_classifier_update(self):
     classifier_inputs = flatten_input_structure({
         name: self._placeholders['observations'][name]
         for name in self._classifier.observation_keys
     })
     logits = self._classifier(classifier_inputs)
     self._classifier_loss_t = tf.reduce_mean(
         tf.nn.sigmoid_cross_entropy_with_logits(
             logits=logits, labels=self._placeholders['labels']))
     self._classifier_training_op = self._get_classifier_training_op()
Exemplo n.º 21
0
    def _policy_input(self):
        try:
            observation = flatten_input_structure({
                key: self._current_observation[key][None, ...]
            for key in self.policy.observation_keys
            })
        except Exception:
            from pprint import pprint; import ipdb; ipdb.set_trace(context=30)

        return observation
 def _dynamics_model_inputs(self, observations, actions):
     dynamics_model_observations = {
         name: observations[name]
         for name in self._dynamics_model.observation_keys
     }
     dynamics_model_inputs = flatten_input_structure({
         **dynamics_model_observations, 'actions':
         actions
     })
     return dynamics_model_inputs
Exemplo n.º 23
0
    def _get_Q_target(self):
        policy_inputs = flatten_input_structure({
            name: self._placeholders['next_observations'][name]
            for name in self._policy.all_keys
        })
        next_actions = self._policy.actions(policy_inputs)
        next_log_pis = self._policy.log_pis(policy_inputs, next_actions)

        next_Q_observations = {
            name: self._placeholders['next_observations'][name]
            for name in self._Qs[0].all_keys
        }
        next_Q_inputs = flatten_input_structure({
            **next_Q_observations, 'actions':
            next_actions
        })
        next_Qs_values = tuple(Q(next_Q_inputs) for Q in self._Q_targets)

        min_next_Q = tf.reduce_min(next_Qs_values, axis=0)
        next_values = min_next_Q - self._alpha * next_log_pis

        terminals = tf.cast(self._placeholders['terminals'], next_values.dtype)

        # if self._rnd_int_rew_coeff:
        #     self._unscaled_int_reward = tf.clip_by_value(
        #         self._rnd_errors / self._placeholders['reward']['running_int_rew_std'],
        #         0, 1000
        #     )
        #     self._int_reward = self._rnd_int_rew_coeff * self._unscaled_int_reward
        # else:
        #     self._int_reward = 0

        self._normalized_ext_reward = (
            self._unscaled_ext_reward /
            self._placeholders['reward']['running_ext_rew_std'])

        self._ext_reward = self._normalized_ext_reward * self._ext_reward_coeff
        self._total_reward = self._ext_reward + self._int_reward

        Q_target = td_target(reward=self._reward_scale * self._total_reward,
                             discount=self._discount,
                             next_value=(1 - terminals) * next_values)
        return tf.stop_gradient(Q_target)
Exemplo n.º 24
0
    def _get_Q_target(self):
        next_Q_observations = {
            name: tf.reshape(
                tf.tile(
                    self._placeholders['next_observations'][name]
                    [:, tf.newaxis, :], (1, self._value_n_particles, 1)),
                (-1, *self._placeholders['next_observations'][name].shape[1:]))
            for name in self._Qs[0].observation_keys
        }

        action_shape = self._placeholders['actions'].shape[1:].as_list()
        target_actions = tf.random.uniform(
            (1, self._value_n_particles, *action_shape), -1, 1)
        target_actions = tf.tile(
            target_actions, (tf.shape(self._placeholders['actions'])[0], 1, 1))
        target_actions = tf.reshape(target_actions, (-1, *action_shape))

        next_Q_inputs = flatten_input_structure({
            **next_Q_observations, 'actions':
            target_actions
        })
        next_Qs_values = tuple(Q(next_Q_inputs) for Q in self._Q_targets)

        min_next_Q = tf.reduce_min(next_Qs_values, axis=0)

        assert_shape(min_next_Q, (None, 1))

        min_Q_next_target = tf.reshape(min_next_Q,
                                       (-1, self._value_n_particles))

        assert_shape(min_Q_next_target, (None, self._value_n_particles))

        # Equation 10 in [1]:
        next_values = tf.reduce_logsumexp(min_Q_next_target,
                                          keepdims=True,
                                          axis=1)

        assert_shape(next_values, [None, 1])

        # Importance weights add just a constant to the value.
        next_values -= tf.math.log(tf.cast(self._value_n_particles,
                                           tf.float32))
        next_values += np.prod(action_shape) * np.log(2)

        assert_shape(next_values, [None, 1])

        terminals = tf.cast(self._placeholders['terminals'], next_values.dtype)

        # \hat Q in Equation 11 in [1]:
        Q_target = td_target(reward=self._reward_scale *
                             self._placeholders['rewards'],
                             discount=self._discount,
                             next_value=(1 - terminals) * next_values)

        return tf.stop_gradient(Q_target)
Exemplo n.º 25
0
    def _init_critic_updates(self):
        """Create minimization operation for critics' Q-functions.

        Creates a `tf.optimizer.minimize` operation for updating
        critic Q-function with gradient descent, and appends it to
        `self._training_ops` attribute.

        See Equations (5, 6) in [1], for further information of the
        Q-function update rule.
        """
        Q_targets = self._get_Q_targets()
        assert len(Q_targets) == len(self._policies)
        for Q_target in Q_targets:
            assert Q_target.shape.as_list() == [None, 1]

        self._Q_optimizers_per_policy = []
        self._Q_values_per_policy = []
        self._Q_losses_per_policy = []

        for i, Qs in enumerate(self._Qs_per_policy):
            Q_observations = {
                name: self._placeholders['observations'][name]
                for name in Qs[0].observation_keys
            }
            Q_inputs = flatten_input_structure({
                **Q_observations, 'actions': self._placeholders['actions']})

            Q_values = tuple(Q(Q_inputs) for Q in Qs)
            self._Q_values_per_policy.append(Q_values)

            Q_losses = tuple(
                tf.compat.v1.losses.mean_squared_error(
                    labels=Q_targets[i], predictions=Q_value, weights=0.5)
                for Q_value in Q_values)
            self._Q_losses_per_policy.append(Q_losses)

            # self._bellman_errors.append(tf.reduce_min(tuple(
            #     tf.math.squared_difference(Q_target, Q_value)
            #     for Q_value in Q_values), axis=0))

            Q_optimizers = tuple(
                tf.compat.v1.train.AdamOptimizer(
                    learning_rate=self._Q_lr,
                    name='{}_{}_optimizer_{}'.format(i, Q._name, j)
                ) for j, Q in enumerate(Qs))
            self._Q_optimizers_per_policy.append(Q_optimizers)

            Q_training_ops = tuple(
                Q_optimizer.minimize(loss=Q_loss, var_list=Q.trainable_variables)
                for i, (Q, Q_loss, Q_optimizer)
                in enumerate(zip(Qs, Q_losses, Q_optimizers)))

            self._training_ops_per_policy[i].update({f'Q_{i}': tf.group(Q_training_ops)})
Exemplo n.º 26
0
def create_dynamics_model(input_shapes,
                          dynamics_latent_dim,
                          *args,
                          preprocessors=None,
                          observation_keys=None,
                          goal_keys=None,
                          name='dynamics_model',
                          encoder_kwargs=None,
                          decoder_kwargs=None,
                          **kwargs):
    inputs_flat = create_inputs(input_shapes)
    preprocessors_flat = (
        flatten_input_structure(preprocessors)
        if preprocessors is not None
        else tuple(None for _ in inputs_flat))

    assert len(inputs_flat) == len(preprocessors_flat), (
        inputs_flat, preprocessors_flat)

    preprocessed_inputs = [
        preprocessor(input_) if preprocessor is not None else input_
        for preprocessor, input_
        in zip(preprocessors_flat, inputs_flat)
    ]
    encoder = feedforward_model(
        *args,
        output_size=dynamics_latent_dim,
        name=f'{name}_encoder',
        **encoder_kwargs)

    output_size = sum([
        shape.as_list()[0]
        for shape in input_shapes['observations'].values()
    ])
    decoder = feedforward_model(
        *args,
        output_size=output_size,
        name=f'{name}_decoder',
        **decoder_kwargs)

    latent = encoder(preprocessed_inputs)
    dynamics_pred = decoder(latent)

    dynamics_model = PicklableModel(inputs_flat, dynamics_pred, name=name)

    dynamics_model.observation_keys = observation_keys or tuple()
    dynamics_model.goal_keys = goal_keys or tuple()
    dynamics_model.all_keys = dynamics_model.observation_keys + dynamics_model.goal_keys

    dynamics_model.encoder = PicklableModel(inputs_flat, latent, name=f'{name}_encoder_model')

    return dynamics_model
Exemplo n.º 27
0
    def test_get_diagnostics(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        observations_np = {}
        for key in observation1_np.keys():
            observations_np[key] = np.stack(
                (observation1_np[key],
                 observation2_np[key])).astype(np.float32)
        observations_np = flatten_input_structure(observations_np)

        diagnostics = self.policy.get_diagnostics(observations_np)
        self.assertTrue(isinstance(diagnostics, OrderedDict))
        self.assertFalse(diagnostics)
    def _classifier_inputs(self, observations, actions):
        classifier_observations = {
            name: observations[name]
            for name in self._policy.observation_keys
        }
        dynamics_model_inputs = self._dynamics_model_inputs(
            observations, actions)
        dynamics_features = self._dynamics_model.encoder(dynamics_model_inputs)

        classifier_inputs = flatten_input_structure({
            **classifier_observations,
            'dynamics_features':
            dynamics_features,
            # 'actions': actions
        })
        return classifier_inputs
Exemplo n.º 29
0
    def test_actions_and_log_pis_numeric(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]

        observations_np = {}
        for key in observation1_np.keys():
            observations_np[key] = np.stack(
                (observation1_np[key],
                 observation2_np[key])).astype(np.float32)
        observations_np = flatten_input_structure(observations_np)

        actions_np = self.policy.actions_np(observations_np)
        log_pis_np = self.policy.log_pis_np(observations_np, actions_np)

        self.assertEqual(actions_np.shape, (2, *self.env.action_space.shape))
        self.assertEqual(log_pis_np.shape, (2, 1))
Exemplo n.º 30
0
    def _init_extrinsic_reward(self):
        classifier_inputs = flatten_input_structure({
            name: self._placeholders['observations'][name]
            for name in self._classifier.observation_keys
        })
        observation_logits = self._classifier(classifier_inputs)

        if self._reward_type == 'logits':
            self._reward_t = observation_logits
        elif self._reward_type == 'probabilities':
            self._reward_t = tf.nn.sigmoid(observation_logits)
        else:
            raise NotImplementedError(
                f"Unknown reward type: {self._reward_type}")

        self._unscaled_ext_reward = self._reward_t