def _get_Q_target(self): policy_inputs = flatten_input_structure({ name: self._placeholders['next_observations'][name] for name in self._policy.observation_keys }) next_actions = self._policy.actions(policy_inputs) next_log_pis = self._policy.log_pis(policy_inputs, next_actions) next_Q_observations = { name: self._placeholders['next_observations'][name] for name in self._Qs[0].observation_keys } next_Q_inputs = flatten_input_structure({ **next_Q_observations, 'actions': next_actions }) next_Qs_values = tuple(Q(next_Q_inputs) for Q in self._Q_targets) min_next_Q = tf.reduce_min(next_Qs_values, axis=0) next_values = min_next_Q - self._alpha * next_log_pis terminals = tf.cast(self._placeholders['terminals'], next_values.dtype) Q_target = td_target(reward=self._reward_scale * self._placeholders['rewards'], discount=self._discount, next_value=(1 - terminals) * next_values) return tf.stop_gradient(Q_target)
def _init_classifier_update(self): classifier_inputs = flatten_input_structure({ name: self._placeholders['observations'][name] for name in self._classifier.observation_keys }) log_p = self._classifier(classifier_inputs) policy_inputs = flatten_input_structure({ name: self._placeholders['observations'][name] for name in self._policy.observation_keys }) sampled_actions = self._policy.actions(policy_inputs) log_pi = self._policy.log_pis(policy_inputs, sampled_actions) # pi / (pi + f), f / (f + pi) log_pi_log_p_concat = tf.concat([log_pi, log_p], axis=1) # self._classifier.summary() # gradient = tf.norm(tf.gradients(log_p, self._classifier.layers[1].outputs), ord=2, axis=-1) # gradient_penalty = \ # self._gradient_penalty_weight * gradient self._classifier_loss_t = tf.reduce_mean( tf.compat.v1.losses.softmax_cross_entropy( self._placeholders['labels'], log_pi_log_p_concat, ) # + gradient_penalty ) self._classifier_training_op = self._get_classifier_training_op()
def _get_Q_target(self): policy_inputs = flatten_input_structure({ name: self._placeholders['next_observations'][name] for name in self._policy.observation_keys }) next_actions = self._policy.actions(policy_inputs) next_log_pis = self._policy.log_pis(policy_inputs, next_actions) next_Q_observations = { name: self._placeholders['next_observations'][name] for name in self._Qs[0].observation_keys } next_Q_inputs = flatten_input_structure( {**next_Q_observations, 'actions': next_actions}) next_Qs_values = tuple(Q(next_Q_inputs) for Q in self._Q_targets) min_next_Q = tf.reduce_min(next_Qs_values, axis=0) next_values = min_next_Q - self._alpha * next_log_pis # TODO: pass through both, and filter by goal_index classifier_0_inputs = flatten_input_structure({ name: self._placeholders['observations'][name] for name in self._classifier_0.observation_keys }) classifier_1_inputs = flatten_input_structure({ name: self._placeholders['observations'][name] for name in self._classifier_1.observation_keys }) observation_logits_0 = self._classifier_0(classifier_0_inputs) observation_logits_1 = self._classifier_1(classifier_1_inputs) # TODO: Merge the two outputs, based on the info/obs/current_goal goal_index_mask = self._placeholders['observations']['goal_index'] # Use above to merge the two. # Use observation_logits_1 where goal is 1, observation_logits_0 where goal is 0 observation_logits = tf.where( tf.cast(goal_index_mask, dtype=tf.bool), x=observation_logits_1, y=observation_logits_0) self._reward_t = observation_logits terminals = tf.cast(self._placeholders['terminals'], next_values.dtype) Q_target = td_target( reward=self._reward_scale * self._reward_t, discount=self._discount, next_value=(1 - terminals) * next_values) return Q_target
def _policy_input(self): observation = super(GoalSampler, self)._policy_input goal = flatten_input_structure({ key: self._current_observation[key][None, ...] for key in self.policy.goal_keys }) return observation + goal
def test_env_step_with_actions(self): observation_np = self.env.reset() observations_np = flatten_input_structure( {key: value[None, :] for key, value in observation_np.items()}) action = self.policy.actions_np(observations_np)[0, ...] self.env.step(action)
def test_serialize_deserialize(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = {} for key in observation1_np.keys(): observations_np[key] = np.stack( (observation1_np[key], observation2_np[key])).astype(np.float32) observations_np = flatten_input_structure(observations_np) weights = self.policy.get_weights() actions_np = self.policy.actions_np(observations_np) log_pis_np = self.policy.log_pis_np(observations_np, actions_np) serialized = pickle.dumps(self.policy) deserialized = pickle.loads(serialized) weights_2 = deserialized.get_weights() log_pis_np_2 = deserialized.log_pis_np(observations_np, actions_np) for weight, weight_2 in zip(weights, weights_2): np.testing.assert_array_equal(weight, weight_2) np.testing.assert_array_equal(log_pis_np, log_pis_np_2) np.testing.assert_equal(actions_np.shape, deserialized.actions_np(observations_np).shape)
def _init_external_reward(self): classifier_inputs = flatten_input_structure({ name: self._placeholders['observations'][name] for name in self._classifiers[0].observation_keys }) observation_logits_per_classifier = [ classifier(classifier_inputs) for classifier in self._classifiers ] # DEBUG # self._observation_logits_per_classifier = observation_logits_per_classifier goal_indices = self._placeholders['observations']['goal_index'] goal_index_masks = [ tf.equal(goal_indices, goal) for goal in range(self._num_goals) ] # DEBUG # self._goal_index_masks = goal_index_masks # Replace the correct classification logits for the repsective goals observation_logits = observation_logits_per_classifier[0] for goal in range(1, self._num_goals): observation_logits = tf.where( goal_index_masks[goal], x=observation_logits_per_classifier[goal], y=observation_logits) self._ext_reward = self._reward_t = observation_logits
def _init_external_rewards(self): classifier_inputs = flatten_input_structure({ name: self._placeholders['observations'][name] for name in self._classifiers[0].observation_keys }) observation_logits_per_classifier = [ classifier(classifier_inputs) for classifier in self._classifiers ] # # DEBUG # # self._observation_logits_per_classifier = observation_logits_per_classifier # goal_indices = self._placeholders['observations']['goal_index'] # goal_index_masks = [ # tf.equal(goal_indices, goal) # for goal in range(self._num_goals) # ] # DEBUG # self._goal_index_masks = goal_index_masks # Replace the correct classification logits for the repsective goals # observation_logits = observation_logits_per_classifier[0] # for goal in range(1, self._num_goals): # observation_logits = tf.where( # goal_index_masks[goal], # x=observation_logits_per_classifier[goal], # y=observation_logits # ) self._unscaled_ext_rewards = observation_logits_per_classifier
def test_get_diagnostics(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = {} for key in observation1_np.keys(): observations_np[key] = np.stack(( observation1_np[key], observation2_np[key] )).astype(np.float32) observations_np = flatten_input_structure(observations_np) diagnostics = self.policy.get_diagnostics(observations_np) self.assertTrue(isinstance(diagnostics, OrderedDict)) self.assertEqual( tuple(diagnostics.keys()), ('shifts-mean', 'shifts-std', 'log_scale_diags-mean', 'log_scale_diags-std', '-log-pis-mean', '-log-pis-std', 'raw-actions-mean', 'raw-actions-std', 'actions-mean', 'actions-std', 'actions-min', 'actions-max')) for value in diagnostics.values(): self.assertTrue(np.isscalar(value))
def test_latent_smoothing(self): observation_np = self.env.reset() smoothed_policy = FeedforwardGaussianPolicy( input_shapes=self.env.observation_shape, output_shape=self.env.action_space.shape, hidden_layer_sizes=self.hidden_layer_sizes, smoothing_coefficient=0.5, observation_keys=self.env.observation_keys) np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0) self.assertEqual(smoothed_policy._smoothing_alpha, 0.5) self.assertEqual( smoothed_policy._smoothing_beta, np.sqrt((1.0 - np.power(smoothed_policy._smoothing_alpha, 2.0))) / (1.0 - smoothed_policy._smoothing_alpha)) smoothing_x_previous = smoothed_policy._smoothing_x for i in range(5): observations_np = flatten_input_structure({ key: value[None, :] for key, value in observation_np.items() }) action_np = smoothed_policy.actions_np(observations_np)[0] observation_np = self.env.step(action_np)[0] self.assertFalse(np.all(np.equal( smoothing_x_previous, smoothed_policy._smoothing_x))) smoothing_x_previous = smoothed_policy._smoothing_x smoothed_policy.reset() np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0)
def _policy_input(self): observation = flatten_input_structure({ key: self._current_observation[key][None, ...] for key in self.policy.observation_keys }) return observation
def create_feedforward_Q_function(input_shapes, *args, preprocessors=None, observation_keys=None, goal_keys=None, name='feedforward_Q', **kwargs): inputs_flat = create_inputs(input_shapes) preprocessors_flat = (flatten_input_structure(preprocessors) if preprocessors is not None else tuple( None for _ in inputs_flat)) assert len(inputs_flat) == len(preprocessors_flat), (inputs_flat, preprocessors_flat) preprocessed_inputs = [ preprocessor(input_) if preprocessor is not None else input_ for preprocessor, input_ in zip(preprocessors_flat, inputs_flat) ] Q_function = feedforward_model(*args, output_size=1, name=name, **kwargs) Q_function = PicklableModel(inputs_flat, Q_function(preprocessed_inputs)) preprocessed_inputs_fn = PicklableModel(inputs_flat, preprocessed_inputs) Q_function.observation_keys = observation_keys or () Q_function.goal_keys = goal_keys or () Q_function.all_keys = observation_keys + goal_keys Q_function.actions_preprocessors = preprocessors['actions'] Q_function.observations_preprocessors = preprocessors['observations'] Q_function.preprocessed_inputs_fn = preprocessed_inputs_fn return Q_function
def _init_rnd_updates(self): (self._rnd_errors, self._rnd_losses, self._rnd_error_stds, self._rnd_optimizers) = [], [], [], [] for i in range(self._num_goals): self._placeholders['reward'].update({ f'running_int_rew_std_{i}': tf.compat.v1.placeholder( tf.float32, shape=(), name=f'running_int_rew_std_{i}') }) policy_inputs = flatten_input_structure({ name: self._placeholders['observations'][name] for name in self._policies[i].observation_keys }) targets = tf.stop_gradient(self._rnd_targets[i](policy_inputs)) predictions = self._rnd_predictors[i](policy_inputs) self._rnd_errors.append(tf.expand_dims(tf.reduce_mean( tf.math.squared_difference(targets, predictions), axis=-1), 1)) self._rnd_losses.append(tf.reduce_mean(self._rnd_errors[i])) self._rnd_error_stds.append(tf.math.reduce_std(self._rnd_errors[i])) self._rnd_optimizers.append(tf.compat.v1.train.AdamOptimizer( learning_rate=self._rnd_lr, name=f"rnd_optimizer_{i}")) rnd_train_op = self._rnd_optimizers[i].minimize( loss=self._rnd_losses[i]) self._training_ops_per_policy[i].update( {f'rnd_train_op_{i}': rnd_train_op} )
def create_feedforward_Q_function(input_shapes, *args, preprocessors=None, observation_keys=None, name='feedforward_Q', **kwargs): print(input_shapes) inputs_flat = create_inputs(input_shapes) preprocessors_flat = (flatten_input_structure(preprocessors) if preprocessors is not None else tuple( None for _ in inputs_flat)) assert len(inputs_flat) == len(preprocessors_flat), (inputs_flat, preprocessors_flat) preprocessed_inputs = [ tf.cast(preprocessor(input_), dtype=tf.float32) if preprocessor is not None else tf.cast(input_, dtype=tf.float32) for preprocessor, input_ in zip(preprocessors_flat, inputs_flat) ] Q_function = feedforward_model(*args, output_size=1, name=name, **kwargs) Q_function = PicklableModel(inputs_flat, Q_function(preprocessed_inputs)) Q_function.observation_keys = observation_keys return Q_function
def create_embedding_fn(input_shapes, embedding_dim, *args, preprocessors=None, observation_keys=None, goal_keys=None, name='embedding_fn', **kwargs): inputs_flat = create_inputs(input_shapes) preprocessors_flat = (flatten_input_structure(preprocessors) if preprocessors is not None else tuple( None for _ in inputs_flat)) assert len(inputs_flat) == len(preprocessors_flat), (inputs_flat, preprocessors_flat) preprocessed_inputs = [ preprocessor(input_) if preprocessor is not None else input_ for preprocessor, input_ in zip(preprocessors_flat, inputs_flat) ] embedding_fn = feedforward_model(*args, output_size=embedding_dim, name=f'feedforward_{name}', **kwargs) embedding_fn = PicklableModel(inputs_flat, embedding_fn(preprocessed_inputs), name=name) embedding_fn.observation_keys = observation_keys or tuple() embedding_fn.goal_keys = goal_keys or tuple() embedding_fn.all_keys = embedding_fn.observation_keys + embedding_fn.goal_keys return embedding_fn
def test_actions_and_log_pis_symbolic(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = {} for key in observation1_np.keys(): observations_np[key] = np.stack( (observation1_np[key], observation2_np[key])).astype(np.float32) observations_np = flatten_input_structure(observations_np) observations_tf = [ tf.constant(x, dtype=tf.float32) for x in observations_np ] actions = self.policy.actions(observations_tf) log_pis = self.policy.log_pis(observations_tf, actions) self.assertEqual(actions.shape, (2, *self.env.action_space.shape)) self.assertEqual(log_pis.shape, (2, 1)) self.evaluate(tf.compat.v1.global_variables_initializer()) actions_np = self.evaluate(actions) log_pis_np = self.evaluate(log_pis) self.assertEqual(actions_np.shape, (2, *self.env.action_space.shape)) self.assertEqual(log_pis_np.shape, (2, 1))
def get_diagnostics(self, iteration, batch, evaluation_paths, training_paths): """Return diagnostic information as ordered dictionary. Also calls the `draw` method of the plotter, if plotter defined. """ feed_dict = self._get_feed_dict(iteration, batch) # TODO: We need to unwrap self._diagnostics_ops from its # tensorflow `_DictWrapper`. diagnostics = self._session.run({**self._diagnostics_ops}, feed_dict) diagnostics.update(OrderedDict([ (f'policy/{key}', value) for key, value in self._policy.get_diagnostics(flatten_input_structure({ name: batch['observations'][name] for name in self._policy.observation_keys })).items() ])) if self._plotter: self._plotter.draw() return diagnostics
def _Q_inputs(self, observations, actions): Q_observations = { name: observations[name] for name in self._Qs[0].observation_keys } Q_inputs = flatten_input_structure( {**Q_observations, 'actions': actions}) return Q_inputs
def sample(self): if self._current_observation is None: self._current_observation = self.env.reset() policy_input = flatten_input_structure({ key: self._current_observation[key][None, ...] for key in self.policy.observation_keys }) action = self.policy.actions_np(policy_input)[0] next_observation, reward, terminal, info = self.env.step(action) self._path_length += 1 self._path_return += reward self._total_samples += 1 processed_sample = self._process_sample( observation=self._current_observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, info=info, ) for key, value in flatten(processed_sample).items(): self._current_path[key].append(value) if terminal or self._path_length >= self._max_path_length: last_path = unflatten({ field_name: np.array(values) for field_name, values in self._current_path.items() }) self.pool.add_path({ key: value for key, value in last_path.items() if key != 'infos' }) self._last_n_paths.appendleft(last_path) self._max_path_return = max(self._max_path_return, self._path_return) self._last_path_return = self._path_return self.policy.reset() self.pool.terminate_episode() self._current_observation = None self._path_length = 0 self._path_return = 0 self._current_path = defaultdict(list) self._n_episodes += 1 else: self._current_observation = next_observation return next_observation, reward, terminal, info
def _init_classifier_update(self): classifier_inputs = flatten_input_structure({ name: self._placeholders['observations'][name] for name in self._classifier.observation_keys }) logits = self._classifier(classifier_inputs) self._classifier_loss_t = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=self._placeholders['labels'])) self._classifier_training_op = self._get_classifier_training_op()
def _policy_input(self): try: observation = flatten_input_structure({ key: self._current_observation[key][None, ...] for key in self.policy.observation_keys }) except Exception: from pprint import pprint; import ipdb; ipdb.set_trace(context=30) return observation
def _dynamics_model_inputs(self, observations, actions): dynamics_model_observations = { name: observations[name] for name in self._dynamics_model.observation_keys } dynamics_model_inputs = flatten_input_structure({ **dynamics_model_observations, 'actions': actions }) return dynamics_model_inputs
def _get_Q_target(self): policy_inputs = flatten_input_structure({ name: self._placeholders['next_observations'][name] for name in self._policy.all_keys }) next_actions = self._policy.actions(policy_inputs) next_log_pis = self._policy.log_pis(policy_inputs, next_actions) next_Q_observations = { name: self._placeholders['next_observations'][name] for name in self._Qs[0].all_keys } next_Q_inputs = flatten_input_structure({ **next_Q_observations, 'actions': next_actions }) next_Qs_values = tuple(Q(next_Q_inputs) for Q in self._Q_targets) min_next_Q = tf.reduce_min(next_Qs_values, axis=0) next_values = min_next_Q - self._alpha * next_log_pis terminals = tf.cast(self._placeholders['terminals'], next_values.dtype) # if self._rnd_int_rew_coeff: # self._unscaled_int_reward = tf.clip_by_value( # self._rnd_errors / self._placeholders['reward']['running_int_rew_std'], # 0, 1000 # ) # self._int_reward = self._rnd_int_rew_coeff * self._unscaled_int_reward # else: # self._int_reward = 0 self._normalized_ext_reward = ( self._unscaled_ext_reward / self._placeholders['reward']['running_ext_rew_std']) self._ext_reward = self._normalized_ext_reward * self._ext_reward_coeff self._total_reward = self._ext_reward + self._int_reward Q_target = td_target(reward=self._reward_scale * self._total_reward, discount=self._discount, next_value=(1 - terminals) * next_values) return tf.stop_gradient(Q_target)
def _get_Q_target(self): next_Q_observations = { name: tf.reshape( tf.tile( self._placeholders['next_observations'][name] [:, tf.newaxis, :], (1, self._value_n_particles, 1)), (-1, *self._placeholders['next_observations'][name].shape[1:])) for name in self._Qs[0].observation_keys } action_shape = self._placeholders['actions'].shape[1:].as_list() target_actions = tf.random.uniform( (1, self._value_n_particles, *action_shape), -1, 1) target_actions = tf.tile( target_actions, (tf.shape(self._placeholders['actions'])[0], 1, 1)) target_actions = tf.reshape(target_actions, (-1, *action_shape)) next_Q_inputs = flatten_input_structure({ **next_Q_observations, 'actions': target_actions }) next_Qs_values = tuple(Q(next_Q_inputs) for Q in self._Q_targets) min_next_Q = tf.reduce_min(next_Qs_values, axis=0) assert_shape(min_next_Q, (None, 1)) min_Q_next_target = tf.reshape(min_next_Q, (-1, self._value_n_particles)) assert_shape(min_Q_next_target, (None, self._value_n_particles)) # Equation 10 in [1]: next_values = tf.reduce_logsumexp(min_Q_next_target, keepdims=True, axis=1) assert_shape(next_values, [None, 1]) # Importance weights add just a constant to the value. next_values -= tf.math.log(tf.cast(self._value_n_particles, tf.float32)) next_values += np.prod(action_shape) * np.log(2) assert_shape(next_values, [None, 1]) terminals = tf.cast(self._placeholders['terminals'], next_values.dtype) # \hat Q in Equation 11 in [1]: Q_target = td_target(reward=self._reward_scale * self._placeholders['rewards'], discount=self._discount, next_value=(1 - terminals) * next_values) return tf.stop_gradient(Q_target)
def _init_critic_updates(self): """Create minimization operation for critics' Q-functions. Creates a `tf.optimizer.minimize` operation for updating critic Q-function with gradient descent, and appends it to `self._training_ops` attribute. See Equations (5, 6) in [1], for further information of the Q-function update rule. """ Q_targets = self._get_Q_targets() assert len(Q_targets) == len(self._policies) for Q_target in Q_targets: assert Q_target.shape.as_list() == [None, 1] self._Q_optimizers_per_policy = [] self._Q_values_per_policy = [] self._Q_losses_per_policy = [] for i, Qs in enumerate(self._Qs_per_policy): Q_observations = { name: self._placeholders['observations'][name] for name in Qs[0].observation_keys } Q_inputs = flatten_input_structure({ **Q_observations, 'actions': self._placeholders['actions']}) Q_values = tuple(Q(Q_inputs) for Q in Qs) self._Q_values_per_policy.append(Q_values) Q_losses = tuple( tf.compat.v1.losses.mean_squared_error( labels=Q_targets[i], predictions=Q_value, weights=0.5) for Q_value in Q_values) self._Q_losses_per_policy.append(Q_losses) # self._bellman_errors.append(tf.reduce_min(tuple( # tf.math.squared_difference(Q_target, Q_value) # for Q_value in Q_values), axis=0)) Q_optimizers = tuple( tf.compat.v1.train.AdamOptimizer( learning_rate=self._Q_lr, name='{}_{}_optimizer_{}'.format(i, Q._name, j) ) for j, Q in enumerate(Qs)) self._Q_optimizers_per_policy.append(Q_optimizers) Q_training_ops = tuple( Q_optimizer.minimize(loss=Q_loss, var_list=Q.trainable_variables) for i, (Q, Q_loss, Q_optimizer) in enumerate(zip(Qs, Q_losses, Q_optimizers))) self._training_ops_per_policy[i].update({f'Q_{i}': tf.group(Q_training_ops)})
def create_dynamics_model(input_shapes, dynamics_latent_dim, *args, preprocessors=None, observation_keys=None, goal_keys=None, name='dynamics_model', encoder_kwargs=None, decoder_kwargs=None, **kwargs): inputs_flat = create_inputs(input_shapes) preprocessors_flat = ( flatten_input_structure(preprocessors) if preprocessors is not None else tuple(None for _ in inputs_flat)) assert len(inputs_flat) == len(preprocessors_flat), ( inputs_flat, preprocessors_flat) preprocessed_inputs = [ preprocessor(input_) if preprocessor is not None else input_ for preprocessor, input_ in zip(preprocessors_flat, inputs_flat) ] encoder = feedforward_model( *args, output_size=dynamics_latent_dim, name=f'{name}_encoder', **encoder_kwargs) output_size = sum([ shape.as_list()[0] for shape in input_shapes['observations'].values() ]) decoder = feedforward_model( *args, output_size=output_size, name=f'{name}_decoder', **decoder_kwargs) latent = encoder(preprocessed_inputs) dynamics_pred = decoder(latent) dynamics_model = PicklableModel(inputs_flat, dynamics_pred, name=name) dynamics_model.observation_keys = observation_keys or tuple() dynamics_model.goal_keys = goal_keys or tuple() dynamics_model.all_keys = dynamics_model.observation_keys + dynamics_model.goal_keys dynamics_model.encoder = PicklableModel(inputs_flat, latent, name=f'{name}_encoder_model') return dynamics_model
def test_get_diagnostics(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = {} for key in observation1_np.keys(): observations_np[key] = np.stack( (observation1_np[key], observation2_np[key])).astype(np.float32) observations_np = flatten_input_structure(observations_np) diagnostics = self.policy.get_diagnostics(observations_np) self.assertTrue(isinstance(diagnostics, OrderedDict)) self.assertFalse(diagnostics)
def _classifier_inputs(self, observations, actions): classifier_observations = { name: observations[name] for name in self._policy.observation_keys } dynamics_model_inputs = self._dynamics_model_inputs( observations, actions) dynamics_features = self._dynamics_model.encoder(dynamics_model_inputs) classifier_inputs = flatten_input_structure({ **classifier_observations, 'dynamics_features': dynamics_features, # 'actions': actions }) return classifier_inputs
def test_actions_and_log_pis_numeric(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = {} for key in observation1_np.keys(): observations_np[key] = np.stack( (observation1_np[key], observation2_np[key])).astype(np.float32) observations_np = flatten_input_structure(observations_np) actions_np = self.policy.actions_np(observations_np) log_pis_np = self.policy.log_pis_np(observations_np, actions_np) self.assertEqual(actions_np.shape, (2, *self.env.action_space.shape)) self.assertEqual(log_pis_np.shape, (2, 1))
def _init_extrinsic_reward(self): classifier_inputs = flatten_input_structure({ name: self._placeholders['observations'][name] for name in self._classifier.observation_keys }) observation_logits = self._classifier(classifier_inputs) if self._reward_type == 'logits': self._reward_t = observation_logits elif self._reward_type == 'probabilities': self._reward_t = tf.nn.sigmoid(observation_logits) else: raise NotImplementedError( f"Unknown reward type: {self._reward_type}") self._unscaled_ext_reward = self._reward_t