def wrap(self, input_action): """ Args: input_action (dict): nested tensor action produced by the neural net. Dictionary keys are those marked True in 'to_learn'. Returns: actions (dict): nested tensor action which includes all action components expected by the GKP class. """ # step counter to follow the script of periodicity 'period' i = self._env._elapsed_steps % self.period out_shape = nest_utils.get_outer_shape(input_action, self._action_spec) action = {} for a in self.to_learn.keys(): C1 = self.use_mask and self.mask[a][i]==0 C2 = not self.to_learn[a] if C1 or C2: # if not learning: replicate scripted action action[a] = common.replicate(self.script[a][i], out_shape) else: # if learning: rescale input tensor action[a] = input_action[a]*self.scale[a] if self.learn_residuals: action[a] += common.replicate(self.script[a][i], out_shape) return action
def test_trajectory_optimiser_pathological_trajectories( action_space, horizon, batch_size): """ The replay buffer is a FIFO buffer of fixed capacity. Ensure that the capacity is sufficient such that the initial observation is still present in the buffer even in the pathological case where all trajectories are of length 2. """ # construct the environment model observations = list( chain.from_iterable( repeat( [ replicate(tf.constant(StepType.FIRST), [batch_size]), replicate(tf.constant(StepType.LAST), [batch_size]), ], horizon, ))) transition_model = TrajectoryOptimiserTransitionModel( action_space, iter(observations)) reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0) termination_model = TrajectoryOptimiserTerminationModel( OBSERVATION_SPACE_SPEC) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=termination_model, initial_state_distribution_model=DeterministicInitialStateModel( StepType.FIRST), batch_size=batch_size, ) time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC) policy = RandomTFPolicy(time_step_space, action_space) stub_policy_state_updater = StubPolicyStateUpdater() trajectory_optimiser = PolicyTrajectoryOptimiser( policy, horizon, population_size=batch_size, max_iterations=1, policy_state_updater=stub_policy_state_updater, ) time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST), axis=0), batch_size=1) trajectory_optimiser.optimise(time_step, environment_model) stored_trajectory = stub_policy_state_updater.step_types[0] assert stored_trajectory[0][0] == StepType.FIRST
def _distribution(self, time_step, policy_state): """Implementation of `distribution`. Returns a `Categorical` distribution. The returned `Categorical` distribution has (unnormalized) probabilities `exp(inverse_temperature * weights)`. Args: time_step: A `TimeStep` tuple corresponding to `time_step_spec()`. policy_state: Unused in `CategoricalPolicy`. It is simply passed through. Returns: A `PolicyStep` named tuple containing: `action`: A (optionally nested) of tfp.distribution.Distribution capturing the distribution of next actions. `state`: A policy state tensor for the next call to distribution. `info`: Optional side information such as action log probabilities. """ outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) logits = (self._inverse_temperature * common.replicate(self._weights, outer_shape)) action_distribution = tfd.Independent( tfd.Categorical(logits=logits, dtype=tf.nest.flatten(self.action_spec)[0].dtype)) return policy_step.PolicyStep(action_distribution, policy_state)
def _action(self, time_step, policy_state, seed): del seed outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) action = tf.nest.map_structure( lambda t: common.replicate(t, outer_shape), self._action_value) return policy_step.PolicyStep(action, policy_state, self._policy_info)
def test_generate_virtual_rollouts(observation_space, action_space, batch_size, horizon): observation = create_uniform_distribution_from_spec( observation_space).sample() network = DummyEnsembleTransitionNetwork(observation_space) model = KerasTransitionModel([network], observation_space, action_space) env_model = EnvironmentModel( transition_model=model, reward_model=ConstantReward(observation_space, action_space, -1.0), termination_model=ConstantFalseTermination(observation_space), initial_state_distribution_model=DeterministicInitialStateModel( observation), batch_size=batch_size, ) random_policy = RandomTFPolicy(time_step_spec(observation_space), action_space) replay_buffer, driver, wrapped_env_model = virtual_rollouts_buffer_and_driver( env_model, random_policy, horizon) driver.run(wrapped_env_model.reset()) trajectory = replay_buffer.gather_all() mid_steps = repeat(1, horizon - 1) expected_step_types = tf.constant(list(chain([0], mid_steps, [2]))) batched_step_types = replicate(expected_step_types, (batch_size, )) np.testing.assert_array_equal(batched_step_types, trajectory.step_type)
def testReplicateScalarTensor(self): value = 1 outer_shape = [2, 1] expected_replicated_value = np.array([[value], [value]]) tf_value = tf.constant(value, shape=()) replicated_value = self.evaluate(common.replicate(tf_value, outer_shape)) self.assertAllEqual(expected_replicated_value, replicated_value)
def _distribution(self, time_step, policy_state): outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) action = common.replicate(self._action_value, outer_shape) def dist_fn(action): """Return a categorical distribution with all density on fixed action.""" return tfp.distributions.Deterministic(loc=action) return policy_step.PolicyStep(nest.map_structure(dist_fn, action), policy_state)
def _get_policy_info_and_action(self, time_step): outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) log_probability = tf.nest.map_structure( lambda _: tf.zeros(outer_shape, tf.float32), self._action_spec) policy_info = policy_step.set_log_probability( self._policy_info, log_probability=log_probability) action = tf.nest.map_structure(lambda t: common.replicate(t, outer_shape), self._action_value) return policy_info, action
def _action(self, time_step, policy_state, seed): outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) action = common.replicate(self._next_action, outer_shape) self._action_index += 1 self._action_index %= self._actions.shape[0] self._next_action.assign(self._actions[self._action_index]) return policy_step.PolicyStep(action, policy_state, info=())
def _action(self, time_step, policy_state, seed): i = policy_state[0] % self.period # position within the policy period out_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) action = {} for a in self.script: A = common.replicate(self.script[a][i], out_shape) if a == 'alpha': # do Markovian feedback A *= time_step.observation['msmt'][:,-1,None] if policy_state[0] == 0: A *= 0 action[a] = A return policy_step.PolicyStep(action, policy_state+1, self._policy_info)
def testReplicateTensor(self, outer_shape_type): value = np.array([[1., 2., 3.], [4., 5., 6.]]) if outer_shape_type == 'tf_constant': outer_shape = tf.constant([2, 1]) else: outer_shape = [2, 1] expected_replicated_value = np.array([[value], [value]]) tf_value = tf.constant(value) replicated_value = self.evaluate(common.replicate(tf_value, outer_shape)) self.assertAllEqual(expected_replicated_value, replicated_value) if isinstance(outer_shape, np.ndarray): # The shape should be fully defined in this case. self.assertEqual(tf.TensorShape(outer_shape + list(value.shape)), replicated_value.shape)
def _action(self, time_step, policy_state, seed): i = policy_state[0] % self.period # position within the policy period out_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) action = {} for a in self.script.keys(): A = common.replicate(self.script[a][i], out_shape) if a == 'alpha': m = time_step.observation['msmt'] if i not in [2*self.K-1, 2*self.K]: # feedback after trimming rounds is Markovian, and after # intermediate sharpening rounds is simply zero. A *= m[:,-1,:] if policy_state[0] == 0: A *= 0 else: # after K sharpening rounds do the Baysian feedback A = self.Bayesian_feedback(i, m) action[a] = A return policy_step.PolicyStep(action, policy_state+1, self._policy_info)
def testReplicateTensor(self, outer_shape_type): value = np.array([[1., 2., 3.], [4., 5., 6.]]) if outer_shape_type == 'placeholder': outer_shape = tf.placeholder(tf.int32, shape=[2]) elif outer_shape_type == 'tf_constant': outer_shape = tf.constant([2, 1]) else: outer_shape = [2, 1] expected_replicated_value = np.array([[value], [value]]) tf_value = tf.constant(value) tf_replicated_value = common.replicate(tf_value, outer_shape) if isinstance(outer_shape, np.ndarray): # The shape should be fully defined in this case. self.assertEqual(tf.TensorShape(outer_shape + list(value.shape)), tf_replicated_value.shape) with self.test_session() as sess: feed_dict = {} if outer_shape_type == 'placeholder': feed_dict = {outer_shape: np.array([2, 1])} replicated_value = sess.run(tf_replicated_value, feed_dict) self.assertAllEqual(expected_replicated_value, replicated_value)
def _action(self, time_step, policy_state, seed): del seed outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) action = common.replicate(self._action_value, outer_shape) return policy_step.PolicyStep(action, policy_state)
def test_trajectory_optimiser_each_iteration_starts_with_the_initial_observation( action_space, horizon, batch_size, max_iterations): class WrappedRandomTFPolicy(TFPolicy): def __init__( self, ts_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, env_model: EnvironmentModel, ): super().__init__(ts_spec, action_spec) self._internal_policy = RandomTFPolicy(ts_spec, action_space) self._environment_model = env_model def _action( self, time_step: ts.TimeStep, policy_state: types.NestedTensor, seed: Optional[types.Seed], ) -> policy_step.PolicyStep: np.testing.assert_array_equal( time_step.observation, self._environment_model.current_time_step().observation) return self._internal_policy._action(time_step, policy_state, seed) def _distribution( self, time_step: ts.TimeStep, policy_state: types.NestedTensorSpec ) -> policy_step.PolicyStep: raise NotImplementedError() observations = list( repeat(replicate(tf.constant(StepType.MID), [batch_size]), max_iterations * (horizon + 1))) transition_model = TrajectoryOptimiserTransitionModel( action_space, iter(observations)) reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0) termination_model = TrajectoryOptimiserTerminationModel( OBSERVATION_SPACE_SPEC) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=termination_model, initial_state_distribution_model=DeterministicInitialStateModel( StepType.FIRST), batch_size=batch_size, ) time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC) policy = WrappedRandomTFPolicy(time_step_space, action_space, environment_model) trajectory_optimiser = PolicyTrajectoryOptimiser( policy, horizon=horizon, population_size=batch_size, max_iterations=max_iterations, ) initial_time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST), axis=0), batch_size=1) trajectory_optimiser.optimise(initial_time_step, environment_model)