Exemplo n.º 1
0
 def __init__(self, name, coord, env, tf_queues, unroll_length, id,
              actor_id):
     """
     Args:
         name (str): name of the thread
         coord (tf.train.Coordinator): coordinate among threads
         env (TFEnvironment):  A TFEnvironment
         tf_queues (TFQueues): an object for storing all the tf.FIFOQueues
             for communicating between threads
         unroll_length (int): Each env unrolls for so many steps before sending
             the steps to the learning queue. If the env is batched, then
             the total number would be `unroll_length` * `batch_size`.
         id (int): an integer identifies the env thread
         actor_id (int): indicates which actor thread the env thread should
             send time steps to.
     """
     super().__init__(name=name,
                      target=self._run,
                      args=(coord, unroll_length))
     self._env = env
     self._tfq = tf_queues
     self._id = id
     self._actor_q = self._tfq.actor_queues[actor_id]
     self._action_return_q = self._tfq.action_return_queues[id]
     self._unroll_queue = self._tfq.env_unroll_queues[id]
     self._initial_policy_state = common.get_initial_policy_state(
         self._env.batch_size,
         tf.nest.map_structure(
             lambda t: tf.TensorSpec(t.shape[1:], t.dtype),
             self._tfq._policy_step_spec.state))
Exemplo n.º 2
0
 def __init__(self,
              name,
              coord,
              env,
              tf_queues,
              unroll_length,
              id,
              actor_id,
              first_env_id=None):
     """
     Args:
         name (str): name of the thread
         coord (tf.train.Coordinator): coordinate among threads
         env (TFEnvironment):  A TFEnvironment
         tf_queues (TFQueues): an object for storing all the tf.FIFOQueues
             for communicating between threads
         unroll_length (int): Each env unrolls for so many steps before sending
             the steps to the learning queue. If the env is batched, then
             the total number would be `unroll_length` * `batch_size`.
         id (int): an integer identifies the env thread
         first_env_id (int): the id for the first environment of the
             batched environment `env`. If there are multiple `EnvThread`s.
             The `first_env_id` should be set in such way that the IDs for
             individual environments are different. If None, it's assumed
             that all the `env` have same batch_size and first_env_id will
             be set as `id * env.batch_size`
         actor_id (int): indicates which actor thread the env thread should
             send time steps to.
     """
     super().__init__(name=name,
                      target=self._run,
                      args=(coord, unroll_length))
     self._env = env
     self._tfq = tf_queues
     self._id = id
     if first_env_id is None:
         first_env_id = self._id * env.batch_size
     self._first_env_id = first_env_id
     self._actor_q = self._tfq.actor_queues[actor_id]
     self._action_return_q = self._tfq.action_return_queues[id]
     self._unroll_queue = self._tfq.env_unroll_queues[id]
     self._initial_policy_state = common.get_initial_policy_state(
         self._env.batch_size,
         tf.nest.map_structure(
             lambda t: tf.TensorSpec(t.shape[1:], t.dtype),
             self._tfq._batch_state_spec))
Exemplo n.º 3
0
    def _update(self, experience, weight):
        batch_size = tf.shape(experience.step_type)[1]
        counter = tf.zeros((), tf.int32)
        initial_train_state = common.get_initial_policy_state(
            batch_size, self.train_state_spec)
        if self._use_rollout_state:
            first_train_state = tf.nest.map_structure(
                lambda state: state[0, ...], experience.state)
        else:
            first_train_state = initial_train_state
        num_steps = tf.shape(experience.step_type)[0]

        def create_ta(s):
            # TensorArray cannot use Tensor (batch_size) as element_shape
            ta_batch_size = experience.step_type.shape[1]
            return tf.TensorArray(dtype=s.dtype,
                                  size=num_steps,
                                  element_shape=tf.TensorShape(
                                      [ta_batch_size]).concatenate(s.shape))

        experience_ta = tf.nest.map_structure(
            create_ta,
            nest_utils.to_distribution_param_spec(
                self.processed_experience_spec))
        experience_ta = tf.nest.map_structure(
            lambda elem, ta: ta.unstack(elem), experience, experience_ta)
        info_ta = tf.nest.map_structure(
            create_ta,
            nest_utils.to_distribution_param_spec(self.train_step_info_spec))

        scope = get_current_scope()

        def _train_loop_body(counter, policy_state, info_ta):
            exp = tf.nest.map_structure(lambda ta: ta.read(counter),
                                        experience_ta)
            exp = nest_utils.params_to_distributions(
                exp, self.processed_experience_spec)
            policy_state = common.reset_state_if_necessary(
                policy_state, initial_train_state,
                tf.equal(exp.step_type, StepType.FIRST))

            with tf.name_scope(scope):
                policy_step = self.train_step(exp, policy_state)

            info_ta = tf.nest.map_structure(
                lambda ta, x: ta.write(counter, x), info_ta,
                nest_utils.distributions_to_params(policy_step.info))

            counter += 1

            return [counter, policy_step.state, info_ta]

        with tf.GradientTape(persistent=True,
                             watch_accessed_variables=False) as tape:
            tape.watch(self.trainable_variables)
            [_, _, info_ta] = tf.while_loop(
                cond=lambda counter, *_: tf.less(counter, num_steps),
                body=_train_loop_body,
                loop_vars=[counter, first_train_state, info_ta],
                back_prop=True,
                name="train_loop")
            info = tf.nest.map_structure(lambda ta: ta.stack(), info_ta)
            info = nest_utils.params_to_distributions(
                info, self.train_step_info_spec)
            experience = nest_utils.params_to_distributions(
                experience, self.processed_experience_spec)
            training_info = TrainingInfo(action=experience.action,
                                         reward=experience.reward,
                                         discount=experience.discount,
                                         step_type=experience.step_type,
                                         rollout_info=experience.rollout_info,
                                         info=info,
                                         env_id=experience.env_id)

        loss_info, grads_and_vars = self.train_complete(
            tape=tape, training_info=training_info, weight=weight)

        del tape

        return training_info, loss_info, grads_and_vars
Exemplo n.º 4
0
    def _update(self, experience, weight):
        batch_size = experience.step_type.shape[1]
        counter = tf.zeros((), tf.int32)
        initial_train_state = common.get_initial_policy_state(
            batch_size, self._algorithm.train_state_spec)
        num_steps = experience.step_type.shape[0]

        def create_ta(s):
            return tf.TensorArray(dtype=s.dtype,
                                  size=num_steps,
                                  element_shape=tf.TensorShape(
                                      [batch_size]).concatenate(s.shape))

        experience_ta = tf.nest.map_structure(create_ta,
                                              self._processed_experience_spec)
        experience_ta = tf.nest.map_structure(
            lambda elem, ta: ta.unstack(elem), experience, experience_ta)
        training_info_ta = tf.nest.map_structure(create_ta,
                                                 self._training_info_spec)

        def _train_loop_body(counter, policy_state, training_info_ta):
            exp = tf.nest.map_structure(lambda ta: ta.read(counter),
                                        experience_ta)
            collect_action_distribution_param = exp.action_distribution
            collect_action_distribution = nested_distributions_from_specs(
                self._action_distribution_spec,
                collect_action_distribution_param)
            exp = exp._replace(action_distribution=collect_action_distribution)

            policy_state = common.reset_state_if_necessary(
                policy_state, initial_train_state,
                tf.equal(exp.step_type, StepType.FIRST))

            policy_step = common.algorithm_step(self._algorithm,
                                                self._observation_transformer,
                                                exp,
                                                policy_state,
                                                training=True)

            action_dist_param = common.get_distribution_params(
                policy_step.action)

            training_info = make_training_info(
                action=exp.action,
                action_distribution=action_dist_param,
                reward=exp.reward,
                discount=exp.discount,
                step_type=exp.step_type,
                info=policy_step.info,
                collect_info=exp.info,
                collect_action_distribution=collect_action_distribution_param)

            training_info_ta = tf.nest.map_structure(
                lambda ta, x: ta.write(counter, x), training_info_ta,
                training_info)

            counter += 1

            return [counter, policy_step.state, training_info_ta]

        with tf.GradientTape(persistent=True,
                             watch_accessed_variables=False) as tape:
            tape.watch(self._trainable_variables)
            [_, _, training_info_ta] = tf.while_loop(
                cond=lambda counter, *_: tf.less(counter, num_steps),
                body=_train_loop_body,
                loop_vars=[counter, initial_train_state, training_info_ta],
                back_prop=True,
                name="train_loop")
            training_info = tf.nest.map_structure(lambda ta: ta.stack(),
                                                  training_info_ta)
            action_distribution = nested_distributions_from_specs(
                self._action_distribution_spec,
                training_info.action_distribution)
            collect_action_distribution = nested_distributions_from_specs(
                self._action_distribution_spec,
                training_info.collect_action_distribution)
            training_info = training_info._replace(
                action_distribution=action_distribution,
                collect_action_distribution=collect_action_distribution)

        loss_info, grads_and_vars = self._algorithm.train_complete(
            tape=tape, training_info=training_info, weight=weight)

        del tape

        return training_info, loss_info, grads_and_vars
Exemplo n.º 5
0
    def _prepare_specs(self, algorithm):
        """Prepare various tensor specs."""
        def extract_spec(nest):
            return tf.nest.map_structure(
                lambda t: tf.TensorSpec(t.shape[1:], t.dtype), nest)

        time_step = self.get_initial_time_step()
        self._time_step_spec = extract_spec(time_step)
        self._action_spec = self._env.action_spec()

        policy_step = algorithm.predict(time_step, self._initial_state)
        info_spec = extract_spec(policy_step.info)
        self._pred_policy_step_spec = PolicyStep(
            action=self._action_spec,
            state=algorithm.predict_state_spec,
            info=info_spec)

        def _to_distribution_spec(spec):
            if isinstance(spec, tf.TensorSpec):
                return DistributionSpec(tfp.distributions.Deterministic,
                                        input_params_spec={"loc": spec},
                                        sample_spec=spec)
            return spec

        self._action_distribution_spec = tf.nest.map_structure(
            _to_distribution_spec, algorithm.action_distribution_spec)
        self._action_dist_param_spec = tf.nest.map_structure(
            lambda spec: spec.input_params_spec,
            self._action_distribution_spec)

        self._experience_spec = Experience(
            step_type=self._time_step_spec.step_type,
            reward=self._time_step_spec.reward,
            discount=self._time_step_spec.discount,
            observation=self._time_step_spec.observation,
            prev_action=self._action_spec,
            action=self._action_spec,
            info=info_spec,
            action_distribution=self._action_dist_param_spec)

        action_dist_params = common.zero_tensor_from_nested_spec(
            self._experience_spec.action_distribution, self._env.batch_size)
        action_dist = nested_distributions_from_specs(
            self._action_distribution_spec, action_dist_params)
        exp = Experience(step_type=time_step.step_type,
                         reward=time_step.reward,
                         discount=time_step.discount,
                         observation=time_step.observation,
                         prev_action=time_step.prev_action,
                         action=time_step.prev_action,
                         info=policy_step.info,
                         action_distribution=action_dist)

        processed_exp = algorithm.preprocess_experience(exp)
        self._processed_experience_spec = self._experience_spec._replace(
            info=extract_spec(processed_exp.info))

        policy_step = common.algorithm_step(
            algorithm,
            ob_transformer=self._observation_transformer,
            time_step=exp,
            state=common.get_initial_policy_state(self._env.batch_size,
                                                  algorithm.train_state_spec),
            training=True)
        info_spec = extract_spec(policy_step.info)
        self._training_info_spec = make_training_info(
            action=self._action_spec,
            action_distribution=self._action_dist_param_spec,
            step_type=self._time_step_spec.step_type,
            reward=self._time_step_spec.reward,
            discount=self._time_step_spec.discount,
            info=info_spec,
            collect_info=self._processed_experience_spec.info,
            collect_action_distribution=self._action_dist_param_spec)
Exemplo n.º 6
0
 def get_initial_policy_state(self):
     """
     Return can be the train or prediction state spec, depending on self._training
     """
     return common.get_initial_policy_state(self._env.batch_size,
                                            self._policy_state_spec)
Exemplo n.º 7
0
    def prepare_off_policy_specs(self, time_step: ActionTimeStep):
        """Prepare various tensor specs for off_policy training.

        prepare_off_policy_specs is called by OffPolicyDriver._prepare_spec().

        """

        self._env_batch_size = time_step.step_type.shape[0]
        self._time_step_spec = common.extract_spec(time_step)
        initial_state = common.get_initial_policy_state(
            self._env_batch_size, self.train_state_spec)
        transformed_timestep = self.transform_timestep(time_step)
        policy_step = self.rollout(transformed_timestep, initial_state)
        info_spec = common.extract_spec(policy_step.info)

        self._action_distribution_spec = tf.nest.map_structure(
            common.to_distribution_spec, self.action_distribution_spec)
        self._action_dist_param_spec = tf.nest.map_structure(
            lambda spec: spec.input_params_spec,
            self._action_distribution_spec)

        self._experience_spec = Experience(
            step_type=self._time_step_spec.step_type,
            reward=self._time_step_spec.reward,
            discount=self._time_step_spec.discount,
            observation=self._time_step_spec.observation,
            prev_action=self._action_spec,
            action=self._action_spec,
            info=info_spec,
            action_distribution=self._action_dist_param_spec,
            state=self.train_state_spec if self._use_rollout_state else ())

        action_dist_params = common.zero_tensor_from_nested_spec(
            self._experience_spec.action_distribution, self._env_batch_size)
        action_dist = nested_distributions_from_specs(
            self._action_distribution_spec, action_dist_params)

        exp = Experience(step_type=time_step.step_type,
                         reward=time_step.reward,
                         discount=time_step.discount,
                         observation=time_step.observation,
                         prev_action=time_step.prev_action,
                         action=time_step.prev_action,
                         info=policy_step.info,
                         action_distribution=action_dist,
                         state=initial_state if self._use_rollout_state else
                         ())

        transformed_exp = self.transform_timestep(exp)
        processed_exp = self.preprocess_experience(transformed_exp)
        self._processed_experience_spec = self._experience_spec._replace(
            observation=common.extract_spec(processed_exp.observation),
            info=common.extract_spec(processed_exp.info))

        policy_step = common.algorithm_step(
            algorithm_step_func=self.train_step,
            time_step=processed_exp,
            state=initial_state)
        info_spec = common.extract_spec(policy_step.info)
        self._training_info_spec = TrainingInfo(
            action_distribution=self._action_dist_param_spec, info=info_spec)