示例#1
0
    def __init__(self,
                 env: TFEnvironment,
                 algorithm: OffPolicyAlgorithm,
                 unroll_length=8,
                 exp_replayer="uniform",
                 observers=[],
                 metrics=[]):
        """Create an OffPolicyDriver.

        Args:
            env (TFEnvironment): A TFEnvironmnet
            algorithm (OffPolicyAlgorithm): The algorithm for training
            unroll_length (int): number of time steps each environment proceeds
                before sending the steps to the learner queue
            exp_replayer (str): a string that indicates which ExperienceReplayer
                to use.
            observers (list[Callable]): An optional list of observers that are
                updated after every step in the environment. Each observer is a
                callable(time_step.Trajectory).
            metrics (list[TFStepMetric]): An optiotional list of metrics.
        """
        # training=False because training info is always obtained from
        # replayed exps instead of current time_step prediction. So _step() in
        # policy_driver.py has nothing to do with training for off-policy
        # algorithms
        super(SyncOffPolicyDriver, self).__init__(env=env,
                                                  algorithm=algorithm,
                                                  exp_replayer=exp_replayer,
                                                  observers=observers,
                                                  metrics=metrics,
                                                  unroll_length=unroll_length,
                                                  learn_queue_cap=1)
        algorithm.set_metrics(self.get_metrics())
        self._prepare_specs(algorithm)
示例#2
0
    def __init__(self,
                 env: TFEnvironment,
                 algorithm: OffPolicyAlgorithm,
                 exp_replayer="uniform",
                 observers=[],
                 use_rollout_state=False,
                 metrics=[]):
        """Create an OffPolicyDriver.

        Args:
            env (TFEnvironment): A TFEnvoronmnet
            algorithm (OffPolicyAlgorithm): The algorithm for training
            exp_replayer (str): a string that indicates which ExperienceReplayer
                to use.
            observers (list[Callable]): An optional list of observers that are
                updated after every step in the environment. Each observer is a
                callable(time_step.Trajectory).
            use_rollout_state (bool): Include the RNN state for the experiences
                used for off-policy training
            metrics (list[TFStepMetric]): An optiotional list of metrics.
        """
        # training=False because training info is always obtained from
        # replayed exps instead of current time_step prediction. So _step() in
        # policy_driver.py has nothing to do with training for off-policy
        # algorithms
        super(SyncOffPolicyDriver,
              self).__init__(env=env,
                             algorithm=algorithm,
                             exp_replayer=exp_replayer,
                             observers=observers,
                             use_rollout_state=use_rollout_state,
                             metrics=metrics)
        algorithm.set_metrics(self.get_metrics())
示例#3
0
    def __init__(self,
                 env: TFEnvironment,
                 algorithm: OffPolicyAlgorithm,
                 exp_replayer: str,
                 observers=[],
                 use_rollout_state=False,
                 metrics=[]):
        """Create an OffPolicyDriver.

        Args:
            env (TFEnvironment): A TFEnvironment
            algorithm (OffPolicyAlgorithm): The algorithm for training
            exp_replayer (str): a string that indicates which ExperienceReplayer
                to use. Either "one_time" or "uniform".
            observers (list[Callable]): An optional list of observers that are
                updated after every step in the environment. Each observer is a
                callable(time_step.Trajectory).
            metrics (list[TFStepMetric]): An optional list of metrics.
        """
        super(OffPolicyDriver, self).__init__(
            env=env,
            algorithm=algorithm,
            observers=observers,
            use_rollout_state=use_rollout_state,
            metrics=metrics,
            training=True,
            greedy_predict=False)  # always use OnPolicyDriver for play/eval!

        self._prepare_specs(algorithm)
        algorithm.set_exp_replayer(exp_replayer)
示例#4
0
    def __init__(self,
                 env: TFEnvironment,
                 algorithm: OffPolicyAlgorithm,
                 exp_replayer: str,
                 num_envs=1,
                 observers=[],
                 metrics=[]):
        """Create an OffPolicyDriver.

        Args:
            env (TFEnvironment): A TFEnvironment
            algorithm (OffPolicyAlgorithm): The algorithm for training
            exp_replayer (str): a string that indicates which ExperienceReplayer
                to use. Either "one_time" or "uniform".
            num_envs (int): the number of batched environments. The total number
                of single environment is `num_envs * env.batch_size`
            observers (list[Callable]): An optional list of observers that are
                updated after every step in the environment. Each observer is a
                callable(time_step.Trajectory).
            metrics (list[TFStepMetric]): An optional list of metrics.
        """
        super(OffPolicyDriver, self).__init__(
            env=env,
            algorithm=algorithm,
            observers=observers,
            metrics=metrics,
            mode=self.OFF_POLICY_TRAINING)

        algorithm.set_exp_replayer(exp_replayer, num_envs * env.batch_size)
示例#5
0
    def __init__(self,
                 env: TFEnvironment,
                 algorithm: OffPolicyAlgorithm,
                 exp_replayer: str,
                 num_envs=1,
                 observers=[],
                 metrics=[],
                 unroll_length=8,
                 learn_queue_cap=1):
        """Create an OffPolicyDriver.

        Args:
            env (TFEnvironment): A TFEnvironment
            algorithm (OffPolicyAlgorithm): The algorithm for training
            exp_replayer (str): a string that indicates which ExperienceReplayer
                to use. One of "one_time", "uniform" or "cycle_one_time".
            num_envs (int): the number of batched environments. The total number
                of single environment is `num_envs * env.batch_size`
            observers (list[Callable]): An optional list of observers that are
                updated after every step in the environment. Each observer is a
                callable(time_step.Trajectory).
            metrics (list[TFStepMetric]): An optional list of metrics.
            unroll_length (int): cycle_one_time replayer's max_length ==
                unroll_length + 1, so that all timesteps are used in training.
            learn_queue_cap (int): number of actors to use in one mini-batch
                of training.  Need to pass along to the experience replayer.
        """
        super(OffPolicyDriver, self).__init__(
            env=env,
            algorithm=algorithm,
            observers=observers,
            metrics=metrics,
            mode=self.OFF_POLICY_TRAINING)

        algorithm.set_exp_replayer(exp_replayer, num_envs * env.batch_size,
                                   num_envs, unroll_length, learn_queue_cap)
示例#6
0
    def __init__(self,
                 envs,
                 algorithm: OffPolicyAlgorithm,
                 num_actor_queues=1,
                 unroll_length=8,
                 learn_queue_cap=1,
                 actor_queue_cap=1,
                 observers=[],
                 metrics=[],
                 exp_replayer="one_time"):
        """
        Args:
            envs (list[TFEnvironment]):  list of TFEnvironment
            algorithm (OffPolicyAlgorithm):
            num_actor_queues (int): number of actor queues. Each queue is
                exclusively owned by just one actor thread.
            unroll_length (int): number of time steps each environment proceeds
                before sending the steps to the learner queue
            learn_queue_cap (int): the learner queue capacity determines how many
                environments contribute to the training data for each training
                iteration
            actor_queue_cap (int): the actor queue capacity determines how many
                environments contribute to the data for each prediction forward
                in an `ActorThread`. To prevent deadlock, it's required that
                `actor_queue_cap` * `num_actor_queues` <= `num_envs`.
            observers (list[Callable]): An optional list of observers that are
                updated after every step in the environment. Each observer is a
                callable(time_step.Trajectory).
            metrics (list[TFStepMetric]): An optional list of metrics.
            exp_replayer (str): a string that indicates which ExperienceReplayer
                to use.
        """
        super(AsyncOffPolicyDriver, self).__init__(
            env=envs[0],
            num_envs=len(envs),
            algorithm=algorithm,
            exp_replayer=exp_replayer,
            observers=observers,
            metrics=metrics)

        # create threads
        self._coord = tf.train.Coordinator()
        num_envs = len(envs)
        policy_step_spec = PolicyStep(
            action=algorithm.action_spec,
            state=algorithm.train_state_spec,
            info=algorithm.rollout_info_spec)
        self._tfq = TFQueues(
            num_envs,
            self._env.batch_size,
            learn_queue_cap,
            actor_queue_cap,
            time_step_spec=algorithm.time_step_spec,
            policy_step_spec=policy_step_spec,
            unroll_length=unroll_length,
            num_actor_queues=num_actor_queues)
        actor_threads = [
            ActorThread(
                name="actor{}".format(i),
                coord=self._coord,
                algorithm=self._algorithm,
                tf_queues=self._tfq,
                id=i) for i in range(num_actor_queues)
        ]
        env_threads = [
            EnvThread(
                name="env{}".format(i),
                coord=self._coord,
                env=envs[i],
                tf_queues=self._tfq,
                unroll_length=unroll_length,
                id=i,
                actor_id=i % num_actor_queues) for i in range(num_envs)
        ]
        self._log_thread = LogThread(
            name="logging",
            num_envs=num_envs,
            env_batch_size=self._env.batch_size,
            observers=observers,
            metrics=metrics,
            coord=self._coord,
            queue=self._tfq.log_queue)
        self._threads = actor_threads + env_threads + [self._log_thread]
        algorithm.set_metrics(self.get_metrics())
示例#7
0
 def greedy_predict(self, time_step: ActionTimeStep, state=None):
     return OffPolicyAlgorithm.greedy_predict(self, time_step, state)