示例#1
0
文件: appo.py 项目: krfricke/ray
 def __init__(self, workers, config):
     self.workers = workers
     self.config = config
     self.update_kl = UpdateKL(workers)
     self.target_update_freq = (
         config["num_sgd_iter"] * config["minibatch_buffer_size"]
     )
示例#2
0
    def __init__(self, config, *args, **kwargs):
        super().__init__(config, *args, **kwargs)

        self.update_kl = UpdateKL(self.workers)

        # After init: Initialize target net.
        self.workers.local_worker().foreach_policy_to_train(
            lambda p, _: p.update_target())
示例#3
0
def mu_zero_execution_plan(workers: WorkerSet,
                           config: TrainerConfigDict) -> LocalIterator[dict]:
    """Execution plan of the PPO algorithm. Defines the distributed dataflow.

    Args:
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: The Policy class to use with PPOTrainer.
            If None, use `default_policy` provided in build_trainer().
    """
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # Collect batches for the trainable policies.
    rollouts = rollouts.for_each(
        SelectExperiences(workers.trainable_policies()))
    # Concatenate the SampleBatches into one.
    rollouts = rollouts.combine(
        ConcatBatches(min_batch_size=config["train_batch_size"]))
    # Standardize advantages.
    rollouts = rollouts.for_each(StandardizeFields(["advantages"]))
    # Standardize value targets
    rollouts = rollouts.for_each(StandardizeFields(["value_targets"]))

    # Perform one training step on the combined + standardized batch.
    if config["simple_optimizer"]:
        train_op = rollouts.for_each(
            TrainOneStep(workers,
                         num_sgd_iter=config["num_sgd_iter"],
                         sgd_minibatch_size=config["sgd_minibatch_size"]))
    else:
        train_op = rollouts.for_each(
            TrainTFMultiGPU(
                workers,
                sgd_minibatch_size=config["sgd_minibatch_size"],
                num_sgd_iter=config["num_sgd_iter"],
                num_gpus=config["num_gpus"],
                rollout_fragment_length=config["rollout_fragment_length"],
                num_envs_per_worker=config["num_envs_per_worker"],
                train_batch_size=config["train_batch_size"],
                shuffle_sequences=config["shuffle_sequences"],
                _fake_gpus=config["_fake_gpus"],
                framework=config.get("framework")))

    # Update KL after each round of training.
    train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers))

    # Warn about bad reward scales and return training metrics.
    return StandardMetricsReporting(train_op, workers, config) \
        .for_each(lambda result: warn_about_bad_reward_scales(config, result))