Exemplo n.º 1
0
    def execution_plan(
        workers: WorkerSet, config: TrainerConfigDict, **kwargs
    ) -> LocalIterator[dict]:
        """Execution plan of the DD-PPO algorithm. Defines the distributed dataflow.

        Args:
            workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
                of the Trainer.
            config (TrainerConfigDict): The trainer's configuration dict.

        Returns:
            LocalIterator[dict]: The Policy class to use with PGTrainer.
                If None, use `get_default_policy_class()` provided by Trainer.
        """
        assert (
            len(kwargs) == 0
        ), "DDPPO execution_plan does NOT take any additional parameters"

        rollouts = ParallelRollouts(workers, mode="raw")

        # Setup the distributed processes.
        ip = ray.get(workers.remote_workers()[0].get_node_ip.remote())
        port = ray.get(workers.remote_workers()[0].find_free_port.remote())
        address = "tcp://{ip}:{port}".format(ip=ip, port=port)
        logger.info("Creating torch process group with leader {}".format(address))

        # Get setup tasks in order to throw errors on failure.
        ray.get(
            [
                worker.setup_torch_data_parallel.remote(
                    url=address,
                    world_rank=i,
                    world_size=len(workers.remote_workers()),
                    backend=config["torch_distributed_backend"],
                )
                for i, worker in enumerate(workers.remote_workers())
            ]
        )
        logger.info("Torch process group init completed")

        # This function is applied remotely on each rollout worker.
        def train_torch_distributed_allreduce(batch):
            expected_batch_size = (
                config["rollout_fragment_length"] * config["num_envs_per_worker"]
            )
            this_worker = get_global_worker()
            assert batch.count == expected_batch_size, (
                "Batch size possibly out of sync between workers, expected:",
                expected_batch_size,
                "got:",
                batch.count,
            )
            logger.info(
                "Executing distributed minibatch SGD "
                "with epoch size {}, minibatch size {}".format(
                    batch.count, config["sgd_minibatch_size"]
                )
            )
            info = do_minibatch_sgd(
                batch,
                this_worker.policy_map,
                this_worker,
                config["num_sgd_iter"],
                config["sgd_minibatch_size"],
                ["advantages"],
            )
            return info, batch.count

        # Broadcast the local set of global vars.
        def update_worker_global_vars(item):
            global_vars = _get_global_vars()
            for w in workers.remote_workers():
                w.set_global_vars.remote(global_vars)
            return item

        # Have to manually record stats since we are using "raw" rollouts mode.
        class RecordStats:
            def _on_fetch_start(self):
                self.fetch_start_time = time.perf_counter()

            def __call__(self, items):
                assert len(items) == config["num_workers"]
                for item in items:
                    info, count = item
                    metrics = _get_shared_metrics()
                    metrics.counters[STEPS_TRAINED_THIS_ITER_COUNTER] = count
                    metrics.counters[STEPS_SAMPLED_COUNTER] += count
                    metrics.counters[STEPS_TRAINED_COUNTER] += count
                    metrics.info[LEARNER_INFO] = info
                # Since SGD happens remotely, the time delay between fetch and
                # completion is approximately the SGD step time.
                metrics.timers[LEARN_ON_BATCH_TIMER].push(
                    time.perf_counter() - self.fetch_start_time
                )

        train_op = (
            rollouts.for_each(train_torch_distributed_allreduce)  # allreduce
            .batch_across_shards()  # List[(grad_info, count)]
            .for_each(RecordStats())
        )

        train_op = train_op.for_each(update_worker_global_vars)

        # Sync down the weights. As with the sync up, this is not really
        # needed unless the user is reading the local weights.
        if config["keep_local_weights_in_sync"]:

            def download_weights(item):
                workers.local_worker().set_weights(
                    ray.get(workers.remote_workers()[0].get_weights.remote())
                )
                return item

            train_op = train_op.for_each(download_weights)

        # In debug mode, check the allreduce successfully synced the weights.
        if logger.isEnabledFor(logging.DEBUG):

            def check_sync(item):
                weights = ray.get(
                    [w.get_weights.remote() for w in workers.remote_workers()]
                )
                sums = []
                for w in weights:
                    acc = 0
                    for p in w.values():
                        for k, v in p.items():
                            acc += v.sum()
                    sums.append(float(acc))
                logger.debug("The worker weight sums are {}".format(sums))
                assert len(set(sums)) == 1, sums

            train_op = train_op.for_each(check_sync)

        return StandardMetricsReporting(train_op, workers, config)
Exemplo n.º 2
0
def execution_plan(workers: WorkerSet,
                   config: TrainerConfigDict) -> LocalIterator[dict]:
    """Execution plan of the DQN algorithm. Defines the distributed dataflow.

    Args:
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: A local iterator over training metrics.
    """
    if config.get("prioritized_replay"):
        prio_args = {
            "prioritized_replay_alpha": config["prioritized_replay_alpha"],
            "prioritized_replay_beta": config["prioritized_replay_beta"],
            "prioritized_replay_eps": config["prioritized_replay_eps"],
        }
    else:
        prio_args = {}

    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config.get("replay_sequence_length", 1),
        replay_burn_in=config.get("burn_in", 0),
        replay_zero_init_states=config.get("zero_init_states", True),
        **prio_args)

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # We execute the following steps concurrently:
    # (1) Generate rollouts and store them in our local replay buffer. Calling
    # next() on store_op drives this.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                # TODO(sven): This is currently structured differently for
                #  torch/tf. Clean up these results/info dicts across
                #  policies (note: fixing this in torch_policy.py will
                #  break e.g. DDPPO!).
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                prio_dict[policy_id] = (samples.policy_batches[policy_id].data.
                                        get("batch_indexes"), td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    # (2) Read and train on experiences from the replay buffer. Every batch
    # returned from the LocalReplay() iterator is passed to TrainOneStep to
    # take a SGD step, and then we decide whether to update the target network.
    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)

    if config["simple_optimizer"]:
        train_step_op = TrainOneStep(workers)
    else:
        train_step_op = TrainTFMultiGPU(
            workers=workers,
            sgd_minibatch_size=config["train_batch_size"],
            num_sgd_iter=1,
            num_gpus=config["num_gpus"],
            shuffle_sequences=True,
            _fake_gpus=config["_fake_gpus"],
            framework=config.get("framework"))

    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(train_step_op) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2). Only return the output
    # of (2) since training metrics are not available until (2) runs.
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1],
                            round_robin_weights=calculate_rr_weights(config))

    return StandardMetricsReporting(train_op, workers, config)
Exemplo n.º 3
0
    def execution_plan(workers: WorkerSet, config: dict,
                       **kwargs) -> LocalIterator[dict]:
        assert (
            len(kwargs) == 0
        ), "Apex execution_plan does NOT take any additional parameters"

        # Create a number of replay buffer actors.
        num_replay_buffer_shards = config["optimizer"][
            "num_replay_buffer_shards"]

        replay_actor_args = [
            num_replay_buffer_shards,
            config["learning_starts"],
            config["buffer_size"],
            config["train_batch_size"],
            config["prioritized_replay_alpha"],
            config["prioritized_replay_beta"],
            config["prioritized_replay_eps"],
            config["multiagent"]["replay_mode"],
            config.get("replay_sequence_length", 1),
        ]
        # Place all replay buffer shards on the same node as the learner
        # (driver process that runs this execution plan).
        if config["replay_buffer_shards_colocated_with_driver"]:
            replay_actors = create_colocated_actors(
                actor_specs=[
                    # (class, args, kwargs={}, count)
                    (ReplayActor, replay_actor_args, {},
                     num_replay_buffer_shards)
                ],
                node=platform.node(),  # localhost
            )[0]  # [0]=only one item in `actor_specs`.
        # Place replay buffer shards on any node(s).
        else:
            replay_actors = [
                ReplayActor(*replay_actor_args)
                for _ in range(num_replay_buffer_shards)
            ]

        # Start the learner thread.
        learner_thread = LearnerThread(workers.local_worker())
        learner_thread.start()

        # Update experience priorities post learning.
        def update_prio_and_stats(item: Tuple[ActorHandle, dict, int]) -> None:
            actor, prio_dict, count = item
            if config.get("prioritized_replay"):
                actor.update_priorities.remote(prio_dict)
            metrics = _get_shared_metrics()
            # Manually update the steps trained counter since the learner
            # thread is executing outside the pipeline.
            metrics.counters[STEPS_TRAINED_THIS_ITER_COUNTER] = count
            metrics.counters[STEPS_TRAINED_COUNTER] += count
            metrics.timers["learner_dequeue"] = learner_thread.queue_timer
            metrics.timers["learner_grad"] = learner_thread.grad_timer
            metrics.timers["learner_overall"] = learner_thread.overall_timer

        # We execute the following steps concurrently:
        # (1) Generate rollouts and store them in one of our replay buffer
        # actors. Update the weights of the worker that generated the batch.
        rollouts = ParallelRollouts(workers, mode="async", num_async=2)
        store_op = rollouts.for_each(StoreToReplayBuffer(actors=replay_actors))
        # Only need to update workers if there are remote workers.
        if workers.remote_workers():
            store_op = store_op.zip_with_source_actor().for_each(
                UpdateWorkerWeights(
                    learner_thread,
                    workers,
                    max_weight_sync_delay=(
                        config["optimizer"]["max_weight_sync_delay"]),
                ))

        # (2) Read experiences from one of the replay buffer actors and send
        # to the learner thread via its in-queue.
        post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)
        replay_op = (Replay(
            actors=replay_actors, num_async=4).for_each(lambda x: post_fn(
                x, workers, config)).zip_with_source_actor().for_each(
                    Enqueue(learner_thread.inqueue)))

        # (3) Get priorities back from learner thread and apply them to the
        # replay buffer actors.
        update_op = (Dequeue(learner_thread.outqueue,
                             check=learner_thread.is_alive).for_each(
                                 update_prio_and_stats).for_each(
                                     UpdateTargetNetwork(
                                         workers,
                                         config["target_network_update_freq"],
                                         by_steps_trained=True)))

        if config["training_intensity"]:
            # Execute (1), (2) with a fixed intensity ratio.
            rr_weights = calculate_rr_weights(config) + ["*"]
            merged_op = Concurrently(
                [store_op, replay_op, update_op],
                mode="round_robin",
                output_indexes=[2],
                round_robin_weights=rr_weights,
            )
        else:
            # Execute (1), (2), (3) asynchronously as fast as possible. Only
            # output items from (3) since metrics aren't available before
            # then.
            merged_op = Concurrently([store_op, replay_op, update_op],
                                     mode="async",
                                     output_indexes=[2])

        # Add in extra replay and learner metrics to the training result.
        def add_apex_metrics(result: dict) -> dict:
            replay_stats = ray.get(replay_actors[0].stats.remote(
                config["optimizer"].get("debug")))
            exploration_infos = workers.foreach_policy_to_train(
                lambda p, _: p.get_exploration_state())
            result["info"].update({
                "exploration_infos":
                exploration_infos,
                "learner_queue":
                learner_thread.learner_queue_size.stats(),
                LEARNER_INFO:
                copy.deepcopy(learner_thread.learner_info),
                "replay_shard_0":
                replay_stats,
            })
            return result

        # Only report metrics from the workers with the lowest 1/3 of
        # epsilons.
        selected_workers = workers.remote_workers(
        )[-len(workers.remote_workers()) // 3:]

        return StandardMetricsReporting(
            merged_op, workers, config,
            selected_workers=selected_workers).for_each(add_apex_metrics)
Exemplo n.º 4
0
def execution_plan(workers, config):
    if config.get("prioritized_replay"):
        prio_args = {
            "prioritized_replay_alpha": config["prioritized_replay_alpha"],
            "prioritized_replay_beta": config["prioritized_replay_beta"],
            "prioritized_replay_eps": config["prioritized_replay_eps"],
        }
    else:
        prio_args = {}

    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config.get("replay_sequence_length", 1),
        replay_burn_in=config.get("burn_in", 0),
        replay_zero_init_states=config.get("zero_init_states", True),
        **prio_args)

    global replay_buffer
    replay_buffer = local_replay_buffer

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                # TODO(sven): This is currently structured differently for
                #  torch/tf. Clean up these results/info dicts across
                #  policies (note: fixing this in torch_policy.py will
                #  break e.g. DDPPO!).
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                samples.policy_batches[policy_id].set_get_interceptor(None)
                prio_dict[policy_id] = (samples.policy_batches[policy_id]
                                        .get("batch_indexes"), td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    # (2) Read and train on experiences from the replay buffer. Every batch
    # returned from the LocalReplay() iterator is passed to TrainOneStep to
    # take a SGD step, and then we decide whether to update the target network.
    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)

    if config["simple_optimizer"]:
        train_step_op = TrainOneStep(workers)
    else:
        train_step_op = MultiGPUTrainOneStep(
            workers=workers,
            sgd_minibatch_size=config["train_batch_size"],
            num_sgd_iter=1,
            num_gpus=config["num_gpus"],
            shuffle_sequences=True,
            _fake_gpus=config["_fake_gpus"],
            framework=config.get("framework"))

    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(train_step_op) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    return StandardMetricsReporting(
        replay_op, workers, config, by_steps_trained=True)
Exemplo n.º 5
0
def execution_plan(workers: WorkerSet, config: TrainerConfigDict,
                   **kwargs) -> LocalIterator[dict]:
    """Execution plan of the DQN algorithm. Defines the distributed dataflow.

    Args:
        trainer (Trainer): The Trainer object creating the execution plan.
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: A local iterator over training metrics.
    """
    assert "local_replay_buffer" in kwargs, (
        "GenericOffPolicyTrainer execution plan requires a "
        "local replay buffer.")

    # Assign to Trainer, so we can store the MultiAgentReplayBuffer's
    # data when we save checkpoints.
    local_replay_buffer = kwargs["local_replay_buffer"]

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # We execute the following steps concurrently:
    # (1) Generate rollouts and store them in our local replay buffer. Calling
    # next() on store_op drives this.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                # TODO(sven): This is currently structured differently for
                #  torch/tf. Clean up these results/info dicts across
                #  policies (note: fixing this in torch_policy.py will
                #  break e.g. DDPPO!).
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                samples.policy_batches[policy_id].set_get_interceptor(None)
                batch_indices = samples.policy_batches[policy_id].get(
                    "batch_indexes")
                # In case the buffer stores sequences, TD-error could already
                # be calculated per sequence chunk.
                if len(batch_indices) != len(td_error):
                    T = local_replay_buffer.replay_sequence_length
                    assert len(batch_indices) > len(
                        td_error) and len(batch_indices) % T == 0
                    batch_indices = batch_indices.reshape([-1, T])[:, 0]
                    assert len(batch_indices) == len(td_error)
                prio_dict[policy_id] = (batch_indices, td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    # (2) Read and train on experiences from the replay buffer. Every batch
    # returned from the LocalReplay() iterator is passed to TrainOneStep to
    # take a SGD step, and then we decide whether to update the target network.
    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)

    if config["simple_optimizer"]:
        train_step_op = TrainOneStep(workers)
    else:
        train_step_op = MultiGPUTrainOneStep(
            workers=workers,
            sgd_minibatch_size=config["train_batch_size"],
            num_sgd_iter=1,
            num_gpus=config["num_gpus"],
            shuffle_sequences=True,
            _fake_gpus=config["_fake_gpus"],
            framework=config.get("framework"))

    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(train_step_op) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2). Only return the output
    # of (2) since training metrics are not available until (2) runs.
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1],
                            round_robin_weights=calculate_rr_weights(config))

    return StandardMetricsReporting(train_op, workers, config)
Exemplo n.º 6
0
def execution_plan(workers, config):
    # Create a number of replay buffer actors.
    # TODO(ekl) support batch replay options
    num_replay_buffer_shards = config["optimizer"]["num_replay_buffer_shards"]
    replay_actors = create_colocated(ReplayActor, [
        num_replay_buffer_shards,
        config["learning_starts"],
        config["buffer_size"],
        config["train_batch_size"],
        config["prioritized_replay_alpha"],
        config["prioritized_replay_beta"],
        config["prioritized_replay_eps"],
    ], num_replay_buffer_shards)

    # Update experience priorities post learning.
    def update_prio_and_stats(item):
        actor, prio_dict, count = item
        actor.update_priorities.remote(prio_dict)
        metrics = LocalIterator.get_metrics()
        # Manually update the steps trained counter since the learner thread
        # is executing outside the pipeline.
        metrics.counters[STEPS_TRAINED_COUNTER] += count
        metrics.timers["learner_dequeue"] = learner_thread.queue_timer
        metrics.timers["learner_grad"] = learner_thread.grad_timer
        metrics.timers["learner_overall"] = learner_thread.overall_timer

    # Update worker weights as they finish generating experiences.
    class UpdateWorkerWeights:
        def __init__(self, learner_thread, workers, max_weight_sync_delay):
            self.learner_thread = learner_thread
            self.workers = workers
            self.steps_since_update = collections.defaultdict(int)
            self.max_weight_sync_delay = max_weight_sync_delay
            self.weights = None

        def __call__(self, item):
            actor, batch = item
            self.steps_since_update[actor] += batch.count
            if self.steps_since_update[actor] >= self.max_weight_sync_delay:
                # Note that it's important to pull new weights once
                # updated to avoid excessive correlation between actors.
                if self.weights is None or self.learner_thread.weights_updated:
                    self.learner_thread.weights_updated = False
                    self.weights = ray.put(
                        self.workers.local_worker().get_weights())
                actor.set_weights.remote(self.weights)
                self.steps_since_update[actor] = 0
                # Update metrics.
                metrics = LocalIterator.get_metrics()
                metrics.counters["num_weight_syncs"] += 1

    # Start the learner thread.
    learner_thread = LearnerThread(workers.local_worker())
    learner_thread.start()

    # We execute the following steps concurrently:
    # (1) Generate rollouts and store them in our replay buffer actors. Update
    # the weights of the worker that generated the batch.
    rollouts = ParallelRollouts(workers, mode="async", async_queue_depth=2)
    store_op = rollouts \
        .for_each(StoreToReplayActors(replay_actors)) \
        .zip_with_source_actor() \
        .for_each(UpdateWorkerWeights(
            learner_thread, workers,
            max_weight_sync_delay=config["optimizer"]["max_weight_sync_delay"])
        )

    # (2) Read experiences from the replay buffer actors and send to the
    # learner thread via its in-queue.
    replay_op = ParallelReplay(replay_actors, async_queue_depth=4) \
        .zip_with_source_actor() \
        .for_each(Enqueue(learner_thread.inqueue))

    # (3) Get priorities back from learner thread and apply them to the
    # replay buffer actors.
    update_op = Dequeue(
            learner_thread.outqueue, check=learner_thread.is_alive) \
        .for_each(update_prio_and_stats) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"],
            by_steps_trained=True))

    # Execute (1), (2), (3) asynchronously as fast as possible.
    merged_op = Concurrently([store_op, replay_op, update_op], mode="async")

    return StandardMetricsReporting(merged_op, workers, config)