def AsyncGradients( workers: WorkerSet) -> LocalIterator[Tuple[ModelGradients, int]]: """Operator to compute gradients in parallel from rollout workers. Args: workers (WorkerSet): set of rollout workers to use. Returns: A local iterator over policy gradients computed on rollout workers. Examples: >>> from ray.rllib.execution.rollout_ops import AsyncGradients >>> workers = ... # doctest: +SKIP >>> grads_op = AsyncGradients(workers) # doctest: +SKIP >>> print(next(grads_op)) # doctest: +SKIP {"var_0": ..., ...}, 50 # grads, batch count Updates the STEPS_SAMPLED_COUNTER counter and LEARNER_INFO field in the local iterator context. """ # Ensure workers are initially in sync. workers.sync_weights() # This function will be applied remotely on the workers. def samples_to_grads(samples): return get_global_worker().compute_gradients(samples), samples.count # Record learner metrics and pass through (grads, count). class record_metrics: def _on_fetch_start(self): self.fetch_start_time = time.perf_counter() def __call__(self, item): (grads, info), count = item metrics = _get_shared_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += count metrics.info[LEARNER_INFO] = ({ DEFAULT_POLICY_ID: info } if LEARNER_STATS_KEY in info else info) metrics.timers[GRAD_WAIT_TIMER].push(time.perf_counter() - self.fetch_start_time) return grads, count rollouts = from_actors(workers.remote_workers()) grads = rollouts.for_each(samples_to_grads) return grads.gather_async().for_each(record_metrics())
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: assert len(kwargs) == 0, ( "MAML execution_plan does NOT take any additional parameters") # Sync workers with meta policy workers.sync_weights() # Samples and sets worker tasks use_meta_env = config["use_meta_env"] set_worker_tasks(workers, use_meta_env) # Metric Collector metric_collect = CollectMetrics( workers, min_history=config["metrics_num_episodes_for_smoothing"], timeout_seconds=config["metrics_episode_collection_timeout_s"]) # Iterator for Inner Adaptation Data gathering (from pre->post # adaptation) inner_steps = config["inner_adaptation_steps"] def inner_adaptation_steps(itr): buf = [] split = [] metrics = {} for samples in itr: # Processing Samples (Standardize Advantages) split_lst = [] for sample in samples: sample["advantages"] = standardized(sample["advantages"]) split_lst.append(sample.count) buf.extend(samples) split.append(split_lst) adapt_iter = len(split) - 1 metrics = post_process_metrics(adapt_iter, workers, metrics) if len(split) > inner_steps: out = SampleBatch.concat_samples(buf) out["split"] = np.array(split) buf = [] split = [] # Reporting Adaptation Rew Diff ep_rew_pre = metrics["episode_reward_mean"] ep_rew_post = metrics["episode_reward_mean_adapt_" + str(inner_steps)] metrics["adaptation_delta"] = ep_rew_post - ep_rew_pre yield out, metrics metrics = {} else: inner_adaptation(workers, samples) rollouts = from_actors(workers.remote_workers()) rollouts = rollouts.batch_across_shards() rollouts = rollouts.transform(inner_adaptation_steps) # Metaupdate Step train_op = rollouts.for_each( MetaUpdate(workers, config["maml_optimizer_steps"], metric_collect, use_meta_env)) return train_op
def ParallelRollouts(workers: WorkerSet, *, mode="bulk_sync", num_async=1) -> LocalIterator[SampleBatch]: """Operator to collect experiences in parallel from rollout workers. If there are no remote workers, experiences will be collected serially from the local worker instance instead. Arguments: workers (WorkerSet): set of rollout workers to use. mode (str): One of {'async', 'bulk_sync', 'raw'}. - In 'async' mode, batches are returned as soon as they are computed by rollout workers with no order guarantees. - In 'bulk_sync' mode, we collect one batch from each worker and concatenate them together into a large batch to return. - In 'raw' mode, the ParallelIterator object is returned directly and the caller is responsible for implementing gather and updating the timesteps counter. num_async (int): In async mode, the max number of async requests in flight per actor. Returns: A local iterator over experiences collected in parallel. Examples: >>> rollouts = ParallelRollouts(workers, mode="async") >>> batch = next(rollouts) >>> print(batch.count) 50 # config.rollout_fragment_length >>> rollouts = ParallelRollouts(workers, mode="bulk_sync") >>> batch = next(rollouts) >>> print(batch.count) 200 # config.rollout_fragment_length * config.num_workers Updates the STEPS_SAMPLED_COUNTER counter in the local iterator context. """ # Ensure workers are initially in sync. workers.sync_weights() def report_timesteps(batch): metrics = _get_shared_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count return batch if not workers.remote_workers(): # Handle the serial sampling case. def sampler(_): while True: yield workers.local_worker().sample() return (LocalIterator(sampler, SharedMetrics()).for_each(report_timesteps)) # Create a parallel iterator over generated experiences. rollouts = from_actors(workers.remote_workers()) if mode == "bulk_sync": return rollouts \ .batch_across_shards() \ .for_each(lambda batches: SampleBatch.concat_samples(batches)) \ .for_each(report_timesteps) elif mode == "async": return rollouts.gather_async( num_async=num_async).for_each(report_timesteps) elif mode == "raw": return rollouts else: raise ValueError("mode must be one of 'bulk_sync', 'async', 'raw', " "got '{}'".format(mode))
def execution_plan(workers: WorkerSet, config: AlgorithmConfigDict, **kwargs) -> LocalIterator[dict]: assert ( len(kwargs) == 0 ), "MBMPO execution_plan does NOT take any additional parameters" # Train TD Models on the driver. workers.local_worker().foreach_policy(fit_dynamics) # Sync driver's policy with workers. workers.sync_weights() # Sync TD Models and normalization stats with workers sync_ensemble(workers) sync_stats(workers) # Dropping metrics from the first iteration _, _ = collect_episodes(workers.local_worker(), workers.remote_workers(), [], timeout_seconds=9999) # Metrics Collector. metric_collect = CollectMetrics( workers, min_history=0, timeout_seconds=config["metrics_episode_collection_timeout_s"], ) num_inner_steps = config["inner_adaptation_steps"] def inner_adaptation_steps(itr): buf = [] split = [] metrics = {} for samples in itr: print("Collecting Samples, Inner Adaptation {}".format( len(split))) # Processing Samples (Standardize Advantages) samples, split_lst = post_process_samples(samples, config) buf.extend(samples) split.append(split_lst) adapt_iter = len(split) - 1 prefix = "DynaTrajInner_" + str(adapt_iter) metrics = post_process_metrics(prefix, workers, metrics) if len(split) > num_inner_steps: out = SampleBatch.concat_samples(buf) out["split"] = np.array(split) buf = [] split = [] yield out, metrics metrics = {} else: inner_adaptation(workers, samples) # Iterator for Inner Adaptation Data gathering (from pre->post # adaptation). rollouts = from_actors(workers.remote_workers()) rollouts = rollouts.batch_across_shards() rollouts = rollouts.transform(inner_adaptation_steps) # Meta update step with outer combine loop for multiple MAML # iterations. train_op = rollouts.combine( MetaUpdate( workers, config["num_maml_steps"], config["maml_optimizer_steps"], metric_collect, )) return train_op
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the PPO algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: The Policy class to use with PPOTrainer. If None, use `default_policy` provided in build_trainer(). """ # Train TD Models on the driver. workers.local_worker().foreach_policy(fit_dynamics) # Sync driver's policy with workers. workers.sync_weights() # Sync TD Models and normalization stats with workers sync_ensemble(workers) sync_stats(workers) # Dropping metrics from the first iteration _, _ = collect_episodes(workers.local_worker(), workers.remote_workers(), [], timeout_seconds=9999) # Metrics Collector. metric_collect = CollectMetrics( workers, min_history=0, timeout_seconds=config["collect_metrics_timeout"]) num_inner_steps = config["inner_adaptation_steps"] def inner_adaptation_steps(itr): buf = [] split = [] metrics = {} for samples in itr: print("Collecting Samples, Inner Adaptation {}".format(len(split))) # Processing Samples (Standardize Advantages) samples, split_lst = post_process_samples(samples, config) buf.extend(samples) split.append(split_lst) adapt_iter = len(split) - 1 prefix = "DynaTrajInner_" + str(adapt_iter) metrics = post_process_metrics(prefix, workers, metrics) if len(split) > num_inner_steps: out = SampleBatch.concat_samples(buf) out["split"] = np.array(split) buf = [] split = [] yield out, metrics metrics = {} else: inner_adaptation(workers, samples) # Iterator for Inner Adaptation Data gathering (from pre->post adaptation). rollouts = from_actors(workers.remote_workers()) rollouts = rollouts.batch_across_shards() rollouts = rollouts.transform(inner_adaptation_steps) # Meta update step with outer combine loop for multiple MAML iterations. train_op = rollouts.combine( MetaUpdate(workers, config["num_maml_steps"], config["maml_optimizer_steps"], metric_collect)) return train_op
def ParallelRollouts(workers: WorkerSet, *, mode="bulk_sync", num_async=1) -> LocalIterator[SampleBatch]: """Operator to collect experiences in parallel from rollout workers. If there are no remote workers, experiences will be collected serially from the local worker instance instead. Args: workers (WorkerSet): set of rollout workers to use. mode (str): One of 'async', 'bulk_sync', 'raw'. In 'async' mode, batches are returned as soon as they are computed by rollout workers with no order guarantees. In 'bulk_sync' mode, we collect one batch from each worker and concatenate them together into a large batch to return. In 'raw' mode, the ParallelIterator object is returned directly and the caller is responsible for implementing gather and updating the timesteps counter. num_async (int): In async mode, the max number of async requests in flight per actor. Returns: A local iterator over experiences collected in parallel. Examples: >>> from ray.rllib.execution import ParallelRollouts >>> workers = ... # doctest: +SKIP >>> rollouts = ParallelRollouts(workers, mode="async") # doctest: +SKIP >>> batch = next(rollouts) # doctest: +SKIP >>> print(batch.count) # doctest: +SKIP 50 # config.rollout_fragment_length >>> rollouts = ParallelRollouts(workers, mode="bulk_sync") # doctest: +SKIP >>> batch = next(rollouts) # doctest: +SKIP >>> print(batch.count) # doctest: +SKIP 200 # config.rollout_fragment_length * config.num_workers Updates the STEPS_SAMPLED_COUNTER counter in the local iterator context. """ # Ensure workers are initially in sync. workers.sync_weights() def report_timesteps(batch): metrics = _get_shared_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count if isinstance(batch, MultiAgentBatch): metrics.counters[AGENT_STEPS_SAMPLED_COUNTER] += batch.agent_steps( ) else: metrics.counters[AGENT_STEPS_SAMPLED_COUNTER] += batch.count return batch if not workers.remote_workers(): # Handle the `num_workers=0` case, in which the local worker # has to do sampling as well. return LocalIterator( lambda timeout: workers.local_worker().item_generator, SharedMetrics()).for_each(report_timesteps) # Create a parallel iterator over generated experiences. rollouts = from_actors(workers.remote_workers()) if mode == "bulk_sync": return (rollouts.batch_across_shards().for_each( lambda batches: SampleBatch.concat_samples(batches)).for_each( report_timesteps)) elif mode == "async": return rollouts.gather_async( num_async=num_async).for_each(report_timesteps) elif mode == "raw": return rollouts else: raise ValueError( "mode must be one of 'bulk_sync', 'async', 'raw', got '{}'".format( mode))