def default_execution_plan(workers: WorkerSet, config: TrainerConfigDict): # Collects experiences in parallel from multiple RolloutWorker actors. rollouts = ParallelRollouts(workers, mode="bulk_sync") # Combine experiences batches until we hit `train_batch_size` in size. # Then, train the policy on those experiences and update the workers. train_op = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) if config.get("simple_optimizer") is True: train_op = train_op.for_each(TrainOneStep(workers)) else: train_op = train_op.for_each( MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config.get("sgd_minibatch_size", config["train_batch_size"]), num_sgd_iter=config.get("num_sgd_iter", 1), num_gpus=config["num_gpus"], shuffle_sequences=config.get("shuffle_sequences", False), _fake_gpus=config["_fake_gpus"], framework=config["framework"])) # Add on the standard episode reward, etc. metrics reporting. This returns # a LocalIterator[metrics_dict] representing metrics for each train step. return StandardMetricsReporting(train_op, workers, config)
def validate_config(config: TrainerConfigDict) -> None: """Checks and updates the config based on settings. Rewrites rollout_fragment_length to take into account n_step truncation. """ if config["model"]["custom_model"]: logger.warning( "Setting use_state_preprocessor=True since a custom model " "was specified.") config["use_state_preprocessor"] = True if config["grad_clip"] is not None and config["grad_clip"] <= 0.0: raise ValueError("`grad_clip` value must be > 0.0!") if config["exploration_config"]["type"] == "ParameterNoise": if config["batch_mode"] != "complete_episodes": logger.warning( "ParameterNoise Exploration requires `batch_mode` to be " "'complete_episodes'. Setting batch_mode=complete_episodes.") config["batch_mode"] = "complete_episodes" if config.get("prioritized_replay"): if config["multiagent"]["replay_mode"] == "lockstep": raise ValueError("Prioritized replay is not supported when " "replay_mode=lockstep.") else: if config.get("worker_side_prioritization"): raise ValueError( "Worker side prioritization is not supported when " "prioritized_replay=False.")
def validate_config(self, config: TrainerConfigDict) -> None: # Call super's validation method. super().validate_config(config) if config["use_state_preprocessor"] != DEPRECATED_VALUE: deprecation_warning(old="config['use_state_preprocessor']", error=False) config["use_state_preprocessor"] = DEPRECATED_VALUE if config.get("policy_model", DEPRECATED_VALUE) != DEPRECATED_VALUE: deprecation_warning( old="config['policy_model']", new="config['policy_model_config']", error=False, ) config["policy_model_config"] = config["policy_model"] if config.get("Q_model", DEPRECATED_VALUE) != DEPRECATED_VALUE: deprecation_warning( old="config['Q_model']", new="config['q_model_config']", error=False, ) config["q_model_config"] = config["Q_model"] if config["grad_clip"] is not None and config["grad_clip"] <= 0.0: raise ValueError("`grad_clip` value must be > 0.0!") if config["framework"] in ["tf", "tf2", "tfe"] and tfp is None: logger.warning( "You need `tensorflow_probability` in order to run SAC! " "Install it via `pip install tensorflow_probability`. Your " f"tf.__version__={tf.__version__ if tf else None}." "Trying to import tfp results in the following error:") try_import_tfp(error=True)
def setup_config( policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, ) -> None: """Executed before Policy is "initialized" (at beginning of constructor). Args: policy (Policy): The Policy object. obs_space (gym.spaces.Space): The Policy's observation space. action_space (gym.spaces.Space): The Policy's action space. config (TrainerConfigDict): The Policy's config. """ # Setting `vf_share_layers` in the top-level config is deprecated. # It's confusing as some users might (correctly!) set it in their # model config and then won't notice that it's silently overwritten # here. if config.get("vf_share_layers", DEPRECATED_VALUE) != DEPRECATED_VALUE: deprecation_warning( old="config[vf_share_layers]", new="config[model][vf_share_layers]", error=True, ) config["model"]["vf_share_layers"] = config["vf_share_layers"] # If vf_share_layers is True, inform about the need to tune vf_loss_coeff. if config.get("model", {}).get("vf_share_layers") is True: logger.info( "`vf_share_layers=True` in your model. " "Therefore, remember to tune the value of `vf_loss_coeff`!")
def validate_config(self, config: TrainerConfigDict) -> None: # Call super's validation method. super().validate_config(config) if config["model"]["custom_model"]: logger.warning( "Setting use_state_preprocessor=True since a custom model " "was specified.") config["use_state_preprocessor"] = True if config["grad_clip"] is not None and config["grad_clip"] <= 0.0: raise ValueError("`grad_clip` value must be > 0.0!") if config["exploration_config"]["type"] == "ParameterNoise": if config["batch_mode"] != "complete_episodes": logger.warning( "ParameterNoise Exploration requires `batch_mode` to be " "'complete_episodes'. Setting " "batch_mode=complete_episodes.") config["batch_mode"] = "complete_episodes" if config.get("prioritized_replay"): if config["multiagent"]["replay_mode"] == "lockstep": raise ValueError("Prioritized replay is not supported when " "replay_mode=lockstep.") else: if config.get("worker_side_prioritization"): raise ValueError( "Worker side prioritization is not supported when " "prioritized_replay=False.")
def validate_config(config: TrainerConfigDict) -> None: """Checks and updates the config based on settings. Rewrites rollout_fragment_length to take into account n_step truncation. """ if config["exploration_config"]["type"] == "ParameterNoise": if config["batch_mode"] != "complete_episodes": logger.warning( "ParameterNoise Exploration requires `batch_mode` to be " "'complete_episodes'. Setting batch_mode=complete_episodes.") config["batch_mode"] = "complete_episodes" if config.get("noisy", False): raise ValueError( "ParameterNoise Exploration and `noisy` network cannot be " "used at the same time!") # Update effective batch size to include n-step adjusted_batch_size = max(config["rollout_fragment_length"], config.get("n_step", 1)) config["rollout_fragment_length"] = adjusted_batch_size if config.get("prioritized_replay"): if config["multiagent"]["replay_mode"] == "lockstep": raise ValueError("Prioritized replay is not supported when " "replay_mode=lockstep.") elif config["replay_sequence_length"] > 1: raise ValueError("Prioritized replay is not supported when " "replay_sequence_length > 1.") # Multi-agent mode and multi-GPU optimizer. if config["multiagent"]["policies"] and not config["simple_optimizer"]: logger.info( "In multi-agent mode, policies will be optimized sequentially " "by the multi-GPU optimizer. Consider setting " "simple_optimizer=True if this doesn't work for you.")
def _get_resource_bundles(config: TrainerConfigDict): input_config = config.get("input_config", {}) parallelism = input_config.get("parallelism", config.get("num_workers", 1)) cpus_per_task = input_config.get( "num_cpus_per_read_task", DEFAULT_NUM_CPUS_PER_TASK ) return [{"CPU": math.ceil(parallelism * cpus_per_task)}]
def validate_config(self, config: TrainerConfigDict) -> None: """Checks and updates the config based on settings.""" # Call super's validation method. super().validate_config(config) if config["exploration_config"]["type"] == "ParameterNoise": if config["batch_mode"] != "complete_episodes": logger.warning( "ParameterNoise Exploration requires `batch_mode` to be " "'complete_episodes'. Setting batch_mode=" "complete_episodes.") config["batch_mode"] = "complete_episodes" if config.get("noisy", False): raise ValueError( "ParameterNoise Exploration and `noisy` network cannot be" " used at the same time!") validate_buffer_config(config) # Multi-agent mode and multi-GPU optimizer. if config["multiagent"]["policies"] and not config["simple_optimizer"]: logger.info( "In multi-agent mode, policies will be optimized sequentially" " by the multi-GPU optimizer. Consider setting " "`simple_optimizer=True` if this doesn't work for you.")
def validate_config(self, config: TrainerConfigDict) -> None: # First check, whether old `timesteps_per_iteration` is used. If so # convert right away as for CQL, we must measure in training timesteps, # never sampling timesteps (CQL does not sample). if config.get("timesteps_per_iteration", DEPRECATED_VALUE) != DEPRECATED_VALUE: deprecation_warning( old="timesteps_per_iteration", new="min_train_timesteps_per_reporting", error=False, ) config["min_train_timesteps_per_reporting"] = config[ "timesteps_per_iteration"] config["timesteps_per_iteration"] = DEPRECATED_VALUE # Call super's validation method. super().validate_config(config) if config["num_gpus"] > 1: raise ValueError("`num_gpus` > 1 not yet supported for CQL!") # CQL-torch performs the optimizer steps inside the loss function. # Using the multi-GPU optimizer will therefore not work (see multi-GPU # check above) and we must use the simple optimizer for now. if config["simple_optimizer"] is not True and config[ "framework"] == "torch": config["simple_optimizer"] = True if config["framework"] in ["tf", "tf2", "tfe"] and tfp is None: logger.warning( "You need `tensorflow_probability` in order to run CQL! " "Install it via `pip install tensorflow_probability`. Your " f"tf.__version__={tf.__version__ if tf else None}." "Trying to import tfp results in the following error:") try_import_tfp(error=True)
def get_action_connectors_from_trainer_config( ctx: ConnectorContext, config: TrainerConfigDict, ) -> ActionConnectorPipeline: """Default list of action connectors to use for a new policy. Args: ctx: context used to create connectors. config: trainer config. """ connectors = [ConvertToNumpyConnector(ctx)] if config.get("normalize_actions", False): connectors.append(NormalizeActionsConnector(ctx)) if config.get("clip_actions", False): connectors.append(ClipActionsConnector(ctx)) connectors.append(ImmutableActionsConnector(ctx)) return ActionConnectorPipeline(ctx, connectors)
def validate_config(self, config: TrainerConfigDict) -> None: """Checks and updates the config based on settings.""" # Call super's validation method. super().validate_config(config) if config["exploration_config"]["type"] == "ParameterNoise": if config["batch_mode"] != "complete_episodes": logger.warning( "ParameterNoise Exploration requires `batch_mode` to be " "'complete_episodes'. Setting batch_mode=" "complete_episodes." ) config["batch_mode"] = "complete_episodes" if config.get("noisy", False): raise ValueError( "ParameterNoise Exploration and `noisy` network cannot be" " used at the same time!" ) if config.get("prioritized_replay") or config.get( "replay_buffer_config", {} ).get("prioritized_replay"): if config["multiagent"]["replay_mode"] == "lockstep": raise ValueError( "Prioritized replay is not supported when replay_mode=lockstep." ) elif config.get("replay_sequence_length", 0) > 1: raise ValueError( "Prioritized replay is not supported when " "replay_sequence_length > 1." ) else: if config.get("worker_side_prioritization"): raise ValueError( "Worker side prioritization is not supported when " "prioritized_replay=False." ) # Multi-agent mode and multi-GPU optimizer. if config["multiagent"]["policies"] and not config["simple_optimizer"]: logger.info( "In multi-agent mode, policies will be optimized sequentially" " by the multi-GPU optimizer. Consider setting " "`simple_optimizer=True` if this doesn't work for you." )
def validate_config(config: TrainerConfigDict) -> None: """Executed before Policy is "initialized" (at beginning of constructor). Args: config (TrainerConfigDict): The Policy's config. """ # If vf_share_layers is True, inform about the need to tune vf_loss_coeff. if config.get("model", {}).get("vf_share_layers") is True: logger.info( "`vf_share_layers=True` in your model. " "Therefore, remember to tune the value of `vf_loss_coeff`!")
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: """Execution plan of the A2C algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ assert len(kwargs) == 0, ( "A2C execution_plan does NOT take any additional parameters") rollouts = ParallelRollouts(workers, mode="bulk_sync") if config["microbatch_size"]: num_microbatches = math.ceil(config["train_batch_size"] / config["microbatch_size"]) # In microbatch mode, we want to compute gradients on experience # microbatches, average a number of these microbatches, and then apply # the averaged gradient in one SGD step. This conserves GPU memory, # allowing for extremely large experience batches to be used. train_op = ( rollouts.combine( ConcatBatches(min_batch_size=config["microbatch_size"], count_steps_by=config["multiagent"] ["count_steps_by"])).for_each( ComputeGradients(workers)) # (grads, info) .batch(num_microbatches) # List[(grads, info)] .for_each(AverageGradients()) # (avg_grads, info) .for_each(ApplyGradients(workers))) else: # In normal mode, we execute one SGD step per each train batch. if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) train_op = rollouts.combine( ConcatBatches(min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"] ["count_steps_by"])).for_each(train_step_op) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(trainer: Trainer, workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: """Execution plan of the Simple Q algorithm. Defines the distributed dataflow. Args: trainer (Trainer): The Trainer object creating the execution plan. workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], replay_mode=config["multiagent"]["replay_mode"], replay_sequence_length=config["replay_sequence_length"]) # Assign to Trainer, so we can store the LocalReplayBuffer's # data when we save checkpoints. trainer.local_replay_buffer = local_replay_buffer rollouts = ParallelRollouts(workers, mode="bulk_sync") # (1) Generate rollouts and store them in our local replay buffer. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) # (2) Read and train on experiences from the replay buffer. replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(train_step_op) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the PPO algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: The Policy class to use with PPOTrainer. If None, use `default_policy` provided in build_trainer(). """ rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect batches for the trainable policies. rollouts = rollouts.for_each( SelectExperiences(workers.trainable_policies())) # Concatenate the SampleBatches into one. rollouts = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) # Standardize advantages. rollouts = rollouts.for_each(StandardizeFields(["advantages"])) # Perform one training step on the combined + standardized batch. if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( TrainTFMultiGPU( workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"], framework=config.get("framework"))) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers)) # Warn about bad reward scales and return training metrics. return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: warn_about_bad_reward_scales(config, result))
def __init__(self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict): print("__init__ FrozenPolicyFromTuneTrainer") self.tune_config = config["tune_config"] TuneTrainerClass = self.tune_config["TuneTrainerClass"] self.tune_trainer = TuneTrainerClass(config=self.tune_config) self.load_checkpoint(config.pop(LOAD_FROM_CONFIG_KEY, (None, None))) super().__init__(observation_space, action_space, config)
def make_model_and_action_dist( policy: Policy, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> Tuple[ModelV2, TorchDistributionWrapper]: model_config = copy.deepcopy(config['model']) model_config.update(model_config.pop('custom_model_config')) model = COMATorchModel(observation_space, action_space, num_outputs=sum(action_space.nvec), model_config=model_config, communication=config.get('communication', True)) return model, COMATorchDist
def validate_config(config: TrainerConfigDict) -> None: """Checks and updates the config based on settings. Rewrites rollout_fragment_length to take into account n_step truncation. """ if config["replay_sequence_length"] != -1: raise ValueError( "`replay_sequence_length` is calculated automatically to be " "model->max_seq_len + burn_in!") # Add the `burn_in` to the Model's max_seq_len. # Set the replay sequence length to the max_seq_len of the model. config["replay_sequence_length"] = \ config["burn_in"] + config["model"]["max_seq_len"] if config.get("prioritized_replay"): raise ValueError("Prioritized replay is not supported for R2D2 yet!") if config.get("batch_mode") != "complete_episodes": raise ValueError("`batch_mode` must be 'complete_episodes'!") if config["n_step"] > 1: raise ValueError("`n_step` > 1 not yet supported by R2D2!")
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the DQN algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ replay_buffer_actor = ReservoirReplayActor.remote( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], replay_mode=config["multiagent"]["replay_mode"], replay_sequence_length=config["replay_sequence_length"], ) # Store a handle for the replay buffer actor in the local worker workers.local_worker().replay_buffer_actor = replay_buffer_actor # Read and train on experiences from the replay buffer. Every batch # returned from the Replay iterator is passed to TrainOneStep to # take a SGD step. post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b) print("running replay op..") def gen_replay(_): while True: item = ray.get(replay_buffer_actor.replay.remote()) if item is None: yield _NextValueNotReady() else: yield item replay_op = LocalIterator(gen_replay, SharedMetrics()) \ .for_each(lambda x: post_fn(x, workers, config)) \ .for_each(TrainOneStep(workers)) replay_op = StandardMetricsReporting(replay_op, workers, config) replay_op = map( lambda x: x if not isinstance(x, _NextValueNotReady) else {}, replay_op) return replay_op
def validate_config(config: TrainerConfigDict) -> None: """Checks and updates the config based on settings. Rewrites rollout_fragment_length to take into account burn-in and max_seq_len truncation. """ if config["replay_sequence_length"] != -1: raise ValueError( "`replay_sequence_length` is calculated automatically to be " "model->max_seq_len + burn_in!") # Add the `burn_in` to the Model's max_seq_len. # Set the replay sequence length to the max_seq_len of the model. config["replay_sequence_length"] = \ config["burn_in"] + config["model"]["max_seq_len"] if config.get("batch_mode") != "complete_episodes": raise ValueError("`batch_mode` must be 'complete_episodes'!")
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: assert len(kwargs) == 0, ( "PPO execution_plan does NOT take any additional parameters") rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect batches for the trainable policies. rollouts = rollouts.for_each( SelectExperiences(workers.trainable_policies())) # Concatenate the SampleBatches into one. rollouts = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) # Standardize advantages. rollouts = rollouts.for_each(StandardizeFields(["advantages"])) # Perform one training step on the combined + standardized batch. if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep(workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"], framework=config.get("framework"))) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each( UpdateKL(workers)) # Warn about bad reward scales and return training metrics. return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: warn_about_bad_reward_scales( config, result))
def _build_policy_map( self, policy_dict: MultiAgentPolicyConfigDict, policy_config: TrainerConfigDict ) -> Tuple[Dict[PolicyID, Policy], Dict[PolicyID, Preprocessor]]: policy_map = {} preprocessors = {} for name, (cls, obs_space, act_space, conf) in sorted(policy_dict.items()): logger.debug("Creating policy for {}".format(name)) merged_conf = merge_dicts(policy_config, conf) merged_conf["num_workers"] = self.num_workers merged_conf["worker_index"] = self.worker_index if self.preprocessing_enabled: preprocessor = ModelCatalog.get_preprocessor_for_space( obs_space, merged_conf.get("model")) preprocessors[name] = preprocessor obs_space = preprocessor.observation_space else: preprocessors[name] = NoPreprocessor(obs_space) if isinstance(obs_space, gym.spaces.Dict) or \ isinstance(obs_space, gym.spaces.Tuple): raise ValueError( "Found raw Tuple|Dict space as input to policy. " "Please preprocess these observations with a " "Tuple|DictFlatteningPreprocessor.") if tf1 and tf1.executing_eagerly(): if hasattr(cls, "as_eager"): cls = cls.as_eager() if policy_config.get("eager_tracing"): cls = cls.with_tracing() elif not issubclass(cls, TFPolicy): pass # could be some other type of policy else: raise ValueError("This policy does not support eager " "execution: {}".format(cls)) if tf1: with tf1.variable_scope(name): policy_map[name] = cls(obs_space, act_space, merged_conf) else: policy_map[name] = cls(obs_space, act_space, merged_conf) if self.worker_index == 0: logger.info("Built policy map: {}".format(policy_map)) logger.info("Built preprocessor map: {}".format(preprocessors)) return policy_map, preprocessors
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: assert "local_replay_buffer" in kwargs, ( "GenericOffPolicy execution plan requires a local replay buffer.") local_replay_buffer = kwargs["local_replay_buffer"] rollouts = ParallelRollouts(workers, mode="bulk_sync") # (1) Generate rollouts and store them in our local replay buffer. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) # (2) Read and train on experiences from the replay buffer. replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(train_step_op) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: assert ( "local_replay_buffer" in kwargs), "DQN's execution plan requires a local replay buffer." # Assign to Trainer, so we can store the MultiAgentReplayBuffer's # data when we save checkpoints. local_replay_buffer = kwargs["local_replay_buffer"] rollouts = ParallelRollouts(workers, mode="bulk_sync") # We execute the following steps concurrently: # (1) Generate rollouts and store them in our local replay buffer. # Calling next() on store_op drives this. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) def update_prio(item): samples, info_dict = item if config.get("prioritized_replay"): prio_dict = {} for policy_id, info in info_dict.items(): # TODO(sven): This is currently structured differently for # torch/tf. Clean up these results/info dicts across # policies (note: fixing this in torch_policy.py will # break e.g. DDPPO!). td_error = info.get( "td_error", info[LEARNER_STATS_KEY].get("td_error")) samples.policy_batches[policy_id].set_get_interceptor(None) batch_indices = samples.policy_batches[policy_id].get( "batch_indexes") # In case the buffer stores sequences, TD-error could # already be calculated per sequence chunk. if len(batch_indices) != len(td_error): T = local_replay_buffer.replay_sequence_length assert (len(batch_indices) > len(td_error) and len(batch_indices) % T == 0) batch_indices = batch_indices.reshape([-1, T])[:, 0] assert len(batch_indices) == len(td_error) prio_dict[policy_id] = (batch_indices, td_error) local_replay_buffer.update_priorities(prio_dict) return info_dict # (2) Read and train on experiences from the replay buffer. Every batch # returned from the LocalReplay() iterator is passed to TrainOneStep to # take a SGD step, and then we decide whether to update the target # network. post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b) if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], _fake_gpus=config["_fake_gpus"], ) replay_op = (Replay(local_buffer=local_replay_buffer).for_each( lambda x: post_fn(x, workers, config)).for_each( train_step_op).for_each(update_prio).for_each( UpdateTargetNetwork(workers, config["target_network_update_freq"]))) # Alternate deterministically between (1) and (2). # Only return the output of (2) since training metrics are not # available until (2) runs. train_op = Concurrently( [store_op, replay_op], mode="round_robin", output_indexes=[1], round_robin_weights=calculate_rr_weights(config), ) return StandardMetricsReporting(train_op, workers, config)
def create_policy(self, policy_id: PolicyID, policy_cls: Type["Policy"], observation_space: gym.Space, action_space: gym.Space, config_override: PartialTrainerConfigDict, merged_config: TrainerConfigDict) -> None: """Creates a new policy and stores it to the cache. Args: policy_id (PolicyID): The policy ID. This is the key under which the created policy will be stored in this map. policy_cls (Type[Policy]): The (original) policy class to use. This may still be altered in case tf-eager (and tracing) is used. observation_space (gym.Space): The observation space of the policy. action_space (gym.Space): The action space of the policy. config_override (PartialTrainerConfigDict): The config override dict for this policy. This is the partial dict provided by the user. merged_config (TrainerConfigDict): The entire config (merged default config + `config_override`). """ framework = merged_config.get("framework", "tf") class_ = get_tf_eager_cls_if_necessary(policy_cls, merged_config) # Tf. if framework in ["tf2", "tf", "tfe"]: var_scope = policy_id + ( ("_wk" + str(self.worker_index)) if self.worker_index else "") # For tf static graph, build every policy in its own graph # and create a new session for it. if framework == "tf": with tf1.Graph().as_default(): if self.session_creator: sess = self.session_creator() else: sess = tf1.Session(config=tf1.ConfigProto( gpu_options=tf1.GPUOptions(allow_growth=True))) with sess.as_default(): # Set graph-level seed. if self.seed is not None: tf1.set_random_seed(self.seed) with tf1.variable_scope(var_scope): self[policy_id] = class_(observation_space, action_space, merged_config) # For tf-eager: no graph, no session. else: with tf1.variable_scope(var_scope): self[policy_id] = \ class_(observation_space, action_space, merged_config) # Non-tf: No graph, no session. else: class_ = policy_cls self[policy_id] = class_(observation_space, action_space, merged_config) # Store spec (class, obs-space, act-space, and config overrides) such # that the map will be able to reproduce on-the-fly added policies # from disk. self.policy_specs[policy_id] = PolicySpec( policy_class=policy_cls, observation_space=observation_space, action_space=action_space, config=config_override)
def _make_worker( self, *, cls: Callable, env_creator: Callable[[EnvContext], EnvType], validate_env: Optional[Callable[[EnvType], None]], policy_cls: Type[Policy], worker_index: int, num_workers: int, config: TrainerConfigDict, spaces: Optional[Dict[PolicyID, Tuple[gym.spaces.Space, gym.spaces.Space]]] = None, ) -> Union[RolloutWorker, "ActorHandle"]: def session_creator(): logger.debug("Creating TF session {}".format( config["tf_session_args"])) return tf1.Session(config=tf1.ConfigProto( **config["tf_session_args"])) if isinstance(config["input"], FunctionType): input_creator = config["input"] elif config["input"] == "sampler": input_creator = (lambda ioctx: ioctx.default_sampler_input()) elif isinstance(config["input"], dict): input_creator = ( lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx), config["shuffle_buffer_size"])) else: input_creator = ( lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx), config["shuffle_buffer_size"])) if isinstance(config["output"], FunctionType): output_creator = config["output"] elif config["output"] is None: output_creator = (lambda ioctx: NoopOutput()) elif config["output"] == "logdir": output_creator = (lambda ioctx: JsonWriter( ioctx.log_dir, ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) else: output_creator = (lambda ioctx: JsonWriter( config["output"], ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) if config["input"] == "sampler": input_evaluation = [] else: input_evaluation = config["input_evaluation"] # Fill in the default policy_cls if 'None' is specified in multiagent. if config["multiagent"]["policies"]: tmp = config["multiagent"]["policies"] _validate_multiagent_config(tmp, allow_none_graph=True) # TODO: (sven) Allow for setting observation and action spaces to # None as well, in which case, spaces are taken from env. # It's tedious to have to provide these in a multi-agent config. for k, v in tmp.items(): if v[0] is None: tmp[k] = (policy_cls, v[1], v[2], v[3]) policy_spec = tmp # Otherwise, policy spec is simply the policy class itself. else: policy_spec = policy_cls if worker_index == 0: extra_python_environs = config.get( "extra_python_environs_for_driver", None) else: extra_python_environs = config.get( "extra_python_environs_for_worker", None) worker = cls( env_creator=env_creator, validate_env=validate_env, policy_spec=policy_spec, policy_mapping_fn=config["multiagent"]["policy_mapping_fn"], policies_to_train=config["multiagent"]["policies_to_train"], tf_session_creator=(session_creator if config["tf_session_args"] else None), rollout_fragment_length=config["rollout_fragment_length"], batch_mode=config["batch_mode"], episode_horizon=config["horizon"], preprocessor_pref=config["preprocessor_pref"], sample_async=config["sample_async"], compress_observations=config["compress_observations"], num_envs=config["num_envs_per_worker"], observation_fn=config["multiagent"]["observation_fn"], observation_filter=config["observation_filter"], clip_rewards=config["clip_rewards"], clip_actions=config["clip_actions"], env_config=config["env_config"], model_config=config["model"], policy_config=config, worker_index=worker_index, num_workers=num_workers, monitor_path=self._logdir if config["monitor"] else None, log_dir=self._logdir, log_level=config["log_level"], callbacks=config["callbacks"], input_creator=input_creator, input_evaluation=input_evaluation, output_creator=output_creator, remote_worker_envs=config["remote_worker_envs"], remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"], soft_horizon=config["soft_horizon"], no_done_at_end=config["no_done_at_end"], seed=(config["seed"] + worker_index) if config["seed"] is not None else None, fake_sampler=config["fake_sampler"], extra_python_environs=extra_python_environs, spaces=spaces, ) return worker
def _make_worker( self, cls: Callable, env_creator: Callable[[EnvContext], EnvType], policy: Type[Policy], worker_index: int, config: TrainerConfigDict) -> Union[RolloutWorker, "ActorHandle"]: def session_creator(): logger.debug("Creating TF session {}".format( config["tf_session_args"])) return tf1.Session(config=tf1.ConfigProto( **config["tf_session_args"])) if isinstance(config["input"], FunctionType): input_creator = config["input"] elif config["input"] == "sampler": input_creator = (lambda ioctx: ioctx.default_sampler_input()) elif isinstance(config["input"], dict): input_creator = ( lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx), config["shuffle_buffer_size"])) else: input_creator = ( lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx), config["shuffle_buffer_size"])) if isinstance(config["output"], FunctionType): output_creator = config["output"] elif config["output"] is None: output_creator = (lambda ioctx: NoopOutput()) elif config["output"] == "logdir": output_creator = (lambda ioctx: JsonWriter( ioctx.log_dir, ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) else: output_creator = (lambda ioctx: JsonWriter( config["output"], ioctx, max_file_size=config["output_max_file_size"], compress_columns=config["output_compress_columns"])) if config["input"] == "sampler": input_evaluation = [] else: input_evaluation = config["input_evaluation"] # Fill in the default policy if 'None' is specified in multiagent. if config["multiagent"]["policies"]: tmp = config["multiagent"]["policies"] _validate_multiagent_config(tmp, allow_none_graph=True) for k, v in tmp.items(): if v[0] is None: tmp[k] = (policy, v[1], v[2], v[3]) policy = tmp if worker_index == 0: extra_python_environs = config.get( "extra_python_environs_for_driver", None) else: extra_python_environs = config.get( "extra_python_environs_for_worker", None) worker = cls( env_creator, policy, policy_mapping_fn=config["multiagent"]["policy_mapping_fn"], policies_to_train=config["multiagent"]["policies_to_train"], tf_session_creator=(session_creator if config["tf_session_args"] else None), rollout_fragment_length=config["rollout_fragment_length"], batch_mode=config["batch_mode"], episode_horizon=config["horizon"], preprocessor_pref=config["preprocessor_pref"], sample_async=config["sample_async"], compress_observations=config["compress_observations"], num_envs=config["num_envs_per_worker"], observation_fn=config["multiagent"]["observation_fn"], observation_filter=config["observation_filter"], clip_rewards=config["clip_rewards"], clip_actions=config["clip_actions"], env_config=config["env_config"], model_config=config["model"], policy_config=config, worker_index=worker_index, num_workers=config["num_workers"], monitor_path=self._logdir if config["monitor"] else None, log_dir=self._logdir, log_level=config["log_level"], callbacks=config["callbacks"], input_creator=input_creator, input_evaluation=input_evaluation, output_creator=output_creator, remote_worker_envs=config["remote_worker_envs"], remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"], soft_horizon=config["soft_horizon"], no_done_at_end=config["no_done_at_end"], seed=(config["seed"] + worker_index) if config["seed"] is not None else None, fake_sampler=config["fake_sampler"], extra_python_environs=extra_python_environs) return worker
def __init__( self, *, env_creator: Callable[[EnvContext], EnvType], validate_env: Optional[Callable[[EnvType, EnvContext], None]] = None, policy_spec: Union[type, Dict[ str, Tuple[Optional[type], gym.Space, gym.Space, PartialTrainerConfigDict]]] = None, policy_mapping_fn: Optional[Callable[[AgentID], PolicyID]] = None, policies_to_train: Optional[List[PolicyID]] = None, tf_session_creator: Optional[Callable[[], "tf1.Session"]] = None, rollout_fragment_length: int = 100, batch_mode: str = "truncate_episodes", episode_horizon: int = None, preprocessor_pref: str = "deepmind", sample_async: bool = False, compress_observations: bool = False, num_envs: int = 1, observation_fn: "ObservationFunction" = None, observation_filter: str = "NoFilter", clip_rewards: bool = None, clip_actions: bool = True, env_config: EnvConfigDict = None, model_config: ModelConfigDict = None, policy_config: TrainerConfigDict = None, worker_index: int = 0, num_workers: int = 0, monitor_path: str = None, log_dir: str = None, log_level: str = None, callbacks: Type["DefaultCallbacks"] = None, input_creator: Callable[[ IOContext ], InputReader] = lambda ioctx: ioctx.default_sampler_input(), input_evaluation: List[str] = frozenset([]), output_creator: Callable[ [IOContext], OutputWriter] = lambda ioctx: NoopOutput(), remote_worker_envs: bool = False, remote_env_batch_wait_ms: int = 0, soft_horizon: bool = False, no_done_at_end: bool = False, seed: int = None, extra_python_environs: dict = None, fake_sampler: bool = False, spaces: Optional[Dict[PolicyID, Tuple[gym.spaces.Space, gym.spaces.Space]]] = None, policy: Union[type, Dict[ str, Tuple[Optional[type], gym.Space, gym.Space, PartialTrainerConfigDict]]] = None, ): """Initialize a rollout worker. Args: env_creator (Callable[[EnvContext], EnvType]): Function that returns a gym.Env given an EnvContext wrapped configuration. validate_env (Optional[Callable[[EnvType, EnvContext], None]]): Optional callable to validate the generated environment (only on worker=0). policy_spec (Union[type, Dict[str, Tuple[Type[Policy], gym.Space, gym.Space, PartialTrainerConfigDict]]]): Either a Policy class or a dict of policy id strings to (Policy class, obs_space, action_space, config)-tuples. If a dict is specified, then we are in multi-agent mode and a policy_mapping_fn can also be set (if not, will map all agents to DEFAULT_POLICY_ID). policy_mapping_fn (Optional[Callable[[AgentID], PolicyID]]): A callable that maps agent ids to policy ids in multi-agent mode. This function will be called each time a new agent appears in an episode, to bind that agent to a policy for the duration of the episode. If not provided, will map all agents to DEFAULT_POLICY_ID. policies_to_train (Optional[List[PolicyID]]): Optional list of policies to train, or None for all policies. tf_session_creator (Optional[Callable[[], tf1.Session]]): A function that returns a TF session. This is optional and only useful with TFPolicy. rollout_fragment_length (int): The target number of env transitions to include in each sample batch returned from this worker. batch_mode (str): One of the following batch modes: "truncate_episodes": Each call to sample() will return a batch of at most `rollout_fragment_length * num_envs` in size. The batch will be exactly `rollout_fragment_length * num_envs` in size if postprocessing does not change batch sizes. Episodes may be truncated in order to meet this size requirement. "complete_episodes": Each call to sample() will return a batch of at least `rollout_fragment_length * num_envs` in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the batch size. Note that when `num_envs > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. episode_horizon (int): Whether to stop episodes at this horizon. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations. They can be decompressed with rllib/utils/compression. num_envs (int): If more than one, will create multiple envs and vectorize the computation of actions. This has no effect if if the env already implements VectorEnv. observation_fn (ObservationFunction): Optional multi-agent observation function. observation_filter (str): Name of observation filter to use. clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to experience postprocessing. Setting to None means clip for Atari only. clip_actions (bool): Whether to clip action values to the range specified by the policy action space. env_config (EnvConfigDict): Config to pass to the env creator. model_config (ModelConfigDict): Config to use when creating the policy model. policy_config (TrainerConfigDict): Config to pass to the policy. In the multi-agent case, this config will be merged with the per-policy configs specified by `policy_spec`. worker_index (int): For remote workers, this should be set to a non-zero and unique value. This index is passed to created envs through EnvContext so that envs can be configured per worker. num_workers (int): For remote workers, how many workers altogether have been created? monitor_path (str): Write out episode stats and videos to this directory if specified. log_dir (str): Directory where logs can be placed. log_level (str): Set the root log level on creation. callbacks (DefaultCallbacks): Custom training callbacks. input_creator (Callable[[IOContext], InputReader]): Function that returns an InputReader object for loading previous generated experiences. input_evaluation (List[str]): How to evaluate the policy performance. This only makes sense to set when the input is reading offline data. The possible values include: - "is": the step-wise importance sampling estimator. - "wis": the weighted step-wise is estimator. - "simulation": run the environment in the background, but use this data for evaluation only and never for learning. output_creator (Callable[[IOContext], OutputWriter]): Function that returns an OutputWriter object for saving generated experiences. remote_worker_envs (bool): If using num_envs > 1, whether to create those new envs in remote processes instead of in the current process. This adds overheads, but can make sense if your envs remote_env_batch_wait_ms (float): Timeout that remote workers are waiting when polling environments. 0 (continue when at least one env is ready) is a reasonable default, but optimal value could be obtained by measuring your environment step / reset and model inference perf. soft_horizon (bool): Calculate rewards but don't reset the environment when the horizon is hit. no_done_at_end (bool): Ignore the done=True at the end of the episode and instead record done=False. seed (int): Set the seed of both np and tf to this value to to ensure each remote worker has unique exploration behavior. extra_python_environs (dict): Extra python environments need to be set. fake_sampler (bool): Use a fake (inf speed) sampler for testing. spaces (Optional[Dict[PolicyID, Tuple[gym.spaces.Space, gym.spaces.Space]]]): An optional space dict mapping policy IDs to (obs_space, action_space)-tuples. This is used in case no Env is created on this RolloutWorker. policy: Obsoleted arg. Use `policy_spec` instead. """ # Deprecated arg. if policy is not None: deprecation_warning("policy", "policy_spec", error=False) policy_spec = policy assert policy_spec is not None, "Must provide `policy_spec` when " \ "creating RolloutWorker!" self._original_kwargs: dict = locals().copy() del self._original_kwargs["self"] global _global_worker _global_worker = self # set extra environs first if extra_python_environs: for key, value in extra_python_environs.items(): os.environ[key] = str(value) def gen_rollouts(): while True: yield self.sample() ParallelIteratorWorker.__init__(self, gen_rollouts, False) policy_config: TrainerConfigDict = policy_config or {} if (tf1 and policy_config.get("framework") in ["tf2", "tfe"] # This eager check is necessary for certain all-framework tests # that use tf's eager_mode() context generator. and not tf1.executing_eagerly()): tf1.enable_eager_execution() if log_level: logging.getLogger("ray.rllib").setLevel(log_level) if worker_index > 1: disable_log_once_globally() # only need 1 worker to log elif log_level == "DEBUG": enable_periodic_logging() env_context = EnvContext(env_config or {}, worker_index) self.env_context = env_context self.policy_config: TrainerConfigDict = policy_config if callbacks: self.callbacks: "DefaultCallbacks" = callbacks() else: from ray.rllib.agents.callbacks import DefaultCallbacks self.callbacks: "DefaultCallbacks" = DefaultCallbacks() self.worker_index: int = worker_index self.num_workers: int = num_workers model_config: ModelConfigDict = model_config or {} policy_mapping_fn = (policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID)) if not callable(policy_mapping_fn): raise ValueError("Policy mapping function not callable?") self.env_creator: Callable[[EnvContext], EnvType] = env_creator self.rollout_fragment_length: int = rollout_fragment_length * num_envs self.batch_mode: str = batch_mode self.compress_observations: bool = compress_observations self.preprocessing_enabled: bool = True self.last_batch: SampleBatchType = None self.global_vars: dict = None self.fake_sampler: bool = fake_sampler # No Env will be used in this particular worker (not needed). if worker_index == 0 and num_workers > 0 and \ policy_config["create_env_on_driver"] is False: self.env = None # Create an env for this worker. else: self.env = _validate_env(env_creator(env_context)) if validate_env is not None: validate_env(self.env, self.env_context) if isinstance(self.env, (BaseEnv, MultiAgentEnv)): def wrap(env): return env # we can't auto-wrap these env types elif is_atari(self.env) and \ not model_config.get("custom_preprocessor") and \ preprocessor_pref == "deepmind": # Deepmind wrappers already handle all preprocessing. self.preprocessing_enabled = False # If clip_rewards not explicitly set to False, switch it # on here (clip between -1.0 and 1.0). if clip_rewards is None: clip_rewards = True def wrap(env): env = wrap_deepmind( env, dim=model_config.get("dim"), framestack=model_config.get("framestack")) if monitor_path: from gym import wrappers env = wrappers.Monitor(env, monitor_path, resume=True) return env else: def wrap(env): if monitor_path: from gym import wrappers env = wrappers.Monitor(env, monitor_path, resume=True) return env self.env: EnvType = wrap(self.env) def make_env(vector_index): return wrap( env_creator( env_context.copy_with_overrides( worker_index=worker_index, vector_index=vector_index, remote=remote_worker_envs))) self.make_env_fn = make_env self.tf_sess = None policy_dict = _validate_and_canonicalize( policy_spec, self.env, spaces=spaces) self.policies_to_train: List[PolicyID] = policies_to_train or list( policy_dict.keys()) self.policy_map: Dict[PolicyID, Policy] = None self.preprocessors: Dict[PolicyID, Preprocessor] = None # set numpy and python seed if seed is not None: np.random.seed(seed) random.seed(seed) if not hasattr(self.env, "seed"): logger.info("Env doesn't support env.seed(): {}".format( self.env)) else: self.env.seed(seed) try: assert torch is not None torch.manual_seed(seed) except AssertionError: logger.info("Could not seed torch") if _has_tensorflow_graph(policy_dict) and not ( tf1 and tf1.executing_eagerly()): if not tf1: raise ImportError("Could not import tensorflow") with tf1.Graph().as_default(): if tf_session_creator: self.tf_sess = tf_session_creator() else: self.tf_sess = tf1.Session( config=tf1.ConfigProto( gpu_options=tf1.GPUOptions(allow_growth=True))) with self.tf_sess.as_default(): # set graph-level seed if seed is not None: tf1.set_random_seed(seed) self.policy_map, self.preprocessors = \ self._build_policy_map(policy_dict, policy_config) else: self.policy_map, self.preprocessors = self._build_policy_map( policy_dict, policy_config) if (ray.is_initialized() and ray.worker._mode() != ray.worker.LOCAL_MODE): # Check available number of GPUs if not ray.get_gpu_ids(): logger.debug("Creating policy evaluation worker {}".format( worker_index) + " on CPU (please ignore any CUDA init errors)") elif (policy_config["framework"] in ["tf2", "tf", "tfe"] and not tf.config.experimental.list_physical_devices("GPU")) or \ (policy_config["framework"] == "torch" and not torch.cuda.is_available()): raise RuntimeError( "GPUs were assigned to this worker by Ray, but " "your DL framework ({}) reports GPU acceleration is " "disabled. This could be due to a bad CUDA- or {} " "installation.".format(policy_config["framework"], policy_config["framework"])) self.multiagent: bool = set( self.policy_map.keys()) != {DEFAULT_POLICY_ID} if self.multiagent and self.env is not None: if not ((isinstance(self.env, MultiAgentEnv) or isinstance(self.env, ExternalMultiAgentEnv)) or isinstance(self.env, BaseEnv)): raise ValueError( "Have multiple policies {}, but the env ".format( self.policy_map) + "{} is not a subclass of BaseEnv, MultiAgentEnv or " "ExternalMultiAgentEnv?".format(self.env)) self.filters: Dict[PolicyID, Filter] = { policy_id: get_filter(observation_filter, policy.observation_space.shape) for (policy_id, policy) in self.policy_map.items() } if self.worker_index == 0: logger.info("Built filter map: {}".format(self.filters)) self.num_envs: int = num_envs if self.env is None: self.async_env = None elif "custom_vector_env" in policy_config: custom_vec_wrapper = policy_config["custom_vector_env"] self.async_env = custom_vec_wrapper(self.env) else: # Always use vector env for consistency even if num_envs = 1. self.async_env: BaseEnv = BaseEnv.to_base_env( self.env, make_env=make_env, num_envs=num_envs, remote_envs=remote_worker_envs, remote_env_batch_wait_ms=remote_env_batch_wait_ms) # `truncate_episodes`: Allow a batch to contain more than one episode # (fragments) and always make the batch `rollout_fragment_length` # long. if self.batch_mode == "truncate_episodes": pack = True # `complete_episodes`: Never cut episodes and sampler will return # exactly one (complete) episode per poll. elif self.batch_mode == "complete_episodes": rollout_fragment_length = float("inf") pack = False else: raise ValueError("Unsupported batch mode: {}".format( self.batch_mode)) self.io_context: IOContext = IOContext(log_dir, policy_config, worker_index, self) self.reward_estimators: List[OffPolicyEstimator] = [] for method in input_evaluation: if method == "simulation": logger.warning( "Requested 'simulation' input evaluation method: " "will discard all sampler outputs and keep only metrics.") sample_async = True elif method == "is": ise = ImportanceSamplingEstimator.create(self.io_context) self.reward_estimators.append(ise) elif method == "wis": wise = WeightedImportanceSamplingEstimator.create( self.io_context) self.reward_estimators.append(wise) else: raise ValueError( "Unknown evaluation method: {}".format(method)) if self.env is None: self.sampler = None elif sample_async: self.sampler = AsyncSampler( worker=self, env=self.async_env, policies=self.policy_map, policy_mapping_fn=policy_mapping_fn, preprocessors=self.preprocessors, obs_filters=self.filters, clip_rewards=clip_rewards, rollout_fragment_length=rollout_fragment_length, callbacks=self.callbacks, horizon=episode_horizon, multiple_episodes_in_batch=pack, tf_sess=self.tf_sess, clip_actions=clip_actions, blackhole_outputs="simulation" in input_evaluation, soft_horizon=soft_horizon, no_done_at_end=no_done_at_end, observation_fn=observation_fn, _use_trajectory_view_api=policy_config.get( "_use_trajectory_view_api", False)) # Start the Sampler thread. self.sampler.start() else: self.sampler = SyncSampler( worker=self, env=self.async_env, policies=self.policy_map, policy_mapping_fn=policy_mapping_fn, preprocessors=self.preprocessors, obs_filters=self.filters, clip_rewards=clip_rewards, rollout_fragment_length=rollout_fragment_length, callbacks=self.callbacks, horizon=episode_horizon, multiple_episodes_in_batch=pack, tf_sess=self.tf_sess, clip_actions=clip_actions, soft_horizon=soft_horizon, no_done_at_end=no_done_at_end, observation_fn=observation_fn, _use_trajectory_view_api=policy_config.get( "_use_trajectory_view_api", False)) self.input_reader: InputReader = input_creator(self.io_context) self.output_writer: OutputWriter = output_creator(self.io_context) logger.debug( "Created rollout worker with env {} ({}), policies {}".format( self.async_env, self.env, self.policy_map))
def __init__( self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, **kwargs, ): self.framework = config.get("framework", "tf2") # Log device. logger.info("Creating TF-eager policy running on {}.".format( "GPU" if get_gpu_devices() else "CPU")) Policy.__init__(self, observation_space, action_space, config) config = dict(self.get_default_config(), **config) self.config = config self._is_training = False # Global timestep should be a tensor. self.global_timestep = tf.Variable(0, trainable=False, dtype=tf.int64) self.explore = tf.Variable(self.config["explore"], trainable=False, dtype=tf.bool) self._loss_initialized = False # Backward compatibility workaround so Policy will call self.loss() directly. # TODO(jungong): clean up after all policies are migrated to new sub-class # implementation. self._loss = None self.batch_divisibility_req = self.get_batch_divisibility_req() self._max_seq_len = config["model"]["max_seq_len"] self.validate_spaces(observation_space, action_space, config) # If using default make_model(), dist_class will get updated when # the model is created next. self.dist_class = self._init_dist_class() self.model = self.make_model() self._init_view_requirements() self.exploration = self._create_exploration() self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(self._state_inputs) > 0 # Got to reset global_timestep again after fake run-throughs. self.global_timestep.assign(0) # Lock used for locking some methods on the object-level. # This prevents possible race conditions when calling the model # first, then its value function (e.g. in a loss function), in # between of which another model call is made (e.g. to compute an # action). self._lock = threading.RLock() # Only for `config.eager_tracing=True`: A counter to keep track of # how many times an eager-traced method (e.g. # `self._compute_actions_helper`) has been re-traced by tensorflow. # We will raise an error if more than n re-tracings have been # detected, since this would considerably slow down execution. # The variable below should only get incremented during the # tf.function trace operations, never when calling the already # traced function after that. self._re_trace_counter = 0
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the DQN algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ if config.get("prioritized_replay"): prio_args = { "prioritized_replay_alpha": config["prioritized_replay_alpha"], "prioritized_replay_beta": config["prioritized_replay_beta"], "prioritized_replay_eps": config["prioritized_replay_eps"], } else: prio_args = {} local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], replay_mode=config["multiagent"]["replay_mode"], replay_sequence_length=config.get("replay_sequence_length", 1), replay_burn_in=config.get("burn_in", 0), replay_zero_init_states=config.get("zero_init_states", True), **prio_args) rollouts = ParallelRollouts(workers, mode="bulk_sync") # We execute the following steps concurrently: # (1) Generate rollouts and store them in our local replay buffer. Calling # next() on store_op drives this. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) def update_prio(item): samples, info_dict = item if config.get("prioritized_replay"): prio_dict = {} for policy_id, info in info_dict.items(): # TODO(sven): This is currently structured differently for # torch/tf. Clean up these results/info dicts across # policies (note: fixing this in torch_policy.py will # break e.g. DDPPO!). td_error = info.get("td_error", info[LEARNER_STATS_KEY].get("td_error")) samples.policy_batches[policy_id].set_get_interceptor(None) prio_dict[policy_id] = ( samples.policy_batches[policy_id].get("batch_indexes"), td_error) local_replay_buffer.update_priorities(prio_dict) return info_dict # (2) Read and train on experiences from the replay buffer. Every batch # returned from the LocalReplay() iterator is passed to TrainOneStep to # take a SGD step, and then we decide whether to update the target network. post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b) if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = TrainTFMultiGPU( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(lambda x: post_fn(x, workers, config)) \ .for_each(train_step_op) \ .for_each(update_prio) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). Only return the output # of (2) since training metrics are not available until (2) runs. train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1], round_robin_weights=calculate_rr_weights(config)) return StandardMetricsReporting(train_op, workers, config)