def _training_pipeline_info(cls, **kwargs) -> Dict[str, Any]: """Define how the model trains.""" training_steps = cls.TRAINING_STEPS return dict( named_losses=dict( ppo_loss=PPO(clip_decay=LinearDecay(training_steps), **PPOConfig) ), pipeline_stages=[ PipelineStage(loss_names=["ppo_loss"], max_stage_steps=training_steps,) ], num_steps=64, num_mini_batch=1, update_repeats=3, use_lr_decay=True, lr=3e-4, )
def training_pipeline(self, **kwargs): # PPO ppo_steps = int(75000000) lr = 3e-4 num_mini_batch = 1 update_repeats = 4 num_steps = 128 save_interval = 5000000 log_interval = 10000 if torch.cuda.is_available() else 1 gamma = 0.99 use_gae = True gae_lambda = 0.95 max_grad_norm = 0.5 PPOConfig["normalize_advantage"] = self.NORMALIZE_ADVANTAGE named_losses = {"ppo_loss": (PPO(**PPOConfig), 1.0)} named_losses = self._update_with_auxiliary_losses(named_losses) return TrainingPipeline( save_interval=save_interval, metric_accumulate_interval=log_interval, optimizer_builder=Builder(optim.Adam, dict(lr=lr)), num_mini_batch=num_mini_batch, update_repeats=update_repeats, max_grad_norm=max_grad_norm, num_steps=num_steps, named_losses={key: val[0] for key, val in named_losses.items()}, gamma=gamma, use_gae=use_gae, gae_lambda=gae_lambda, advance_scene_rollout_period=self.ADVANCE_SCENE_ROLLOUT_PERIOD, pipeline_stages=[ PipelineStage( loss_names=list(named_losses.keys()), max_stage_steps=ppo_steps, loss_weights=[val[1] for val in named_losses.values()], ) ], lr_scheduler_builder=Builder( LambdaLR, {"lr_lambda": LinearDecay(steps=ppo_steps)}), )
def training_pipeline(cls, **kwargs): ppo_steps = int(1e6) lr = 2.5e-4 num_mini_batch = 2 if not torch.cuda.is_available() else 6 update_repeats = 4 num_steps = 128 metric_accumulate_interval = cls.MAX_STEPS * 10 # Log every 10 max length tasks save_interval = 10000 gamma = 0.99 use_gae = True gae_lambda = 1.0 max_grad_norm = 0.5 return TrainingPipeline( save_interval=save_interval, metric_accumulate_interval=metric_accumulate_interval, optimizer_builder=Builder(optim.Adam, dict(lr=lr)), num_mini_batch=num_mini_batch, update_repeats=update_repeats, max_grad_norm=max_grad_norm, num_steps=num_steps, named_losses={ "ppo_loss": PPO(clip_decay=LinearDecay(ppo_steps), **PPOConfig), }, gamma=gamma, use_gae=use_gae, gae_lambda=gae_lambda, advance_scene_rollout_period=cls.ADVANCE_SCENE_ROLLOUT_PERIOD, pipeline_stages=[ PipelineStage( loss_names=["ppo_loss"], max_stage_steps=ppo_steps, ), ], lr_scheduler_builder=Builder( LambdaLR, {"lr_lambda": LinearDecay(steps=ppo_steps)}), )
class MaskedPPO(AbstractActorCriticLoss): """Compute the PPO loss where specified by a mask. # Attributes mask_uuid : A string specifying the sensor UUID to use for masking. The PPO loss will only be computed for those steps where this mask equals 1. """ def __init__( self, mask_uuid: str, ppo_params: Dict[str, Any], ): """Initializer. # Parameters mask_uuid : A string specifying the sensor UUID to use for masking. The PPO loss will only be computed for those steps where this mask equals 1. ppo_params : A dictionary containing keyword arguments for the ppo loss. See the `PPO` class for what arguments are available. """ super().__init__() self.mask_uuid = mask_uuid self._ppo_loss = PPO(**ppo_params) def loss( # type: ignore self, step_count: int, batch: ObservationType, actor_critic_output: ActorCriticOutput[CategoricalDistr], *args, **kwargs): mask = batch["observations"][self.mask_uuid].float() denominator = mask.sum().item() losses_per_step, _ = self._ppo_loss.loss_per_step( step_count=step_count, batch=batch, actor_critic_output=actor_critic_output, ) losses_per_step["entropy"] = ( losses_per_step["entropy"][0].unsqueeze(-1), losses_per_step["entropy"][1], ) losses = { key: ((loss * mask).sum() / max(denominator, 1), weight) for (key, (loss, weight)) in losses_per_step.items() } total_loss = sum(loss * weight if weight is not None else loss for loss, weight in losses.values()) if denominator == 0: losses_to_record = {} else: losses_to_record = { "ppo_total": cast(torch.Tensor, total_loss).item(), **{key: loss.item() for key, (loss, _) in losses.items()}, } return ( total_loss, losses_to_record, )
def training_pipeline(self, **kwargs): # These params are identical to the baseline configuration for 60 samplers (1 machine) ppo_steps = int(300e6) lr = 3e-4 num_mini_batch = 1 update_repeats = 4 num_steps = 128 save_interval = 5000000 log_interval = 10000 if torch.cuda.is_available() else 1 gamma = 0.99 use_gae = True gae_lambda = 0.95 max_grad_norm = 0.5 # We add 30 million steps for small batch learning small_batch_steps = int(30e6) # And a short transition phase towards large learning rate # (see comment in the `lr_scheduler` helper method transition_steps = int(2 / 3 * self.distributed_nodes * 1e6) # Find exact number of samplers per GPU assert (self.num_train_processes % len(self.train_gpu_ids) == 0 ), "Expected uniform number of samplers per GPU" samplers_per_gpu = self.num_train_processes // len(self.train_gpu_ids) # Multiply num_mini_batch by the largest divisor of # samplers_per_gpu to keep all batches of same size: num_mini_batch_multiplier = [ i for i in reversed( range(1, min(samplers_per_gpu // 2, self.distributed_nodes) + 1)) if samplers_per_gpu % i == 0 ][0] # Multiply update_repeats so that the product of this factor and # num_mini_batch_multiplier is >= self.distributed_nodes: update_repeats_multiplier = int( math.ceil(self.distributed_nodes / num_mini_batch_multiplier)) return TrainingPipeline( save_interval=save_interval, metric_accumulate_interval=log_interval, optimizer_builder=Builder(optim.Adam, dict(lr=lr)), num_mini_batch=num_mini_batch, update_repeats=update_repeats, max_grad_norm=max_grad_norm, num_steps=num_steps, named_losses={"ppo_loss": PPO(**PPOConfig, show_ratios=False)}, gamma=gamma, use_gae=use_gae, gae_lambda=gae_lambda, advance_scene_rollout_period=self.ADVANCE_SCENE_ROLLOUT_PERIOD, pipeline_stages=[ # We increase the number of batches for the first stage to reach an # equivalent number of updates per collected rollout data as in the # 1 node/60 samplers setting PipelineStage( loss_names=["ppo_loss"], max_stage_steps=small_batch_steps, num_mini_batch=num_mini_batch * num_mini_batch_multiplier, update_repeats=update_repeats * update_repeats_multiplier, ), # The we proceed with the base configuration (leading to larger # batches due to the increased number of samplers) PipelineStage( loss_names=["ppo_loss"], max_stage_steps=ppo_steps - small_batch_steps, ), ], # We use the MultiLinearDecay curve defined by the helper function, # setting the learning rate scaling as the square root of the number # of nodes. Linear scaling might also works, but we leave that # check to the reader. lr_scheduler_builder=Builder( LambdaLR, { "lr_lambda": self.lr_scheduler( small_batch_steps=small_batch_steps, transition_steps=transition_steps, ppo_steps=ppo_steps, lr_scaling=math.sqrt(self.distributed_nodes), ) }, ), )