def training_pipeline(self, **kwargs): training_steps = int(300000000) tf_steps = int(5e6) anneal_steps = int(5e6) il_no_tf_steps = training_steps - tf_steps - anneal_steps assert il_no_tf_steps > 0 lr = 3e-4 num_mini_batch = 2 if torch.cuda.is_available() else 1 update_repeats = 4 num_steps = 30 save_interval = 5000000 log_interval = 10000 gamma = 0.99 use_gae = True gae_lambda = 0.95 max_grad_norm = 0.5 return TrainingPipeline( save_interval=save_interval, metric_accumulate_interval=log_interval, optimizer_builder=Builder(optim.Adam, dict(lr=lr)), num_mini_batch=num_mini_batch, update_repeats=update_repeats, max_grad_norm=max_grad_norm, num_steps=num_steps, named_losses={ "imitation_loss": Imitation(), }, gamma=gamma, use_gae=use_gae, gae_lambda=gae_lambda, advance_scene_rollout_period=self.ADVANCE_SCENE_ROLLOUT_PERIOD, pipeline_stages=[ PipelineStage( loss_names=["imitation_loss"], max_stage_steps=tf_steps, teacher_forcing=LinearDecay( startp=1.0, endp=1.0, steps=tf_steps, ), ), PipelineStage( loss_names=["imitation_loss"], max_stage_steps=anneal_steps + il_no_tf_steps, teacher_forcing=LinearDecay( startp=1.0, endp=0.0, steps=anneal_steps, ), ), ], lr_scheduler_builder=Builder( LambdaLR, {"lr_lambda": LinearDecay(steps=training_steps)}, ), )
def training_pipeline(cls, **kwargs): dagger_steos = int(1e4) ppo_steps = int(1e6) lr = 2.5e-4 num_mini_batch = 2 if not torch.cuda.is_available() else 6 update_repeats = 4 num_steps = 128 metric_accumulate_interval = cls.MAX_STEPS * 10 # Log every 10 max length tasks save_interval = 10000 gamma = 0.99 use_gae = True gae_lambda = 1.0 max_grad_norm = 0.5 return TrainingPipeline( save_interval=save_interval, metric_accumulate_interval=metric_accumulate_interval, optimizer_builder=Builder(optim.Adam, dict(lr=lr)), num_mini_batch=num_mini_batch, update_repeats=update_repeats, max_grad_norm=max_grad_norm, num_steps=num_steps, named_losses={ "ppo_loss": PPO(clip_decay=LinearDecay(ppo_steps), **PPOConfig), "imitation_loss": Imitation(), # We add an imitation loss. }, gamma=gamma, use_gae=use_gae, gae_lambda=gae_lambda, advance_scene_rollout_period=cls.ADVANCE_SCENE_ROLLOUT_PERIOD, pipeline_stages=[ PipelineStage( loss_names=["imitation_loss"], teacher_forcing=LinearDecay( startp=1.0, endp=0.0, steps=dagger_steos, ), max_stage_steps=dagger_steos, ), PipelineStage( loss_names=["ppo_loss"], max_stage_steps=ppo_steps, ), ], lr_scheduler_builder=Builder( LambdaLR, {"lr_lambda": LinearDecay(steps=ppo_steps)}), )
def training_pipeline(cls, **kwargs): ppo_steps = int(250000000) lr = 3e-4 num_mini_batch = 1 update_repeats = 3 num_steps = 30 save_interval = 5000000 log_interval = 1000 gamma = 0.99 use_gae = True gae_lambda = 0.95 max_grad_norm = 0.5 return TrainingPipeline( save_interval=save_interval, metric_accumulate_interval=log_interval, optimizer_builder=Builder(optim.Adam, dict(lr=lr)), num_mini_batch=num_mini_batch, update_repeats=update_repeats, max_grad_norm=max_grad_norm, num_steps=num_steps, named_losses={"ppo_loss": PPO(**PPOConfig)}, gamma=gamma, use_gae=use_gae, gae_lambda=gae_lambda, advance_scene_rollout_period=cls.ADVANCE_SCENE_ROLLOUT_PERIOD, pipeline_stages=[ PipelineStage(loss_names=["ppo_loss"], max_stage_steps=ppo_steps) ], lr_scheduler_builder=Builder( LambdaLR, {"lr_lambda": LinearDecay(steps=ppo_steps)}), )
def training_pipeline(cls, **kwargs): total_train_steps = cls.TOTAL_IL_TRAIN_STEPS ppo_info = cls.rl_loss_default("ppo", steps=-1) imitation_info = cls.rl_loss_default("imitation") return cls._training_pipeline( named_losses={ "imitation_loss": imitation_info["loss"], }, pipeline_stages=[ PipelineStage( loss_names=["imitation_loss"], teacher_forcing=LinearDecay( startp=1.0, endp=1.0, steps=total_train_steps, ), max_stage_steps=total_train_steps, ), ], num_mini_batch=min(info["num_mini_batch"] for info in [ppo_info, imitation_info]), update_repeats=min(info["update_repeats"] for info in [ppo_info, imitation_info]), total_train_steps=total_train_steps, )
def training_pipeline(cls, **kwargs): ppo_steps = int(1e6) return TrainingPipeline( save_interval=200000, metric_accumulate_interval=1, optimizer_builder=Builder(optim.Adam, dict(lr=3e-4)), num_mini_batch=2, update_repeats=3, max_grad_norm=0.5, num_steps=30, named_losses={ "ppo_loss": Builder( PPO, kwargs={}, default=PPOConfig, ) }, gamma=0.99, use_gae=True, gae_lambda=0.95, advance_scene_rollout_period=cls.ADVANCE_SCENE_ROLLOUT_PERIOD, pipeline_stages=[ PipelineStage(loss_names=["ppo_loss"], max_stage_steps=ppo_steps) ], lr_scheduler_builder=Builder( LambdaLR, {"lr_lambda": LinearDecay(steps=ppo_steps)}), )
def training_pipeline(self, **kwargs): ppo_steps = int(300000000) lr = 3e-4 num_mini_batch = 1 update_repeats = 4 num_steps = 128 save_interval = 5000000 log_interval = 10000 gamma = 0.99 use_gae = True gae_lambda = 0.95 max_grad_norm = 0.5 action_strs = ObjectNavTask.class_action_names() non_end_action_inds_set = { i for i, a in enumerate(action_strs) if a != robothor_constants.END } end_action_ind_set = {action_strs.index(robothor_constants.END)} return TrainingPipeline( save_interval=save_interval, metric_accumulate_interval=log_interval, optimizer_builder=Builder(optim.Adam, dict(lr=lr)), num_mini_batch=num_mini_batch, update_repeats=update_repeats, max_grad_norm=max_grad_norm, num_steps=num_steps, named_losses={ "ppo_loss": PPO(**PPOConfig), "grouped_action_imitation": GroupedActionImitation( nactions=len(ObjectNavTask.class_action_names()), action_groups=[ non_end_action_inds_set, end_action_ind_set ], ), }, gamma=gamma, use_gae=use_gae, gae_lambda=gae_lambda, advance_scene_rollout_period=self.ADVANCE_SCENE_ROLLOUT_PERIOD, pipeline_stages=[ PipelineStage( loss_names=["ppo_loss", "grouped_action_imitation"], max_stage_steps=ppo_steps, ) ], lr_scheduler_builder=Builder( LambdaLR, {"lr_lambda": LinearDecay(steps=ppo_steps)}), )
def training_pipeline(cls, **kwargs): total_train_steps = cls.TOTAL_IL_TRAIN_STEPS ppo_info = cls.rl_loss_default("ppo", steps=-1) num_mini_batch = ppo_info["num_mini_batch"] update_repeats = ppo_info["update_repeats"] # fmt: off return cls._training_pipeline( named_losses={ "offpolicy_expert_ce_loss": MiniGridOffPolicyExpertCELoss( total_episodes_in_epoch=int(1e6)), }, pipeline_stages=[ # Single stage, only with off-policy training PipelineStage( loss_names=[], # no on-policy losses max_stage_steps= total_train_steps, # keep sampling episodes in the stage # Enable off-policy training: offpolicy_component=OffPolicyPipelineComponent( # Pass a method to instantiate data iterators data_iterator_builder=lambda **extra_kwargs: create_minigrid_offpolicy_data_iterator( path=os.path.join( BABYAI_EXPERT_TRAJECTORIES_DIR, "BabyAI-GoToLocal-v0{}.pkl". format("" if torch.cuda.is_available() else "-small"), ), nrollouts=cls.NUM_TRAIN_SAMPLERS // num_mini_batch, # per trainer batch size rollout_len=cls.ROLLOUT_STEPS, instr_len=cls.INSTR_LEN, **extra_kwargs, ), loss_names=["offpolicy_expert_ce_loss" ], # off-policy losses updates=num_mini_batch * update_repeats, # number of batches per rollout ), ), ], # As we don't have any on-policy losses, we set the next # two values to zero to ensure we don't attempt to # compute gradients for on-policy rollouts: num_mini_batch=0, update_repeats=0, total_train_steps=total_train_steps, )
def training_pipeline(cls, **kwargs): total_train_steps = cls.TOTAL_RL_TRAIN_STEPS ppo_info = cls.rl_loss_default("ppo", steps=total_train_steps) return cls._training_pipeline( named_losses={"ppo_loss": ppo_info["loss"],}, pipeline_stages=[ PipelineStage( loss_names=["ppo_loss"], max_stage_steps=total_train_steps, ), ], num_mini_batch=ppo_info["num_mini_batch"], update_repeats=ppo_info["update_repeats"], total_train_steps=total_train_steps, )
def training_pipeline(cls, **kwargs): total_train_steps = cls.TOTAL_IL_TRAIN_STEPS ppo_info = cls.rl_loss_default("ppo", steps=-1) num_mini_batch = ppo_info["num_mini_batch"] update_repeats = ppo_info["update_repeats"] return cls._training_pipeline( named_losses={ "offpolicy_expert_ce_loss": MiniGridOffPolicyExpertCELoss( total_episodes_in_epoch=int(1e6) // len(cls.machine_params("train")["gpu_ids"])), }, pipeline_stages=[ PipelineStage( loss_names=[], max_stage_steps=total_train_steps, offpolicy_component=OffPolicyPipelineComponent( data_iterator_builder=lambda **kwargs: create_minigrid_offpolicy_data_iterator( path=os.path.join( BABYAI_EXPERT_TRAJECTORIES_DIR, "BabyAI-GoToLocal-v0{}.pkl". format("" if torch.cuda.is_available() else "-small"), ), nrollouts=cls.NUM_TRAIN_SAMPLERS // num_mini_batch, rollout_len=cls.ROLLOUT_STEPS, instr_len=cls.INSTR_LEN, **kwargs, ), data_iterator_kwargs_generator=cls. expert_ce_loss_kwargs_generator, loss_names=["offpolicy_expert_ce_loss"], updates=num_mini_batch * update_repeats, ), ), ], num_mini_batch=0, update_repeats=0, total_train_steps=total_train_steps, )
def training_pipeline(cls, **kwargs): total_train_steps = cls.TOTAL_IL_TRAIN_STEPS loss_info = cls.rl_loss_default("imitation") return cls._training_pipeline( named_losses={"imitation_loss": loss_info["loss"]}, pipeline_stages=[ PipelineStage( loss_names=["imitation_loss"], teacher_forcing=LinearDecay( startp=1.0, endp=0.0, steps=total_train_steps // 2, ), max_stage_steps=total_train_steps, ) ], num_mini_batch=loss_info["num_mini_batch"], update_repeats=loss_info["update_repeats"], total_train_steps=total_train_steps, )
def training_pipeline(cls, **kwargs) -> TrainingPipeline: ppo_steps = int(150000) return TrainingPipeline( named_losses=dict(ppo_loss=PPO(**PPOConfig)), # type:ignore pipeline_stages=[ PipelineStage(loss_names=["ppo_loss"], max_stage_steps=ppo_steps) ], optimizer_builder=Builder(optim.Adam, dict(lr=1e-4)), num_mini_batch=4, update_repeats=3, max_grad_norm=0.5, num_steps=16, gamma=0.99, use_gae=True, gae_lambda=0.95, advance_scene_rollout_period=None, save_interval=10000, metric_accumulate_interval=1, lr_scheduler_builder=Builder( LambdaLR, {"lr_lambda": LinearDecay(steps=ppo_steps)} # type:ignore ), )