def training_pipeline(cls, **kwargs):
        total_train_steps = cls.TOTAL_IL_TRAIN_STEPS
        ppo_info = cls.rl_loss_default("ppo", steps=-1)

        num_mini_batch = ppo_info["num_mini_batch"]
        update_repeats = ppo_info["update_repeats"]

        # fmt: off
        return cls._training_pipeline(
            named_losses={
                "offpolicy_expert_ce_loss":
                MiniGridOffPolicyExpertCELoss(
                    total_episodes_in_epoch=int(1e6)),
            },
            pipeline_stages=[
                # Single stage, only with off-policy training
                PipelineStage(
                    loss_names=[],  # no on-policy losses
                    max_stage_steps=
                    total_train_steps,  # keep sampling episodes in the stage
                    # Enable off-policy training:
                    offpolicy_component=OffPolicyPipelineComponent(
                        # Pass a method to instantiate data iterators
                        data_iterator_builder=lambda **extra_kwargs:
                        create_minigrid_offpolicy_data_iterator(
                            path=os.path.join(
                                BABYAI_EXPERT_TRAJECTORIES_DIR,
                                "BabyAI-GoToLocal-v0{}.pkl".
                                format("" if torch.cuda.is_available() else
                                       "-small"),
                            ),
                            nrollouts=cls.NUM_TRAIN_SAMPLERS //
                            num_mini_batch,  # per trainer batch size
                            rollout_len=cls.ROLLOUT_STEPS,
                            instr_len=cls.INSTR_LEN,
                            **extra_kwargs,
                        ),
                        loss_names=["offpolicy_expert_ce_loss"
                                    ],  # off-policy losses
                        updates=num_mini_batch *
                        update_repeats,  # number of batches per rollout
                    ),
                ),
            ],
            # As we don't have any on-policy losses, we set the next
            # two values to zero to ensure we don't attempt to
            # compute gradients for on-policy rollouts:
            num_mini_batch=0,
            update_repeats=0,
            total_train_steps=total_train_steps,
        )
예제 #2
0
    def training_pipeline(cls, **kwargs):
        total_train_steps = cls.TOTAL_IL_TRAIN_STEPS
        ppo_info = cls.rl_loss_default("ppo", steps=-1)

        num_mini_batch = ppo_info["num_mini_batch"]
        update_repeats = ppo_info["update_repeats"]

        return cls._training_pipeline(
            named_losses={
                "offpolicy_expert_ce_loss":
                MiniGridOffPolicyExpertCELoss(
                    total_episodes_in_epoch=int(1e6) //
                    len(cls.machine_params("train")["gpu_ids"])),
            },
            pipeline_stages=[
                PipelineStage(
                    loss_names=[],
                    max_stage_steps=total_train_steps,
                    offpolicy_component=OffPolicyPipelineComponent(
                        data_iterator_builder=lambda **kwargs:
                        create_minigrid_offpolicy_data_iterator(
                            path=os.path.join(
                                BABYAI_EXPERT_TRAJECTORIES_DIR,
                                "BabyAI-GoToLocal-v0{}.pkl".
                                format("" if torch.cuda.is_available() else
                                       "-small"),
                            ),
                            nrollouts=cls.NUM_TRAIN_SAMPLERS // num_mini_batch,
                            rollout_len=cls.ROLLOUT_STEPS,
                            instr_len=cls.INSTR_LEN,
                            **kwargs,
                        ),
                        data_iterator_kwargs_generator=cls.
                        expert_ce_loss_kwargs_generator,
                        loss_names=["offpolicy_expert_ce_loss"],
                        updates=num_mini_batch * update_repeats,
                    ),
                ),
            ],
            num_mini_batch=0,
            update_repeats=0,
            total_train_steps=total_train_steps,
        )