예제 #1
0
def test_reset_seed_no_op():
    """Test that the reset_seed function is a no-op when seed_everything() was not used."""
    assert "PL_GLOBAL_SEED" not in os.environ
    seed_before = torch.initial_seed()
    seed_utils.reset_seed()
    assert torch.initial_seed() == seed_before
    assert "PL_GLOBAL_SEED" not in os.environ
예제 #2
0
    def setup_distributed(self):
        reset_seed()

        # determine which process we are and world size
        self.set_world_ranks()

        # set warning rank
        rank_zero_only.rank = self.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        self.init_ddp_connection()

        # on world_size=0 let everyone know training is starting
        if self.is_global_zero and not torch.distributed.is_initialized():
            log.info("-" * 100)
            log.info(f"distributed_backend={self.distributed_backend}")
            log.info(
                f"All DDP processes registered. Starting ddp with {self.world_size} processes"
            )
            log.info("-" * 100)

        # set the ranks and devices
        self.dist.rank = self.global_rank
        self.dist.device = self.root_device
예제 #3
0
    def setup_distributed(self) -> None:
        reset_seed()

        # determine which process we are and world size
        self.set_world_ranks()

        self._init_bagua_distributed()
예제 #4
0
 def _worker_setup(self, process_idx: int):
     reset_seed()
     self.set_world_ranks(process_idx)
     rank_zero_only.rank = self.global_rank
     init_dist_connection(self.cluster_environment,
                          self.torch_distributed_backend, self.global_rank,
                          self.world_size)
예제 #5
0
    def new_process(self, process_idx: int, trainer, mp_queue) -> None:
        self.mp_queue = mp_queue

        reset_seed()

        self.tpu_local_core_rank = xm.get_local_ordinal()
        self.tpu_global_core_rank = xm.get_ordinal()

        # set warning rank
        rank_zero_only.rank = self.global_rank

        if self.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None:
            trainer.progress_bar_callback.disable()

        self.model_to_device()
        trainer.accelerator.setup_optimizers(trainer)
        trainer.precision_plugin.connect(self._model, None, None)

        self.barrier("pre-run-stage")

        results = trainer.run_stage()

        self.transfer_distrib_spawn_state_on_fit_end(results)

        # https://github.com/pytorch/xla/issues/1801#issuecomment-602799542
        self.barrier("end-process")

        # https://github.com/pytorch/xla/issues/2190#issuecomment-641665358
        if self.local_rank == 0:
            time.sleep(2)
예제 #6
0
 def _worker_setup(self, process_idx: int):
     reset_seed()
     self.set_world_ranks(process_idx)
     rank_zero_only.rank = self.global_rank
     self._process_group_backend = self._get_process_group_backend()
     init_dist_connection(self.cluster_environment,
                          self._process_group_backend, self.global_rank,
                          self.world_size)
예제 #7
0
def test_reset_seed_everything():
    """ Test that we can reset the seed to the initial value set by seed_everything() """
    assert "PL_GLOBAL_SEED" not in os.environ
    seed_utils.seed_everything(123)
    assert os.environ["PL_GLOBAL_SEED"] == "123"
    before = torch.rand(1)
    seed_utils.reset_seed()
    after = torch.rand(1)
    assert torch.allclose(before, after)
 def setup_environment(self) -> None:
     reset_seed()
     # set warning rank
     rank_zero_only.rank = self.global_rank
     self._process_group_backend = self._get_process_group_backend()
     assert self.cluster_environment is not None
     init_dist_connection(self.cluster_environment,
                          self._process_group_backend)
     super().setup_environment()
예제 #9
0
    def setup_distributed(self):
        reset_seed()

        # determine which process we are and world size
        self.set_world_ranks()

        self._init_deepspeed_distributed()

        if not self._config_initialized:
            self._format_config()
            self._config_initialized = True
예제 #10
0
    def setup_distributed(self):
        log.detail(f"{self.__class__.__name__}: setting up distributed...")
        reset_seed()

        # determine which process we are and world size
        self.set_world_ranks()

        # set warning rank
        rank_zero_only.rank = self.global_rank

        self._process_group_backend = self._get_process_group_backend()
        init_dist_connection(self.cluster_environment,
                             self._process_group_backend)
예제 #11
0
    def setup_distributed(self):
        reset_seed()

        # determine which process we are and world size
        self.set_world_ranks()

        # set warning rank
        rank_zero_only.rank = self.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        init_dist_connection(self.cluster_environment,
                             self.torch_distributed_backend)
예제 #12
0
    def new_process(self, process_idx, trainer, mp_queue):
        self.mp_queue = mp_queue

        reset_seed()

        self.set_world_ranks(process_idx)

        # set warning rank
        rank_zero_only.rank = self.global_rank  # type: ignore

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        self.init_ddp_connection(self.global_rank, self.world_size)

        # TODO: we moved it to the trainer.fit after calling pre_dispatch
        #   ... need to double check that it is the correct place
        # self.trainer.call_setup_hook(self.model)

        # on world_size=0 let everyone know training is starting
        if self.is_global_zero and not torch.distributed.is_initialized():
            log.info("-" * 100)
            log.info(f"distributed_backend={self.distributed_backend}")
            log.info(f"All DDP processes registered. Starting ddp with"
                     "{self.world_size} processes")
            log.info("-" * 100)

        # set the ranks and devices
        self.dist.rank = self.global_rank
        self.dist.device = self.root_device

        if self.sync_batchnorm:
            self.model = self.configure_sync_batchnorm(self.model)

        self.configure_ddp()

        # Move this line here so that we can temporarily use cpu while configuring ddp
        # and use ipex.DEVICE later on
        # move the model to the correct device
        #
        # The reason for this movement is relate to unstorage tensor for ipex.
        # So maybe another way is replacing torch.save like ipexaccelerator does.
        self.model_to_device()

        self.barrier()
        results = trainer.run_stage()

        # persist info in ddp_spawn
        self.transfer_distrib_spawn_state_on_fit_end(results)
예제 #13
0
def test_reset_seed_everything(workers):
    """Test that we can reset the seed to the initial value set by seed_everything()"""
    assert "PL_GLOBAL_SEED" not in os.environ
    assert "PL_SEED_WORKERS" not in os.environ

    seed_utils.seed_everything(123, workers)
    before = torch.rand(1)
    assert os.environ["PL_GLOBAL_SEED"] == "123"
    assert os.environ["PL_SEED_WORKERS"] == str(int(workers))

    seed_utils.reset_seed()
    after = torch.rand(1)
    assert os.environ["PL_GLOBAL_SEED"] == "123"
    assert os.environ["PL_SEED_WORKERS"] == str(int(workers))
    assert torch.allclose(before, after)
예제 #14
0
    def setup_distributed(self):
        reset_seed()

        # determine which process we are and world size
        self.set_world_ranks()

        # set warning rank
        rank_zero_only.rank = self.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        self.init_ddp_connection()

        # set the ranks and devices
        self.dist.rank = self.global_rank
        self.dist.device = self.root_device
예제 #15
0
    def new_process(self, process_idx: int, trainer: "pl.Trainer",
                    mp_queue: SimpleQueue) -> None:
        self.mp_queue = mp_queue

        reset_seed()

        self.set_world_ranks(process_idx)

        # set warning rank
        rank_zero_only.rank = self.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        init_ddp_connection(self.cluster_environment,
                            self.torch_distributed_backend, self.global_rank,
                            self.world_size)

        # TODO: we moved it to the trainer.fit after calling pre_dispatch
        #   ... need to double check that it is the correct place
        # self.trainer.call_setup_hook(self.model)

        # set the ranks and devices
        self.dist.rank = self.global_rank
        self.dist.device = self.root_device

        # move the model to the correct device
        self.model_to_device()

        if self.sync_batchnorm:
            self.model = self.configure_sync_batchnorm(self.model)

        # skip wrapping the model if we are not fitting as no gradients need to be exchanged
        trainer_fn = self.lightning_module.trainer.state.fn
        if trainer_fn == TrainerFn.FITTING:
            self.configure_ddp()

        self.barrier()

        results = trainer.run_stage()

        # persist info in ddp_spawn
        self.__transfer_distrib_spawn_state_on_fit_end(trainer, results)

        # ensure that spawned processes go through teardown before joining
        trainer._call_teardown_hook()
예제 #16
0
    def new_process(self, process_idx, trainer, mp_queue):
        self.mp_queue = mp_queue

        reset_seed()

        self.set_world_ranks(process_idx)

        # set warning rank
        rank_zero_only.rank = self.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        self.init_ddp_connection(self.global_rank, self.world_size)

        # TODO: we moved it to the trainer.fit after calling pre_dispatch
        #   ... need to double check that it is the correct place
        # self.trainer.call_setup_hook(self.model)

        # on world_size=0 let everyone know training is starting
        if self.is_global_zero and not torch.distributed.is_initialized():
            log.info("-" * 100)
            log.info(f"distributed_backend={self.distributed_backend}")
            log.info(
                f"All DDP processes registered. Starting ddp with {self.world_size} processes"
            )
            log.info("-" * 100)

        # set the ranks and devices
        self.dist.rank = self.global_rank
        self.dist.device = self.root_device

        # move the model to the correct device
        self.model_to_device()

        if self.sync_batchnorm:
            self.model = self.configure_sync_batchnorm(self.model)

        self.configure_ddp()

        self.barrier()

        results = trainer.run_stage()

        # persist info in ddp_spawn
        self.transfer_distrib_spawn_state_on_fit_end(results)
예제 #17
0
    def new_process(self, process_idx, trainer, mp_queue):
        self.mp_queue = mp_queue

        reset_seed()

        self.set_world_ranks(process_idx)

        # set warning rank
        rank_zero_only.rank = self.global_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        self.init_ddp_connection(self.global_rank, self.world_size)

        # TODO: we moved it to the trainer.fit after calling pre_dispatch
        #   ... need to double check that it is the correct place
        # self.trainer.call_setup_hook(self.model)

        # set the ranks and devices
        self.dist.rank = self.global_rank
        self.dist.device = self.root_device

        # move the model to the correct device
        self.model_to_device()

        if self.sync_batchnorm:
            self.model = self.configure_sync_batchnorm(self.model)

        self.configure_ddp()

        self.barrier()

        results = trainer.run_stage()

        # persist info in ddp_spawn
        self.transfer_distrib_spawn_state_on_fit_end(results)

        # ensure that spawned processes go through teardown before joining
        trainer._call_teardown_hook()
예제 #18
0
    def run_sanity_check(self, ref_model):
        using_val_step = ref_model.val_dataloader is not None and is_overridden(
            'validation_step', ref_model)
        should_sanity_check = using_val_step and self.num_sanity_val_steps > 0 and self.limit_val_batches > 0

        # run tiny validation (if validation defined)
        # to make sure program won't crash during val
        if should_sanity_check:
            stage = self._running_stage
            self.sanity_checking = True

            # hook and callback
            self.on_sanity_check_start()

            # run eval step
            self.run_evaluation()

            self.on_sanity_check_end()

            self._running_stage = stage

            # reset the seed to what it was before sanity check
            # prevents sanity check to affect random sampling in training
            reset_seed()
예제 #19
0
 def _worker_setup(self, process_idx: int):
     reset_seed()
     self.tpu_local_core_rank = xm.get_local_ordinal()
     self.tpu_global_core_rank = xm.get_ordinal()
     rank_zero_only.rank = self.global_rank
예제 #20
0
 def _worker_setup(self, process_idx: int):
     reset_seed()
     self.set_world_ranks(process_idx)
     rank_zero_only.rank = self.global_rank