def test_detect():
    """Test the detection of a SLURM environment configuration."""
    with mock.patch.dict(os.environ, {}, clear=True):
        assert not SLURMEnvironment.detect()

    with mock.patch.dict(os.environ, {"SLURM_NTASKS": "2"}):
        assert SLURMEnvironment.detect()
示例#2
0
    def _is_slurm_managing_tasks(self) -> bool:
        """used by choosing cluster enviroment."""
        if not SLURMEnvironment.detect() or SLURMEnvironment.job_name() == "bash":
            return False

        total_requested_devices = len(self._parallel_devices) * self._num_nodes_flag
        num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0)
        return num_slurm_tasks == total_requested_devices
示例#3
0
def test_default_attributes():
    """ Test the default attributes when no environment variables are set. """
    env = SLURMEnvironment()
    assert env.creates_children()
    assert env.master_address() == "127.0.0.1"
    assert env.master_port() == 12910
    assert env.world_size() is None
    with pytest.raises(KeyError):
        # local rank is required to be passed as env variable
        env.local_rank()
    with pytest.raises(KeyError):
        # node_rank is required to be passed as env variable
        env.node_rank()
示例#4
0
def environment_combinations():
    expected = dict(global_rank=3, local_rank=1, node_rank=1, world_size=4)
    # Lightning
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "LOCAL_RANK": "1",
        "NODE_RANK": "1",
        "WORLD_SIZE": "8"
    }
    environment = LightningEnvironment()
    yield environment, variables, expected
    # SLURM
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "SLURM_JOB_NAME": "SOME_NAME",
        "SLURM_LOCALID": "1",
        "SLURM_NODEID": "1",
        "SLURM_PROCID": "3",
        "SLURM_NTASKS": "4",
    }
    environment = SLURMEnvironment()
    yield environment, variables, expected
    # TorchElastic
    variables = {
        "CUDA_VISIBLE_DEVICES": "0,1,2,4",
        "LOCAL_RANK": "1",
        "GROUP_RANK": "1",
        "RANK": "3",
        "WORLD_SIZE": "4",
        "LOCAL_WORLD_SIZE": "2",
    }
    environment = TorchElasticEnvironment()
    yield environment, variables, expected
def test_default_attributes():
    """Test the default attributes when no environment variables are set."""
    env = SLURMEnvironment()
    assert env.creates_processes_externally
    assert env.main_address == "127.0.0.1"
    assert env.main_port == 12910
    assert env.job_name() is None
    assert env.job_id() is None

    with pytest.raises(KeyError):
        # world size is required to be passed as env variable
        env.world_size()
    with pytest.raises(KeyError):
        # local rank is required to be passed as env variable
        env.local_rank()
    with pytest.raises(KeyError):
        # node_rank is required to be passed as env variable
        env.node_rank()
示例#6
0
def test_attributes_from_environment_variables():
    """ Test that the SLURM cluster environment takes the attributes from the environment variables. """
    env = SLURMEnvironment()
    assert env.master_address() == "1.1.1.1"
    assert env.master_port() == 15000 + 1234
    assert env.world_size() is None
    assert env.local_rank() == 2
    assert env.node_rank() == 3
示例#7
0
 def _choose_and_init_cluster_environment(self) -> ClusterEnvironment:
     if isinstance(self._cluster_environment_flag, ClusterEnvironment):
         return self._cluster_environment_flag
     if self._is_slurm_managing_tasks():
         rank_zero_info("Multiprocessing is handled by SLURM.")
         return SLURMEnvironment()
     for env_type in (BaguaEnvironment, TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment):
         if env_type.detect():
             return env_type()
     return LightningEnvironment()
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self.is_slurm_managing_tasks:
         env = SLURMEnvironment()
     elif TorchElasticEnvironment.is_using_torchelastic():
         env = TorchElasticEnvironment()
     else:
         env = LightningEnvironment()
     return env
    def _is_slurm_managing_tasks(self) -> bool:
        """Returns whether we let SLURM manage the processes or not.

        Returns ``True`` if and only if these conditions match:

            - A SLURM cluster is detected
            - A distributed plugin is being used
            - The process is not launching in interactive mode
            - The number of tasks in SLURM matches the requested number of devices and nodes in the Trainer
        """
        if (
            (not self.use_ddp and not self.use_ddp2)
            or not SLURMEnvironment.detect()
            or SLURMEnvironment.job_name() == "bash"  # in interactive mode we don't manage tasks
        ):
            return False

        total_requested_devices = (self.num_gpus or self.num_processes) * self.num_nodes
        num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0)
        return num_slurm_tasks == total_requested_devices
def test_signal_handlers_restored_in_teardown():
    """Test that the SignalConnector restores the previously configured handler on teardown."""
    assert signal.getsignal(signal.SIGTERM) is signal.SIG_DFL

    trainer = Trainer(plugins=SLURMEnvironment())
    connector = SignalConnector(trainer)
    connector.register_signal_handlers()

    assert signal.getsignal(signal.SIGTERM) is not signal.SIG_DFL
    connector.teardown()
    assert signal.getsignal(signal.SIGTERM) is signal.SIG_DFL
 def _choose_and_init_cluster_environment(self) -> ClusterEnvironment:
     if isinstance(self._cluster_environment_flag, ClusterEnvironment):
         return self._cluster_environment_flag
     if self._is_slurm_managing_tasks():
         rank_zero_info("Multiprocessing is handled by SLURM.")
         return SLURMEnvironment()
     for env_type in (BaguaEnvironment, TorchElasticEnvironment,
                      KubeflowEnvironment, LSFEnvironment):
         if env_type.detect():
             # Ignore type error because it is a false positive: https://github.com/python/mypy/issues/13044
             return env_type()  # type: ignore[abstract]
     return LightningEnvironment()
    def select_cluster_environment(self) -> ClusterEnvironment:
        if self._cluster_environment is not None:
            return self._cluster_environment
        if self._is_slurm_managing_tasks():
            rank_zero_info("Multiprocessing is handled by SLURM.")
            return SLURMEnvironment()

        for env_type in (TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment):
            if env_type.detect():
                return env_type()

        return LightningEnvironment()
示例#13
0
 def select_cluster_environment(self):
     if self.cluster_environment is not None:
         return self.cluster_environment
     if self.is_slurm_managing_tasks:
         env = SLURMEnvironment()
     elif self.is_using_torchelastic:
         env = TorchElasticEnvironment()
         # TODO: decouple DDP from TE
         #   maybe introduce a DefaultEnvironment?
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
     else:
         # TODO: maybe introduce a DefaultEnvironment?
         env = TorchElasticEnvironment()
     return env
示例#14
0
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self._is_slurm_managing_tasks():
         env = SLURMEnvironment()
         rank_zero_info("Multiprocessing is handled by SLURM.")
     elif TorchElasticEnvironment.is_using_torchelastic():
         env = TorchElasticEnvironment()
     elif KubeflowEnvironment.is_using_kubeflow():
         env = KubeflowEnvironment()
     elif LSFEnvironment.is_using_lsf():
         env = LSFEnvironment()
     else:
         env = LightningEnvironment()
     return env
 def select_cluster_environment(self) -> ClusterEnvironment:
     if self._cluster_environment is not None:
         return self._cluster_environment
     if self.is_slurm_managing_tasks:
         env = SLURMEnvironment()
         # TODO: decouple DDP from SLURM
         #   refactor and let generic cluster env hold the information about who spawns the processes
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
     elif self.is_using_torchelastic:
         env = TorchElasticEnvironment()
         # TODO: decouple DDP from TE
         #   refactor and let generic cluster env hold the information about who spawns the processes
         os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
     else:
         # TODO: maybe introduce a DefaultEnvironment?
         env = TorchElasticEnvironment()
     return env
def test_auto_requeue_flag(auto_requeue):
    trainer = Trainer(plugins=[SLURMEnvironment(auto_requeue=auto_requeue)])
    connector = SignalConnector(trainer)
    connector.register_signal_handlers()

    if auto_requeue:
        sigterm_handlers = signal.getsignal(signal.SIGTERM).signal_handlers
        assert len(sigterm_handlers) == 1
        assert sigterm_handlers[0].__qualname__ == "SignalConnector.sigterm_handler_fn"

        sigusr1_handlers = signal.getsignal(signal.SIGUSR1).signal_handlers
        assert len(sigusr1_handlers) == 1
        assert sigusr1_handlers[0].__qualname__ == "SignalConnector.slurm_sigusr1_handler_fn"
    else:
        assert signal.getsignal(signal.SIGTERM) is signal.SIG_DFL
        assert signal.getsignal(signal.SIGUSR1) is signal.SIG_DFL

    connector.teardown()
示例#17
0
def process_args(args, comm=None):
    """
    Process arguments for running inference
    """
    conf_args = process_config(args.config)
    for k, v in vars(conf_args).items():
        if not hasattr(args, k):
            setattr(args, k, v)

    logger = args.logger
    # set up logger
    if logger is None:
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)
    if args.debug:
        logger.setLevel(logging.DEBUG)
    args.logger = logger

    rank = 0
    size = 1
    local_rank = 0
    env = None

    if args.slurm:
        env = SLURMEnvironment()
    elif args.lsf:
        env = LSFEnvironment()

    if env is not None:
        local_rank = env.local_rank()
        rank = env.global_rank()
        size = env.world_size()

    # Figure out the checkpoint file to read from
    # and where to save outputs to
    if args.output is None:
        if os.path.isdir(args.checkpoint):
            ckpt = list(glob.glob(f"{args.checkpoint}/*.ckpt"))
            if len(ckpt) == 0:
                print(f'No checkpoint file found in {args.checkpoint}',
                      file=sys.stderr)
                sys.exit(1)
            elif len(ckpt) > 1:
                print(
                    f'More than one checkpoint file found in {args.checkpoint}. '
                    'Please specify checkpoint with -c',
                    file=sys.stderr)
                sys.exit(1)
            args.checkpoint = ckpt[0]
        outdir = args.checkpoint
        if outdir.endswith('.ckpt'):
            outdir = outdir[:-5]
        if not os.path.isdir(outdir):
            os.mkdir(outdir)
        args.output = os.path.join(outdir, 'outputs.h5')

    # setting classify to so that we can get labels when
    # we load data. We do this here because we assume that
    # network is going to output features, and we want to use the
    # labels for downstream analysis
    args.classify = True

    # load the model and override batch size
    model = process_model(args, inference=True)
    model.set_inference(True)
    if args.batch_size is not None:
        model.hparams.batch_size = args.batch_size

    args.n_outputs = model.hparams.n_outputs
    args.save_seq_ids = model.hparams.window is not None

    # remove ResNet features
    if args.resnet_features:
        if 'ResNet' not in model.__class__.__name__:
            raise ValueError("Cannot use -f without ResNet model - got %s" %
                             model.__class__.__name__)
        from .models.resnet import ResNetFeatures
        args.n_outputs = model.fc.in_features if isinstance(
            model.fc, nn.Linear) else model.fc[0].in_features
        model = ResNetFeatures(model)
        args.features = True

    if size > 1:
        dataset = LazySeqDataset(path=args.input,
                                 hparams=argparse.Namespace(**model.hparams),
                                 keep_open=True,
                                 comm=comm,
                                 size=size,
                                 rank=rank)
    else:
        dataset = LazySeqDataset(path=args.input,
                                 hparams=argparse.Namespace(**model.hparams),
                                 keep_open=True)

    tot_bases = dataset.orig_difile.get_seq_lengths().sum()
    args.logger.info(
        f'rank {rank} - processing {tot_bases} bases across {len(dataset)} samples'
    )

    tmp_dset = dataset

    kwargs = dict(batch_size=args.batch_size, shuffle=False)
    if args.num_workers > 0:
        kwargs['num_workers'] = args.num_workers
        kwargs['multiprocessing_context'] = 'spawn'
        kwargs['worker_init_fn'] = dataset.worker_init
        kwargs['persistent_workers'] = True
    loader = get_loader(tmp_dset, inference=True, **kwargs)

    args.difile = dataset.difile

    # return the model, any arguments, and Lighting Trainer args just in case
    # we want to use them down the line when we figure out how to use Lightning for
    # inference

    if not args.features:
        model = nn.Sequential(model, nn.Softmax(dim=1))

    model.eval()

    ret = [model, dataset, loader, args, env]

    if size > 1:
        args.device = torch.device('cuda:%d' % local_rank)
    else:
        args.device = torch.device('cuda')

    return tuple(ret)
示例#18
0
        for key in sd.keys():
            if not (key in this_sd):
                missing_params.append(key)
        excessive_params = []
        for key in this_sd.keys():
            if not (key in sd):
                excessive_params.append(key)
        print('Loaded chkpt key not in this model:', missing_params[:10])
        print('This model key not in chkpt:', excessive_params[:10])
        model.af2.load_state_dict(sd)

    if args.precision == 'bf16':
        model = model.to(dtype=torch.bfloat16)

    if "SLURM_JOB_ID" in os.environ:
        cluster_environment = SLURMEnvironment()
    else:
        cluster_environment = None

    trainer = pl.Trainer(
        accelerator="gpu",
        gpus=args.num_gpus,
        logger=logger,
        max_steps=args.max_iter,
        num_nodes=args.num_nodes,
        # strategy=CustomDDPPlugin(find_unused_parameters=False),
        strategy=DeepSpeedPlugin(config=args.deepspeed_config_path,
                                 load_full_weights=True),
        accumulate_grad_batches=args.num_accum,
        gradient_clip_val=0.1,
        gradient_clip_algorithm='norm',
def test_main_address_from_slurm_node_list(slurm_node_list, expected):
    """Test extracting the main node from different formats for the SLURM_NODELIST."""
    with mock.patch.dict(os.environ, {"SLURM_NODELIST": slurm_node_list}):
        env = SLURMEnvironment()
        assert env.main_address == expected
def test_attributes_from_environment_variables(caplog):
    """Test that the SLURM cluster environment takes the attributes from the environment variables."""
    env = SLURMEnvironment()
    assert env.auto_requeue is True
    assert env.main_address == "1.1.1.1"
    assert env.main_port == 15000 + 1234
    assert env.job_id() == int("0001234")
    assert env.world_size() == 20
    assert env.global_rank() == 1
    assert env.local_rank() == 2
    assert env.node_rank() == 3
    assert env.job_name() == "JOB"
    # setter should be no-op
    with caplog.at_level(logging.DEBUG,
                         logger="pytorch_lightning.plugins.environments"):
        env.set_global_rank(100)
    assert env.global_rank() == 1
    assert "setting global rank is not allowed" in caplog.text

    caplog.clear()

    with caplog.at_level(logging.DEBUG,
                         logger="pytorch_lightning.plugins.environments"):
        env.set_world_size(100)
    assert env.world_size() == 20
    assert "setting world size is not allowed" in caplog.text
@RunIf(skip_windows=True)
def test_sync_batchnorm_set_in_custom_strategy(tmpdir):
    """Tests if layer_sync is automatically set for custom strategy."""

    class CustomParallelStrategy(DDPStrategy):
        def __init__(self, **kwargs):
            super().__init__(**kwargs)
            # Set to None so it will be overwritten by the accelerator connector.
            self._layer_sync = None

    strategy = CustomParallelStrategy()
    assert strategy._layer_sync is None
    Trainer(strategy=strategy, sync_batchnorm=True)
    assert isinstance(strategy._layer_sync, NativeSyncBatchNorm)


@pytest.mark.parametrize(
    ["plugins", "expected"],
    [
        ([LightningEnvironment(), SLURMEnvironment()], "ClusterEnvironment"),
        ([TorchCheckpointIO(), TorchCheckpointIO()], "CheckpointIO"),
        (
            [PrecisionPlugin(), DoublePrecisionPlugin(), LightningEnvironment(), SLURMEnvironment()],
            "PrecisionPlugin, ClusterEnvironment",
        ),
    ],
)
def test_plugin_only_one_instance_for_one_type(plugins, expected):
    with pytest.raises(MisconfigurationException, match=f"Received multiple values for {expected}"):
        Trainer(plugins=plugins)
示例#22
0
def process_args(args=None, return_io=False):
    """
    Process arguments for running training
    """
    if not isinstance(args, argparse.Namespace):
        args = parse_args(args)

    args.loader_kwargs = dict()

    targs = dict(max_epochs=args.epochs, )

    targs['accumulate_grad_batches'] = args.accumulate

    env = None

    if args.ipu:
        targs['accelerator'] = 'ipu'
        targs['devices'] = process_gpus(args.gpus)
    else:
        targs['gpus'] = process_gpus(args.gpus)
        targs['num_nodes'] = args.num_nodes
        if args.lsf:
            ##########################################################################################
            # Currently coding against pytorch-lightning 1.4.3
            ##########################################################################################
            if args.num_workers > 4:
                print0(
                    "num_workers (-k) > 4 can lead to hanging on Summit -- setting to 4",
                    file=sys.stderr)
                args.num_workers = 4
            args.loader_kwargs[
                'num_workers'] = 1  # Set as a default. This will get overridden elsewhere
            args.loader_kwargs['multiprocessing_context'] = 'spawn'
            env = LSFEnvironment()
        elif args.slurm:
            env = SLURMEnvironment()

        if env is not None:
            global RANK
            global SIZE
            try:
                RANK = env.global_rank()
                SIZE = env.world_size()
            except:
                print(
                    ">>> Could not get global rank -- setting RANK to 0 and SIZE to 1",
                    file=sys.stderr)
                RANK = 0
                SIZE = 1

        if targs['gpus'] is not None:
            targs['accelerator'] = 'gpu'
            if targs['gpus'] == 1:
                targs['devices'] = 1
            else:
                if env is None:
                    raise ValueError(
                        'Please specify environment (--lsf or --slurm) if using more than one GPU'
                    )
                # parallel_devices = [torch.device(i) for i in range(torch.cuda.device_count()) if i < targs['gpus']]
                # precision_plugin = NativeMixedPrecisionPlugin(16, 'cuda')
                torch.cuda.set_device(env.local_rank())
                targs['devices'] = targs['gpus']
                targs['strategy'] = DDPStrategy(
                    find_unused_parameters=False,
                    cluster_environment=env,
                    #accelerator=GPUAccelerator(),
                    #parallel_devices=parallel_devices,
                    #precision_plugin=precision_plugin,
                )

                print(
                    "---- Rank %s  -  Using GPUAccelerator with DDPStrategy" %
                    env.global_rank(),
                    file=sys.stderr)
        else:
            targs['accelerator'] = 'cpu'

    del args.gpus

    if args.sanity:
        if isinstance(args.sanity, str):
            args.sanity = int(args.sanity)
        else:
            args.sanity = 4000
        targs['limit_train_batches'] = args.sanity
        targs['limit_val_batches'] = args.sanity // 4

    if args.lr_find:
        targs['auto_lr_find'] = True
    del args.lr_find

    if args.checkpoint is not None:
        if os.path.exists(args.checkpoint):
            targs['resume_from_checkpoint'] = args.checkpoint
        else:
            warnings.warn(
                "Ignoring -c/--checkpoint argument because {args.checkpoint} does not exist."
            )
            args.checkpoint = None

    if args.cuda_profile:
        targs['profiler'] = PyTorchProfiler(
            filename=f'pytorch_prof.{RANK:0{len(str(SIZE))}}', emit_nvtx=True)

    targs['replace_sampler_ddp'] = False

    args.loader_kwargs = dict()

    # make sure we are classifying if we are using adding classifier layers
    # to a resnet features model
    if args.features_checkpoint is not None:
        if args.manifold:
            raise ValueError(
                'Cannot use manifold loss (i.e. -M) if adding classifier (i.e. -F)'
            )
        args.classify = True

    data_mod = DeepIndexDataModule(args,
                                   keep_open=True,
                                   seed=args.seed + RANK,
                                   rank=RANK,
                                   size=SIZE)

    # if classification problem, use the number of taxa as the number of outputs
    if args.classify:
        args.n_outputs = data_mod.dataset.n_outputs

    args.input_nc = 136 if args.tnf else len(data_mod.dataset.vocab)

    model = process_model(args, taxa_table=data_mod.dataset.difile.taxa_table)

    if args.num_workers > 0:
        data_mod.dataset.close()

    ret = [model, args, targs]
    if return_io:
        ret.append(io)

    ret.append(data_mod)

    return tuple(ret)
示例#23
0
    class CustomParallelStrategy(DDPStrategy):
        def __init__(self, **kwargs):
            super().__init__(**kwargs)
            # Set to None so it will be overwritten by the accelerator connector.
            self._layer_sync = None

    strategy = CustomParallelStrategy()
    assert strategy._layer_sync is None
    Trainer(strategy=strategy, sync_batchnorm=True)
    assert isinstance(strategy._layer_sync, NativeSyncBatchNorm)


@pytest.mark.parametrize(
    ["plugins", "expected"],
    [
        ([LightningEnvironment(), SLURMEnvironment()], "ClusterEnvironment"),
        ([TorchCheckpointIO(), TorchCheckpointIO()], "CheckpointIO"),
        (
            [
                PrecisionPlugin(),
                DoublePrecisionPlugin(),
                LightningEnvironment(),
                SLURMEnvironment()
            ],
            "PrecisionPlugin, ClusterEnvironment",
        ),
    ],
)
def test_plugin_only_one_instance_for_one_type(plugins, expected):
    with pytest.raises(MisconfigurationException,
                       match=f"Received multiple values for {expected}"):