def test_mpi_multi_node_job_cluster(self): # First do a simple mpi job cluster = get_cluster(queue_type="knl", mpi_mode=True, cores=130, **self.kwargs) self.assertIn("#SBATCH --cpus-per-task=1", cluster.job_header) self.assertIn("#SBATCH --ntasks-per-node=64", cluster.job_header) self.assertIn("#SBATCH -n 130", cluster.job_script()) self.assertEqual(cluster._dummy_job.worker_cores, 1) self.assertEqual(cluster._dummy_job.worker_processes, 1) self.assertEqual(cluster._dummy_job.worker_process_threads, 1) with self.assertRaises(ValueError): get_cluster(queue_type="knl", mpi_mode=True, **self.kwargs) with self.assertRaises(ValueError): get_cluster(queue_type="knl", cpus_per_task=37, cores=2, mpi_mode=True, **self.kwargs) with self.assertRaises(ValueError): get_cluster(queue_type="knl", cores=1, ntasks_per_node=13, mpi_mode=True, **self.kwargs) controller.delete_cluster(cluster.name)
def test_gpu_cluster(self): cluster = get_cluster(queue_type="gpus", **self.kwargs) self.assertEqual(cluster.name, "dask-worker-gpus") self.assertIn("#SBATCH -p gpus", cluster.job_header) self.assertIn("#SBATCH --gres=gpu:4", cluster.job_header) self.assertIsInstance(cluster.client, Client) controller.delete_cluster(cluster.name)
def test_knl_cluster(self): cluster = get_cluster(queue_type="knl", **self.kwargs) self.assertEqual(cluster.name, "dask-worker-knl") self.assertIn("#SBATCH -p booster", cluster.job_header) self.assertIn("#SBATCH --cpus-per-task=64", cluster.job_header) self.assertIsInstance(cluster.client, Client) controller.delete_cluster(cluster.name)
def test_custom_cluster(self): cluster = get_cluster(scheduler=SLURM, **self.kwargs) self.assertEqual(cluster.name, self.cluster_name) self.assertIsInstance(cluster.client, Client) with self.assertRaises(ValueError): get_cluster(SLURM, cores=128, **self.kwargs) controller.delete_cluster(cluster.name)
def test_custom_cluster(self): cluster = get_cluster(scheduler=SLURM, **self.kwargs) self.assertEqual(cluster.name, self.cluster_name) self.assertIn("#SBATCH -J dask-worker-batch", cluster.job_header) self.assertIsInstance(cluster.client, Client) with self.assertRaises(ValueError): get_cluster(SLURM, cores=128, **self.kwargs) controller.delete_cluster(cluster.name)
def test_fork_mpi_job_cluster(self): # First do a simple mpi job kwargs = self.kwargs kwargs.update({"fork_mpi": True}) cluster = get_cluster(queue_type="knl", mpi_mode=True, cores=64, **kwargs) self.assertNotIn(MPI_DASK_WRAPPER_MODULE, cluster._command_template) controller.delete_cluster(cluster.name)
def test_task_by_cluster(self): cluster_type = CustomSLURMCluster original_cluster = cluster_type() @on_cluster(cluster=original_cluster) @task(cluster=original_cluster) def f(): pass self._basic_tests(cluster_type=cluster_type, id_=original_cluster.name) controller.delete_cluster(original_cluster.name)
def test_task_by_id_string(self): _id = "test1" cluster_type = CustomSLURMCluster original_cluster = cluster_type(name=_id) @on_cluster(cluster=original_cluster) @task(cluster_id=_id) def f(): pass self._basic_tests(cluster_type=cluster_type, id_=_id) controller.delete_cluster(_id)
def test_task_by_cluster_and_cluster_id_local(self): _id = "test1" cluster_type = LocalCluster original_cluster = cluster_type() @on_cluster(cluster=original_cluster, cluster_id=_id) @task(cluster=original_cluster, cluster_id="_id") def f(): pass self._basic_tests(cluster_type=cluster_type, id_=_id) controller.delete_cluster(id_=_id)
def test_init_only_id_local(self): _id = "test1" original_cluster = LocalCluster() controller.add_cluster(id_=_id, cluster=original_cluster) @on_cluster(cluster_id=_id) def f(): pass cluster, client = controller.get_cluster(id_=_id) self.assertEqual(original_cluster, cluster) self.assertIsInstance(client, Client) controller.delete_cluster(_id)
def test_cluster_pure_functions_mpi(self): # Check the pure attribute is set to False in MPI mode cluster = get_cluster(queue_type="knl", mpi_mode=True, cores=64, **self.kwargs) self.assertEqual(cluster.pure, False) controller.delete_cluster(cluster.name) # Check this can be overridden with a kwarg cluster = get_cluster(queue_type="knl", mpi_mode=True, cores=64, pure=True, **self.kwargs) self.assertEqual(cluster.pure, True) controller.delete_cluster(cluster.name)
def test_init_only_id(self): _id = "test1" original_cluster = CustomSLURMCluster(name=_id) with self.assertRaises(ClusterException) as ctx: controller.add_cluster(cluster=original_cluster) self.assertEqual('Cluster "{}" already exists!'.format(_id), str(ctx.exception)) @on_cluster(cluster_id=_id) def f(): pass cluster, client = controller.get_cluster(id_=_id) self.assertEqual(original_cluster, cluster) self.assertEqual(cluster.name, _id) self.assertIsInstance(client, Client) controller.delete_cluster(_id)
def test_mpi_complex_job_cluster(self): # Now a few more variables cluster = get_cluster(queue_type="gpus", mpi_mode=True, nodes=2, ntasks_per_node=4, **self.kwargs) self.assertIn("#SBATCH --cpus-per-task=6", cluster.job_header) self.assertIn("#SBATCH --ntasks-per-node=4", cluster.job_header) self.assertIn("#SBATCH --nodes=2", cluster.job_header) self.assertIn("#SBATCH --gres=gpu:4", cluster.job_header) self.assertEqual(cluster._dummy_job.worker_cores, 1) self.assertEqual(cluster._dummy_job.worker_processes, 1) self.assertEqual(cluster._dummy_job.worker_process_threads, 1) self.assertIn("#SBATCH -n 8", cluster.job_script()) self.assertIn("export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}", cluster.job_script()) self.assertIn("export OMP_PROC_BIND=spread", cluster.job_script()) self.assertIn("export OMP_PLACES=threads", cluster.job_script()) controller.delete_cluster(cluster.name)
def test_init_only_cluster_local(self): original_cluster = LocalCluster() with self.assertRaises(ClusterException) as ctx: @on_cluster(cluster=original_cluster) def f(): pass self.assertEqual('LocalCluster requires "cluster_id" argument.', str(ctx.exception)) _id = "test1" @on_cluster(cluster=original_cluster, cluster_id=_id) def f(): pass cluster, client = controller.get_cluster(_id) self.assertEqual(original_cluster, cluster) self.assertIsInstance(client, Client) controller.delete_cluster(_id)
def test_init_both_cluster_and_id(self): _id = "test1" original_cluster = CustomSLURMCluster() with self.assertRaises(AssertionError): @on_cluster(cluster_id=_id, cluster=original_cluster) def f(): pass controller.delete_cluster(original_cluster.name) original_cluster = CustomSLURMCluster() @on_cluster(cluster_id=original_cluster.name, cluster=original_cluster) def f(): pass cluster, client = controller.get_cluster(id_=original_cluster.name) self.assertEqual(original_cluster, cluster) self.assertEqual(cluster.name, original_cluster.name) self.assertIsInstance(client, Client) controller.delete_cluster(original_cluster.name) original_cluster = CustomSLURMCluster(name=_id) @on_cluster(cluster_id=_id, cluster=original_cluster) def f(): pass cluster, client = controller.get_cluster(id_=_id) self.assertEqual(original_cluster, cluster) self.assertEqual(cluster.name, _id) self.assertIsInstance(client, Client) controller.delete_cluster(_id)
def test_mpi_job_cluster(self): # First do a simple mpi job cluster = get_cluster(queue_type="knl", mpi_mode=True, cores=64, **self.kwargs) self.assertIn("#SBATCH --cpus-per-task=1", cluster.job_header) self.assertIn("#SBATCH --ntasks-per-node=64", cluster.job_header) self.assertIn("#SBATCH -n 64", cluster.job_script()) self.assertIn(MPI_DASK_WRAPPER_MODULE, cluster._command_template) self.assertEqual(cluster.worker_cores, 1) self.assertEqual(cluster.worker_processes, 1) self.assertEqual(cluster.worker_process_threads, 1) self.assertIsInstance(cluster.client, Client) controller.delete_cluster(cluster.name) with self.assertRaises(ValueError): cluster = get_cluster(queue_type="knl", mpi_mode=True, nodes=64, cores=64, **self.kwargs) controller.delete_cluster(cluster.name)
def test_task_by_cluster_and_cluster_id(self): _id = "test1" cluster_type = CustomSLURMCluster original_cluster = cluster_type(name=_id) with self.assertRaises(ClusterException) as ctx: @on_cluster(cluster=original_cluster) @task(cluster=original_cluster, cluster_id="bad name") def f(): pass self.assertEqual("Cluster 'name' and cluster_id are different.", str(ctx.exception)) @on_cluster(cluster=original_cluster) @task(cluster=original_cluster, cluster_id=_id) def f(): pass self._basic_tests(cluster_type=cluster_type, id_=original_cluster.name) controller.delete_cluster(original_cluster.name)
def test_task_by_cluster_local(self): _id = "test1" cluster_type = LocalCluster original_cluster = cluster_type() with self.assertRaises(ClusterException) as ctx: @on_cluster(cluster=original_cluster, cluster_id=_id) @task(cluster=original_cluster) def f(): pass self.assertEqual("'cluster_id' argument is required for LocalCluster.", str(ctx.exception)) @on_cluster(cluster=original_cluster, cluster_id=_id) @task(cluster=original_cluster, cluster_id=_id) def f(): pass self._basic_tests(cluster_type=cluster_type, id_=_id) controller.delete_cluster(_id)
def test_mpi_complex_job_cluster_fail(self): # Now a few more variables with self.assertRaises(ValueError): # When we provide ntasks_per_node, cpus_per_tasks is derived (in this case # 24/2 = 12). For an MPI job we expect the core count (which is the total # number of cores to be used) to be divisible by cpus_per_tasks but that is # not true in this case resulting in a ValueError get_cluster(queue_type="gpus", mpi_mode=True, cores=2, ntasks_per_node=2, **self.kwargs) # If you really want this you ask for it explicitly cluster = get_cluster(queue_type="gpus", mpi_mode=True, cores=2, ntasks_per_node=2, cpus_per_task=1, **self.kwargs) self.assertIn("#SBATCH --cpus-per-task=1", cluster.job_header) self.assertIn("#SBATCH --ntasks-per-node=2", cluster.job_header) self.assertIn("#SBATCH -n 2", cluster.job_header) self.assertNotIn("#SBATCH --nodes", cluster.job_header) self.assertIn("#SBATCH --gres=gpu:4", cluster.job_header) controller.delete_cluster(cluster.name) # For memory pinning stuff that may be done by the scheduler, it is probably # better to ask for it like this (even if you don't intend to use OpenMP) cluster = get_cluster(queue_type="gpus", mpi_mode=True, nodes=1, ntasks_per_node=2, **self.kwargs) self.assertIn("#SBATCH --cpus-per-task=12", cluster.job_header) self.assertIn("#SBATCH --ntasks-per-node=2", cluster.job_header) self.assertIn("#SBATCH -n 2", cluster.job_header) self.assertIn("#SBATCH --nodes=1", cluster.job_header) self.assertIn("#SBATCH --gres=gpu:4", cluster.job_header) controller.delete_cluster(cluster.name)
def test_init_only_cluster(self): original_cluster = CustomSLURMCluster() @on_cluster(cluster=original_cluster) def f(): pass cluster, client = controller.get_cluster(original_cluster.name) self.assertEqual(original_cluster, cluster) self.assertIsInstance(client, Client) controller.delete_cluster(cluster.name) _id = "test1" original_cluster = CustomSLURMCluster(name=_id) @on_cluster(cluster=original_cluster) def f(): pass cluster, client = controller.get_cluster(_id) self.assertEqual(original_cluster, cluster) self.assertIsInstance(client, Client) controller.delete_cluster(cluster.name)
def test_mpi_job_cluster(self): # First do a simple mpi job cluster = get_cluster(queue_type="knl", mpi_mode=True, cores=64, **self.kwargs) self.assertIn("#SBATCH --cpus-per-task=1", cluster.job_header) self.assertIn("#SBATCH --ntasks-per-node=64", cluster.job_header) self.assertIn("#SBATCH -n 64", cluster.job_script()) self.assertIn(MPI_DASK_WRAPPER_MODULE, cluster._dummy_job._command_template) self.assertEqual(cluster._dummy_job.worker_cores, 1) self.assertEqual(cluster._dummy_job.worker_processes, 1) self.assertEqual(cluster._dummy_job.worker_process_threads, 1) self.assertIsInstance(cluster.client, Client) controller.delete_cluster(cluster.name) # Now check our command template when we use different MPI runtimes remaining_launchers = SUPPORTED_MPI_LAUNCHERS.copy() # Don't recheck SRUN since we did it above (and it takes no args) remaining_launchers.remove(SRUN) expected_outputs = [ "-n 64", "-np 64 --map-by ppr:64:node", "-n 64 -perhost 64" ] for launcher, args in zip(remaining_launchers, expected_outputs): temp_kwargs = self.kwargs.copy() temp_kwargs.update({"mpi_launcher": launcher}) cluster = get_cluster(queue_type="knl", mpi_mode=True, cores=64, **temp_kwargs) self.assertIn( " ".join([launcher["launcher"], args, sys.executable]), cluster._dummy_job._command_template, ) controller.delete_cluster(cluster.name) # Finally check that we catch the assumption that cores means total CPU # elements for the job (in MPI mode) with self.assertRaises(ValueError): cluster = get_cluster(queue_type="knl", mpi_mode=True, nodes=64, cores=64, **self.kwargs) controller.delete_cluster(cluster.name)
def test_cluster_pure_functions(self): # Check that the attribute is not set for non-MPI mode cluster = get_cluster(queue_type="knl", **self.kwargs) self.assertEqual(hasattr(cluster, "pure"), False) controller.delete_cluster(cluster.name)