def test_header(): with OARCluster( walltime="00:02:00", processes=4, cores=8, memory="28GB" ) as cluster: assert "#OAR -n dask-worker" in cluster.job_header assert "#OAR -l /nodes=1/core=8,walltime=00:02:00" in cluster.job_header assert "#OAR --project" not in cluster.job_header assert "#OAR -q" not in cluster.job_header with OARCluster( queue="regular", project="DaskOnOar", processes=4, cores=8, memory="28GB", job_cpu=16, job_mem="100G", job_extra=["-t besteffort"], ) as cluster: assert "walltime=" in cluster.job_header assert "#OAR --project DaskOnOar" in cluster.job_header assert "#OAR -q regular" in cluster.job_header assert "#OAR -t besteffort" in cluster.job_header with OARCluster(cores=4, memory="8GB") as cluster: assert "#OAR -n dask-worker" in cluster.job_header assert "walltime=" in cluster.job_header assert "#OAR --project" not in cluster.job_header assert "#OAR -q" not in cluster.job_header
def test_header(): with OARCluster(walltime='00:02:00', processes=4, cores=8, memory='28GB') as cluster: assert '#OAR -n dask-worker' in cluster.job_header assert '#OAR -l /nodes=1/core=8,walltime=00:02:00' in cluster.job_header assert '#OAR --project' not in cluster.job_header assert '#OAR -q' not in cluster.job_header with OARCluster(queue='regular', project='DaskOnOar', processes=4, cores=8, memory='28GB', job_cpu=16, job_mem='100G', job_extra=['-t besteffort']) as cluster: assert 'walltime=' in cluster.job_header assert '#OAR --project DaskOnOar' in cluster.job_header assert '#OAR -q regular' in cluster.job_header assert '#OAR -t besteffort' in cluster.job_header with OARCluster(cores=4, memory='8GB') as cluster: assert '#OAR -n dask-worker' in cluster.job_header assert 'walltime=' in cluster.job_header assert '#OAR --project' not in cluster.job_header assert '#OAR -q' not in cluster.job_header
def __init__( self, cores, name, processes=1, mem_req=4000, walltime='72:00:00', venv=None, to_source='~/.bashrc', log_dir='/home/apashevi/Logs/dask/', spill_dir='/home/apashevi/Logs/dask/', env_extra=None, besteffort=False, job_extra=None, interface_node=None, extra='', **kwargs): if name == 'dask-cpu': resource_spec = 'nodes=1/core={}'.format(cores) elif name == 'dask-gpu': resource_spec = None else: raise NotImplementedError name += '_' + datetime.datetime.now().strftime('%Y%m%dT%H%M%S') os.path.join(log_dir, 'logs') if besteffort: if job_extra is None: job_extra = [] job_extra += [' -t besteffort -t idempotent'] job_extra += [ '--stdout={}'.format(os.path.join(log_dir, '%jobid%_stdout.txt')) ] job_extra += [ '--stderr={}'.format(os.path.join(log_dir, '%jobid%_stderr.txt')) ] OARCluster.__init__( self, resource_spec=resource_spec, walltime=walltime, name=name, cores=cores, processes=processes, memory='{}m'.format(mem_req), local_directory=spill_dir, extra=extra, env_extra=env_extra, job_extra=job_extra, interface_node=interface_node, **kwargs)
def test_job_script(): with OARCluster( walltime="00:02:00", processes=4, cores=8, memory="28GB" ) as cluster: job_script = cluster.job_script() assert "#OAR" in job_script assert "#OAR -n dask-worker" in job_script assert "--memory-limit 7.00GB " in job_script assert "#OAR -l /nodes=1/core=8,walltime=00:02:00" in job_script assert "#OAR --project" not in job_script assert "#OAR -q" not in job_script assert "export " not in job_script assert ( "{} -m distributed.cli.dask_worker tcp://".format(sys.executable) in job_script ) assert "--nthreads 2 --nprocs 4 --memory-limit 7.00GB" in job_script with OARCluster( walltime="00:02:00", processes=4, cores=8, memory="28GB", env_extra=[ 'export LANG="en_US.utf8"', 'export LANGUAGE="en_US.utf8"', 'export LC_ALL="en_US.utf8"', ], ) as cluster: job_script = cluster.job_script() assert "#OAR" in job_script assert "#OAR -n dask-worker" in job_script assert "--memory-limit 7.00GB " in job_script assert "#OAR -l /nodes=1/core=8,walltime=00:02:00" in job_script assert "#OAR --project" not in job_script assert "#OAR -q" not in job_script assert 'export LANG="en_US.utf8"' in job_script assert 'export LANGUAGE="en_US.utf8"' in job_script assert 'export LC_ALL="en_US.utf8"' in job_script assert ( "{} -m distributed.cli.dask_worker tcp://".format(sys.executable) in job_script ) assert "--nthreads 2 --nprocs 4 --memory-limit 7.00GB" in job_script
def test_config_name_oar_takes_custom_config(): conf = { "queue": "myqueue", "project": "myproject", "ncpus": 1, "cores": 1, "memory": "2 GB", "walltime": "00:02", "job-extra": [], "name": "myname", "processes": 1, "interface": None, "death-timeout": None, "local-directory": "/foo", "extra": [], "env-extra": [], "log-directory": None, "shebang": "#!/usr/bin/env bash", "job-cpu": None, "job-mem": None, "resource-spec": None, } with dask.config.set({"jobqueue.oar-config-name": conf}): with OARCluster(config_name="oar-config-name") as cluster: assert cluster.name == "myname"
def test_config_name_oar_takes_custom_config(): conf = { 'queue': 'myqueue', 'project': 'myproject', 'ncpus': 1, 'cores': 1, 'memory': '2 GB', 'walltime': '00:02', 'job-extra': [], 'name': 'myname', 'processes': 1, 'interface': None, 'death-timeout': None, 'local-directory': '/foo', 'extra': [], 'env-extra': [], 'log-directory': None, 'shebang': '#!/usr/bin/env bash', 'job-cpu': None, 'job-mem': None, 'resource-spec': None } with dask.config.set({'jobqueue.oar-config-name': conf}): with OARCluster(config_name='oar-config-name') as cluster: assert cluster.name == 'myname'
def test_job_script(): with OARCluster(walltime='00:02:00', processes=4, cores=8, memory='28GB') as cluster: job_script = cluster.job_script() assert '#OAR' in job_script assert '#OAR -n dask-worker' in job_script assert '--memory-limit 7.00GB ' in job_script assert '#OAR -l /nodes=1/core=8,walltime=00:02:00' in job_script assert '#OAR --project' not in job_script assert '#OAR -q' not in job_script assert 'export ' not in job_script assert '{} -m distributed.cli.dask_worker tcp://'.format( sys.executable) in job_script assert '--nthreads 2 --nprocs 4 --memory-limit 7.00GB' in job_script with OARCluster(walltime='00:02:00', processes=4, cores=8, memory='28GB', env_extra=[ 'export LANG="en_US.utf8"', 'export LANGUAGE="en_US.utf8"', 'export LC_ALL="en_US.utf8"' ]) as cluster: job_script = cluster.job_script() assert '#OAR' in job_script assert '#OAR -n dask-worker' in job_script assert '--memory-limit 7.00GB ' in job_script assert '#OAR -l /nodes=1/core=8,walltime=00:02:00' in job_script assert '#OAR --project' not in job_script assert '#OAR -q' not in job_script assert 'export LANG="en_US.utf8"' in job_script assert 'export LANGUAGE="en_US.utf8"' in job_script assert 'export LC_ALL="en_US.utf8"' in job_script assert '{} -m distributed.cli.dask_worker tcp://'.format( sys.executable) in job_script assert '--nthreads 2 --nprocs 4 --memory-limit 7.00GB' in job_script
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor, DaskExecutor), "pipeline_context", "Expected executor to be DaskExecutor got {}".format( pipeline_context.executor), ) check.invariant( pipeline_context.instance.is_persistent, "Dask execution requires a persistent DagsterInstance", ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = self.cluster_type if cluster_type == "local": from dask.distributed import LocalCluster cluster = LocalCluster(**self.build_dict(pipeline_name)) elif cluster_type == "yarn": from dask_yarn import YarnCluster cluster = YarnCluster(**self.build_dict(pipeline_name)) elif cluster_type == "ssh": from dask.distributed import SSHCluster cluster = SSHCluster(**self.build_dict(pipeline_name)) elif cluster_type == "pbs": from dask_jobqueue import PBSCluster cluster = PBSCluster(**self.build_dict(pipeline_name)) elif cluster_type == "moab": from dask_jobqueue import MoabCluster cluster = MoabCluster(**self.build_dict(pipeline_name)) elif cluster_type == "sge": from dask_jobqueue import SGECluster cluster = SGECluster(**self.build_dict(pipeline_name)) elif cluster_type == "lsf": from dask_jobqueue import LSFCluster cluster = LSFCluster(**self.build_dict(pipeline_name)) elif cluster_type == "slurm": from dask_jobqueue import SLURMCluster cluster = SLURMCluster(**self.build_dict(pipeline_name)) elif cluster_type == "oar": from dask_jobqueue import OARCluster cluster = OARCluster(**self.build_dict(pipeline_name)) elif cluster_type == "kube": from dask_kubernetes import KubeCluster cluster = KubeCluster(**self.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) run_config = dict(pipeline_context.run_config, execution={"in_process": {}}) recon_repo = pipeline_context.pipeline.get_reconstructable_repository( ) dask_task_name = "%s.%s" % (pipeline_name, step.key) recon_pipeline = recon_repo.get_reconstructable_pipeline( pipeline_name) future = client.submit( query_on_dask_worker, dependencies, recon_pipeline, pipeline_context.pipeline_run, run_config, [step.key], pipeline_context.mode_def.name, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master futures = dask.distributed.as_completed(execution_futures, with_results=True) # Allow interrupts while waiting for the results from Dask for future, result in iterate_with_context( raise_interrupts_immediately, futures): for step_event in result: check.inst(step_event, DagsterEvent) yield step_event
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor, DaskExecutor), 'pipeline_context', 'Expected executor to be DaskExecutor got {}'.format( pipeline_context.executor), ) # Checks to ensure storage is compatible with Dask configuration storage = pipeline_context.run_config.get('storage') check.invariant(storage.keys(), 'Must specify storage to use Dask execution') check.invariant( pipeline_context.instance.is_persistent, 'Dask execution requires a persistent DagsterInstance', ) # https://github.com/dagster-io/dagster/issues/2440 check.invariant( pipeline_context.system_storage_def.is_persistent, 'Cannot use in-memory storage with Dask, use filesystem, S3, or GCS', ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = self.cluster_type if cluster_type == 'local': from dask.distributed import LocalCluster cluster = LocalCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'yarn': from dask_yarn import YarnCluster cluster = YarnCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'ssh': from dask.distributed import SSHCluster cluster = SSHCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'pbs': from dask_jobqueue import PBSCluster cluster = PBSCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'moab': from dask_jobqueue import MoabCluster cluster = MoabCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'sge': from dask_jobqueue import SGECluster cluster = SGECluster(**self.build_dict(pipeline_name)) elif cluster_type == 'lsf': from dask_jobqueue import LSFCluster cluster = LSFCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'slurm': from dask_jobqueue import SLURMCluster cluster = SLURMCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'oar': from dask_jobqueue import OARCluster cluster = OARCluster(**self.build_dict(pipeline_name)) elif cluster_type == 'kube': from dask_kubernetes import KubeCluster cluster = KubeCluster(**self.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) run_config = dict(pipeline_context.run_config, execution={'in_process': {}}) recon_repo = pipeline_context.pipeline.get_reconstructable_repository( ) variables = { 'executionParams': { 'selector': { 'pipelineName': pipeline_name, 'repositoryName': recon_repo.get_definition().name, 'repositoryLocationName': '<<in_process>>', }, 'runConfigData': run_config, 'mode': pipeline_context.mode_def.name, 'executionMetadata': { 'runId': pipeline_context.pipeline_run.run_id }, 'stepKeys': [step.key], } } dask_task_name = '%s.%s' % (pipeline_name, step.key) workspace = create_in_process_ephemeral_workspace( pointer=pipeline_context.pipeline. get_reconstructable_repository().pointer) future = client.submit( query_on_dask_worker, workspace, variables, dependencies, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master for future in dask.distributed.as_completed(execution_futures): for step_event in future.result(): check.inst(step_event, DagsterEvent) yield step_event
def run_dask(options: dict, docker_username: str = None, docker_password: str = None, docker: bool = False, slurm_job_array: bool = False): try: if 'jobqueue' not in options: cluster = LocalCluster() else: jobqueue = options['jobqueue'] gpus = options['gpus'] if 'gpus' in options else 0 if 'slurm' in jobqueue: print("Requesting SLURM cluster:") pprint(jobqueue['slurm']) cluster = SLURMCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['slurm']) if gpus else SLURMCluster(**jobqueue['slurm']) elif 'pbs' in jobqueue: print("Requesting PBS cluster:") pprint(jobqueue['pbs']) cluster = PBSCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['pbs']) if gpus else PBSCluster(**jobqueue['pbs']) elif 'moab' in jobqueue: print("Requesting MOAB cluster:") pprint(jobqueue['moab']) cluster = MoabCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['moab']) if gpus else MoabCluster(**jobqueue['moab']) elif 'sge' in jobqueue: print("Requesting SGE cluster:") pprint(jobqueue['sge']) cluster = SGECluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['sge']) if gpus else SGECluster(**jobqueue['sge']) elif 'lsf' in jobqueue: print("Requesting LSF cluster:") pprint(jobqueue['lsf']) cluster = LSFCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['lsf']) if gpus else LSFCluster(**jobqueue['lsf']) elif 'oar' in jobqueue: print("Requesting OAR cluster:") pprint(jobqueue['oar']) cluster = OARCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['oar']) if gpus else OARCluster(**jobqueue['oar']) else: raise ValueError(f"Unsupported jobqueue configuration: {jobqueue}") print(f"Cluster job script: {cluster.job_script()}") if 'output' in options and 'from' in options['output']: output_path = options['output']['from'] else: output_path = '.' if 'input' not in options: env = options['env'] if 'env' in options else [] params = options['parameters'] if 'parameters' in options else [] bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else [] no_cache = options['no_cache'] if 'no_cache' in options else False gpus = options['gpus'] if 'gpus' in options else 0 if 'jobqueue' in options: cluster.scale(1) with Client(cluster) as client: command = prep_command( work_dir=options['workdir'], image=options['image'], command=options['command'], env=env + [{'key': 'INDEX', 'value': 1}], parameters=params + [{'key': 'OUTPUT', 'value': output_path}], bind_mounts=bind_mounts, no_cache=no_cache, gpus=gpus, docker_username=docker_username, docker_password=docker_password, docker=docker) logger.info(f"Submitting container") future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3) future.result() if future.status != 'finished': logger.error(f"Container failed: {future.exception}") else: logger.info(f"Container completed") elif options['input']['kind'] == InputKind.DIRECTORY: input_path = options['input']['path'] env = options['env'] if 'env' in options else [] params = options['parameters'] if 'parameters' in options else [] bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else [] no_cache = options['no_cache'] if 'no_cache' in options else False gpus = options['gpus'] if 'gpus' in options else 0 if 'jobqueue' in options: cluster.scale(1) with Client(cluster) as client: command = prep_command( work_dir=options['workdir'], image=options['image'], command=options['command'], env=env + [{'key': 'INDEX', 'value': 1}], parameters=params + [{'key': 'INPUT', 'value': input_path}, {'key': 'OUTPUT', 'value': output_path}], bind_mounts=bind_mounts, no_cache=no_cache, gpus=gpus, docker_username=docker_username, docker_password=docker_password, docker=docker) logger.info(f"Submitting container for directory '{input_path}'") future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3) future.result() if future.status != 'finished': logger.error(f"Container failed for directory '{input_path}': {future.exception}") else: logger.info(f"Container completed for directory '{input_path}'") elif options['input']['kind'] == InputKind.FILES: input_path = options['input']['path'] if slurm_job_array: files = os.listdir(input_path) file_id = int(os.environ.get('SLURM_ARRAY_TASK_ID')) current_file = files[file_id] env = options['env'] if 'env' in options else [] params = options['parameters'] if 'parameters' in options else [] patterns = options['input']['patterns'] if 'patterns' in options['input'] else [] bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else [] no_cache = options['no_cache'] if 'no_cache' in options else False gpus = options['gpus'] if 'gpus' in options else 0 if 'jobqueue' in options: cluster.scale(1) with Client(cluster) as client: command = prep_command( work_dir=options['workdir'], image=options['image'], command=options['command'], env=env + [{'key': 'INDEX', 'value': file_id}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}], parameters=params + [{'key': 'INPUT', 'value': join(input_path, current_file)}, {'key': 'OUTPUT', 'value': output_path}], bind_mounts=bind_mounts, no_cache=no_cache, gpus=gpus, docker_username=docker_username, docker_password=docker_password, docker=docker) logger.info(f"Submitting container for file '{input_path}'") future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3) future.result() if future.status != 'finished': logger.error(f"Container failed for file '{input_path}': {future.exception}") else: logger.info(f"Container completed for file '{input_path}'") logger.info(f"Run succeeded") else: files = os.listdir(input_path) count = len(files) futures = [] if 'jobqueue' not in options: logger.info(f"Processing {count} files in '{input_path}'") else: logger.info(f"Requesting {count} nodes to process {count} files in '{input_path}' with job script:\n{cluster.job_script()}") cluster.scale(count) env = options['env'] if 'env' in options else [] params = deepcopy(options['parameters']) if 'parameters' in options else [] patterns = options['input']['patterns'] if 'patterns' in options['input'] else [] bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else [] no_cache = options['no_cache'] if 'no_cache' in options else False gpus = options['gpus'] if 'gpus' in options else 0 with Client(cluster) as client: num_files = len(files) for i, current_file in tqdm.tqdm(enumerate(files), total=num_files): command = prep_command( work_dir=options['workdir'], image=options['image'], command=options['command'], env=env + [{'key': 'INDEX', 'value': i}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}], parameters=params + [{'key': 'INPUT', 'value': join(input_path, current_file)}, {'key': 'OUTPUT', 'value': output_path}], bind_mounts=bind_mounts, no_cache=no_cache, gpus=gpus, docker_username=docker_username, docker_password=docker_password, docker=docker) logger.info(f"Submitting container for file {i}") futures.append(submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3)) finished = 0 for future in tqdm.tqdm(as_completed(futures), total=num_files): finished += 1 if future.status != 'finished': logger.error(f"Container failed for file {finished}: {future.exception}") else: logger.info(f"Container completed for file {finished}") elif options['input']['kind'] == InputKind.FILE: input_path = options['input']['path'] env = options['env'] if 'env' in options else [] params = options['parameters'] if 'parameters' in options else [] patterns = options['input']['patterns'] if 'patterns' in options['input'] else [] bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else [] no_cache = options['no_cache'] if 'no_cache' in options else False gpus = options['gpus'] if 'gpus' in options else 0 if 'jobqueue' in options: cluster.scale(1) with Client(cluster) as client: command = prep_command( work_dir=options['workdir'], image=options['image'], command=options['command'], env=env + [{'key': 'INDEX', 'value': 1}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}], parameters=params + [{'key': 'INPUT', 'value': input_path}, {'key': 'OUTPUT', 'value': output_path}], bind_mounts=bind_mounts, no_cache=no_cache, gpus=gpus, docker_username=docker_username, docker_password=docker_password, docker=docker) logger.info(f"Submitting container for file 1") future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3) future.result() if future.status != 'finished': logger.error(f"Container failed for file 1") logger.error(future.exception) else: logger.info(f"Container completed for file 1") logger.info(f"Run succeeded") except: logger.error(f"Run failed: {traceback.format_exc()}") raise