Пример #1
0
def construct_worker_launch_command(batch: Optional[Dict], btype: str,
                                    nodes: int) -> str:
    """
    If no 'worker_launch' is found in the batch yaml, this method constructs the needed launch command.

    : param batch : (Optional[Dict]): An optional batch override from the worker config
    : param btype : (str): The type of batch (flux, local, lsf)
    : param nodes : (int): The number of nodes to use in the batch launch
    """
    launch_command: str = ""
    workload_manager: str = get_batch_type()
    bank: str = get_yaml_var(batch, "bank", "")
    queue: str = get_yaml_var(batch, "queue", "")
    walltime: str = get_yaml_var(batch, "walltime", "")
    if btype == "slurm" or workload_manager == "slurm":
        launch_command = f"srun -N {nodes} -n {nodes}"
        if bank:
            launch_command += f" -A {bank}"
        if queue:
            launch_command += f" -p {queue}"
        if walltime:
            launch_command += f" -t {walltime}"
    if workload_manager == "lsf":
        # The jsrun utility does not have a time argument
        launch_command = f"jsrun -a 1 -c ALL_CPUS -g ALL_GPUS --bind=none -n {nodes}"

    return launch_command
Пример #2
0
def examine_and_log_machines(worker_val, yenv) -> bool:
    """
    Examines whether a worker should be skipped in a step of start_celery_workers(), logs errors in output path for a celery
    worker.
    """
    worker_machines = get_yaml_var(worker_val, "machines", None)
    if worker_machines:
        LOG.debug("check machines = ", check_machines(worker_machines))
        if not check_machines(worker_machines):
            return True

        if yenv:
            output_path = get_yaml_var(yenv, "OUTPUT_PATH", None)
            if output_path and not os.path.exists(output_path):
                hostname = socket.gethostname()
                LOG.error(
                    f"The output path, {output_path}, is not accessible on this host, {hostname}"
                )
        else:
            LOG.warning(
                "The env:variables section does not have an OUTPUT_PATH specified, multi-machine checks cannot be performed."
            )
        return False
Пример #3
0
def batch_check_parallel(spec):
    """
    Check for a parallel batch section in the yaml file.
    """
    parallel = False

    try:
        batch = spec.batch
    except AttributeError:
        LOG.error("The batch section is required in the specification file.")
        raise

    btype = get_yaml_var(batch, "type", "local")
    if btype != "local":
        parallel = True

    return parallel
Пример #4
0
def batch_worker_launch(spec, com, nodes=None, batch=None):
    """
      The configuration in the batch section of the merlin spec
      is used to create the worker launch line, which may be
      different from a simulation launch.

      com (str): The command to launch with batch configuration
      nodes (int): The number of nodes to use in the batch launch
      batch (dict): An optional batch override from the worker config

    """
    if batch is None:
        try:
            batch = spec.batch
        except AttributeError:
            LOG.error(
                "The batch section is required in the specification file.")
            raise

    btype = get_yaml_var(batch, "type", "local")

    # A jsrun submission cannot be run under a parent jsrun so
    # all non flux lsf submissions need to be local.
    if btype == "local" or "lsf" in btype:
        return com

    if nodes is None:
        # Use the value in the batch section
        nodes = get_yaml_var(batch, "nodes", None)

    # Get the number of nodes from the environment if unset
    if nodes is None or nodes == "all":
        nodes = get_node_count(default=1)

    bank = get_yaml_var(batch, "bank", "")
    queue = get_yaml_var(batch, "queue", "")
    shell = get_yaml_var(batch, "shell", "bash")
    walltime = get_yaml_var(batch, "walltime", "")

    launch_pre = get_yaml_var(batch, "launch_pre", "")
    launch_args = get_yaml_var(batch, "launch_args", "")
    worker_launch = get_yaml_var(batch, "worker_launch", "")

    if btype == "flux":
        launcher = get_batch_type()
    else:
        launcher = get_batch_type()

    launchs = worker_launch
    if not launchs:
        if btype == "slurm" or launcher == "slurm":
            launchs = f"srun --mpi=none -N {nodes} -n {nodes}"
            if bank:
                launchs += f" -A {bank}"
            if queue:
                launchs += f" -p {queue}"
            if walltime:
                launchs += f" -t {walltime}"
        if launcher == "lsf":
            # The jsrun utility does not have a time argument
            launchs = f"jsrun -a 1 -c ALL_CPUS -g ALL_GPUS --bind=none -n {nodes}"

    launchs += f" {launch_args}"

    # Allow for any pre launch manipulation, e.g. module load
    # hwloc/1.11.10-cuda
    if launch_pre:
        launchs = f"{launch_pre} {launchs}"

    worker_cmd = f"{launchs} {com}"

    if btype == "flux":
        flux_path = get_yaml_var(batch, "flux_path", "")
        flux_opts = get_yaml_var(batch, "flux_start_opts", "")
        flux_exec_workers = get_yaml_var(batch, "flux_exec_workers", True)

        flux_exec = ""
        if flux_exec_workers:
            flux_exec = "flux exec"

        if "/" in flux_path:
            flux_path += "/"

        flux_exe = os.path.join(flux_path, "flux")

        launch = (
            f"{launchs} {flux_exe} start {flux_opts} {flux_exec} `which {shell}` -c"
        )
        worker_cmd = f'{launch} "{com}"'

    return worker_cmd
Пример #5
0
def start_celery_workers(spec, steps, celery_args, just_return_command):
    """ Start the celery workers on the allocation

    specs       Tuple of (YAMLSpecification, MerlinSpec)
    ...

    example config:

    merlin:
      resources:
        task_server: celery
        overlap: False
        workers:
            simworkers:
                args: -O fair --prefetch-multiplier 1 -E -l info --concurrency 4
                steps: [run, data]
                nodes: 1
                machine: [hostA, hostB]
    """
    if not just_return_command:
        LOG.info("Starting workers")

    overlap = spec.merlin["resources"]["overlap"]
    workers = spec.merlin["resources"]["workers"]

    senv = spec.environment
    spenv = os.environ.copy()
    yenv = None
    if senv:
        yenv = get_yaml_var(senv, "variables", {})
        for k, v in yenv.items():
            spenv[str(k)] = str(v)
            # For expandvars
            os.environ[str(k)] = str(v)

    worker_list = []
    local_queues = []

    for worker_name, worker_val in workers.items():
        worker_machines = get_yaml_var(worker_val, "machines", None)
        if worker_machines:
            LOG.debug("check machines = ", check_machines(worker_machines))
            if not check_machines(worker_machines):
                continue

            if yenv:
                output_path = get_yaml_var(yenv, "OUTPUT_PATH", None)
                if output_path and not os.path.exists(output_path):
                    hostname = socket.gethostname()
                    LOG.error(
                        f"The output path, {output_path}, is not accessible on this host, {hostname}"
                    )
            else:
                LOG.warning(
                    "The env:variables section does not have an OUTPUT_PATH"
                    "specified, multi-machine checks cannot be performed."
                )

        worker_args = get_yaml_var(worker_val, "args", celery_args)
        with suppress(KeyError):
            if worker_val["args"] is None:
                worker_args = ""

        worker_nodes = get_yaml_var(worker_val, "nodes", None)

        worker_batch = get_yaml_var(worker_val, "batch", None)

        wsteps = get_yaml_var(worker_val, "steps", steps)
        queues = spec.make_queue_string(wsteps).split(",")

        # Check for missing arguments
        parallel = batch_check_parallel(spec)
        if parallel:
            if "--concurrency" not in worker_args:
                LOG.warning(
                    "The worker arg --concurrency [1-4] is recommended "
                    "when running parallel tasks"
                )
            if "--prefetch-multiplier" not in worker_args:
                LOG.warning(
                    "The worker arg --prefetch-multiplier 1 is "
                    "recommended when running parallel tasks"
                )
            if "fair" not in worker_args:
                LOG.warning(
                    "The worker arg -O fair is recommended when running "
                    "parallel tasks"
                )

        if "-n" not in worker_args:
            nhash = ""
            if overlap:
                nhash = time.strftime("%Y%m%d-%H%M%S")
            # TODO: Once flux fixes their bug, change this back to %h
            worker_args += f" -n {worker_name}{nhash}.%%h"

        if "-l" not in worker_args:
            worker_args += f" -l {logging.getLevelName(LOG.getEffectiveLevel())}"

        # Add a per worker log file (debug)
        if LOG.isEnabledFor(logging.DEBUG):
            LOG.debug("Redirecting worker output to individual log files")
            worker_args += f" --logfile %p.%i"

        # Get the celery command
        celery_com = launch_celery_workers(
            spec, steps=wsteps, worker_args=worker_args, just_return_command=True
        )

        celery_cmd = os.path.expandvars(celery_com)

        worker_cmd = batch_worker_launch(
            spec, celery_cmd, nodes=worker_nodes, batch=worker_batch
        )

        worker_cmd = os.path.expandvars(worker_cmd)

        try:
            kwargs = {"env": spenv, "shell": True, "universal_newlines": True}
            # These cannot be used with a detached process
            # "stdout":               subprocess.PIPE,
            # "stderr":               subprocess.PIPE,

            LOG.debug(f"worker cmd={worker_cmd}")
            LOG.debug(f"env={spenv}")

            found = []
            running_queues = []

            if not just_return_command and not overlap:
                running_queues.extend(get_running_queues())
            running_queues.extend(local_queues)

            for q in queues:
                if q in running_queues:
                    found.append(q)

            if found:
                LOG.warning(
                    f"A celery worker named '{worker_name}' is already configured/running for queue(s) = {' '.join(found)}"
                )
                continue

            # Cache the queues from this worker to use to test
            # for existing queues in any subsequent workers.
            # If overlap is True, then do not check the local queues.
            # This will allow multiple workers to pull from the same
            # queue.
            if not overlap:
                local_queues.extend(queues)

            if just_return_command:
                worker_list = ""
                print(worker_cmd)
                continue

            _ = subprocess.Popen(worker_cmd, **kwargs)

            worker_list.append(worker_cmd)

        except Exception as e:
            LOG.error(f"Cannot start celery workers, {e}")
            raise

    # Return a string with the worker commands for logging
    return str(worker_list)
Пример #6
0
def batch_worker_launch(
    spec: Dict,
    com: str,
    nodes: Optional[Union[str, int]] = None,
    batch: Optional[Dict] = None,
) -> str:
    """
    The configuration in the batch section of the merlin spec
    is used to create the worker launch line, which may be
    different from a simulation launch.

    : param spec : (Dict) workflow specification
    : param com : (str): The command to launch with batch configuration
    : param nodes : (Optional[Union[str, int]]): The number of nodes to use in the batch launch
    : param batch : (Optional[Dict]): An optional batch override from the worker config
    """
    if batch is None:
        try:
            batch = spec.batch
        except AttributeError:
            LOG.error(
                "The batch section is required in the specification file.")
            raise

    btype: str = get_yaml_var(batch, "type", "local")

    # A jsrun submission cannot be run under a parent jsrun so
    # all non flux lsf submissions need to be local.
    if btype == "local" or "lsf" in btype:
        return com

    if nodes is None:
        # Use the value in the batch section
        nodes = get_yaml_var(batch, "nodes", None)

    # Get the number of nodes from the environment if unset
    if nodes is None or nodes == "all":
        nodes = get_node_count(default=1)
    elif not isinstance(nodes, int):
        raise TypeError(
            "Nodes was passed into batch_worker_launch with an invalid type (likely a string other than 'all')."
        )

    shell: str = get_yaml_var(batch, "shell", "bash")

    launch_pre: str = get_yaml_var(batch, "launch_pre", "")
    launch_args: str = get_yaml_var(batch, "launch_args", "")
    launch_command: str = get_yaml_var(batch, "worker_launch", "")

    if not launch_command:
        launch_command = construct_worker_launch_command(batch, btype, nodes)

    launch_command += f" {launch_args}"

    # Allow for any pre launch manipulation, e.g. module load
    # hwloc/1.11.10-cuda
    if launch_pre:
        launch_command = f"{launch_pre} {launch_command}"

    worker_cmd: str = ""
    if btype == "flux":
        flux_path: str = get_yaml_var(batch, "flux_path", "")
        flux_opts: Union[str, Dict] = get_yaml_var(batch, "flux_start_opts",
                                                   "")
        flux_exec_workers: Union[str, Dict,
                                 bool] = get_yaml_var(batch,
                                                      "flux_exec_workers",
                                                      True)

        flux_exec: str = ""
        if flux_exec_workers:
            flux_exec = "flux exec"

        if "/" in flux_path:
            flux_path += "/"

        flux_exe: str = os.path.join(flux_path, "flux")

        launch: str = f"{launch_command} {flux_exe} start {flux_opts} {flux_exec} `which {shell}` -c"
        worker_cmd = f'{launch} "{com}"'
    else:
        worker_cmd = f"{launch_command} {com}"

    return worker_cmd
Пример #7
0
def start_celery_workers(spec, steps, celery_args, just_return_command):
    """Start the celery workers on the allocation

    specs       Tuple of (YAMLSpecification, MerlinSpec)
    ...

    example config:

    merlin:
      resources:
        task_server: celery
        overlap: False
        workers:
            simworkers:
                args: -O fair --prefetch-multiplier 1 -E -l info --concurrency 4
                steps: [run, data]
                nodes: 1
                machine: [hostA, hostB]
    """
    if not just_return_command:
        LOG.info("Starting workers")

    overlap = spec.merlin["resources"]["overlap"]
    workers = spec.merlin["resources"]["workers"]

    senv = spec.environment
    spenv = os.environ.copy()
    yenv = None
    if senv:
        yenv = get_yaml_var(senv, "variables", {})
        for k, v in yenv.items():
            spenv[str(k)] = str(v)
            # For expandvars
            os.environ[str(k)] = str(v)

    worker_list = []
    local_queues = []

    for worker_name, worker_val in workers.items():
        skip_loop_step: bool = examine_and_log_machines(worker_val, yenv)
        if skip_loop_step:
            continue

        worker_args = get_yaml_var(worker_val, "args", celery_args)
        with suppress(KeyError):
            if worker_val["args"] is None:
                worker_args = ""

        worker_nodes = get_yaml_var(worker_val, "nodes", None)

        worker_batch = get_yaml_var(worker_val, "batch", None)

        wsteps = get_yaml_var(worker_val, "steps", steps)
        queues = spec.make_queue_string(wsteps).split(",")

        # Check for missing arguments
        verify_args(spec, worker_args, worker_name, overlap)

        # Add a per worker log file (debug)
        if LOG.isEnabledFor(logging.DEBUG):
            LOG.debug("Redirecting worker output to individual log files")
            worker_args += " --logfile %p.%i"

        # Get the celery command
        celery_com = launch_celery_workers(spec,
                                           steps=wsteps,
                                           worker_args=worker_args,
                                           just_return_command=True)

        celery_cmd = os.path.expandvars(celery_com)

        worker_cmd = batch_worker_launch(spec,
                                         celery_cmd,
                                         nodes=worker_nodes,
                                         batch=worker_batch)

        worker_cmd = os.path.expandvars(worker_cmd)

        try:
            kwargs = {"env": spenv, "shell": True, "universal_newlines": True}
            # These cannot be used with a detached process
            # "stdout":               subprocess.PIPE,
            # "stderr":               subprocess.PIPE,

            LOG.debug(f"worker cmd={worker_cmd}")
            LOG.debug(f"env={spenv}")

            if just_return_command:
                worker_list = ""
                print(worker_cmd)
                continue

            found = []
            running_queues = []

            running_queues.extend(local_queues)
            if not overlap:
                running_queues.extend(get_running_queues())
                # Cache the queues from this worker to use to test
                # for existing queues in any subsequent workers.
                # If overlap is True, then do not check the local queues.
                # This will allow multiple workers to pull from the same
                # queue.
                local_queues.extend(queues)

            for q in queues:
                if q in running_queues:
                    found.append(q)

            if found:
                LOG.warning(
                    f"A celery worker named '{worker_name}' is already configured/running for queue(s) = {' '.join(found)}"
                )
                continue

            _ = subprocess.Popen(worker_cmd, **kwargs)

            worker_list.append(worker_cmd)

        except Exception as e:
            LOG.error(f"Cannot start celery workers, {e}")
            raise

    # Return a string with the worker commands for logging
    return str(worker_list)