예제 #1
0
파일: pipeline.py 프로젝트: jgu2/jade
def create(
    auto_config_cmds,
    per_node_batch_size,
    config_file,
    hpc_config,
    local,
    max_nodes,
    poll_interval,
    num_processes,
    reports,
    enable_singularity,
    container,
    verbose,
):
    """Create a pipeline with multiple Jade configurations."""
    if local:
        hpc_config = HpcConfig(hpc_type=HpcType.LOCAL, hpc=LocalHpcConfig())
    else:
        if not os.path.exists(hpc_config):
            print(
                f"{hpc_config} does not exist. Generate it with 'jade config hpc' "
                "or run in local mode with '-l'",
                file=sys.stderr,
            )
            sys.exit(1)
        hpc_config = HpcConfig(**load_data(hpc_config))

    if enable_singularity:
        singularity_params = SingularityParams(enabled=True,
                                               container=container)
    else:
        singularity_params = None
    submit_params = SubmitterParams(
        generate_reports=reports,
        hpc_config=hpc_config,
        max_nodes=max_nodes,
        num_processes=num_processes,
        per_node_batch_size=per_node_batch_size,
        poll_interval=poll_interval,
        singularity_params=singularity_params,
        verbose=verbose,
    )
    PipelineManager.create_config(auto_config_cmds, config_file, submit_params)
예제 #2
0
파일: test_cluster.py 프로젝트: jgu2/jade
def cluster():
    os.makedirs(OUTPUT, exist_ok=True)
    commands = ["echo 'hello'"] * 2
    cmd_file = os.path.join(OUTPUT, "commands.txt")
    with open(cmd_file, "w") as f_out:
        for cmd in commands:
            f_out.write(cmd + "\n")

    jade_config = GenericCommandConfiguration.auto_config(cmd_file)
    config_file = os.path.join(OUTPUT, CONFIG_FILE)
    jade_config.dump(config_file)
    hpc_config = HpcConfig(hpc_type="slurm", hpc=SlurmConfig(account="abc"))
    cluster = Cluster.create(OUTPUT, jade_config)

    yield cluster

    if os.path.exists(OUTPUT):
        shutil.rmtree(OUTPUT)
예제 #3
0
파일: config.py 프로젝트: jgu2/jade
def hpc(account, config_file, mem, partition, qos, hpc_type, tmp, walltime):
    """Create an HPC config file."""
    if hpc_type == "slurm":
        hpc = SlurmConfig(
            account=account,
            mem=mem,
            partition=partition,
            qos=qos,
            tmp=tmp,
            walltime=walltime,
        )
    elif hpc_type == "fake":
        hpc = FakeHpcConfig(walltime=walltime)
    else:
        assert hpc_type == "local"
        hpc = LocalHpcConfig()

    # This converts enums to values.
    data = json.loads(HpcConfig(hpc_type=hpc_type, hpc=hpc).json())
    dump_data(data, config_file)
    print(f"Created HPC config file {config_file}")
예제 #4
0
def make_submitter_params(
    per_node_batch_size=None,
    dry_run=None,
    hpc_config=None,
    local=None,
    max_nodes=None,
    poll_interval=None,
    resource_monitor_interval=None,
    resource_monitor_type=None,
    num_processes=None,
    verbose=None,
    reports=None,
    enable_singularity=None,
    container=None,
    try_add_blocked_jobs=None,
    time_based_batching=None,
    node_setup_script=None,
    node_shutdown_script=None,
    no_distributed_submitter=None,
):
    """Returns an instance of SubmitterParams for use in a job submission."""
    if node_setup_script is not None or node_shutdown_script is not None:
        print(
            "Warning: node_setup_script and node_shutdown_script are deprecated and will "
            "be removed in release v0.9.0.")
    if local:
        hpc_config = HpcConfig(hpc_type="local", hpc=LocalHpcConfig())
    else:
        # TODO: If the config file contains submission groups then this should not be required.
        if not os.path.exists(hpc_config):
            print(
                f"{hpc_config} does not exist. Generate it with 'jade config hpc' "
                "or run in local mode with '-l'",
                file=sys.stderr,
            )
            sys.exit(1)
        hpc_config = HpcConfig(**load_data(hpc_config))

    if local and dry_run:
        print("Dry run is not allowed in local mode.", file=sys.stderr)
        sys.exit(1)

    if (time_based_batching and per_node_batch_size !=
            SUBMITTER_PARAMS_DEFAULTS["per_node_batch_size"]):
        # This doesn't catch the case where the user passes --per-node-batch-size=default, but
        # I don't see that click provides a way to detect that condition.
        print(
            "Error: --per-node-batch-size and --time-based-batching are mutually exclusive",
            file=sys.stderr,
        )
        sys.exit(1)

    if time_based_batching and num_processes is None:
        print("Error: num_processes must be set with time-based batching",
              file=sys.stderr)
        sys.exit(1)

    # We added resource_monitor_type after resource_monitor_interval. The following logic
    # maintains backwards compatibility with user settings.
    default_monitor_interval = SUBMITTER_PARAMS_DEFAULTS[
        "resource_monitor_interval"]
    if resource_monitor_interval is not None and resource_monitor_type is not None:
        pass
    elif resource_monitor_interval is None and resource_monitor_type is None:
        resource_monitor_type = ResourceMonitorType.AGGREGATION
        resource_monitor_interval = default_monitor_interval
    elif resource_monitor_interval is not None and resource_monitor_type is None:
        resource_monitor_type = ResourceMonitorType.PERIODIC
    elif resource_monitor_interval is None and resource_monitor_type is not None:
        resource_monitor_interval = default_monitor_interval
    else:
        assert False, f"interval={resource_monitor_interval} type={resource_monitor_type}"

    if enable_singularity:
        singularity_params = SingularityParams(enabled=True,
                                               container=container)
    else:
        singularity_params = None
    return SubmitterParams(
        generate_reports=reports,
        hpc_config=hpc_config,
        max_nodes=max_nodes,
        num_processes=num_processes,
        per_node_batch_size=per_node_batch_size,
        distributed_submitter=not no_distributed_submitter,
        dry_run=dry_run,
        node_setup_script=node_setup_script,
        node_shutdown_script=node_shutdown_script,
        poll_interval=poll_interval,
        resource_monitor_interval=resource_monitor_interval,
        resource_monitor_type=resource_monitor_type,
        singularity_params=singularity_params,
        time_based_batching=time_based_batching,
        try_add_blocked_jobs=try_add_blocked_jobs,
        verbose=verbose,
    )
예제 #5
0
def config(
    collect_worker_logs,
    container_path,
    dynamic_allocation,
    gpu,
    hpc_config,
    master_node_memory_overhead_gb,
    node_memory_overhead_gb,
    run_user_script_inside_container,
    spark_dir,
    shuffle_partition_multiplier,
    update_config_file,
    use_tmpfs_for_scratch,
    alt_scratch,
    verbose,
    worker_memory_gb,
    force,
):
    """Create a Spark configuration to use for running a job on a Spark cluster."""
    level = logging.DEBUG if verbose else logging.WARNING
    setup_logging("config_spark", None, console_level=level)
    spark_dir = Path(spark_dir)
    if spark_dir.exists():
        if force:
            shutil.rmtree(spark_dir)
        else:
            print(
                f"The directory '{spark_dir}' already exists. Use a different name or pass --force to overwrite.",
                file=sys.stderr,
            )
            sys.exit(1)
    spark_dir.mkdir(parents=True)

    if use_tmpfs_for_scratch and alt is not None:
        print("use_tmpfs_for_scratch and alt_scratch cannot both be set",
              file=sys.stderr)
        sys.exit(1)

    hpc_config_data = HpcConfig.load(hpc_config)
    nodes = getattr(hpc_config_data.hpc, "nodes", None)
    if nodes is None:
        print(
            f"hpc_type={hpc_config_data.hpc_type} doesn't have a nodes field",
            file=sys.stderr)
        sys.exit(1)
    mem = getattr(hpc_config_data.hpc, "mem", None)
    if mem is None:
        executor_mem_gb = 11
        print(f"Use default per-executor memory of {executor_mem_gb}G")
    else:
        num_executors = 7
        if not mem.endswith("G"):
            raise Exception(
                f"This feature only supports HPC memory requirements ending with 'G'"
            )
        per_node_mem_gb = int(mem[:-1])
        if use_tmpfs_for_scratch:
            per_node_mem_gb //= 2
        overhead = master_node_memory_overhead_gb - node_memory_overhead_gb
        executor_mem_gb = (per_node_mem_gb - overhead) // num_executors
        print(
            f"Use custom per-executor memory of {executor_mem_gb}G based on per-node {mem}"
        )

    for dirname in ("bin", "conf"):
        src_path = Path(os.path.dirname(__file__)).parent / "spark" / dirname
        dst_path = spark_dir / dirname
        if not dst_path.exists():
            dst_path.mkdir()
        for filename in src_path.iterdir():
            shutil.copyfile(filename, dst_path / filename.name)

    use_gpus = _should_use_gpus(hpc_config_data, gpu)

    with open(spark_dir / "conf" / "spark-defaults.conf", "a") as f_out:
        f_out.write("\n")
        f_out.write(f"spark.executor.memory {executor_mem_gb}G\n")
        # Online documentation says this value should correlate with the number of cores in the
        # cluster. Some sources say 1 per core, others say 2 or 4 per core. Depends on use case.
        # This should be a reasonable default for users, who can customize dynamically.
        params = ["spark.sql.shuffle.partitions"]
        # Some sources say that we should set spark.default.parallelism to the same value,
        # others say it doesn't work. Experiments showed harmful effects if dynamic allocation
        # was enabled with a custom value.
        for param in params:
            f_out.write(param)
            f_out.write(" ")
            f_out.write(str(nodes * 35 * shuffle_partition_multiplier))
            f_out.write("\n")

        if dynamic_allocation:
            f_out.write("\n")
            f_out.write(DYNAMIC_ALLOCATION_SETTINGS)
            f_out.write("\n")

        if use_gpus:
            src_path = (Path(os.path.dirname(__file__)).parent / "spark" /
                        "conf" / "resourcesFile.json")
            resources_file = spark_dir / "conf" / "resourcesFile.json"
            shutil.copyfile(src_path, resources_file)
            f_out.write(
                "spark.worker.resource.gpu.discoveryScript /opt/sparkRapidsPlugin/getGpusResources.sh\n"
            )
            f_out.write(f"spark.worker.resourcesFile {resources_file}\n")

    if use_gpus:
        filename = spark_dir / "conf" / "spark-env.sh"
        with open(filename, "a") as f_out:
            num_gpus = hpc_config_data.get_num_gpus() or 2
            f_out.write(
                f'SPARK_WORKER_OPTS="-Dspark.worker.resource.gpu.amount={num_gpus} '
                f'-Dspark.worker.resource.gpu.discoveryScript={GPU_DISCOVERY_SCRIPT}"\n'
            )

    replacement_values = [
        ("SPARK_DIR", str(spark_dir)),
        ("CONTAINER_PATH", container_path),
    ]
    for name in ("run_spark_script_wrapper.sh", "run_user_script_wrapper.sh"):
        filename = spark_dir / "bin" / name
        _replace_tag(replacement_values, filename)
        st = os.stat(filename)
        os.chmod(filename, st.st_mode | stat.S_IEXEC)
        print(f"Assigned paths in {filename}")

    scripts = [spark_dir / "conf" / "spark-env.sh"] + list(
        (spark_dir / "bin").glob("*.sh"))
    for script in scripts:
        st = os.stat(script)
        os.chmod(script, st.st_mode | stat.S_IEXEC)

    print(
        f"Created Spark configuration in {spark_dir.absolute()} for a {nodes}-node cluster. "
        f"GPUs={use_gpus}")

    spark_config = SparkConfigModel(
        collect_worker_logs=collect_worker_logs,
        conf_dir=str(spark_dir),
        container=SparkContainerModel(path=container_path),
        enabled=True,
        master_node_memory_overhead_gb=master_node_memory_overhead_gb,
        node_memory_overhead_gb=node_memory_overhead_gb,
        run_user_script_inside_container=run_user_script_inside_container,
        use_tmpfs_for_scratch=use_tmpfs_for_scratch,
        alt_scratch=alt_scratch,
        worker_memory_gb=worker_memory_gb,
    )

    if update_config_file is not None:
        if not Path(update_config_file).exists():
            print(f"'update_config_file={update_config_file} does not exist",
                  file=sys.stderr)
            sys.exit(1)
        config = load_data(update_config_file)
        for job in config["jobs"]:
            job["spark_config"] = spark_config.dict()
        dump_data(config, update_config_file, indent=2)
        print(
            f"Updated jobs in {update_config_file} with this Spark configuration."
        )
    else:
        print(
            "\nAdd and customize this JSON object to the 'spark_config' field for each Spark "
            "job in your config.json file:\n")
        print(spark_config.json(indent=2))