예제 #1
0
def _run_manager(job_name, output_dir, verbose, manager_script_and_args):
    filename = os.path.join(output_dir,
                            f"run_multi_node_job_manager__{job_name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run manager on %s: %s", socket.gethostname(),
                get_cli_string())

    # Note that the manager receives its own hostname.
    output = {}
    check_run_command(f"jade cluster hostnames {output_dir}", output)
    hostnames = [x for x in output["stdout"].split() if x != ""]
    logger.info("Manager found %s hostnames: %s", len(hostnames), hostnames)
    cmd = " ".join(manager_script_and_args)
    logger.info("Run manager script [%s]", cmd)

    os.environ["JADE_OUTPUT_DIR"] = output_dir
    os.environ["JADE_COMPUTE_NODE_NAMES"] = " ".join(hostnames)
    start = time.time()
    ret = run_command(cmd)
    logger.info("Finished job. duration = %s seconds", time.time() - start)
    return ret
예제 #2
0
파일: pipeline.py 프로젝트: jgu2/jade
def submit(config_file, output, force, verbose=False):
    """Submit the pipeline for execution."""
    if os.path.exists(output):
        if force:
            shutil.rmtree(output)
        else:
            print(
                f"{output} already exists. Delete it or use '--force' to overwrite.",
                file=sys.stderr,
            )
            sys.exit(1)
    os.makedirs(output, exist_ok=True)

    filename = os.path.join(output, "pipeline_submit.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, filename, file_level=level, console_level=level)
    logger.info(get_cli_string())

    mgr = PipelineManager.create(config_file, output)
    try:
        mgr.submit_next_stage(1)
    except Exception:
        logger.exception("Pipeline execution failed")
        raise

    logging.shutdown()
    sys.exit(0)
예제 #3
0
def cancel_jobs(output, verbose):
    """Cancels jobs."""
    filename = os.path.join(output, "cancel_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="a")
    logger.info(get_cli_string())

    for _ in range(60):
        cluster, promoted = Cluster.deserialize(
            output,
            try_promote_to_submitter=True,
            deserialize_jobs=True,
        )
        if not promoted:
            logger.info("Did not get promoted. Sleep.")
            time.sleep(1)
            continue
        if cluster.is_complete():
            cluster.demote_from_submitter()
            logger.info("All jobs are already finished.")
            sys.exit(0)
        submitter = JobSubmitter.load(output)
        submitter.cancel_jobs(cluster)
        sys.exit(0)

    logger.error("Failed to get promoted to submitter.")
    sys.exit(1)
예제 #4
0
def submit_jobs(config_file, per_node_batch_size, hpc_config, local, max_nodes,
                output, poll_interval, num_processes, rotate_logs, verbose,
                restart_failed, restart_missing, reports,
                try_add_blocked_jobs):
    """Submits jobs for execution, locally or on HPC."""
    os.makedirs(output, exist_ok=True)

    previous_results = []

    if restart_failed:
        failed_job_config = create_config_from_previous_run(
            config_file, output, result_type='failed')
        previous_results = ResultsSummary(output).get_successful_results()
        config_file = "failed_job_inputs.json"
        failed_job_config.dump(config_file)

    if restart_missing:
        missing_job_config = create_config_from_previous_run(
            config_file, output, result_type='missing')
        config_file = "missing_job_inputs.json"
        missing_job_config.dump(config_file)
        previous_results = ResultsSummary(output).list_results()

    if rotate_logs:
        rotate_filenames(output, ".log")

    filename = os.path.join(output, "submit_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, filename, file_level=level, console_level=level)
    logger.info(get_cli_string())

    event_file = os.path.join(output, "submit_jobs_events.log")
    # This effectively means no console logging.
    setup_logging("event",
                  event_file,
                  console_level=logging.ERROR,
                  file_level=logging.INFO)

    mgr = JobSubmitter(config_file, hpc_config=hpc_config, output=output)
    ret = mgr.submit_jobs(
        per_node_batch_size=per_node_batch_size,
        max_nodes=max_nodes,
        force_local=local,
        verbose=verbose,
        num_processes=num_processes,
        poll_interval=poll_interval,
        previous_results=previous_results,
        reports=reports,
        try_add_blocked_jobs=try_add_blocked_jobs,
    )

    sys.exit(ret.value)
예제 #5
0
def run_jobs(config_file, distributed_submitter, output, num_processes,
             verbose):
    """Starts jobs on HPC."""
    match = re.search(r"batch_(\d+)\.json", config_file)
    assert match
    batch_id = match.group(1)
    os.makedirs(output, exist_ok=True)

    mgr = JobRunner(config_file, output=output, batch_id=batch_id)

    # Logging has to get enabled after the JobRunner is created because we need the node ID
    # is what makes the file unique.
    filename = os.path.join(output,
                            f"run_jobs_batch_{batch_id}_{mgr.node_id}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_event_logging(mgr.event_filename)
    logger = setup_logging(__name__,
                           filename,
                           file_level=level,
                           console_level=level,
                           mode="w")
    logger.info(get_cli_string())

    group = mgr.config.get_default_submission_group()
    if group.submitter_params.node_setup_script:
        cmd = f"{group.submitter_params.node_setup_script} {config_file} {output}"
        ret = run_command(cmd)
        if ret != 0:
            logger.error("Failed to run node setup script %s: %s", cmd, ret)
            sys.exit(ret)

    status = mgr.run_jobs(distributed_submitter=distributed_submitter,
                          verbose=verbose,
                          num_processes=num_processes)
    ret = status.value

    if group.submitter_params.node_shutdown_script:
        cmd = f"{group.submitter_params.node_shutdown_script} {config_file} {output}"
        ret2 = run_command(cmd)
        if ret2 != 0:
            logger.error("Failed to run node shutdown script %s: %s", cmd,
                         ret2)

    if status == Status.GOOD and distributed_submitter:
        start = time.time()
        _try_submit_jobs(output, verbose=verbose)
        logger.info("try-submit-jobs took %s seconds", time.time() - start)

    sys.exit(ret)
예제 #6
0
def run_worker(job_name, output_dir, verbose, poll_interval=60):
    """Run a worker instance."""
    hostname = socket.gethostname()
    filename = os.path.join(output_dir, f"run_multi_node_job_worker__{job_name}__{hostname}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, filename, file_level=level, console_level=level, mode="w")
    logger.info("Run worker: %s", get_cli_string())

    shutdown_file = _get_shutdown_file(job_name, output_dir)
    while not shutdown_file.exists():
        logger.debug("sleep for %s seconds", poll_interval)
        time.sleep(poll_interval)

    logger.info("Detected shutdown.")
    return 0
예제 #7
0
def run_worker(job, manager_node, output_dir, verbose, poll_interval=60):
    """Run a worker instance."""
    logger.error("in worker manager_node=%s job=%s", manager_node, job.name)
    hostname = socket.gethostname()
    filename = os.path.join(
        output_dir, f"run_spark_job_worker__{hostname}__{job.name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run worker: %s", get_cli_string())

    # Ignore errors. Spark may not be running.
    run_command(job.model.spark_config.get_stop_worker())

    # Give the master a head start.
    time.sleep(10)
    job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name
    logs_dir = job_output / "spark" / "logs"
    job_conf_dir = job_output / "spark" / "conf"
    workers_dir = job_output / "spark" / "workers"
    _set_env_variables(job, job_conf_dir, logs_dir)
    worker_memory = _get_worker_memory_str(job, is_master=False)
    cmd = _get_worker_command(job, manager_node, worker_memory)
    ret = 1
    output = {}
    for _ in range(5):
        output.clear()
        logger.info("Run spark worker: [%s]", cmd)
        ret = run_command(cmd, output=output)
        if ret == 0:
            break
    if ret != 0:
        logger.error("Failed to start spark worker: %s: %s", ret, output)

    shutdown_file = _get_shutdown_file(job.name, output_dir)
    while not shutdown_file.exists():
        logger.debug("sleep for %s seconds", poll_interval)
        time.sleep(poll_interval)

    logger.info("Detected shutdown.")
    check_run_command(job.model.spark_config.get_stop_worker())
    if job.model.spark_config.collect_worker_logs:
        shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]),
                        workers_dir / hostname)
    return 0
예제 #8
0
def run_jobs(config_file, output, num_processes, verbose):
    """Starts jobs on HPC."""
    match = re.search(r"batch_(\d+)\.json", config_file)
    assert match
    batch_id = match.group(1)
    os.makedirs(output, exist_ok=True)
    filename = os.path.join(output, f"run_jobs_batch_{batch_id}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=logging.ERROR)
    logger.info(get_cli_string())

    mgr = JobRunner(config_file, output=output, batch_id=batch_id)
    ret = mgr.run_jobs(verbose=verbose, num_processes=num_processes)
    sys.exit(ret.value)
예제 #9
0
def submit(config_file, output, verbose=False):
    """Submit the pipeline for execution."""
    global logger
    os.makedirs(output, exist_ok=True)
    filename = os.path.join(output, "pipeline_submit.log")
    level = logging.DEBUG if verbose else logging.INFO
    logger = setup_logging(__name__, filename, file_level=level,
                           console_level=level)

    logger.info(get_cli_string())

    mgr = PipelineManager(config_file, output)
    try:
        mgr.submit(verbose=verbose)
    except Exception:
        logger.exception("Pipeline execution failed")
        raise

    sys.exit(0)
예제 #10
0
파일: pipeline.py 프로젝트: jgu2/jade
def submit_next_stage(output, stage_num, return_code, verbose=False):
    """Internal command to submit the next stage of the pipeline for execution."""
    filename = os.path.join(output, "pipeline_submit.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="a")
    logger.info(get_cli_string())

    mgr = PipelineManager.load(output)
    try:
        mgr.submit_next_stage(stage_num, return_code=return_code)
    except Exception:
        logger.exception("Pipeline execution failed")
        raise

    logging.shutdown()
    sys.exit(0)
예제 #11
0
def _run_cluster_master(job, manager_node, output_dir, verbose,
                        manager_script_and_args):
    filename = os.path.join(output_dir, f"run_spark_cluster__{job.name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run cluster master on %s job=%s: %s", socket.gethostname(),
                job.name, get_cli_string())

    job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name
    if job_output.exists():
        shutil.rmtree(job_output)
    job_output.mkdir(parents=True)
    events_dir = job_output / "spark" / "events"
    events_dir.mkdir(parents=True)
    logs_dir = job_output / "spark" / "logs"
    logs_dir.mkdir()
    workers_dir = job_output / "spark" / "workers"
    workers_dir.mkdir()

    # Make a job-specific conf directory because the log and event files need to be per-job.
    job_conf_dir = job_output / "spark" / "conf"
    shutil.copytree(
        Path(job.model.spark_config.conf_dir) / "conf", job_conf_dir)
    _fix_spark_conf_file(job_conf_dir, events_dir)
    _set_env_variables(job, job_conf_dir, logs_dir)

    # Ignore errors. Spark may not be running.
    run_command(job.model.spark_config.get_stop_worker())
    run_command(job.model.spark_config.get_stop_history_server())
    run_command(job.model.spark_config.get_stop_master())

    # It would be better to start all workers from the master. Doing so would require that
    # Spark processes on the master node be able to ssh into the worker nodes.
    # I haven't spent the time to figure out to do that inside Singularity containers.
    master_cmd = job.model.spark_config.get_start_master()
    logger.info("Run spark master: [%s]", master_cmd)
    check_run_command(master_cmd)
    history_cmd = job.model.spark_config.get_start_history_server()
    logger.info("Run spark history server: [%s]", history_cmd)
    check_run_command(history_cmd)
    worker_memory = _get_worker_memory_str(job, is_master=True)

    worker_cmd = _get_worker_command(job, manager_node, memory=worker_memory)
    logger.info("Run spark worker: [%s]", worker_cmd)
    check_run_command(worker_cmd)

    # Wait for workers.
    # TODO: find a way to check programmatically with the rest api
    # or parse the logs
    time.sleep(15)
    args = list(manager_script_and_args) + [
        _get_cluster(manager_node),
        str(job_output)
    ]
    if job.model.spark_config.run_user_script_inside_container:
        user_cmd = str(job.model.spark_config.get_run_user_script()
                       ) + " " + " ".join(args)
    else:
        user_cmd = " ".join(args)
    logger.info("Run user script [%s]", user_cmd)

    start = time.time()
    ret = run_command(user_cmd)
    logger.info("Finished job. duration = %s seconds", time.time() - start)

    # Delay to ensure the history is saved.
    time.sleep(10)
    metrics = SparkMetrics("localhost", history=True)
    try:
        metrics.generate_metrics(job_output / "spark_metrics")
    except Exception:
        logger.exception("Failed to generate metrics")

    check_run_command(job.model.spark_config.get_stop_worker())
    check_run_command(job.model.spark_config.get_stop_history_server())
    check_run_command(job.model.spark_config.get_stop_master())
    if job.model.spark_config.collect_worker_logs:
        shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]),
                        workers_dir / socket.gethostname())
    return ret
예제 #12
0
def submit_jobs(
    config_file=None,
    per_node_batch_size=None,
    dry_run=None,
    force=None,
    hpc_config=None,
    local=None,
    max_nodes=None,
    output=None,
    poll_interval=None,
    resource_monitor_interval=None,
    resource_monitor_type=None,
    num_processes=None,
    verbose=None,
    reports=None,
    enable_singularity=None,
    container=None,
    try_add_blocked_jobs=None,
    time_based_batching=None,
    node_setup_script=None,
    node_shutdown_script=None,
    submitter_params=None,
    no_distributed_submitter=None,
):
    """Submits jobs for execution, locally or on HPC."""
    if os.path.exists(output):
        if force:
            shutil.rmtree(output)
        else:
            print(
                f"{output} already exists. Delete it or use '--force' to overwrite.",
                file=sys.stderr,
            )
            sys.exit(1)

    if submitter_params is not None:
        params = SubmitterParams(**load_data(submitter_params))
    else:
        params = make_submitter_params(
            per_node_batch_size=per_node_batch_size,
            dry_run=dry_run,
            hpc_config=hpc_config,
            local=local,
            max_nodes=max_nodes,
            poll_interval=poll_interval,
            resource_monitor_interval=resource_monitor_interval,
            resource_monitor_type=resource_monitor_type,
            num_processes=num_processes,
            verbose=verbose,
            reports=reports,
            enable_singularity=enable_singularity,
            container=container,
            try_add_blocked_jobs=try_add_blocked_jobs,
            time_based_batching=time_based_batching,
            node_setup_script=node_setup_script,
            node_shutdown_script=node_shutdown_script,
            no_distributed_submitter=no_distributed_submitter,
        )

    if params.time_based_batching and params.num_processes is None:
        print("Error: num_processes must be set with time-based batching",
              file=sys.stderr)
        sys.exit(1)

    os.makedirs(output)
    filename = os.path.join(output, "submit_jobs.log")
    event_filename = os.path.join(output, "submit_jobs_events.log")
    level = logging.DEBUG if verbose else logging.INFO
    # For some reason event logging must be setup before general logging.
    # Otherwise, the first event doesn't show up in the log.
    setup_event_logging(event_filename)
    logger = setup_logging(__name__,
                           filename,
                           file_level=level,
                           console_level=level,
                           mode="w")
    logger.info(get_cli_string())

    try:
        ret = JobSubmitter.run_submit_jobs(config_file, output, params)
        sys.exit(ret)
    except Exception:
        logger.exception("Failed to run submit_jobs")
        raise
예제 #13
0
def run(extension, **kwargs):
    """Runs individual job."""
    registry = Registry()
    if not registry.is_registered(extension):
        raise InvalidExtension(f"Extension '{extension}' is not registered.")

    # Parse Argument
    config_file = kwargs["config_file"]
    name = kwargs["name"]
    output = kwargs["output"]
    output_format = kwargs["output_format"]
    verbose = kwargs["verbose"]
    level = logging.DEBUG if verbose else logging.INFO

    # Create directory for current job
    job_dir = os.path.join(output, name)
    os.makedirs(job_dir, exist_ok=True)
    # Structural logging setup
    event_file = os.path.join(job_dir, "events.log")
    setup_event_logging(event_file)

    # General logging setup
    log_file = os.path.join(job_dir, "run.log")
    general_logger = setup_logging(
        extension,
        log_file,
        console_level=logging.ERROR,
        file_level=level,
    )
    general_logger.info(get_cli_string())

    # Create config for run
    try:
        cli = registry.get_extension_class(extension, ExtensionClassType.CLI)
        ret = cli.run(config_file, name, output, output_format, verbose)
    except Exception as err:
        msg = f"unexpected exception in run '{extension}' job={name} - {err}"
        general_logger.exception(msg)
        event = StructuredErrorLogEvent(
            source=name,
            category=EVENT_CATEGORY_ERROR,
            name=EVENT_NAME_UNHANDLED_ERROR,
            message=msg,
        )
        log_event(event)
        ret = 1

    if ret == 0:
        try:
            config = load_data(config_file)
            if "job_post_process_config" in config.keys():
                post_process = JobPostProcess(
                    module_name=config["job_post_process_config"]["module"],
                    class_name=config["job_post_process_config"]["class"],
                    data=config["job_post_process_config"]["data"],
                    job_name=name,
                    output=output,
                )
                post_process.run(config_file=config_file, output=output)
        except Exception as err:
            msg = f"unexpected exception in post-process '{extension}' job={name} - {err}"
            general_logger.exception(msg)
            event = StructuredErrorLogEvent(
                source=name,
                category=EVENT_CATEGORY_ERROR,
                name=EVENT_NAME_UNHANDLED_ERROR,
                message=msg,
            )
            log_event(event)
            ret = 1

    sys.exit(ret)