示例#1
0
def collect(duration, force, interval, output):
    """Collect resource utilization stats."""
    if os.path.exists(output):
        if force:
            shutil.rmtree(output)
        else:
            print(
                f"The directory {output} already exists. Delete it or run with --force",
                file=sys.stderr,
            )
            sys.exit(1)

    os.makedirs(output)
    event_file = os.path.join(output, "stats_events.log")
    setup_event_logging(event_file)
    monitor = ResourceMonitorLogger("ResourceMonitor")
    start_time = time.time()

    show_cmd = f"jade stats show -o {output} [STATS]"
    print(f"Collecting stats. When complete run '{show_cmd}' to view stats.")
    try:
        while True:
            monitor.log_resource_stats()
            time.sleep(interval)
            if duration is not None and time.time() - start_time > duration:
                print(f"Exceeded {duration} seconds. Exiting.")
                EventsSummary(output)
                break
    except KeyboardInterrupt:
        # TODO: This doesn't actually work. click catches KeyboardInterrupt.
        # Need to prevent it from doing that.
        # Then always call EventsSummary(output) at the end.
        pass
示例#2
0
def run_jobs(config_file, distributed_submitter, output, num_processes,
             verbose):
    """Starts jobs on HPC."""
    match = re.search(r"batch_(\d+)\.json", config_file)
    assert match
    batch_id = match.group(1)
    os.makedirs(output, exist_ok=True)

    mgr = JobRunner(config_file, output=output, batch_id=batch_id)

    # Logging has to get enabled after the JobRunner is created because we need the node ID
    # is what makes the file unique.
    filename = os.path.join(output,
                            f"run_jobs_batch_{batch_id}_{mgr.node_id}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_event_logging(mgr.event_filename)
    logger = setup_logging(__name__,
                           filename,
                           file_level=level,
                           console_level=level,
                           mode="w")
    logger.info(get_cli_string())

    group = mgr.config.get_default_submission_group()
    if group.submitter_params.node_setup_script:
        cmd = f"{group.submitter_params.node_setup_script} {config_file} {output}"
        ret = run_command(cmd)
        if ret != 0:
            logger.error("Failed to run node setup script %s: %s", cmd, ret)
            sys.exit(ret)

    status = mgr.run_jobs(distributed_submitter=distributed_submitter,
                          verbose=verbose,
                          num_processes=num_processes)
    ret = status.value

    if group.submitter_params.node_shutdown_script:
        cmd = f"{group.submitter_params.node_shutdown_script} {config_file} {output}"
        ret2 = run_command(cmd)
        if ret2 != 0:
            logger.error("Failed to run node shutdown script %s: %s", cmd,
                         ret2)

    if status == Status.GOOD and distributed_submitter:
        start = time.time()
        _try_submit_jobs(output, verbose=verbose)
        logger.info("try-submit-jobs took %s seconds", time.time() - start)

    sys.exit(ret)
示例#3
0
def resubmit_jobs(output, failed, missing, verbose):
    """Resubmit failed and missing jobs."""
    event_file = os.path.join(output, "submit_jobs_events.log")
    setup_event_logging(event_file, mode="a")
    filename = os.path.join(output, "submit_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="a")

    cluster, promoted = Cluster.deserialize(
        output,
        try_promote_to_submitter=True,
        deserialize_jobs=True,
    )
    if not cluster.is_complete():
        cluster.demote_from_submitter()
        print(
            "resubmit-jobs requires that the existing submission be complete",
            file=sys.stderr)
        sys.exit(1)
    assert promoted

    jobs_to_resubmit = _get_jobs_to_resubmit(cluster, output, failed, missing)
    updated_blocking_jobs_by_name = _update_with_blocking_jobs(
        jobs_to_resubmit, output)
    _reset_results(output, jobs_to_resubmit)
    cluster.prepare_for_resubmission(jobs_to_resubmit,
                                     updated_blocking_jobs_by_name)

    ret = 1
    try:
        mgr = JobSubmitter.load(output)
        status = mgr.submit_jobs(cluster)
        if status == Status.IN_PROGRESS:
            print(f"Resubmitted {len(jobs_to_resubmit)} jobs in {output}")
            ret = 0
        else:
            ret = status.value
    except Exception:
        logger.exception("Failed to resubmit jobs")
        raise
    finally:
        cluster.demote_from_submitter()

    sys.exit(ret)
示例#4
0
def test_resource_stats():
    with tempfile.TemporaryDirectory() as tmpdir:
        event_file = os.path.join(tmpdir, "events.log")
        setup_event_logging(event_file)

        resource_monitor = ResourceMonitorLogger("test")
        count = 2
        for _ in range(count):
            resource_monitor.log_resource_stats()

        summary = EventsSummary(tmpdir)
        assert len(summary.list_events(EVENT_NAME_CPU_STATS)) == count
        assert len(summary.list_events(EVENT_NAME_DISK_STATS)) == count
        assert len(summary.list_events(EVENT_NAME_MEMORY_STATS)) == count
        assert len(summary.list_events(EVENT_NAME_NETWORK_STATS)) == count

        viewers = [
            CpuStatsViewer(summary),
            DiskStatsViewer(summary),
            MemoryStatsViewer(summary),
            NetworkStatsViewer(summary),
        ]
        for viewer in viewers:
            df = viewer.get_dataframe("test")
            assert len(df) == 2
            if isinstance(viewer, MemoryStatsViewer):
                viewer.get_dataframe("test")
                averages = viewer._calc_batch_averages("test")
                for field, val in averages.items():
                    assert val == df[field].mean()

        output = {}
        cmd = f"jade stats show -o {tmpdir} cpu disk mem net"
        ret = run_command(cmd, output=output)
        assert ret == 0
        for term in ("IOPS", "read_bytes", "bytes_recv", "idle"):
            assert term in output["stdout"]
示例#5
0
def resubmit_jobs(output, failed, missing, successful, submission_groups_file, verbose):
    """Resubmit jobs."""
    event_file = os.path.join(output, "submit_jobs_events.log")
    setup_event_logging(event_file, mode="a")
    filename = os.path.join(output, "submit_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, filename, file_level=level, console_level=level, mode="a")

    cluster, promoted = Cluster.deserialize(
        output,
        try_promote_to_submitter=True,
        deserialize_jobs=True,
    )
    if not cluster.is_complete():
        cluster.demote_from_submitter()
        print("resubmit-jobs requires that the existing submission be complete", file=sys.stderr)
        sys.exit(1)
    assert promoted

    if submission_groups_file is not None:
        groups = load_data(submission_groups_file)
        cur = len(groups)
        orig = len(cluster.config.submission_groups)
        if cur != orig:
            print(
                f"Length of submission_groups ({cur}) must be identical to the original ({orig})",
                file=sys.stderr,
            )
            cluster.demote_from_submitter()
            sys.exit(1)

        for _group in groups:
            group = SubmissionGroup(**_group)
            found = False
            for i, orig_group in enumerate(cluster.config.submission_groups):
                if group.name == orig_group.name:
                    cluster.config.submission_groups[i] = group
                    found = True
                    break
            if not found:
                print(
                    f"submission group {group.name} does not exist in the original",
                    file=sys.stderr,
                )
                cluster.demote_from_submitter()
                sys.exit(1)
        logger.info("Updated submitter parameters from %s", submission_groups_file)

    jobs_to_resubmit = _get_jobs_to_resubmit(cluster, output, failed, missing, successful)
    updated_blocking_jobs_by_name = _update_with_blocking_jobs(jobs_to_resubmit, output)
    _reset_results(output, jobs_to_resubmit)
    cluster.prepare_for_resubmission(jobs_to_resubmit, updated_blocking_jobs_by_name)

    ret = 1
    try:
        mgr = JobSubmitter.load(output)
        status = mgr.submit_jobs(cluster)
        if status == Status.IN_PROGRESS:
            print(f"Resubmitted {len(jobs_to_resubmit)} jobs in {output}")
            ret = 0
        else:
            ret = status.value
    except Exception:
        logger.exception("Failed to resubmit jobs")
        raise
    finally:
        cluster.demote_from_submitter()

    sys.exit(ret)
示例#6
0
def submit_jobs(
    config_file=None,
    per_node_batch_size=None,
    dry_run=None,
    force=None,
    hpc_config=None,
    local=None,
    max_nodes=None,
    output=None,
    poll_interval=None,
    resource_monitor_interval=None,
    resource_monitor_type=None,
    num_processes=None,
    verbose=None,
    reports=None,
    enable_singularity=None,
    container=None,
    try_add_blocked_jobs=None,
    time_based_batching=None,
    node_setup_script=None,
    node_shutdown_script=None,
    submitter_params=None,
    no_distributed_submitter=None,
):
    """Submits jobs for execution, locally or on HPC."""
    if os.path.exists(output):
        if force:
            shutil.rmtree(output)
        else:
            print(
                f"{output} already exists. Delete it or use '--force' to overwrite.",
                file=sys.stderr,
            )
            sys.exit(1)

    if submitter_params is not None:
        params = SubmitterParams(**load_data(submitter_params))
    else:
        params = make_submitter_params(
            per_node_batch_size=per_node_batch_size,
            dry_run=dry_run,
            hpc_config=hpc_config,
            local=local,
            max_nodes=max_nodes,
            poll_interval=poll_interval,
            resource_monitor_interval=resource_monitor_interval,
            resource_monitor_type=resource_monitor_type,
            num_processes=num_processes,
            verbose=verbose,
            reports=reports,
            enable_singularity=enable_singularity,
            container=container,
            try_add_blocked_jobs=try_add_blocked_jobs,
            time_based_batching=time_based_batching,
            node_setup_script=node_setup_script,
            node_shutdown_script=node_shutdown_script,
            no_distributed_submitter=no_distributed_submitter,
        )

    if params.time_based_batching and params.num_processes is None:
        print("Error: num_processes must be set with time-based batching",
              file=sys.stderr)
        sys.exit(1)

    os.makedirs(output)
    filename = os.path.join(output, "submit_jobs.log")
    event_filename = os.path.join(output, "submit_jobs_events.log")
    level = logging.DEBUG if verbose else logging.INFO
    # For some reason event logging must be setup before general logging.
    # Otherwise, the first event doesn't show up in the log.
    setup_event_logging(event_filename)
    logger = setup_logging(__name__,
                           filename,
                           file_level=level,
                           console_level=level,
                           mode="w")
    logger.info(get_cli_string())

    try:
        ret = JobSubmitter.run_submit_jobs(config_file, output, params)
        sys.exit(ret)
    except Exception:
        logger.exception("Failed to run submit_jobs")
        raise
示例#7
0
def run(extension, **kwargs):
    """Runs individual job."""
    registry = Registry()
    if not registry.is_registered(extension):
        raise InvalidExtension(f"Extension '{extension}' is not registered.")

    # Parse Argument
    config_file = kwargs["config_file"]
    name = kwargs["name"]
    output = kwargs["output"]
    output_format = kwargs["output_format"]
    verbose = kwargs["verbose"]
    level = logging.DEBUG if verbose else logging.INFO

    # Create directory for current job
    job_dir = os.path.join(output, name)
    os.makedirs(job_dir, exist_ok=True)
    # Structural logging setup
    event_file = os.path.join(job_dir, "events.log")
    setup_event_logging(event_file)

    # General logging setup
    log_file = os.path.join(job_dir, "run.log")
    general_logger = setup_logging(
        extension,
        log_file,
        console_level=logging.ERROR,
        file_level=level,
    )
    general_logger.info(get_cli_string())

    # Create config for run
    try:
        cli = registry.get_extension_class(extension, ExtensionClassType.CLI)
        ret = cli.run(config_file, name, output, output_format, verbose)
    except Exception as err:
        msg = f"unexpected exception in run '{extension}' job={name} - {err}"
        general_logger.exception(msg)
        event = StructuredErrorLogEvent(
            source=name,
            category=EVENT_CATEGORY_ERROR,
            name=EVENT_NAME_UNHANDLED_ERROR,
            message=msg,
        )
        log_event(event)
        ret = 1

    if ret == 0:
        try:
            config = load_data(config_file)
            if "job_post_process_config" in config.keys():
                post_process = JobPostProcess(
                    module_name=config["job_post_process_config"]["module"],
                    class_name=config["job_post_process_config"]["class"],
                    data=config["job_post_process_config"]["data"],
                    job_name=name,
                    output=output,
                )
                post_process.run(config_file=config_file, output=output)
        except Exception as err:
            msg = f"unexpected exception in post-process '{extension}' job={name} - {err}"
            general_logger.exception(msg)
            event = StructuredErrorLogEvent(
                source=name,
                category=EVENT_CATEGORY_ERROR,
                name=EVENT_NAME_UNHANDLED_ERROR,
                message=msg,
            )
            log_event(event)
            ret = 1

    sys.exit(ret)