예제 #1
0
    def test_job_submitter(self):
        config = get_config()
        config.dump(CONFIG_FILE)

        try:
            mgr = JobSubmitter(CONFIG_FILE)
            assert not mgr.get_completed_results()
            assert config.get_num_jobs() == mgr.get_num_jobs()
        finally:
            os.remove(CONFIG_FILE)
예제 #2
0
def submit_jobs(config_file, per_node_batch_size, hpc_config, local, max_nodes,
                output, poll_interval, num_processes, rotate_logs, verbose,
                restart_failed, restart_missing, reports,
                try_add_blocked_jobs):
    """Submits jobs for execution, locally or on HPC."""
    os.makedirs(output, exist_ok=True)

    previous_results = []

    if restart_failed:
        failed_job_config = create_config_from_previous_run(
            config_file, output, result_type='failed')
        previous_results = ResultsSummary(output).get_successful_results()
        config_file = "failed_job_inputs.json"
        failed_job_config.dump(config_file)

    if restart_missing:
        missing_job_config = create_config_from_previous_run(
            config_file, output, result_type='missing')
        config_file = "missing_job_inputs.json"
        missing_job_config.dump(config_file)
        previous_results = ResultsSummary(output).list_results()

    if rotate_logs:
        rotate_filenames(output, ".log")

    filename = os.path.join(output, "submit_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, filename, file_level=level, console_level=level)
    logger.info(get_cli_string())

    event_file = os.path.join(output, "submit_jobs_events.log")
    # This effectively means no console logging.
    setup_logging("event",
                  event_file,
                  console_level=logging.ERROR,
                  file_level=logging.INFO)

    mgr = JobSubmitter(config_file, hpc_config=hpc_config, output=output)
    ret = mgr.submit_jobs(
        per_node_batch_size=per_node_batch_size,
        max_nodes=max_nodes,
        force_local=local,
        verbose=verbose,
        num_processes=num_processes,
        poll_interval=poll_interval,
        previous_results=previous_results,
        reports=reports,
        try_add_blocked_jobs=try_add_blocked_jobs,
    )

    sys.exit(ret.value)
예제 #3
0
    def _submit_next_stage(self, stage_num, return_code=None):
        if return_code is None:
            assert stage_num == 1, str(stage_num)
        else:
            if stage_num != self.stage_num + 1:
                raise InvalidParameter(
                    f"expected stage_num {self.stage_num + 1}, received {stage_num}"
                )

            self._config.stages[stage_num - 2].return_code = return_code
            self._config.stage_num += 1

        if self._config.stage_num == len(self._config.stages) + 1:
            logger.info("Pipeline is complete")
            self._config.is_complete = True
            self._serialize()
            return

        logger.info("Start execution pipeline stage %s/%s", stage_num,
                    len(self._config.stages))

        self._serialize()
        stage = self._config.stages[self.stage_num - 1]
        os.environ["JADE_PIPELINE_STAGE_ID"] = str(self.stage_num)
        self._run_auto_config(stage)
        output = self.get_stage_output_path(self.path, self.stage_num)
        ret = JobSubmitter.run_submit_jobs(
            stage.config_file,
            output,
            stage.submitter_params,
            pipeline_stage_num=self.stage_num,
        )
        if ret != 0:
            raise ExecutionError(f"stage {self.stage_num} failed")
예제 #4
0
def cancel_jobs(output, verbose):
    """Cancels jobs."""
    filename = os.path.join(output, "cancel_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="a")
    logger.info(get_cli_string())

    for _ in range(60):
        cluster, promoted = Cluster.deserialize(
            output,
            try_promote_to_submitter=True,
            deserialize_jobs=True,
        )
        if not promoted:
            logger.info("Did not get promoted. Sleep.")
            time.sleep(1)
            continue
        if cluster.is_complete():
            cluster.demote_from_submitter()
            logger.info("All jobs are already finished.")
            sys.exit(0)
        submitter = JobSubmitter.load(output)
        submitter.cancel_jobs(cluster)
        sys.exit(0)

    logger.error("Failed to get promoted to submitter.")
    sys.exit(1)
예제 #5
0
def test_jobs_submitter__find_error_log_messages(example_output):
    events = list(JobSubmitter.find_error_log_messages(example_output))
    assert len(events) == 4
    assert events[0].data["error"] == "Traceback"
    assert events[0].data[
        "filename"] == f"{example_output}/job_output_2741821.e"
    assert events[0].data["line_number"] == 2
    assert events[0].data["text"] == "Traceback (most recent call last):"
    assert events[1].data["error"] == "DUE TO TIME LIMIT"
    assert events[1].data["line_number"] == 43
    assert events[2].data["error"] == "slurmstepd"
    assert events[2].data["line_number"] == 44
    assert events[3].data["error"] == "srun"
    assert events[3].data["line_number"] == 45
예제 #6
0
파일: resubmit_jobs.py 프로젝트: jgu2/jade
def resubmit_jobs(output, failed, missing, verbose):
    """Resubmit failed and missing jobs."""
    event_file = os.path.join(output, "submit_jobs_events.log")
    setup_event_logging(event_file, mode="a")
    filename = os.path.join(output, "submit_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="a")

    cluster, promoted = Cluster.deserialize(
        output,
        try_promote_to_submitter=True,
        deserialize_jobs=True,
    )
    if not cluster.is_complete():
        cluster.demote_from_submitter()
        print(
            "resubmit-jobs requires that the existing submission be complete",
            file=sys.stderr)
        sys.exit(1)
    assert promoted

    jobs_to_resubmit = _get_jobs_to_resubmit(cluster, output, failed, missing)
    updated_blocking_jobs_by_name = _update_with_blocking_jobs(
        jobs_to_resubmit, output)
    _reset_results(output, jobs_to_resubmit)
    cluster.prepare_for_resubmission(jobs_to_resubmit,
                                     updated_blocking_jobs_by_name)

    ret = 1
    try:
        mgr = JobSubmitter.load(output)
        status = mgr.submit_jobs(cluster)
        if status == Status.IN_PROGRESS:
            print(f"Resubmitted {len(jobs_to_resubmit)} jobs in {output}")
            ret = 0
        else:
            ret = status.value
    except Exception:
        logger.exception("Failed to resubmit jobs")
        raise
    finally:
        cluster.demote_from_submitter()

    sys.exit(ret)
예제 #7
0
파일: resubmit_jobs.py 프로젝트: NREL/jade
def resubmit_jobs(output, failed, missing, successful, submission_groups_file, verbose):
    """Resubmit jobs."""
    event_file = os.path.join(output, "submit_jobs_events.log")
    setup_event_logging(event_file, mode="a")
    filename = os.path.join(output, "submit_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, filename, file_level=level, console_level=level, mode="a")

    cluster, promoted = Cluster.deserialize(
        output,
        try_promote_to_submitter=True,
        deserialize_jobs=True,
    )
    if not cluster.is_complete():
        cluster.demote_from_submitter()
        print("resubmit-jobs requires that the existing submission be complete", file=sys.stderr)
        sys.exit(1)
    assert promoted

    if submission_groups_file is not None:
        groups = load_data(submission_groups_file)
        cur = len(groups)
        orig = len(cluster.config.submission_groups)
        if cur != orig:
            print(
                f"Length of submission_groups ({cur}) must be identical to the original ({orig})",
                file=sys.stderr,
            )
            cluster.demote_from_submitter()
            sys.exit(1)

        for _group in groups:
            group = SubmissionGroup(**_group)
            found = False
            for i, orig_group in enumerate(cluster.config.submission_groups):
                if group.name == orig_group.name:
                    cluster.config.submission_groups[i] = group
                    found = True
                    break
            if not found:
                print(
                    f"submission group {group.name} does not exist in the original",
                    file=sys.stderr,
                )
                cluster.demote_from_submitter()
                sys.exit(1)
        logger.info("Updated submitter parameters from %s", submission_groups_file)

    jobs_to_resubmit = _get_jobs_to_resubmit(cluster, output, failed, missing, successful)
    updated_blocking_jobs_by_name = _update_with_blocking_jobs(jobs_to_resubmit, output)
    _reset_results(output, jobs_to_resubmit)
    cluster.prepare_for_resubmission(jobs_to_resubmit, updated_blocking_jobs_by_name)

    ret = 1
    try:
        mgr = JobSubmitter.load(output)
        status = mgr.submit_jobs(cluster)
        if status == Status.IN_PROGRESS:
            print(f"Resubmitted {len(jobs_to_resubmit)} jobs in {output}")
            ret = 0
        else:
            ret = status.value
    except Exception:
        logger.exception("Failed to resubmit jobs")
        raise
    finally:
        cluster.demote_from_submitter()

    sys.exit(ret)
예제 #8
0
def submit_jobs(
    config_file=None,
    per_node_batch_size=None,
    dry_run=None,
    force=None,
    hpc_config=None,
    local=None,
    max_nodes=None,
    output=None,
    poll_interval=None,
    resource_monitor_interval=None,
    resource_monitor_type=None,
    num_processes=None,
    verbose=None,
    reports=None,
    enable_singularity=None,
    container=None,
    try_add_blocked_jobs=None,
    time_based_batching=None,
    node_setup_script=None,
    node_shutdown_script=None,
    submitter_params=None,
    no_distributed_submitter=None,
):
    """Submits jobs for execution, locally or on HPC."""
    if os.path.exists(output):
        if force:
            shutil.rmtree(output)
        else:
            print(
                f"{output} already exists. Delete it or use '--force' to overwrite.",
                file=sys.stderr,
            )
            sys.exit(1)

    if submitter_params is not None:
        params = SubmitterParams(**load_data(submitter_params))
    else:
        params = make_submitter_params(
            per_node_batch_size=per_node_batch_size,
            dry_run=dry_run,
            hpc_config=hpc_config,
            local=local,
            max_nodes=max_nodes,
            poll_interval=poll_interval,
            resource_monitor_interval=resource_monitor_interval,
            resource_monitor_type=resource_monitor_type,
            num_processes=num_processes,
            verbose=verbose,
            reports=reports,
            enable_singularity=enable_singularity,
            container=container,
            try_add_blocked_jobs=try_add_blocked_jobs,
            time_based_batching=time_based_batching,
            node_setup_script=node_setup_script,
            node_shutdown_script=node_shutdown_script,
            no_distributed_submitter=no_distributed_submitter,
        )

    if params.time_based_batching and params.num_processes is None:
        print("Error: num_processes must be set with time-based batching",
              file=sys.stderr)
        sys.exit(1)

    os.makedirs(output)
    filename = os.path.join(output, "submit_jobs.log")
    event_filename = os.path.join(output, "submit_jobs_events.log")
    level = logging.DEBUG if verbose else logging.INFO
    # For some reason event logging must be setup before general logging.
    # Otherwise, the first event doesn't show up in the log.
    setup_event_logging(event_filename)
    logger = setup_logging(__name__,
                           filename,
                           file_level=level,
                           console_level=level,
                           mode="w")
    logger.info(get_cli_string())

    try:
        ret = JobSubmitter.run_submit_jobs(config_file, output, params)
        sys.exit(ret)
    except Exception:
        logger.exception("Failed to run submit_jobs")
        raise
예제 #9
0
def test_jobs_submitter__generate_reports(example_output, cleanup):
    ret = JobSubmitter.generate_reports(example_output, True)
    assert ret == 0
    for filename in ("errors.txt", "results.txt", "stats.txt"):
        path = os.path.join(example_output, filename)
        assert os.path.exists(path)