예제 #1
0
def test_resubmit_failed(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    x = results[0]
    results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time)
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    final_results["results"][0]["return_code"] = 1
    final_results["results_summary"]["num_failed"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert summary.get_failed_results()[0].name == "1"

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
예제 #2
0
    def _complete(self):
        ret = self._pipe.returncode
        exec_time_s = time.time() - self._start_time

        job_filename = self._job.name
        illegal_chars = ("/", "\\", ":")
        for char in illegal_chars:
            job_filename = job_filename.replace(char, "-")

        status = "finished"
        output_dir = os.path.join(self._output, JOBS_OUTPUT_DIR,
                                  self._job.name)
        bytes_consumed = get_directory_size_bytes(output_dir)
        event = StructuredLogEvent(
            source=self._job.name,
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="job output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)
        result = Result(self._job.name, ret, status, exec_time_s)
        ResultsAggregator.append(self._results_filename, result)

        logger.info("Job %s completed return_code=%s exec_time_s=%s",
                    self._job.name, ret, exec_time_s)
예제 #3
0
 def cancel(self):
     self._return_code = 1
     self._is_complete = True
     result = Result(self._job.name, self._return_code,
                     JobCompletionStatus.CANCELED, 0.0)
     ResultsAggregator.append(self._output, result, batch_id=self._batch_id)
     logger.info("Canceled job %s", self._job.name)
예제 #4
0
    def _submit(self, verbose):
        for stage in self._stages:
            os.environ["JADE_PIPELINE_STAGE_ID"] = str(self._cur_stage_id)
            stage_info = {
                "stage_id":
                self._cur_stage_id,
                "output_directory":
                self.get_stage_output_path(self._output, self._cur_stage_id)
            }
            self._status_info["stages"].append(stage_info)
            self._dump_status()
            self._run_auto_config(stage)
            cmd = self._make_submit_cmd(stage, verbose)
            start = time.time()
            ret = run_command(cmd)
            end = time.time()
            exec_time = end - start
            result = Result(str(self._cur_stage_id), ret, "finished",
                            exec_time, end)
            self._status_info["stages"][-1]["result"] = serialize_result(
                result)
            self._dump_status()
            if ret != 0:
                raise ExecutionError(f"stage {self._cur_stage_id} failed")
            self._cur_stage_id += 1

        logger.info("Finished execution pipeline")
예제 #5
0
파일: hpc_submitter.py 프로젝트: jgu2/jade
 def _cancel_job(self, job, aggregator):
     job.state = JobState.DONE
     job.blocked_by.clear()
     result = Result(job.name, 1, JobCompletionStatus.CANCELED, 0)
     aggregator.append_result(result)
     logger.info("Canceled job %s because one of its blocking jobs failed.",
                 job.name)
     return result
예제 #6
0
 def cancel(self):
     self._return_code = 1
     self._is_complete = True
     if self._is_manager_node:
         result = Result(
             self._job.name,
             self._return_code,
             JobCompletionStatus.CANCELED,
             0.0,
             hpc_job_id=self._hpc_job_id,
         )
         ResultsAggregator.append(self._output, result, batch_id=self._batch_id)
         logger.info("Canceled job %s", self._job.name)
     else:
         logger.info("Canceled job %s on non-manager node", self._job.name)
예제 #7
0
    def _complete(self):
        self._return_code = self._pipe.returncode
        exec_time_s = time.time() - self._start_time

        if not self._is_manager_node:
            # This will happen on a multi-node job. Don't complete it multiple times.
            logger.info(
                "Job %s completed on non-manager node return_code=%s exec_time_s=%s",
                self._job.name,
                self._return_code,
                exec_time_s,
            )
            return

        status = JobCompletionStatus.FINISHED
        output_dir = self._output / JOBS_OUTPUT_DIR / self._job.name
        bytes_consumed = get_directory_size_bytes(output_dir)
        event = StructuredLogEvent(
            source=self._job.name,
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="job output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)
        result = Result(
            self._job.name, self._return_code, status, exec_time_s, hpc_job_id=self._hpc_job_id
        )
        ResultsAggregator.append(self._output, result, batch_id=self._batch_id)

        logger.info(
            "Job %s completed return_code=%s exec_time_s=%s hpc_job_id=%s",
            self._job.name,
            self._return_code,
            exec_time_s,
            self._hpc_job_id,
        )
예제 #8
0
def create_result(index):
    """Creates a result with unique fields based on an index."""
    return Result(str(index), index, "finished", 1.0 + index, hpc_job_id=None)