예제 #1
0
파일: Task.py 프로젝트: LPM-HMS/COSMOS2
def task_status_changed(task):
    if task.status in completed_task_statuses:
        task.workflow.jobmanager.get_drm(task.drm).populate_logs(task)

    if task.status == TaskStatus.waiting:
        task.started_on = datetime.datetime.now()

    elif task.status == TaskStatus.submitted:
        task.stage.status = StageStatus.running
        if not task.NOOP:
            task.log.info(
                '%s %s. drm=%s; drm_jobid=%s; job_class=%s; queue=%s' %
                (task, task.status, repr(task.drm), repr(task.drm_jobID),
                 repr(task.job_class), repr(task.queue)))
        task.submitted_on = datetime.datetime.now()

    elif task.status == TaskStatus.failed:
        if not task.must_succeed:
            task.log.warn('%s failed, but must_succeed is False' % task)
            task.log.warn(task_printout.format(task))
            task.finished_on = datetime.datetime.now()
        else:
            #
            # By default /usr/bin/timeout returns 124 when it kills a job.
            # DRM_Local jobs that time out will usually have this error code.
            # Other DRM's may well have different error codes. Currently, this
            # check is purely cosmetic, but if we do more here, then
            # FIXME we should have a DRM-agnostic way of determining timed-out tasks.
            #
            if task.exit_status == 124:
                exit_reason = 'timed out'
            else:
                exit_reason = 'failed'

            task.log.warn('%s attempt #%s %s (max_attempts=%s)' % (task, task.attempt, exit_reason, task.max_attempts))

            if task.attempt < task.max_attempts:
                task.log.warn(task_printout.format(task))
                task.attempt += 1
                task.status = TaskStatus.no_attempt
            else:
                wait_for_file(task.workflow, task.output_stderr_path, 30, error=False)

                task.log.warn(task_printout.format(task))
                task.log.error('%s has failed too many times' % task)
                task.finished_on = datetime.datetime.now()
                task.stage.status = StageStatus.running_but_failed

    elif task.status == TaskStatus.successful:
        task.successful = True
        if not task.NOOP:
            task.log.info('{} {}, wall_time: {}.  {}/{} Tasks finished.'.format(task, task.status,
                                                                                datetime.timedelta(
                                                                                    seconds=task.wall_time),
                                                                                sum(1 for t in task.workflow.tasks if
                                                                                    t.finished),
                                                                                len(task.workflow.tasks)))
        task.finished_on = datetime.datetime.now()
        if all(t.successful or not t.must_succeed for t in task.stage.tasks):
            task.stage.status = StageStatus.successful
예제 #2
0
파일: Task.py 프로젝트: p7k/COSMOS2
def task_status_changed(task):
    if task.status in completed_task_statuses:
        task.workflow.jobmanager.get_drm(task.drm).populate_logs(task)

    if task.status == TaskStatus.waiting:
        task.started_on = datetime.datetime.now()

    elif task.status == TaskStatus.submitted:
        task.stage.status = StageStatus.running
        if not task.NOOP:
            task.log.info(
                '%s %s. drm=%s; drm_jobid=%s; job_class=%s; queue=%s' %
                (task, task.status, repr(task.drm), repr(task.drm_jobID),
                 repr(task.job_class), repr(task.queue)))
        task.submitted_on = datetime.datetime.now()

    elif task.status == TaskStatus.failed:
        if not task.must_succeed:
            task.log.warn('%s failed, but must_succeed is False' % task)
            task.log.warn(task_printout.format(task))
            task.finished_on = datetime.datetime.now()
        else:
            #
            # By default /usr/bin/timeout returns 124 when it kills a job.
            # DRM_Local jobs that time out will usually have this error code.
            # Other DRM's may well have different error codes. Currently, this
            # check is purely cosmetic, but if we do more here, then
            # FIXME we should have a DRM-agnostic way of determining timed-out tasks.
            #
            if task.exit_status == 124:
                exit_reason = 'timed out'
            else:
                exit_reason = 'failed'

            task.log.warn('%s attempt #%s %s (max_attempts=%s)' % (task, task.attempt, exit_reason, task.max_attempts))

            if task.attempt < task.max_attempts:
                task.log.warn(task_printout.format(task))
                task.attempt += 1
                task.status = TaskStatus.no_attempt
            else:
                wait_for_file(task.workflow, task.output_stderr_path, 30, error=False)

                task.log.warn(task_printout.format(task))
                task.log.error('%s has failed too many times' % task)
                task.finished_on = datetime.datetime.now()
                task.stage.status = StageStatus.running_but_failed

    elif task.status == TaskStatus.successful:
        task.successful = True
        if not task.NOOP:
            task.log.info('{} {}, wall_time: {}.  {}/{} Tasks finished.'.format(task, task.status,
                                                                            datetime.timedelta(seconds=task.wall_time),
                                                                            sum(1 for t in task.workflow.tasks if
                                                                                t.finished), len(task.workflow.tasks)))
        task.finished_on = datetime.datetime.now()
        if all(t.successful or not t.must_succeed for t in task.stage.tasks):
            task.stage.status = StageStatus.successful
예제 #3
0
파일: Task.py 프로젝트: indraniel/COSMOS2
def task_status_changed(task):
    if task.status in completed_task_statuses:
        task.workflow.jobmanager.get_drm(task.drm).populate_logs(task)

    if task.status == TaskStatus.waiting:
        task.started_on = datetime.datetime.now()

    elif task.status == TaskStatus.submitted:
        task.stage.status = StageStatus.running
        if not task.NOOP:
            task.log.info(f"{task} {task.status}. "
                          f"drm={repr(task.drm)}; "
                          f"drm_jobid={repr(task.drm_jobID)}; "
                          f"job_class={repr(task.job_class)}; "
                          f"queue={repr(task.queue)}; "
                          f"core_req={task.core_req}; "
                          f"gpu_req={task.gpu_req}; "
                          f"mem_req={task.mem_req}")
        task.submitted_on = datetime.datetime.now()

    elif task.status == TaskStatus.failed:
        if not task.must_succeed:
            task.log.warn("%s failed, but must_succeed is False" % task)
            task.log.warn(task_printout_long.format(task))
            task.finished_on = datetime.datetime.now()
        else:
            #
            # By default /usr/bin/timeout returns 124 when it kills a job.
            # DRM_Local jobs that time out will usually have this error code.
            # Other DRM's may well have different error codes. Currently, this
            # check is purely cosmetic, but if we do more here, then
            # FIXME we should have a DRM-agnostic way of determining timed-out tasks.
            #
            if task.exit_status == 124:
                exit_reason = "timed out"
            else:
                exit_reason = "failed"

            task.log.warn(
                f"{task} attempt #{task.attempt} {exit_reason} with status_reason: '{task.status_reason}'"
            )
            regex = task.drm_options.get("retry_only_if_status_reason_matches")
            # if task.status_reason matches our regex, then we want to retry
            # ex: regex = "Host .+ Terminated", task.status_reason = "Host Terminated" would indicate we want
            # to retry because a spot instance died
            if regex is not None:
                status_reason_is_valid_for_retry = re.search(
                    regex, task.status_reason or "")
            else:
                status_reason_is_valid_for_retry = True

            if status_reason_is_valid_for_retry and task.attempt < task.max_attempts:
                task.attempt += 1
                task.log.info(
                    f"Reattempting {task}, this will be attempt #{task.attempt}, max_attempts={task.max_attempts}"
                )
                task.status = TaskStatus.no_attempt
            else:
                wait_for_file(task.workflow,
                              task.output_stderr_path,
                              30,
                              error=False)

                task.log.warn(task_printout_long.format(task))
                task.log.error("%s has failed too many times" % task)
                task.finished_on = datetime.datetime.now()
                task.stage.status = StageStatus.running_but_failed

    elif task.status == TaskStatus.successful:
        task.successful = True
        if not task.NOOP:
            task.log.info(
                "{} {}, drm_jobid={}, wall_time: {}.  {}/{} Tasks finished.".
                format(
                    task,
                    task.status,
                    task.drm_jobID,
                    datetime.timedelta(seconds=task.wall_time),
                    sum(1 for t in task.workflow.tasks if t.finished),
                    len(task.workflow.tasks),
                ))
        task.finished_on = datetime.datetime.now()
        if all(t.successful or not t.must_succeed for t in task.stage.tasks):
            task.stage.status = StageStatus.successful