def task_status_changed(task): if task.status in completed_task_statuses: task.workflow.jobmanager.get_drm(task.drm).populate_logs(task) if task.status == TaskStatus.waiting: task.started_on = datetime.datetime.now() elif task.status == TaskStatus.submitted: task.stage.status = StageStatus.running if not task.NOOP: task.log.info( '%s %s. drm=%s; drm_jobid=%s; job_class=%s; queue=%s' % (task, task.status, repr(task.drm), repr(task.drm_jobID), repr(task.job_class), repr(task.queue))) task.submitted_on = datetime.datetime.now() elif task.status == TaskStatus.failed: if not task.must_succeed: task.log.warn('%s failed, but must_succeed is False' % task) task.log.warn(task_printout.format(task)) task.finished_on = datetime.datetime.now() else: # # By default /usr/bin/timeout returns 124 when it kills a job. # DRM_Local jobs that time out will usually have this error code. # Other DRM's may well have different error codes. Currently, this # check is purely cosmetic, but if we do more here, then # FIXME we should have a DRM-agnostic way of determining timed-out tasks. # if task.exit_status == 124: exit_reason = 'timed out' else: exit_reason = 'failed' task.log.warn('%s attempt #%s %s (max_attempts=%s)' % (task, task.attempt, exit_reason, task.max_attempts)) if task.attempt < task.max_attempts: task.log.warn(task_printout.format(task)) task.attempt += 1 task.status = TaskStatus.no_attempt else: wait_for_file(task.workflow, task.output_stderr_path, 30, error=False) task.log.warn(task_printout.format(task)) task.log.error('%s has failed too many times' % task) task.finished_on = datetime.datetime.now() task.stage.status = StageStatus.running_but_failed elif task.status == TaskStatus.successful: task.successful = True if not task.NOOP: task.log.info('{} {}, wall_time: {}. {}/{} Tasks finished.'.format(task, task.status, datetime.timedelta( seconds=task.wall_time), sum(1 for t in task.workflow.tasks if t.finished), len(task.workflow.tasks))) task.finished_on = datetime.datetime.now() if all(t.successful or not t.must_succeed for t in task.stage.tasks): task.stage.status = StageStatus.successful
def task_status_changed(task): if task.status in completed_task_statuses: task.workflow.jobmanager.get_drm(task.drm).populate_logs(task) if task.status == TaskStatus.waiting: task.started_on = datetime.datetime.now() elif task.status == TaskStatus.submitted: task.stage.status = StageStatus.running if not task.NOOP: task.log.info( '%s %s. drm=%s; drm_jobid=%s; job_class=%s; queue=%s' % (task, task.status, repr(task.drm), repr(task.drm_jobID), repr(task.job_class), repr(task.queue))) task.submitted_on = datetime.datetime.now() elif task.status == TaskStatus.failed: if not task.must_succeed: task.log.warn('%s failed, but must_succeed is False' % task) task.log.warn(task_printout.format(task)) task.finished_on = datetime.datetime.now() else: # # By default /usr/bin/timeout returns 124 when it kills a job. # DRM_Local jobs that time out will usually have this error code. # Other DRM's may well have different error codes. Currently, this # check is purely cosmetic, but if we do more here, then # FIXME we should have a DRM-agnostic way of determining timed-out tasks. # if task.exit_status == 124: exit_reason = 'timed out' else: exit_reason = 'failed' task.log.warn('%s attempt #%s %s (max_attempts=%s)' % (task, task.attempt, exit_reason, task.max_attempts)) if task.attempt < task.max_attempts: task.log.warn(task_printout.format(task)) task.attempt += 1 task.status = TaskStatus.no_attempt else: wait_for_file(task.workflow, task.output_stderr_path, 30, error=False) task.log.warn(task_printout.format(task)) task.log.error('%s has failed too many times' % task) task.finished_on = datetime.datetime.now() task.stage.status = StageStatus.running_but_failed elif task.status == TaskStatus.successful: task.successful = True if not task.NOOP: task.log.info('{} {}, wall_time: {}. {}/{} Tasks finished.'.format(task, task.status, datetime.timedelta(seconds=task.wall_time), sum(1 for t in task.workflow.tasks if t.finished), len(task.workflow.tasks))) task.finished_on = datetime.datetime.now() if all(t.successful or not t.must_succeed for t in task.stage.tasks): task.stage.status = StageStatus.successful
def task_status_changed(task): if task.status in completed_task_statuses: task.workflow.jobmanager.get_drm(task.drm).populate_logs(task) if task.status == TaskStatus.waiting: task.started_on = datetime.datetime.now() elif task.status == TaskStatus.submitted: task.stage.status = StageStatus.running if not task.NOOP: task.log.info(f"{task} {task.status}. " f"drm={repr(task.drm)}; " f"drm_jobid={repr(task.drm_jobID)}; " f"job_class={repr(task.job_class)}; " f"queue={repr(task.queue)}; " f"core_req={task.core_req}; " f"gpu_req={task.gpu_req}; " f"mem_req={task.mem_req}") task.submitted_on = datetime.datetime.now() elif task.status == TaskStatus.failed: if not task.must_succeed: task.log.warn("%s failed, but must_succeed is False" % task) task.log.warn(task_printout_long.format(task)) task.finished_on = datetime.datetime.now() else: # # By default /usr/bin/timeout returns 124 when it kills a job. # DRM_Local jobs that time out will usually have this error code. # Other DRM's may well have different error codes. Currently, this # check is purely cosmetic, but if we do more here, then # FIXME we should have a DRM-agnostic way of determining timed-out tasks. # if task.exit_status == 124: exit_reason = "timed out" else: exit_reason = "failed" task.log.warn( f"{task} attempt #{task.attempt} {exit_reason} with status_reason: '{task.status_reason}'" ) regex = task.drm_options.get("retry_only_if_status_reason_matches") # if task.status_reason matches our regex, then we want to retry # ex: regex = "Host .+ Terminated", task.status_reason = "Host Terminated" would indicate we want # to retry because a spot instance died if regex is not None: status_reason_is_valid_for_retry = re.search( regex, task.status_reason or "") else: status_reason_is_valid_for_retry = True if status_reason_is_valid_for_retry and task.attempt < task.max_attempts: task.attempt += 1 task.log.info( f"Reattempting {task}, this will be attempt #{task.attempt}, max_attempts={task.max_attempts}" ) task.status = TaskStatus.no_attempt else: wait_for_file(task.workflow, task.output_stderr_path, 30, error=False) task.log.warn(task_printout_long.format(task)) task.log.error("%s has failed too many times" % task) task.finished_on = datetime.datetime.now() task.stage.status = StageStatus.running_but_failed elif task.status == TaskStatus.successful: task.successful = True if not task.NOOP: task.log.info( "{} {}, drm_jobid={}, wall_time: {}. {}/{} Tasks finished.". format( task, task.status, task.drm_jobID, datetime.timedelta(seconds=task.wall_time), sum(1 for t in task.workflow.tasks if t.finished), len(task.workflow.tasks), )) task.finished_on = datetime.datetime.now() if all(t.successful or not t.must_succeed for t in task.stage.tasks): task.stage.status = StageStatus.successful