def task_upload_job(process, transport_queue, cancellable): """Transport task that will attempt to upload the files of a job calculation to the remote. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :raises: Return if the tasks was successfully completed :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ node = process.node if node.get_state() == CalcJobState.SUBMITTING: logger.warning('CalcJob<{}> already marked as SUBMITTING, skipping task_update_job'.format(node.pk)) raise Return initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS authinfo = node.computer.get_authinfo(node.user) @coroutine def do_upload(): with transport_queue.request_transport(authinfo) as request: transport = yield cancellable.with_interrupt(request) with SandboxFolder() as folder: # Any exception thrown in `presubmit` call is not transient so we circumvent the exponential backoff try: calc_info = process.presubmit(folder) except Exception as exception: # pylint: disable=broad-except raise PreSubmitException('exception occurred in presubmit call') from exception else: execmanager.upload_calculation(node, transport, calc_info, folder) raise Return try: logger.info('scheduled request to upload CalcJob<{}>'.format(node.pk)) ignore_exceptions = (plumpy.CancelledError, PreSubmitException) result = yield exponential_backoff_retry( do_upload, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions ) except PreSubmitException: raise except plumpy.CancelledError: pass except Exception: logger.warning('uploading CalcJob<{}> failed'.format(node.pk)) raise TransportTaskException('upload_calculation failed {} times consecutively'.format(max_attempts)) else: logger.info('uploading CalcJob<{}> successful'.format(node.pk)) node.set_state(CalcJobState.SUBMITTING) raise Return(result)
def task_update_job(node, job_manager, cancellable): """Transport task that will attempt to update the scheduler status of the job calculation. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :type node: :class:`aiida.orm.nodes.process.calculation.calcjob.CalcJobNode` :param job_manager: The job manager :type job_manager: :class:`aiida.engine.processes.calcjobs.manager.JobManager` :param cancellable: A cancel flag :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :raises: Return containing True if the tasks was successfully completed, False otherwise """ if node.get_state() == CalcJobState.RETRIEVING: logger.warning('CalcJob<{}> already marked as RETRIEVING, skipping task_update_job'.format(node.pk)) raise Return(True) initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS authinfo = node.computer.get_authinfo(node.user) job_id = node.get_job_id() @coroutine def do_update(): # Get the update request with job_manager.request_job_info_update(authinfo, job_id) as update_request: job_info = yield cancellable.with_interrupt(update_request) if job_info is None: # If the job is computed or not found assume it's done node.set_scheduler_state(JobState.DONE) job_done = True else: node.set_last_job_info(job_info) node.set_scheduler_state(job_info.job_state) job_done = job_info.job_state == JobState.DONE raise Return(job_done) try: logger.info('scheduled request to update CalcJob<{}>'.format(node.pk)) job_done = yield exponential_backoff_retry( do_update, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=plumpy.Interruption ) except plumpy.Interruption: raise except Exception: logger.warning('updating CalcJob<{}> failed'.format(node.pk)) raise TransportTaskException('update_calculation failed {} times consecutively'.format(max_attempts)) else: logger.info('updating CalcJob<{}> successful'.format(node.pk)) if job_done: node.set_state(CalcJobState.RETRIEVING) raise Return(job_done)
def task_retrieve_job(node, transport_queue, retrieved_temporary_folder, cancellable): """Transport task that will attempt to retrieve all files of a completed job calculation. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :raises: Return if the tasks was successfully completed :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ if node.get_state() == CalcJobState.PARSING: logger.warning('CalcJob<{}> already marked as PARSING, skipping task_retrieve_job'.format(node.pk)) raise Return initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS authinfo = node.computer.get_authinfo(node.user) @coroutine def do_retrieve(): with transport_queue.request_transport(authinfo) as request: transport = yield cancellable.with_interrupt(request) # Perform the job accounting and set it on the node if successful. If the scheduler does not implement this # still set the attribute but set it to `None`. This way we can distinguish calculation jobs for which the # accounting was called but could not be set. scheduler = node.computer.get_scheduler() scheduler.set_transport(transport) try: detailed_job_info = scheduler.get_detailed_job_info(node.get_job_id()) except FeatureNotAvailable: logger.info('detailed job info not available for scheduler of CalcJob<{}>'.format(node.pk)) node.set_detailed_job_info(None) else: node.set_detailed_job_info(detailed_job_info) raise Return(execmanager.retrieve_calculation(node, transport, retrieved_temporary_folder)) try: logger.info('scheduled request to retrieve CalcJob<{}>'.format(node.pk)) yield exponential_backoff_retry( do_retrieve, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=plumpy.Interruption ) except plumpy.Interruption: raise except Exception: logger.warning('retrieving CalcJob<{}> failed'.format(node.pk)) raise TransportTaskException('retrieve_calculation failed {} times consecutively'.format(max_attempts)) else: node.set_state(CalcJobState.PARSING) logger.info('retrieving CalcJob<{}> successful'.format(node.pk)) raise Return
def task_submit_job(node, transport_queue, calc_info, script_filename, cancellable): """ Transport task that will attempt to submit a job calculation The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param calc_info: the calculation info datastructure returned by `CalcJobNode._presubmit` :param script_filename: the job launch script returned by `CalcJobNode._presubmit` :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :raises: Return if the tasks was successfully completed :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ if node.get_state() == CalcJobState.WITHSCHEDULER: assert node.get_job_id( ) is not None, 'job is WITHSCHEDULER, however, it does not have a job id' logger.warning( 'CalcJob<{}> already marked as WITHSCHEDULER, skipping task_submit_job' .format(node.pk)) raise Return(node.get_job_id()) initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS authinfo = node.computer.get_authinfo(node.user) @coroutine def do_submit(): with transport_queue.request_transport(authinfo) as request: transport = yield cancellable.with_interrupt(request) raise Return( execmanager.submit_calculation(node, transport, calc_info, script_filename)) try: logger.info('scheduled request to submit CalcJob<{}>'.format(node.pk)) result = yield exponential_backoff_retry( do_submit, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=plumpy.Interruption) except plumpy.Interruption: pass except Exception: logger.warning('submitting CalcJob<{}> failed'.format(node.pk)) raise TransportTaskException( 'submit_calculation failed {} times consecutively'.format( max_attempts)) else: logger.info('submitting CalcJob<{}> successful'.format(node.pk)) node.set_state(CalcJobState.WITHSCHEDULER) raise Return(result)
async def task_submit_job(node: CalcJobNode, transport_queue: TransportQueue, cancellable: InterruptableFuture): """Transport task that will attempt to submit a job calculation. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ if node.get_state() == CalcJobState.WITHSCHEDULER: assert node.get_job_id( ) is not None, 'job is WITHSCHEDULER, however, it does not have a job id' logger.warning( f'CalcJob<{node.pk}> already marked as WITHSCHEDULER, skipping task_submit_job' ) return node.get_job_id() initial_interval = get_config_option(RETRY_INTERVAL_OPTION) max_attempts = get_config_option(MAX_ATTEMPTS_OPTION) authinfo = node.get_authinfo() async def do_submit(): with transport_queue.request_transport(authinfo) as request: transport = await cancellable.with_interrupt(request) return execmanager.submit_calculation(node, transport) try: logger.info(f'scheduled request to submit CalcJob<{node.pk}>') ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) result = await exponential_backoff_retry( do_submit, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions) except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): # pylint: disable=try-except-raise raise except Exception as exception: logger.warning(f'submitting CalcJob<{node.pk}> failed') raise TransportTaskException( f'submit_calculation failed {max_attempts} times consecutively' ) from exception else: logger.info(f'submitting CalcJob<{node.pk}> successful') node.set_state(CalcJobState.WITHSCHEDULER) return result
async def task_stash_job(node: CalcJobNode, transport_queue: TransportQueue, cancellable: InterruptableFuture): """Transport task that will optionally stash files of a completed job calculation on the remote. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :raises: Return if the tasks was successfully completed :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ if node.get_state() == CalcJobState.RETRIEVING: logger.warning( f'calculation<{node.pk}> already marked as RETRIEVING, skipping task_stash_job' ) return initial_interval = get_config_option(RETRY_INTERVAL_OPTION) max_attempts = get_config_option(MAX_ATTEMPTS_OPTION) authinfo = node.get_authinfo() async def do_stash(): with transport_queue.request_transport(authinfo) as request: transport = await cancellable.with_interrupt(request) logger.info(f'stashing calculation<{node.pk}>') return execmanager.stash_calculation(node, transport) try: await exponential_backoff_retry( do_stash, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=plumpy.process_states.Interruption) except plumpy.process_states.Interruption: raise except Exception as exception: logger.warning(f'stashing calculation<{node.pk}> failed') raise TransportTaskException( f'stash_calculation failed {max_attempts} times consecutively' ) from exception else: node.set_state(CalcJobState.RETRIEVING) logger.info(f'stashing calculation<{node.pk}> successful') return
def task_kill_job(node, transport_queue, cancel_flag): """ Transport task that will attempt to kill a job calculation The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param cancel_flag: the cancelled flag that will be queried to determine whether the task was cancelled :raises: Return if the tasks was successfully completed :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ initial_interval = 1 max_attempts = 5 if node.get_state() in [calc_states.NEW, calc_states.TOSUBMIT]: node._set_state(calc_states.FAILED) logger.warning('calculation<{}> killed, it was in the {} state'.format(node.pk, node.get_state())) raise Return(True) authinfo = node.get_computer().get_authinfo(node.get_user()) @coroutine def do_kill(): with transport_queue.request_transport(authinfo) as request: transport = yield request # It may have taken time to get the transport, check if we've been cancelled if cancel_flag.is_cancelled: raise plumpy.CancelledError('task_kill_job for calculation<{}> cancelled'.format(node.pk)) logger.info('killing calculation<{}>'.format(node.pk)) raise Return(execmanager.kill_calculation(node, transport)) try: result = yield exponential_backoff_retry(do_kill, initial_interval, max_attempts, logger=node.logger) except plumpy.CancelledError: pass except Exception: logger.warning('killing calculation<{}> failed:\n{}'.format(node.pk, traceback.format_exc())) node._set_state(calc_states.FAILED) raise TransportTaskException('kill_calculation failed {} times consecutively'.format(max_attempts)) else: logger.info('killing calculation<{}> successful'.format(node.pk)) raise Return(result)
def task_kill_job(node, transport_queue, cancellable): """ Transport task that will attempt to kill a job calculation The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :raises: Return if the tasks was successfully completed :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS if node.get_state() in [CalcJobState.UPLOADING, CalcJobState.SUBMITTING]: logger.warning('CalcJob<{}> killed, it was in the {} state'.format( node.pk, node.get_state())) raise Return(True) authinfo = node.computer.get_authinfo(node.user) @coroutine def do_kill(): with transport_queue.request_transport(authinfo) as request: transport = yield cancellable.with_interrupt(request) raise Return(execmanager.kill_calculation(node, transport)) try: logger.info('scheduled request to kill CalcJob<{}>'.format(node.pk)) result = yield exponential_backoff_retry(do_kill, initial_interval, max_attempts, logger=node.logger) except plumpy.Interruption: raise except Exception: logger.warning('killing CalcJob<{}> failed'.format(node.pk)) raise TransportTaskException( 'kill_calculation failed {} times consecutively'.format( max_attempts)) else: logger.info('killing CalcJob<{}> successful'.format(node.pk)) node.set_scheduler_state(JobState.DONE) raise Return(result)
async def task_kill_job(node: CalcJobNode, transport_queue: TransportQueue, cancellable: InterruptableFuture): """Transport task that will attempt to kill a job calculation. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ initial_interval = get_config_option(RETRY_INTERVAL_OPTION) max_attempts = get_config_option(MAX_ATTEMPTS_OPTION) if node.get_state() in [CalcJobState.UPLOADING, CalcJobState.SUBMITTING]: logger.warning( f'CalcJob<{node.pk}> killed, it was in the {node.get_state()} state' ) return True authinfo = node.get_authinfo() async def do_kill(): with transport_queue.request_transport(authinfo) as request: transport = await cancellable.with_interrupt(request) return execmanager.kill_calculation(node, transport) try: logger.info(f'scheduled request to kill CalcJob<{node.pk}>') result = await exponential_backoff_retry(do_kill, initial_interval, max_attempts, logger=node.logger) except plumpy.process_states.Interruption: raise except Exception as exception: logger.warning(f'killing CalcJob<{node.pk}> failed') raise TransportTaskException( f'kill_calculation failed {max_attempts} times consecutively' ) from exception else: logger.info(f'killing CalcJob<{node.pk}> successful') node.set_scheduler_state(JobState.DONE) return result
async def task_upload_job(process: 'CalcJob', transport_queue: TransportQueue, cancellable: InterruptableFuture): """Transport task that will attempt to upload the files of a job calculation to the remote. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param process: the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ node = process.node if node.get_state() == CalcJobState.SUBMITTING: logger.warning( f'CalcJob<{node.pk}> already marked as SUBMITTING, skipping task_update_job' ) return initial_interval = get_config_option(RETRY_INTERVAL_OPTION) max_attempts = get_config_option(MAX_ATTEMPTS_OPTION) authinfo = node.get_authinfo() async def do_upload(): with transport_queue.request_transport(authinfo) as request: transport = await cancellable.with_interrupt(request) with SandboxFolder() as folder: # Any exception thrown in `presubmit` call is not transient so we circumvent the exponential backoff try: calc_info = process.presubmit(folder) except Exception as exception: # pylint: disable=broad-except raise PreSubmitException( 'exception occurred in presubmit call') from exception else: execmanager.upload_calculation(node, transport, calc_info, folder) skip_submit = calc_info.skip_submit or False return skip_submit try: logger.info(f'scheduled request to upload CalcJob<{node.pk}>') ignore_exceptions = (plumpy.futures.CancelledError, PreSubmitException, plumpy.process_states.Interruption) skip_submit = await exponential_backoff_retry( do_upload, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions) except PreSubmitException: raise except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): raise except Exception as exception: logger.warning(f'uploading CalcJob<{node.pk}> failed') raise TransportTaskException( f'upload_calculation failed {max_attempts} times consecutively' ) from exception else: logger.info(f'uploading CalcJob<{node.pk}> successful') node.set_state(CalcJobState.SUBMITTING) return skip_submit
async def task_retrieve_job(node: CalcJobNode, transport_queue: TransportQueue, retrieved_temporary_folder: str, cancellable: InterruptableFuture): """Transport task that will attempt to retrieve all files of a completed job calculation. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param retrieved_temporary_folder: the absolute path to a directory to store files :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ if node.get_state() == CalcJobState.PARSING: logger.warning( f'CalcJob<{node.pk}> already marked as PARSING, skipping task_retrieve_job' ) return initial_interval = get_config_option(RETRY_INTERVAL_OPTION) max_attempts = get_config_option(MAX_ATTEMPTS_OPTION) authinfo = node.get_authinfo() async def do_retrieve(): with transport_queue.request_transport(authinfo) as request: transport = await cancellable.with_interrupt(request) # Perform the job accounting and set it on the node if successful. If the scheduler does not implement this # still set the attribute but set it to `None`. This way we can distinguish calculation jobs for which the # accounting was called but could not be set. scheduler = node.computer.get_scheduler( ) # type: ignore[union-attr] scheduler.set_transport(transport) try: detailed_job_info = scheduler.get_detailed_job_info( node.get_job_id()) except FeatureNotAvailable: logger.info( f'detailed job info not available for scheduler of CalcJob<{node.pk}>' ) node.set_detailed_job_info(None) else: node.set_detailed_job_info(detailed_job_info) return execmanager.retrieve_calculation( node, transport, retrieved_temporary_folder) try: logger.info(f'scheduled request to retrieve CalcJob<{node.pk}>') ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) result = await exponential_backoff_retry( do_retrieve, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions) except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): # pylint: disable=try-except-raise raise except Exception as exception: logger.warning(f'retrieving CalcJob<{node.pk}> failed') raise TransportTaskException( f'retrieve_calculation failed {max_attempts} times consecutively' ) from exception else: node.set_state(CalcJobState.PARSING) logger.info(f'retrieving CalcJob<{node.pk}> successful') return result
async def task_update_job(node: CalcJobNode, job_manager, cancellable: InterruptableFuture): """Transport task that will attempt to update the scheduler status of the job calculation. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :type node: :class:`aiida.orm.nodes.process.calculation.calcjob.CalcJobNode` :param job_manager: The job manager :type job_manager: :class:`aiida.engine.processes.calcjobs.manager.JobManager` :param cancellable: A cancel flag :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :return: True if the tasks was successfully completed, False otherwise """ state = node.get_state() if state in [CalcJobState.RETRIEVING, CalcJobState.STASHING]: logger.warning( f'CalcJob<{node.pk}> already marked as `{state}`, skipping task_update_job' ) return True initial_interval = get_config_option(RETRY_INTERVAL_OPTION) max_attempts = get_config_option(MAX_ATTEMPTS_OPTION) authinfo = node.get_authinfo() job_id = node.get_job_id() async def do_update(): # Get the update request with job_manager.request_job_info_update(authinfo, job_id) as update_request: job_info = await cancellable.with_interrupt(update_request) if job_info is None: # If the job is computed or not found assume it's done node.set_scheduler_state(JobState.DONE) job_done = True else: node.set_last_job_info(job_info) node.set_scheduler_state(job_info.job_state) job_done = job_info.job_state == JobState.DONE return job_done try: logger.info(f'scheduled request to update CalcJob<{node.pk}>') ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) job_done = await exponential_backoff_retry( do_update, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions) except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): # pylint: disable=try-except-raise raise except Exception as exception: logger.warning(f'updating CalcJob<{node.pk}> failed') raise TransportTaskException( f'update_calculation failed {max_attempts} times consecutively' ) from exception else: logger.info(f'updating CalcJob<{node.pk}> successful') if job_done: node.set_state(CalcJobState.STASHING) return job_done