def task_retrieve_job(node, transport_queue, retrieved_temporary_folder, cancellable): """Transport task that will attempt to retrieve all files of a completed job calculation. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :raises: Return if the tasks was successfully completed :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ if node.get_state() == CalcJobState.PARSING: logger.warning('CalcJob<{}> already marked as PARSING, skipping task_retrieve_job'.format(node.pk)) raise Return initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS authinfo = node.computer.get_authinfo(node.user) @coroutine def do_retrieve(): with transport_queue.request_transport(authinfo) as request: transport = yield cancellable.with_interrupt(request) # Perform the job accounting and set it on the node if successful. If the scheduler does not implement this # still set the attribute but set it to `None`. This way we can distinguish calculation jobs for which the # accounting was called but could not be set. scheduler = node.computer.get_scheduler() scheduler.set_transport(transport) try: detailed_job_info = scheduler.get_detailed_job_info(node.get_job_id()) except FeatureNotAvailable: logger.info('detailed job info not available for scheduler of CalcJob<{}>'.format(node.pk)) node.set_detailed_job_info(None) else: node.set_detailed_job_info(detailed_job_info) raise Return(execmanager.retrieve_calculation(node, transport, retrieved_temporary_folder)) try: logger.info('scheduled request to retrieve CalcJob<{}>'.format(node.pk)) result = yield exponential_backoff_retry( do_retrieve, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=plumpy.Interruption) except plumpy.Interruption: raise except Exception: logger.warning('retrieving CalcJob<{}> failed'.format(node.pk)) raise TransportTaskException('retrieve_calculation failed {} times consecutively'.format(max_attempts)) else: node.set_state(CalcJobState.PARSING) logger.info('retrieving CalcJob<{}> successful'.format(node.pk)) raise Return
def task_submit_job(node, transport_queue, calc_info, script_filename, cancellable): """Transport task that will attempt to submit a job calculation. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param calc_info: the calculation info datastructure returned by `CalcJobNode._presubmit` :param script_filename: the job launch script returned by `CalcJobNode._presubmit` :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :raises: Return if the tasks was successfully completed :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ if node.get_state() == CalcJobState.WITHSCHEDULER: assert node.get_job_id( ) is not None, 'job is WITHSCHEDULER, however, it does not have a job id' logger.warning( 'CalcJob<{}> already marked as WITHSCHEDULER, skipping task_submit_job' .format(node.pk)) raise Return(node.get_job_id()) initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS authinfo = node.computer.get_authinfo(node.user) @coroutine def do_submit(): with transport_queue.request_transport(authinfo) as request: transport = yield cancellable.with_interrupt(request) raise Return( execmanager.submit_calculation(node, transport, calc_info, script_filename)) try: logger.info('scheduled request to submit CalcJob<{}>'.format(node.pk)) result = yield exponential_backoff_retry( do_submit, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=plumpy.Interruption) except plumpy.Interruption: pass except Exception: logger.warning('submitting CalcJob<{}> failed'.format(node.pk)) raise TransportTaskException( 'submit_calculation failed {} times consecutively'.format( max_attempts)) else: logger.info('submitting CalcJob<{}> successful'.format(node.pk)) node.set_state(CalcJobState.WITHSCHEDULER) raise Return(result)
def task_update_job(node, job_manager, cancellable): """Transport task that will attempt to update the scheduler status of the job calculation. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :type node: :class:`aiida.orm.nodes.process.calculation.calcjob.CalcJobNode` :param job_manager: The job manager :type job_manager: :class:`aiida.engine.processes.calcjobs.manager.JobManager` :param cancellable: A cancel flag :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :raises: Return containing True if the tasks was successfully completed, False otherwise """ if node.get_state() == CalcJobState.RETRIEVING: logger.warning('CalcJob<{}> already marked as RETRIEVING, skipping task_update_job'.format(node.pk)) raise Return(True) initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS authinfo = node.computer.get_authinfo(node.user) job_id = node.get_job_id() @coroutine def do_update(): # Get the update request with job_manager.request_job_info_update(authinfo, job_id) as update_request: job_info = yield cancellable.with_interrupt(update_request) if job_info is None: # If the job is computed or not found assume it's done node.set_scheduler_state(JobState.DONE) job_done = True else: node.set_last_job_info(job_info) node.set_scheduler_state(job_info.job_state) job_done = job_info.job_state == JobState.DONE raise Return(job_done) try: logger.info('scheduled request to update CalcJob<{}>'.format(node.pk)) job_done = yield exponential_backoff_retry( do_update, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=plumpy.Interruption) except plumpy.Interruption: raise except Exception: logger.warning('updating CalcJob<{}> failed'.format(node.pk)) raise TransportTaskException('update_calculation failed {} times consecutively'.format(max_attempts)) else: logger.info('updating CalcJob<{}> successful'.format(node.pk)) if job_done: node.set_state(CalcJobState.RETRIEVING) raise Return(job_done)
def task_kill_job(node, transport_queue, cancellable): """ Transport task that will attempt to kill a job calculation The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :raises: Return if the tasks was successfully completed :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS if node.get_state() in [CalcJobState.UPLOADING, CalcJobState.SUBMITTING]: logger.warning('CalcJob<{}> killed, it was in the {} state'.format( node.pk, node.get_state())) raise Return(True) authinfo = node.computer.get_authinfo(node.user) @coroutine def do_kill(): with transport_queue.request_transport(authinfo) as request: transport = yield cancellable.with_interrupt(request) raise Return(execmanager.kill_calculation(node, transport)) try: logger.info('scheduled request to kill CalcJob<{}>'.format(node.pk)) result = yield exponential_backoff_retry(do_kill, initial_interval, max_attempts, logger=node.logger) except plumpy.Interruption: raise except Exception: logger.warning('killing CalcJob<{}> failed'.format(node.pk)) raise TransportTaskException( 'kill_calculation failed {} times consecutively'.format( max_attempts)) else: logger.info('killing CalcJob<{}> successful'.format(node.pk)) node.set_scheduler_state(JobState.DONE) raise Return(result)
def test_exp_backoff_success(): """Test that exponential backoff will successfully catch exceptions as long as max_attempts is not exceeded.""" global ITERATION ITERATION = 0 loop = asyncio.get_event_loop() async def coro(): """A function that will raise RuntimeError as long as ITERATION is smaller than MAX_ITERATIONS.""" global ITERATION ITERATION += 1 if ITERATION < MAX_ITERATIONS: raise RuntimeError max_attempts = MAX_ITERATIONS + 1 loop.run_until_complete(exponential_backoff_retry(coro, initial_interval=0.1, max_attempts=max_attempts))
def test_exp_backoff_max_attempts_exceeded(self): """Test that exponential backoff will finally raise if max_attempts is exceeded""" global ITERATION ITERATION = 0 loop = asyncio.get_event_loop() def coro(): """A function that will raise RuntimeError as long as ITERATION is smaller than MAX_ITERATIONS.""" global ITERATION ITERATION += 1 if ITERATION < MAX_ITERATIONS: raise RuntimeError max_attempts = MAX_ITERATIONS - 1 with self.assertRaises(RuntimeError): loop.run_until_complete(exponential_backoff_retry(coro, initial_interval=0.1, max_attempts=max_attempts))
def test_exponential_backoff_success(self): """Test that exponential backoff will successfully catch exceptions as long as max_attempts is not exceeded.""" ITERATION = 0 loop = IOLoop() @coroutine def coro(): """A function that will raise RuntimeError as long as ITERATION is smaller than MAX_ITERATIONS.""" global ITERATION ITERATION += 1 if ITERATION < MAX_ITERATIONS: raise RuntimeError max_attempts = MAX_ITERATIONS + 1 loop.run_sync(lambda: exponential_backoff_retry( coro, initial_interval=0.1, max_attempts=max_attempts))
def task_upload_job(process, transport_queue, cancellable): """Transport task that will attempt to upload the files of a job calculation to the remote. The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will retry after an interval that increases exponentially with the number of retries, for a maximum number of retries. If all retries fail, the task will raise a TransportTaskException :param node: the node that represents the job calculation :param transport_queue: the TransportQueue from which to request a Transport :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled :type cancellable: :class:`aiida.engine.utils.InterruptableFuture` :raises: Return if the tasks was successfully completed :raises: TransportTaskException if after the maximum number of retries the transport task still excepted """ node = process.node if node.get_state() == CalcJobState.SUBMITTING: logger.warning( 'CalcJob<{}> already marked as SUBMITTING, skipping task_update_job' .format(node.pk)) raise Return(True) initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS authinfo = node.computer.get_authinfo(node.user) @coroutine def do_upload(): with transport_queue.request_transport(authinfo) as request: transport = yield cancellable.with_interrupt(request) with SandboxFolder() as folder: # Any exception thrown in `presubmit` call is not transient so we circumvent the exponential backoff try: calc_info, script_filename = process.presubmit(folder) except Exception as exception: # pylint: disable=broad-except raise PreSubmitException( 'exception occurred in presubmit call') from exception else: execmanager.upload_calculation(node, transport, calc_info, folder) raise Return((calc_info, script_filename)) try: logger.info('scheduled request to upload CalcJob<{}>'.format(node.pk)) ignore_exceptions = (plumpy.CancelledError, PreSubmitException) result = yield exponential_backoff_retry( do_upload, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions) except PreSubmitException: raise except plumpy.CancelledError: pass except Exception: logger.warning('uploading CalcJob<{}> failed'.format(node.pk)) raise TransportTaskException( 'upload_calculation failed {} times consecutively'.format( max_attempts)) else: logger.info('uploading CalcJob<{}> successful'.format(node.pk)) node.set_state(CalcJobState.SUBMITTING) raise Return(result)