Exemplo n.º 1
0
def task_retrieve_job(node, transport_queue, retrieved_temporary_folder, cancellable):
    """Transport task that will attempt to retrieve all files of a completed job calculation.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :raises: Return if the tasks was successfully completed
    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    if node.get_state() == CalcJobState.PARSING:
        logger.warning('CalcJob<{}> already marked as PARSING, skipping task_retrieve_job'.format(node.pk))
        raise Return

    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS

    authinfo = node.computer.get_authinfo(node.user)

    @coroutine
    def do_retrieve():
        with transport_queue.request_transport(authinfo) as request:
            transport = yield cancellable.with_interrupt(request)

            # Perform the job accounting and set it on the node if successful. If the scheduler does not implement this
            # still set the attribute but set it to `None`. This way we can distinguish calculation jobs for which the
            # accounting was called but could not be set.
            scheduler = node.computer.get_scheduler()
            scheduler.set_transport(transport)

            try:
                detailed_job_info = scheduler.get_detailed_job_info(node.get_job_id())
            except FeatureNotAvailable:
                logger.info('detailed job info not available for scheduler of CalcJob<{}>'.format(node.pk))
                node.set_detailed_job_info(None)
            else:
                node.set_detailed_job_info(detailed_job_info)

            raise Return(execmanager.retrieve_calculation(node, transport, retrieved_temporary_folder))

    try:
        logger.info('scheduled request to retrieve CalcJob<{}>'.format(node.pk))
        result = yield exponential_backoff_retry(
            do_retrieve, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=plumpy.Interruption)
    except plumpy.Interruption:
        raise
    except Exception:
        logger.warning('retrieving CalcJob<{}> failed'.format(node.pk))
        raise TransportTaskException('retrieve_calculation failed {} times consecutively'.format(max_attempts))
    else:
        node.set_state(CalcJobState.PARSING)
        logger.info('retrieving CalcJob<{}> successful'.format(node.pk))
        raise Return
Exemplo n.º 2
0
def task_submit_job(node, transport_queue, calc_info, script_filename,
                    cancellable):
    """Transport task that will attempt to submit a job calculation.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param calc_info: the calculation info datastructure returned by `CalcJobNode._presubmit`
    :param script_filename: the job launch script returned by `CalcJobNode._presubmit`
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :raises: Return if the tasks was successfully completed
    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    if node.get_state() == CalcJobState.WITHSCHEDULER:
        assert node.get_job_id(
        ) is not None, 'job is WITHSCHEDULER, however, it does not have a job id'
        logger.warning(
            'CalcJob<{}> already marked as WITHSCHEDULER, skipping task_submit_job'
            .format(node.pk))
        raise Return(node.get_job_id())

    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS

    authinfo = node.computer.get_authinfo(node.user)

    @coroutine
    def do_submit():
        with transport_queue.request_transport(authinfo) as request:
            transport = yield cancellable.with_interrupt(request)
            raise Return(
                execmanager.submit_calculation(node, transport, calc_info,
                                               script_filename))

    try:
        logger.info('scheduled request to submit CalcJob<{}>'.format(node.pk))
        result = yield exponential_backoff_retry(
            do_submit,
            initial_interval,
            max_attempts,
            logger=node.logger,
            ignore_exceptions=plumpy.Interruption)
    except plumpy.Interruption:
        pass
    except Exception:
        logger.warning('submitting CalcJob<{}> failed'.format(node.pk))
        raise TransportTaskException(
            'submit_calculation failed {} times consecutively'.format(
                max_attempts))
    else:
        logger.info('submitting CalcJob<{}> successful'.format(node.pk))
        node.set_state(CalcJobState.WITHSCHEDULER)
        raise Return(result)
Exemplo n.º 3
0
def task_update_job(node, job_manager, cancellable):
    """Transport task that will attempt to update the scheduler status of the job calculation.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :type node: :class:`aiida.orm.nodes.process.calculation.calcjob.CalcJobNode`
    :param job_manager: The job manager
    :type job_manager: :class:`aiida.engine.processes.calcjobs.manager.JobManager`
    :param cancellable: A cancel flag
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :raises: Return containing True if the tasks was successfully completed, False otherwise
    """
    if node.get_state() == CalcJobState.RETRIEVING:
        logger.warning('CalcJob<{}> already marked as RETRIEVING, skipping task_update_job'.format(node.pk))
        raise Return(True)

    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS

    authinfo = node.computer.get_authinfo(node.user)
    job_id = node.get_job_id()

    @coroutine
    def do_update():
        # Get the update request
        with job_manager.request_job_info_update(authinfo, job_id) as update_request:
            job_info = yield cancellable.with_interrupt(update_request)

        if job_info is None:
            # If the job is computed or not found assume it's done
            node.set_scheduler_state(JobState.DONE)
            job_done = True
        else:
            node.set_last_job_info(job_info)
            node.set_scheduler_state(job_info.job_state)
            job_done = job_info.job_state == JobState.DONE

        raise Return(job_done)

    try:
        logger.info('scheduled request to update CalcJob<{}>'.format(node.pk))
        job_done = yield exponential_backoff_retry(
            do_update, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=plumpy.Interruption)
    except plumpy.Interruption:
        raise
    except Exception:
        logger.warning('updating CalcJob<{}> failed'.format(node.pk))
        raise TransportTaskException('update_calculation failed {} times consecutively'.format(max_attempts))
    else:
        logger.info('updating CalcJob<{}> successful'.format(node.pk))
        if job_done:
            node.set_state(CalcJobState.RETRIEVING)

        raise Return(job_done)
Exemplo n.º 4
0
def task_kill_job(node, transport_queue, cancellable):
    """
    Transport task that will attempt to kill a job calculation

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :raises: Return if the tasks was successfully completed
    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS

    if node.get_state() in [CalcJobState.UPLOADING, CalcJobState.SUBMITTING]:
        logger.warning('CalcJob<{}> killed, it was in the {} state'.format(
            node.pk, node.get_state()))
        raise Return(True)

    authinfo = node.computer.get_authinfo(node.user)

    @coroutine
    def do_kill():
        with transport_queue.request_transport(authinfo) as request:
            transport = yield cancellable.with_interrupt(request)
            raise Return(execmanager.kill_calculation(node, transport))

    try:
        logger.info('scheduled request to kill CalcJob<{}>'.format(node.pk))
        result = yield exponential_backoff_retry(do_kill,
                                                 initial_interval,
                                                 max_attempts,
                                                 logger=node.logger)
    except plumpy.Interruption:
        raise
    except Exception:
        logger.warning('killing CalcJob<{}> failed'.format(node.pk))
        raise TransportTaskException(
            'kill_calculation failed {} times consecutively'.format(
                max_attempts))
    else:
        logger.info('killing CalcJob<{}> successful'.format(node.pk))
        node.set_scheduler_state(JobState.DONE)
        raise Return(result)
Exemplo n.º 5
0
    def test_exp_backoff_success():
        """Test that exponential backoff will successfully catch exceptions as long as max_attempts is not exceeded."""
        global ITERATION
        ITERATION = 0
        loop = asyncio.get_event_loop()

        async def coro():
            """A function that will raise RuntimeError as long as ITERATION is smaller than MAX_ITERATIONS."""
            global ITERATION
            ITERATION += 1
            if ITERATION < MAX_ITERATIONS:
                raise RuntimeError

        max_attempts = MAX_ITERATIONS + 1
        loop.run_until_complete(exponential_backoff_retry(coro, initial_interval=0.1, max_attempts=max_attempts))
Exemplo n.º 6
0
    def test_exp_backoff_max_attempts_exceeded(self):
        """Test that exponential backoff will finally raise if max_attempts is exceeded"""
        global ITERATION
        ITERATION = 0
        loop = asyncio.get_event_loop()

        def coro():
            """A function that will raise RuntimeError as long as ITERATION is smaller than MAX_ITERATIONS."""
            global ITERATION
            ITERATION += 1
            if ITERATION < MAX_ITERATIONS:
                raise RuntimeError

        max_attempts = MAX_ITERATIONS - 1
        with self.assertRaises(RuntimeError):
            loop.run_until_complete(exponential_backoff_retry(coro, initial_interval=0.1, max_attempts=max_attempts))
Exemplo n.º 7
0
    def test_exponential_backoff_success(self):
        """Test that exponential backoff will successfully catch exceptions as long as max_attempts is not exceeded."""
        ITERATION = 0
        loop = IOLoop()

        @coroutine
        def coro():
            """A function that will raise RuntimeError as long as ITERATION is smaller than MAX_ITERATIONS."""
            global ITERATION
            ITERATION += 1
            if ITERATION < MAX_ITERATIONS:
                raise RuntimeError

        max_attempts = MAX_ITERATIONS + 1
        loop.run_sync(lambda: exponential_backoff_retry(
            coro, initial_interval=0.1, max_attempts=max_attempts))
Exemplo n.º 8
0
def task_upload_job(process, transport_queue, cancellable):
    """Transport task that will attempt to upload the files of a job calculation to the remote.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :raises: Return if the tasks was successfully completed
    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    node = process.node

    if node.get_state() == CalcJobState.SUBMITTING:
        logger.warning(
            'CalcJob<{}> already marked as SUBMITTING, skipping task_update_job'
            .format(node.pk))
        raise Return(True)

    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS

    authinfo = node.computer.get_authinfo(node.user)

    @coroutine
    def do_upload():
        with transport_queue.request_transport(authinfo) as request:
            transport = yield cancellable.with_interrupt(request)

            with SandboxFolder() as folder:
                # Any exception thrown in `presubmit` call is not transient so we circumvent the exponential backoff
                try:
                    calc_info, script_filename = process.presubmit(folder)
                except Exception as exception:  # pylint: disable=broad-except
                    raise PreSubmitException(
                        'exception occurred in presubmit call') from exception
                else:
                    execmanager.upload_calculation(node, transport, calc_info,
                                                   folder)

            raise Return((calc_info, script_filename))

    try:
        logger.info('scheduled request to upload CalcJob<{}>'.format(node.pk))
        ignore_exceptions = (plumpy.CancelledError, PreSubmitException)
        result = yield exponential_backoff_retry(
            do_upload,
            initial_interval,
            max_attempts,
            logger=node.logger,
            ignore_exceptions=ignore_exceptions)
    except PreSubmitException:
        raise
    except plumpy.CancelledError:
        pass
    except Exception:
        logger.warning('uploading CalcJob<{}> failed'.format(node.pk))
        raise TransportTaskException(
            'upload_calculation failed {} times consecutively'.format(
                max_attempts))
    else:
        logger.info('uploading CalcJob<{}> successful'.format(node.pk))
        node.set_state(CalcJobState.SUBMITTING)
        raise Return(result)