예제 #1
0
def check_local_space(initial=True):
    """
    Do we have enough local disk space left to run the job?
    For the initial local space check, the Pilot will require 2 GB of free space, but during running
    this can be lowered to 1 GB.

    :param initial: True means a 2 GB limit, False means a 1 GB limit (optional Boolean)
    :return: pilot error code (0 if success, NOLOCALSPACE if failure)
    """

    ec = 0
    diagnostics = ""

    # is there enough local space to run a job?
    cwd = os.getcwd()
    logger.debug('checking local space on %s', cwd)
    spaceleft = convert_mb_to_b(get_local_disk_space(cwd))  # B (diskspace is in MB)
    free_space_limit = human2bytes(config.Pilot.free_space_limit) if initial else human2bytes(config.Pilot.free_space_limit_running)

    if spaceleft <= free_space_limit:
        diagnostics = 'too little space left on local disk to run job: %d B (need > %d B)' %\
                      (spaceleft, free_space_limit)
        ec = errors.NOLOCALSPACE
        logger.warning(diagnostics)
    else:
        logger.info('sufficient remaining disk space (%d B)', spaceleft)

    return ec, diagnostics
예제 #2
0
파일: data.py 프로젝트: ptrlv/pilot2
    def check_availablespace(self, files):
        """
        Verify that enough local space is available to stage in and run the job

        :param files: list of FileSpec objects.
        :raise: PilotException in case of not enough space or total input size too large
        """

        for f in files:
            self.logger.debug('lfn=%s filesize=%d accessmode=%s' %
                              (f.lfn, f.filesize, f.accessmode))

        maxinputsize = convert_mb_to_b(get_maximum_input_sizes())
        totalsize = reduce(lambda x, y: x + y.filesize, files, 0)

        # verify total filesize
        if maxinputsize and totalsize > maxinputsize:
            error = "too many/too large input files (%s). total file size=%s B > maxinputsize=%s B" % \
                    (len(files), totalsize, maxinputsize)
            raise SizeTooLarge(error)

        self.logger.info(
            "total input file size=%s B within allowed limit=%s B (zero value means unlimited)"
            % (totalsize, maxinputsize))

        # get available space
        available_space = convert_mb_to_b(get_local_disk_space(os.getcwd()))
        self.logger.info("locally available space: %d B" % available_space)

        # are we within the limit?
        if totalsize > available_space:
            error = "not enough local space for staging input files and run the job (need %d B, but only have %d B)" % \
                    (totalsize, available_space)
            raise NoLocalSpace(error)
예제 #3
0
파일: monitoring.py 프로젝트: ptrlv/pilot2
def check_local_space():
    """
    Do we have enough local disk space left to run the job?

    :return: pilot error code (0 if success, NOLOCALSPACE if failure)
    """

    ec = 0
    diagnostics = ""

    # is there enough local space to run a job?
    cwd = os.getcwd()
    logger.debug('checking local space on %s' % cwd)
    spaceleft = convert_mb_to_b(
        get_local_disk_space(cwd))  # B (diskspace is in MB)
    free_space_limit = human2bytes(config.Pilot.free_space_limit)
    if spaceleft <= free_space_limit:
        diagnostics = 'too little space left on local disk to run job: %d B (need > %d B)' %\
                      (spaceleft, free_space_limit)
        ec = errors.NOLOCALSPACE
        logger.warning(diagnostics)
    else:
        logger.info('sufficient remaining disk space (%d B)' % spaceleft)

    return ec, diagnostics
예제 #4
0
def interpret_payload_exit_info(job):
    """
    Interpret the exit info from the payload

    :param job: job object.
    :return:
    """

    # try to identify out of memory errors in the stderr
    if is_out_of_memory(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.PAYLOADOUTOFMEMORY, priority=True)
        return

    # look for specific errors in the stdout (tail)
    if is_installation_error(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.MISSINGINSTALLATION, priority=True)
        return

    # did AtlasSetup fail?
    if is_atlassetup_error(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.SETUPFATAL, priority=True)
        return

    # did the payload run out of space?
    if is_out_of_space(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.NOLOCALSPACE, priority=True)

        # double check local space
        spaceleft = convert_mb_to_b(get_local_disk_space(
            os.getcwd()))  # B (diskspace is in MB)
        logger.info('verifying local space: %d B' % spaceleft)
        return

    # look for specific errors in the stdout (full)
    if is_nfssqlite_locking_problem(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.NFSSQLITE, priority=True)
        return

    # is the user tarball missing on the server?
    if is_user_code_missing(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.MISSINGUSERCODE, priority=True)
        return

    # set a general Pilot error code if the payload error could not be identified
    if job.transexitcode == 0 and job.exitcode != 0:
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.UNKNOWNPAYLOADFAILURE, priority=True)