def check_local_space(initial=True): """ Do we have enough local disk space left to run the job? For the initial local space check, the Pilot will require 2 GB of free space, but during running this can be lowered to 1 GB. :param initial: True means a 2 GB limit, False means a 1 GB limit (optional Boolean) :return: pilot error code (0 if success, NOLOCALSPACE if failure) """ ec = 0 diagnostics = "" # is there enough local space to run a job? cwd = os.getcwd() logger.debug('checking local space on %s', cwd) spaceleft = convert_mb_to_b(get_local_disk_space(cwd)) # B (diskspace is in MB) free_space_limit = human2bytes(config.Pilot.free_space_limit) if initial else human2bytes(config.Pilot.free_space_limit_running) if spaceleft <= free_space_limit: diagnostics = 'too little space left on local disk to run job: %d B (need > %d B)' %\ (spaceleft, free_space_limit) ec = errors.NOLOCALSPACE logger.warning(diagnostics) else: logger.info('sufficient remaining disk space (%d B)', spaceleft) return ec, diagnostics
def check_availablespace(self, files): """ Verify that enough local space is available to stage in and run the job :param files: list of FileSpec objects. :raise: PilotException in case of not enough space or total input size too large """ for f in files: self.logger.debug('lfn=%s filesize=%d accessmode=%s' % (f.lfn, f.filesize, f.accessmode)) maxinputsize = convert_mb_to_b(get_maximum_input_sizes()) totalsize = reduce(lambda x, y: x + y.filesize, files, 0) # verify total filesize if maxinputsize and totalsize > maxinputsize: error = "too many/too large input files (%s). total file size=%s B > maxinputsize=%s B" % \ (len(files), totalsize, maxinputsize) raise SizeTooLarge(error) self.logger.info( "total input file size=%s B within allowed limit=%s B (zero value means unlimited)" % (totalsize, maxinputsize)) # get available space available_space = convert_mb_to_b(get_local_disk_space(os.getcwd())) self.logger.info("locally available space: %d B" % available_space) # are we within the limit? if totalsize > available_space: error = "not enough local space for staging input files and run the job (need %d B, but only have %d B)" % \ (totalsize, available_space) raise NoLocalSpace(error)
def check_local_space(): """ Do we have enough local disk space left to run the job? :return: pilot error code (0 if success, NOLOCALSPACE if failure) """ ec = 0 diagnostics = "" # is there enough local space to run a job? cwd = os.getcwd() logger.debug('checking local space on %s' % cwd) spaceleft = convert_mb_to_b( get_local_disk_space(cwd)) # B (diskspace is in MB) free_space_limit = human2bytes(config.Pilot.free_space_limit) if spaceleft <= free_space_limit: diagnostics = 'too little space left on local disk to run job: %d B (need > %d B)' %\ (spaceleft, free_space_limit) ec = errors.NOLOCALSPACE logger.warning(diagnostics) else: logger.info('sufficient remaining disk space (%d B)' % spaceleft) return ec, diagnostics
def interpret_payload_exit_info(job): """ Interpret the exit info from the payload :param job: job object. :return: """ # try to identify out of memory errors in the stderr if is_out_of_memory(job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.PAYLOADOUTOFMEMORY, priority=True) return # look for specific errors in the stdout (tail) if is_installation_error(job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.MISSINGINSTALLATION, priority=True) return # did AtlasSetup fail? if is_atlassetup_error(job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.SETUPFATAL, priority=True) return # did the payload run out of space? if is_out_of_space(job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.NOLOCALSPACE, priority=True) # double check local space spaceleft = convert_mb_to_b(get_local_disk_space( os.getcwd())) # B (diskspace is in MB) logger.info('verifying local space: %d B' % spaceleft) return # look for specific errors in the stdout (full) if is_nfssqlite_locking_problem(job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.NFSSQLITE, priority=True) return # is the user tarball missing on the server? if is_user_code_missing(job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.MISSINGUSERCODE, priority=True) return # set a general Pilot error code if the payload error could not be identified if job.transexitcode == 0 and job.exitcode != 0: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.UNKNOWNPAYLOADFAILURE, priority=True)