def testA_runCommand(self): """ _runCommand_ Check and see we can run a basic shell command. Also check to make sure we get the exit code back correctly """ # First ls a directory you know will have stuff in it stdout, stderr, retcode = SubprocessAlgos.runCommand(cmd='ls /tmp/') self.assertTrue(len(stdout) > 0) self.assertEqual(retcode, 0) # Now try this same with the shell off stdout, stderr, retcode = SubprocessAlgos.runCommand( cmd=['ls', '/tmp/']) self.assertTrue(len(stdout) > 0) self.assertEqual(retcode, 0) # Now test and see if we can catch a non-zero return code and output stdout, stderr, retcode = SubprocessAlgos.runCommand( cmd='echo HELP; exit 5') self.assertEqual(stdout, 'HELP\n') self.assertEqual(stderr, '') self.assertEqual(retcode, 5) # Now see if we can do a timeout if the process takes too long self.assertRaises(SubprocessAlgos.SubprocessAlgoException, SubprocessAlgos.runCommand, cmd='sleep 10', timeout=1) # And this one will go on for too long, but not raise because waitTime is not an int SubprocessAlgos.runCommand(cmd='sleep 1', timeout=0.1) return
def testA_runCommand(self): """ _runCommand_ Check and see we can run a basic shell command. Also check to make sure we get the exit code back correctly """ # First ls a directory you know will have stuff in it stdout, stderr, retcode = SubprocessAlgos.runCommand(cmd = 'ls /tmp/') self.assertTrue(len(stdout) > 0) self.assertEqual(retcode, 0) # Now try this same with the shell off stdout, stderr, retcode = SubprocessAlgos.runCommand(cmd = ['ls', '/tmp/']) self.assertTrue(len(stdout) > 0) self.assertEqual(retcode, 0) # Now test and see if we can catch a non-zero return code and output stdout, stderr, retcode = SubprocessAlgos.runCommand(cmd = 'echo HELP; exit 5') self.assertEqual(stdout, 'HELP\n') self.assertEqual(stderr, '') self.assertEqual(retcode, 5) # Now see if we can do a timeout if the process takes too long self.assertRaises(SubprocessAlgos.SubprocessAlgoException, SubprocessAlgos.runCommand, cmd = 'sleep 10', timeout = 1) # And this one will go on for too long, but not raise because waitTime is not an int SubprocessAlgos.runCommand(cmd = 'sleep 1', timeout = 0.1) return
def submitWorker(input, results, timeout = None): """ _outputWorker_ Runs a subprocessed command. This takes whatever you send it (a single ID) executes the command and then returns the stdout result I planned this to do a glite-job-output command in massive parallel, possibly using the bulkID instead of the gridID. Either way, all you have to change is the command here, and what is send in in the complete() function. """ # Get this started while True: try: work = input.get() except (EOFError, IOError) as ex: crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" crashMessage += str(ex) logging.error(crashMessage) break except Exception as ex: msg = "Hit unidentified exception getting work\n" msg += str(ex) msg += "Assuming everything's totally hosed. Killing process.\n" logging.error(msg) break if work == 'STOP': # Put the brakes on logging.info("submitWorker multiprocess issued STOP command!") break command = work.get('command', None) idList = work.get('idList', []) if not command: results.put({'stdout': '', 'stderr': '999100\n Got no command!', 'idList': idList}) continue try: stdout, stderr, returnCode = SubprocessAlgos.runCommand(cmd = command, shell = True, timeout = timeout) if returnCode == 0: results.put({'stdout': stdout, 'stderr': stderr, 'idList': idList, 'exitCode': returnCode}) else: results.put({'stdout': stdout, 'stderr': 'Non-zero exit code: %s\n stderr: %s' % (returnCode, stderr), 'exitCode': returnCode, 'idList': idList}) except Exception as ex: msg = "Critical error in subprocess while submitting to condor" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) results.put({'stdout': '', 'stderr': '999101\n %s' % msg, 'idList': idList, 'exitCode': 999101}) return 0
def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName == None: # We're between steps return if self.currentStepSpace == None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID == None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return rss = float(output[2]) vsize = float(output[3]) logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3], output[4], output[5])) msg = 'Error in CMSSW step %s\n' % self.currentStepName if self.maxRSS != None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True if self.maxVSize != None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in DashboardMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step = "PerformanceError"): report.addStep(reportname = "PerformanceError") report.addError(stepName = "PerformanceError", exitCode = 99900, errorType = "PerformanceKill", errorDetails = msg) report.save(logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: logging.error("Attempting to kill job using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) except Exception: os.kill(stepPID, signal.SIGTERM)
logging.error(msg) break if work == 'STOP': # Put the brakes on logging.info("submitWorker multiprocess issued STOP command!") break command = work.get('command', None) idList = work.get('idList', []) if not command: results.put({'stdout': '', 'stderr': '999100\n Got no command!', 'idList': idList}) continue try: stdout, stderr, returnCode = SubprocessAlgos.runCommand(cmd = command, shell = True, timeout = timeout) if returnCode == 0: results.put({'stdout': stdout, 'stderr': stderr, 'idList': idList, 'exitCode': returnCode}) else: results.put({'stdout': stdout, 'stderr': 'Non-zero exit code: %s\n stderr: %s' % (returnCode, stderr), 'exitCode': returnCode, 'idList': idList}) except Exception, ex: msg = "Critical error in subprocess while submitting to condor" msg += str(ex) msg += str(traceback.format_exc()) logging.error(msg) results.put({'stdout': '', 'stderr': '999101\n %s' % msg, 'idList': idList, 'exitCode': 999101}) return 0
def testD_MyProxyDelegation(self): """ _MyProxyDelegation_ Test whether we can delegate a proxy via myproxy to this job IMPORTANT: If you are going to run this test you will have to set the serverCert/Key config options to point to your local server cert. You will also have to run this job with your DN. I don't recommend figuring out how to do this without knowing what you're doing in regards to proxy stuff. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) # Get the config and set the removal time to -10 for testing proxyDir = os.path.join(self.testDir, 'proxyDir') os.mkdir(proxyDir) config = self.getConfig() config.BossAir.removeTime = -10.0 config.BossAir.pluginNames.append('VanillaCondorPlugin') config.BossAir.delegatedServerCert = '/uscms/home/mnorman/.globus/cms-xen39crab3devcert.pem' config.BossAir.delegatedServerKey = '/uscms/home/mnorman/.globus/cms-xen39crab3devkey.pem' config.BossAir.myproxyServer = 'myproxy.cern.ch' config.BossAir.proxyDir = proxyDir config.BossAir.delegatedServerHash = 'a6f078516a0beed5dcb31ba866868fa690069f9a' userDN = '/DC=org/DC=doegrids/OU=People/CN=Matthew Norman 453632' nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs) baAPI = BossAirAPI(config = config) jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: tmpJob = {'id': j['id']} tmpJob['custom'] = {'location': 'malpaquet'} tmpJob['name'] = j['name'] tmpJob['cache_dir'] = self.testDir tmpJob['retry_count'] = 0 tmpJob['plugin'] = 'VanillaCondorPlugin' tmpJob['owner'] = userDN tmpJob['packageDir'] = self.testDir tmpJob['sandbox'] = sandbox tmpJob['priority'] = None jobList.append(tmpJob) info = {} #info['packageDir'] = self.testDir info['index'] = 0 info['sandbox'] = sandbox baAPI.submit(jobs = jobList, info = info) proxyFile = os.listdir(proxyDir)[0] stdout, stderr = SubprocessAlgos.runCommand(cmd = 'export X509_USER_PROXY=%s; voms-proxy-info' \ % os.path.join(proxyDir, proxyFile)) self.assertEqual(stdout.split('\n')[0], 'subject : %s/CN=proxy/CN=proxy/CN=proxy/CN=proxy' % userDN) # Now kill 'em manually command = ['condor_rm', self.user] SubprocessAlgos.runCommand(cmd = command, shell = False) return
def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = {'PSS': 50660, 'Wallclock time': 50664, '': 99999} if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the ps monitor command and collate the data # Gathers RSS, %CPU and %MEM statistics from ps ps_cmd = self.monitorBase % (stepPID, stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd) ps_output = stdout.split() if not len(ps_output) > 6: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % ps_output msg += "command = %s\n" % ps_cmd logging.error(msg) return # run the command to gather PSS memory statistics from /proc/<pid>/smaps smaps_cmd = self.pssMemoryCommand % (stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd) smaps_output = stdout.split() if not len(smaps_output) == 1: # Then something went wrong in getting the smaps data msg = "Error when grabbing output from smaps\n" msg += "output = %s\n" % smaps_output msg += "command = %s\n" % smaps_cmd logging.error(msg) return # smaps also returns data in kiloBytes, let's make it megaBytes # I'm also confused with these megabytes and mebibytes... pss = int(smaps_output[0]) // 1000 logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxPSS is not None and pss >= self.maxPSS: msg += "Job has exceeded maxPSS: %s MB\n" % self.maxPSS msg += "Job has PSS: %s MB\n" % pss killProc = True reason = 'PSS' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str(self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if not killProc: # then job is behaving well, there is nothing to do return # make sure we persist the performance error only once if not self.killRetry: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in PerformanceMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += "Error: %s" % str(ex) logging.exception(msg2) try: if not killHard and not self.killRetry: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) finally: self.killRetry = True return
def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName == None: # We're between steps return if self.currentStepSpace == None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID == None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return rss = float(output[2]) vsize = float(output[3]) logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3], output[4], output[5])) msg = 'Error in CMSSW step %s\n' % self.currentStepName if self.maxRSS != None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True if self.maxVSize != None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True #Let's check the running time currentTime = time.time() if self.hardTimeout != None and self.softTimeout != None: if (currentTime - self.startTime) > self.softTimeout: killProc = True msg += "Job has been running for more than: %s\n" % str( self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug( "Found pre-existant error report in PerformanceMonitor termination." ) report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=99900, errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM)
def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = {'RSS': 50660, 'VSZ': 50661, 'Wallclock time': 50664, '': 99999} if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return # FIXME: making it backwards compatible. Keep only the "else" block in HG1801 if self.maxRSS is not None and self.maxRSS >= (1024 * 1024): # then workload value is still in KiB (old way) rss = int(output[2]) vsize = int(output[3]) else: rss = int(output[2]) // 1024 # convert it to MiB vsize = int(output[3]) // 1024 # convert it to MiB logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s", output[2], output[3], output[4], output[5]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxRSS is not None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True reason = 'RSS' elif self.maxVSize is not None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True reason = 'VSZ' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str(self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in PerformanceMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) return
def testD_MyProxyDelegation(self): """ _MyProxyDelegation_ Test whether we can delegate a proxy via myproxy to this job IMPORTANT: If you are going to run this test you will have to set the serverCert/Key config options to point to your local server cert. You will also have to run this job with your DN. I don't recommend figuring out how to do this without knowing what you're doing in regards to proxy stuff. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) # Get the config and set the removal time to -10 for testing proxyDir = os.path.join(self.testDir, 'proxyDir') os.mkdir(proxyDir) config = self.getConfig() config.BossAir.removeTime = -10.0 config.BossAir.pluginNames.append('VanillaCondorPlugin') config.BossAir.delegatedServerCert = '/uscms/home/mnorman/.globus/cms-xen39crab3devcert.pem' config.BossAir.delegatedServerKey = '/uscms/home/mnorman/.globus/cms-xen39crab3devkey.pem' config.BossAir.myproxyServer = 'myproxy.cern.ch' config.BossAir.proxyDir = proxyDir config.BossAir.delegatedServerHash = 'a6f078516a0beed5dcb31ba866868fa690069f9a' userDN = '/DC=org/DC=doegrids/OU=People/CN=Matthew Norman 453632' nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) baAPI = BossAirAPI(config=config) jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: tmpJob = {'id': j['id']} tmpJob['custom'] = {'location': 'malpaquet'} tmpJob['name'] = j['name'] tmpJob['cache_dir'] = self.testDir tmpJob['retry_count'] = 0 tmpJob['plugin'] = 'VanillaCondorPlugin' tmpJob['owner'] = userDN tmpJob['packageDir'] = self.testDir tmpJob['sandbox'] = sandbox tmpJob['priority'] = None jobList.append(tmpJob) info = {} #info['packageDir'] = self.testDir info['index'] = 0 info['sandbox'] = sandbox baAPI.submit(jobs=jobList, info=info) proxyFile = os.listdir(proxyDir)[0] stdout, stderr = SubprocessAlgos.runCommand(cmd = 'export X509_USER_PROXY=%s; voms-proxy-info' \ % os.path.join(proxyDir, proxyFile)) self.assertEqual( stdout.split('\n')[0], 'subject : %s/CN=proxy/CN=proxy/CN=proxy/CN=proxy' % userDN) # Now kill 'em manually command = ['condor_rm', self.user] SubprocessAlgos.runCommand(cmd=command, shell=False) return
def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = { 'RSS': 50660, 'VSZ': 50661, 'Wallclock time': 50664, '': 99999 } if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return # FIXME: making it backwards compatible. Keep only the "else" block in HG1801 if self.maxRSS is not None and self.maxRSS >= (1024 * 1024): # then workload value is still in KiB (old way) rss = int(output[2]) vsize = int(output[3]) else: # ps returns data in kiloBytes, let's make it megaBytes # I'm so confused with these megabytes and mebibytes... rss = int(output[2]) // 1000 # convert it to MiB vsize = int(output[3]) // 1000 # convert it to MiB logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s", output[2], output[3], output[4], output[5]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxRSS is not None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True reason = 'RSS' elif self.maxVSize is not None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True reason = 'VSZ' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str( self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug( "Found pre-existant error report in PerformanceMonitor termination." ) report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) return