def run(self): '''Runs the thread in a loop, looking for clusters than can have their submit directories wiped from disk because they are no longer in the queue. Sleeps after each pass on the submit directory list.''' logging.info('[cleaner] Local submission cleanup thread starting up') # TODO Should we warn the user if they're running as root? That's dangerous. # TODO Should we context switch to CONDOR_IDS automatically if we're root? while not self._stopevent.isSet(): # Case #8236: Sleep when the cleanup thread starts. Gives the schedds a # chance to start up and avoids a race condition that can delete the directories # of jobs that may actually still be in the queued. logging.info('[cleaner] Sleeping for %d seconds' % self._sleeptime) self._stopevent.wait(self._sleeptime) submitDir = util.getCondorConfigVal('CONDOR_AGENT_SUBMIT_DIR', default='""').replace('"', '') if submitDir == '': logging.error('[cleaner] Could not find a CONDOR_AGENT_SUBMIT_DIR setting for this host -- no cleanup performed') elif util.getCondorConfigVal('CONDOR_AGENT_SKIP_CLEANUP', default=False): logging.info('[cleaner] CONDOR_AGENT_SKIP_CLEANUP is True. Skipping cleanup') else: logging.info('[cleaner] Scanning submit directory \'%s\' for *.cluster files...' % submitDir) for c in self._locate(pattern='*.cluster', root=submitDir): # I never want this thread to exit because of an exception so we'll blanket trap # everything at this level and just report it back as an error. try: self._safeRemoveClusterFiles(c) except Exception, e: logging.error('[cleaner] Caught unhandled exception: %s' % (str(e)))
def run(self): '''Runs the thread in a loop, looking for clusters than can have their submit directories wiped from disk because they are no longer in the queue. Sleeps after each pass on the submit directory list.''' logging.info('[cleaner] Local submission cleanup thread starting up') # TODO Should we warn the user if they're running as root? That's dangerous. # TODO Should we context switch to CONDOR_IDS automatically if we're root? while not self._stopevent.isSet(): # Case #8236: Sleep when the cleanup thread starts. Gives the schedds a # chance to start up and avoids a race condition that can delete the directories # of jobs that may actually still be in the queued. logging.info('[cleaner] Sleeping for %d seconds' % self._sleeptime) self._stopevent.wait(self._sleeptime) submitDir = util.getCondorConfigVal('CONDOR_AGENT_SUBMIT_DIR', default='""').replace('"', '') if submitDir == '': logging.error( '[cleaner] Could not find a CONDOR_AGENT_SUBMIT_DIR setting for this host -- no cleanup performed' ) else: logging.info( '[cleaner] Scanning submit directory \'%s\' for *.cluster files...' % submitDir) for c in self._locate(pattern='*.cluster', root=submitDir): # I never want this thread to exit because of an exception so we'll blanket trap # everything at this level and just report it back as an error. try: self._safeRemoveClusterFiles(c) except Exception, e: logging.error( '[cleaner] Caught unhandled exception: %s' % (str(e)))
def cleanSubmissionDir(submission_dir): ''' Cleans up after a failed submission attempt so disk pollution does not get so bad. Does some minimal checking on submission_dir to make sure it does not do something stupid like delete / or something like that. ''' # Check if cleanup is disabled if util.getCondorConfigVal('CONDOR_AGENT_SKIP_CLEANUP', default=False): logging.warn('CONDOR_AGENT_SKIP_CLEANUP is true. Not cleaning failed submission.') return submit_dir_expected_prefix = CondorAgent.util.getCondorConfigVal("CONDOR_AGENT_SUBMIT_DIR").replace('"', '') if submit_dir_expected_prefix and submit_dir_expected_prefix != '' and len(submit_dir_expected_prefix) > 3: # That's a bit of a lame check, >3 -- it could be better. On !Windows it should suffice to # stop someone from accidentally removing /. On Windows it should suffice to stop someone # from accidentally removing something like C:\. In truth this path should always be longer # than 3 characters...so for now we'll say it's good enough. # TODO Improve the safety checks before we try to delete things off of disk if submission_dir.find(submit_dir_expected_prefix) > -1: # We will trust that the user wasn't stupid enough to use / as their # submission dir. if os.path.isdir(submission_dir): logging.info('Cleaning up directory %s after failed submission' % submission_dir) try: shutil.rmtree(submission_dir) except Exception, e: logging.error('Unable to clean up %s after failed submission: %s' % (submission_dir, str(e))) else: logging.warning('Skipped cleanup of %s after failed submission, path is not a directory' % submission_dir) else: logging.warning('Skipped cleanup of %s after failed submission, path does not start with expected string "%s"' % (submission_dir, submit_dir_expected_prefix))
def getHistory(self, completed_since, jobs): '''Returns a tuple of history, new_completed_since. The new value for completed_since comes from the last job (with a non-zero CompletionDate) that was read. This ensures that the next time the client reads we resume from the last value we read in the file. (Previously we used the current timestamp.)''' new_completed_since = completed_since history_file = util.getCondorConfigVal("HISTORY", "schedd", self.scheddName) if history_file == None: raise Exception("History is not enabled on this scheduler") # Case 5416: HISTORY should be a file, not a directory if os.path.isdir(history_file): raise Exception("The HISTORY setting is a directory") # Case 5458: Consider an empty string value for HISTORY to be the same as None # and raise an exception. if len(history_file.strip()) == 0: raise Exception("The HISTORY setting is an empty string") history_file = os.path.normpath(history_file) logging.info("History file for daemon %s: %s" % (self.scheddName, history_file)) files = glob.glob(history_file + "*") history_data = '' for f in files: if os.path.isfile(f): mod = os.path.getmtime(f) # allow for some overlap when testing # note: we don't skip the current file based on its timestamp because we don't # want to rely on that being updated properly if mod >= (completed_since - COMPLETED_SINCE_OVERLAP ) or os.path.normpath(f) == history_file: # each output from condor_history has a trailing newline so we can # just concatenate them if jobs != "": history_data = history_data + self.getItemizedHistoryFromFile( completed_since, jobs, f) else: new_data, new_time = self.getHistoryFromFile( completed_since, f) # keep the latest we've seen new_completed_since = max(new_time, new_completed_since) logging.debug("New CompletedSince: %s" % new_completed_since) history_data = history_data + new_data else: logging.info( "History file %s was last modified before given completedSince, skipped" % os.path.basename(f)) return (history_data, new_completed_since)
def cleanSubmissionDir(submission_dir): ''' Cleans up after a failed submission attempt so disk pollution does not get so bad. Does some minimal checking on submission_dir to make sure it does not do something stupid like delete / or something like that. ''' # Check if cleanup is disabled if util.getCondorConfigVal('CONDOR_AGENT_SKIP_CLEANUP', default=False): logging.warn( 'CONDOR_AGENT_SKIP_CLEANUP is true. Not cleaning failed submission.' ) return submit_dir_expected_prefix = CondorAgent.util.getCondorConfigVal( "CONDOR_AGENT_SUBMIT_DIR").replace('"', '') if submit_dir_expected_prefix and submit_dir_expected_prefix != '' and len( submit_dir_expected_prefix) > 3: # That's a bit of a lame check, >3 -- it could be better. On !Windows it should suffice to # stop someone from accidentally removing /. On Windows it should suffice to stop someone # from accidentally removing something like C:\. In truth this path should always be longer # than 3 characters...so for now we'll say it's good enough. # TODO Improve the safety checks before we try to delete things off of disk if submission_dir.find(submit_dir_expected_prefix) > -1: # We will trust that the user wasn't stupid enough to use / as their # submission dir. if os.path.isdir(submission_dir): logging.info( 'Cleaning up directory %s after failed submission' % submission_dir) try: shutil.rmtree(submission_dir) except Exception, e: logging.error( 'Unable to clean up %s after failed submission: %s' % (submission_dir, str(e))) else: logging.warning( 'Skipped cleanup of %s after failed submission, path is not a directory' % submission_dir) else: logging.warning( 'Skipped cleanup of %s after failed submission, path does not start with expected string "%s"' % (submission_dir, submit_dir_expected_prefix))
def getHistory(self, completed_since, jobs): '''Returns a tuple of history, new_completed_since. The new value for completed_since comes from the last job (with a non-zero CompletionDate) that was read. This ensures that the next time the client reads we resume from the last value we read in the file. (Previously we used the current timestamp.)''' new_completed_since = completed_since history_file = util.getCondorConfigVal("HISTORY", "schedd", self.scheddName) if history_file == None: raise Exception("History is not enabled on this scheduler") # Case 5416: HISTORY should be a file, not a directory if os.path.isdir(history_file) : raise Exception("The HISTORY setting is a directory") # Case 5458: Consider an empty string value for HISTORY to be the same as None # and raise an exception. if len(history_file.strip()) == 0 : raise Exception("The HISTORY setting is an empty string") history_file = os.path.normpath(history_file) logging.info("History file for daemon %s: %s"%(self.scheddName, history_file)) files = glob.glob(history_file + "*") history_data = '' for f in files: if os.path.isfile(f): mod = os.path.getmtime(f) # allow for some overlap when testing # note: we don't skip the current file based on its timestamp because we don't # want to rely on that being updated properly if mod >= (completed_since - COMPLETED_SINCE_OVERLAP) or os.path.normpath(f) == history_file: # each output from condor_history has a trailing newline so we can # just concatenate them if jobs != "": history_data = history_data + self.getItemizedHistoryFromFile(completed_since, jobs, f) else: new_data, new_time = self.getHistoryFromFile(completed_since, f) # keep the latest we've seen new_completed_since = max(new_time, new_completed_since) logging.debug("New CompletedSince: %s" % new_completed_since) history_data = history_data + new_data else: logging.info("History file %s was last modified before given completedSince, skipped" % os.path.basename(f)) return (history_data, new_completed_since)
def getHistory(self, completed_since, jobs): history_file = util.getCondorConfigVal("HISTORY", "schedd", self.scheddName) if history_file == None: raise Exception("History is not enabled on this scheduler") # Case 5458: Consider an empty string value for HISTORY to be the same as None # and raise an exception. if len(history_file.strip()) == 0 : raise Exception("The HISTORY setting is an empty string") logging.info("History file for daemon %s: %s"%(self.scheddName, history_file)) files = glob.glob(history_file + "*") history_data = '' for f in files: if os.path.isfile(f): mod = os.path.getmtime(f) # allow for some overlap when testing if mod >= (completed_since - COMPLETED_SINCE_OVERLAP): # each output from condor_history has a trailing newline so we can # just concatenate them history_data = history_data + self.getHistoryFromFile(completed_since, jobs, f) else: logging.info("History file %s was last modified before given completedSince, skipped" % os.path.basename(f)) return history_data