def disk_usage_alert(proj, max_disk, emails): try: import datetime except ImportError: return 'failed import of datetime' # if we are at a round 5-minute interval timenow = datetime.datetime.now() if ((timenow.minute % 5) != 0): return last_entry = proj[-1] log_time = last_entry.get_log_time() log_dict = last_entry.get_log_dict() lastDISK = 0 for key in log_dict: if (str(key) == 'DISK_USAGE_DATA'): lastDISK = float(log_dict[key]) * 100 if (lastDISK > max_disk): pub_smtp( receiver=emails, subject='PUBS ALERT: Disk usage on %s above %i percent!' % (pub_env.kSERVER_NAME, max_disk), text= 'Current disk usage is at %.02f percent. Please take action and clear disk space!' % (lastDISK)) return
def _jobstat_from_log(self, submit_time): result = (False,'') if not os.path.isfile(self.JOBSUB_LOG): subject = 'Failed fetching job log' text = 'Batch job log file not available... (check daemon, should not happen)' text += '\n\n' pub_smtp(receiver = self._experts, subject = subject, text = text) return result # Make sure log has been updated more recently than most recent submit. mod_time = os.path.getmtime(self.JOBSUB_LOG) self.info('Job log modification time: %s' % time.ctime(mod_time)) self.info('Submit time: %s' % time.ctime(submit_time)) if mod_time < submit_time + 60: return result log_age = time.time() - mod_time if log_age + 10 > self._period: return result contents = open(self.JOBSUB_LOG,'r').read() return (self._check_jobstat_str(contents),contents)
def check_db( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs( self._project, 2 ): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Calculating the file checksum: run=%d, subrun=%d ...' % (run,subrun)) statusCode = 2 in_file_name = self._infile_format % ( run, subrun ) in_file = '%s/%s' % ( self._in_dir, in_file_name ) # Get status object status = self._api.get_status(ds_status(self._project, x[0],x[1],x[2])) self._data = status._data self._data = str( self._data ) if self._data: statusCode = 0 else: subject = 'Checksum of the file %s not in database' % in_file text = """File: %s Checksum is not in database """ % ( in_file ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = statusCode, data = self._data ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def disk_usage_alert(proj,max_disk,emails): try: import datetime except ImportError: return 'failed import of datetime' # if we are at a round 5-minute interval timenow = datetime.datetime.now() if ( (timenow.minute%5) != 0): return last_entry = proj[-1] log_time = last_entry.get_log_time() log_dict = last_entry.get_log_dict() lastDISK = 0 for key in log_dict: if (str(key) == 'DISK_USAGE_DATA'): lastDISK = float(log_dict[key])*100 if (lastDISK > max_disk): pub_smtp(receiver = emails, subject = 'PUBS ALERT: Disk usage on %s above %i percent!' %(pub_env.kSERVER_NAME, max_disk) , text = 'Current disk usage is at %.02f percent. Please take action and clear disk space!'%(lastDISK)) return
def find_checksum( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs( [self._project, self._parent_project], [2, 0] ): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Checking tape: run=%d, subrun=%d ...' % (run,subrun)) statusCode = 2 in_file = self._infile_format % ( run, subrun ) samweb = samweb_cli.SAMWebClient(experiment="uboone") meta = {} try: meta = samweb.getMetadata(filenameorid=in_file) checksum_info = meta['checksum'][0].split(':') if checksum_info[0] == 'enstore': self._data = checksum_info[1] statusCode = 0 else: statusCode = 1 except samweb_cli.exceptions.FileNotFound: subject = 'Failed to locate file %s at SAM' % in_file text = 'File %s is not found at SAM!' % in_file pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = statusCode, data = self._data ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def execute(self): try: #self._logger.info('Executing: \"%s\"' % self._info._command) #self._logger.info('Executing: \"%s\"' % str(self._info._command.split())) self._proc = Popen( self._info._command.split(None), #shell=True, stdout=PIPE, stderr=PIPE) self._running = True except OSError as e: self._logger.error(e.strerror) self._logger.error('Error executing %s! Sending e-mail report...' % self._info._project) try: pub_smtp(receiver=self._info._email, subject='Failed execution of project %s' % self.name(), text=e.strerror) except BaseException as be: self._logger.critical( 'Project %s error could not be reported via email!' % self._info._project) for line in be.v.split('\n'): self._logger.error(line) raise DSException(e.strerror)
def execute(self): try: #self._logger.info('Executing: \"%s\"' % self._info._command) #self._logger.info('Executing: \"%s\"' % str(self._info._command.split())) self._proc = Popen(self._info._command.split(None), #shell=True, stdout = PIPE, stderr = PIPE) self._running = True except OSError as e: self._logger.error(e.strerror) self._logger.error('Error executing %s! Sending e-mail report...' % self._info._project) try: pub_smtp(receiver = self._info._email, subject = 'Failed execution of project %s' % self.name(), text = e.strerror) except BaseException as be: self._logger.critical('Project %s error could not be reported via email!' % self._info._project) for line in be.v.split('\n'): self._logger.error(line) raise DSException(e.strerror)
def email(cls, owner, subject, text): if not owner in cls._address: return False try: if cls._sub_prefix[owner]: subject = '<<%s>> %s' % (cls._sub_prefix[owner], subject) res = pub_smtp(receiver=cls._address[owner], subject=subject, text=text) if not res: res = False return res except BaseException as e: print e return False
def email(cls,owner,subject,text): if not owner in cls._address: return False try: if cls._sub_prefix[owner]: subject = '<<%s>> %s' % (cls._sub_prefix[owner],subject) res = pub_smtp(receiver = cls._address[owner], subject = subject, text = text) if not res: res=False return res except BaseException as e: print e return False
def _jobstat_from_log(self, submit_time): result = (False, '') if not os.path.isfile(self.JOBSUB_LOG): subject = 'Failed fetching job log' text = 'Batch job log file not available... (check daemon, should not happen)' text += '\n\n' pub_smtp(receiver=self._experts, subject=subject, text=text) return result # Make sure log has been updated more recently than most recent submit. mod_time = os.path.getmtime(self.JOBSUB_LOG) self.info('Job log modification time: %s' % time.ctime(mod_time)) self.info('Submit time: %s' % time.ctime(submit_time)) if mod_time < submit_time + 60: return result log_age = time.time() - mod_time if log_age + 10 > self._period: return result contents = open(self.JOBSUB_LOG, 'r').read() return (self._check_jobstat_str(contents), contents)
def recover( self, statusCode, istage, run, subrun ): current_status = statusCode + istage error_status = current_status + 1000 # Report starting # self.info() self._data = str( self._data ) # Main command stage = self._digit_to_name[istage] # Get project and stage object. try: probj, stobj = project.get_pubs_stage(self._xml_file, '', stage, run, subrun, self._version) except: self.error('Exception raied by project.get_pubs_stage:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return current_status # Submit job. jobid='' try: jobid = project.dosubmit(probj, stobj) except: self.error('Exception raied by project.dosubmit:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return current_status self.info( 'Resubmit jobs: xml: %s, stage: %s' %( self._xml_file, stage ) ) # Tentatively do so; need to change!!! if not jobid: self.error('Failed to fetch job log id...') subject = 'Failed to fetch job log id while submitting project %s stage %s.' % ( probj.name, stobj.name) text = subject text += '\n' text += 'Status code is set to %d!\n\n' % error_status pub_smtp(receiver = self._experts, subject = subject, text = text) return error_status # Now grab the parent job id and submit time if self._data == None or self._data == "None" or len(self._data) == 0: self._data = '%s+%f' % (jobid, time.time()) else: self._data += ':%s+%f' % (jobid, time.time()) statusCode = istage + self.kSUBMITTED self.info( "Resubmitted jobs, job id: %s, status: %d" % ( self._data, statusCode ) ) # Pretend I'm doing something time.sleep(5) # Here we may need some checks return statusCode
def declare_to_sam(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # self.info('Here, self._nruns=%d ... ' % (self._nruns)) self._project_requirement[0] = 1 # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs(self._project_list, self._project_requirement): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Declaring a file to SAM: run=%d, subrun=%d ...' % (run, subrun)) status = 1 # Check input file exists. Otherwise report error in_file_holder = '%s/%s' % (self._in_dir, self._infile_format % (run, subrun)) filelist = glob.glob(in_file_holder) if (len(filelist) < 1): self.error( 'ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run, subrun)) status_code = 100 status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status_code) self.log_status(status) continue if (len(filelist) > 1): self.error( 'ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run, subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] in_json = '%s.json' % in_file self.info('Declaring ' + in_file + ' to SAM: using ' + in_json) if os.path.isfile(in_file) and os.path.isfile(in_json): self.info('Found %s' % (in_file)) self.info('Found %s' % (in_json)) json_dict = json.load(open(in_json)) # native SAM python call, instead of a system call # make sure you've done get-cert # Perhaps we want a try block for samweb? samweb = samweb_cli.SAMWebClient(experiment="uboone") # Check if the file already exists at SAM try: in_file_base = os.path.basename(in_file) samweb.getMetadata(filenameorid=in_file_base) status = 101 # Email the experts subject = 'File %s Existing at SAM' % in_file_base text = """ File %s has already exists at SAM! """ % in_file_base pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) except samweb_cli.exceptions.FileNotFound: # metadata already validated in get_assembler_metadata_file.py try: samweb.declareFile(md=json_dict) status = 2 except Exception as e: # print "Unexpected error: samweb declareFile problem: " self.error( "Unexpected error: samweb declareFile problem: ") self.error("%s" % e) subject = "samweb declareFile problem: %s" % in_file_base text = """ File %s failed to be declared to SAM! %s """ % (in_file_base, traceback.print_exc()) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) # print "Give some null properties to this meta data" self.error("Give this file a status 102") status = 102 else: status = 100 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=0, status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def isRunning(self, statusCode, istage, run, subrun): current_status = statusCode + istage error_status = current_status + 1000 self._data = str(self._data) last_job_data = self._data.strip().split(':')[-1] job_data_list = last_job_data.split('+') jobid = job_data_list[0] if len(job_data_list) > 1: submit_time = float(job_data_list[1]) else: submit_time = float(0) # Main command jobstat = self._jobstat_from_log(submit_time) if not jobstat[0]: self.warning( 'Fetching job status from log file failed! Try running cmd...') jobstat = self._jobstat_from_cmd(jobid) if not jobstat[0]: subject = 'Failed to fetch job status!' text = '' if not jobstat[1]: text = 'No job log found...' else: text = 'Job log indicates query has failed (see below).\n %s' % jobstat[ 1] text += '\n' text += 'PUBS status remains same (%d)' % current_status self.error(subject) self.error(text) pub_smtp(receiver=self._experts, subject=subject, text=text) return current_status is_running = False target_jobs = [ x for x in jobstat[1].split('\n') if x.startswith(jobid) ] for line in target_jobs: words = line.split() job_state = words[5] if job_state == 'X': continue is_running = True if job_state == 'R': statusCode = self.kRUNNING break if not is_running: statusCode = self.kFINISHED msg = 'jobid: %s ... status: ' % jobid if statusCode == self.kRUNNING: msg += 'RUNNING' elif statusCode == self.kFINISHED: msg += 'FINISHED' elif statusCode == self.kSUBMITTED: msg += 'SUBMITTED' statusCode += istage msg += ' (%d)' % statusCode self.info(msg) return statusCode
def check(self, statusCode, istage, run, subrun): self._data = str(self._data) nSubmit = None # Get the number of job submissions. if self._data != None and len(self._data) > 0: holder = self._data.split(':') nSubmit = len(holder) # Check the finished jobs stage = self._digit_to_name[istage] # Get project and stage object. try: probj, stobj = project.get_pubs_stage(self._xml_file, '', stage, run, subrun, self._version) except: self.error('Exception raied by project.get_pubs_stage:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return statusCode + istage # Do check. try: real_stdout = sys.stdout real_stderr = sys.stderr sys.stdout = StringIO.StringIO() sys.stderr = StringIO.StringIO() project.doshorten(stobj) check_status = project.docheck(probj, stobj, ana=False) strout = sys.stdout.getvalue() strerr = sys.stderr.getvalue() sys.stdout = real_stdout sys.stderr = real_stderr if strout: self.info(strout) if strerr: self.warning(strerr) except: sys.stdout = real_stdout sys.stderr = real_stderr self.error('Exception raied by project.docheck:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return statusCode + istage # Update pubs status. if check_status == 0: statusCode = self.kREADYFORSAM self._data = '' elif nSubmit > self._nresubmission: # If the sample has been submitted more than a certain number # of times, email the expert, and move on to the next stage subject = "MCC jobs fails after %d resubmissions" % nSubmit text = """ Sample : %s Stage : %s Job IDs : %s """ % (self._project, self._digit_to_name[istage], self._data.split(':')[2:]) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) #statusCode = self.kDONE #istage += 10 statusCode += 1000 else: statusCode = self.kTOBERECOVERED statusCode += istage self.info("Checked job, status: %d" % statusCode) # Pretend I'm doing something time.sleep(5) # Here we may need some checks return statusCode
def recover(self, statusCode, istage, run, subrun): current_status = statusCode + istage error_status = current_status + 1000 # Report starting # self.info() self._data = str(self._data) # Main command stage = self._digit_to_name[istage] # Get project and stage object. try: probj, stobj = project.get_pubs_stage(self._xml_file, '', stage, run, subrun, self._version) except: self.error('Exception raied by project.get_pubs_stage:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return current_status # Submit job. jobid = '' try: jobid = project.dosubmit(probj, stobj) except: self.error('Exception raied by project.dosubmit:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return current_status self.info('Resubmit jobs: xml: %s, stage: %s' % (self._xml_file, stage)) # Tentatively do so; need to change!!! if not jobid: self.error('Failed to fetch job log id...') subject = 'Failed to fetch job log id while submitting project %s stage %s.' % ( probj.name, stobj.name) text = subject text += '\n' text += 'Status code is set to %d!\n\n' % error_status pub_smtp(receiver=self._experts, subject=subject, text=text) return error_status # Now grab the parent job id and submit time if self._data == None or self._data == "None" or len(self._data) == 0: self._data = '%s+%f' % (jobid, time.time()) else: self._data += ':%s+%f' % (jobid, time.time()) statusCode = istage + self.kSUBMITTED self.info("Resubmitted jobs, job id: %s, status: %d" % (self._data, statusCode)) # Pretend I'm doing something time.sleep(5) # Here we may need some checks return statusCode
def store(self, statusCode, istage, run, subrun): # Only store the final stage. # If this is not the final stage, advance to the next stage. if istage != self._stage_digits[-1]: statusCode = self.kDONE istage += 10 return statusCode + istage # Get stage name. stage = self._digit_to_name[istage] # Get project and stage object. try: probj, stobj = project.get_pubs_stage(self._xml_file, '', stage, run, subrun, self._version) except: self.error('Exception raied by project.get_pubs_stage:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return statusCode + istage # Do store. try: real_stdout = sys.stdout real_stderr = sys.stderr sys.stdout = StringIO.StringIO() sys.stderr = StringIO.StringIO() # Store files. dim = project_utilities.dimensions(probj, stobj, ana=False) store_status = project.docheck_locations(dim, stobj.outdir, add=False, clean=False, remove=False, upload=True) if store_status == 0: dim = project_utilities.dimensions(probj, stobj, ana=True) store_status = project.docheck_locations(dim, stobj.outdir, add=False, clean=False, remove=False, upload=True) strout = sys.stdout.getvalue() strerr = sys.stderr.getvalue() sys.stdout = real_stdout sys.stderr = real_stderr if strout: self.info(strout) if strerr: self.warning(strerr) except: sys.stdout = real_stdout sys.stderr = real_stderr self.error('Exception raied by project.docheck_locations:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return statusCode + istage # Update pubs status. if store_status == 0: statusCode = self.kDONE istage += 10 # Pretend I'm doing something time.sleep(5) # If all the stages complete, send an email to experts if not istage in self._stage_digits: subject = "Completed: MCC sample %s" % self._project text = """ Sample : %s Stage : %s """ % (self._project, self._digit_to_name[istage - 10]) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) statusCode += istage self.info("SAM store, status: %d" % statusCode) # Pretend I'm doing something time.sleep(5) # Here we may need some checks return statusCode
def calculate_checksum(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 1): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Calculating the file checksum: run=%d, subrun=%d ...' % (run, subrun)) statusCode = 1 in_file_name = self._infile_format % (run, subrun) in_file_holder = '%s/%s' % (self._in_dir, in_file_name) filelist = glob.glob(in_file_holder) if (len(filelist) > 1): self.error( 'ERROR: There is more than one file matching that pattern: %s' % in_file_name) if (len(filelist) < 1): errorMessage = "Failed to find file%s" % in_file_holder subject = "get_checksum_temp Failed to find file%s" % in_file_holder text = """File: %s Error message: %s """ % (in_file, errorMessage) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) statusCode = 200 else: in_file = filelist[0] metadata = {} try: metadata[ 'crc'] = samweb_client.utility.fileEnstoreChecksum( in_file) self._data = metadata['crc']['crc_value'] statusCode = 0 except Exception: errorMessage = traceback.print_exc() subject = 'Failed to obtain the checksum of the file %s' % in_file text = """File: %s Error message: %s """ % (in_file, errorMessage) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=statusCode, data=self._data) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def isRunning( self, statusCode, istage, run, subrun ): current_status = statusCode + istage error_status = current_status + 1000 self._data = str( self._data ) last_job_data = self._data.strip().split(':')[-1] job_data_list = last_job_data.split('+') jobid = job_data_list[0] if len(job_data_list) > 1: submit_time = float(job_data_list[1]) else: submit_time = float(0) # Main command jobstat = self._jobstat_from_log(submit_time) if not jobstat[0]: self.warning('Fetching job status from log file failed! Try running cmd...') jobstat = self._jobstat_from_cmd(jobid) if not jobstat[0]: subject = 'Failed to fetch job status!' text = '' if not jobstat[1]: text = 'No job log found...' else: text = 'Job log indicates query has failed (see below).\n %s' % jobstat[1] text += '\n' text += 'PUBS status remains same (%d)' % current_status self.error(subject) self.error(text) pub_smtp(receiver = self._experts, subject = subject, text = text) return current_status is_running = False target_jobs = [x for x in jobstat[1].split('\n') if x.startswith(jobid)] for line in target_jobs: words = line.split() job_state = words[5] if job_state == 'X': continue is_running = True if job_state == 'R': statusCode = self.kRUNNING break if not is_running: statusCode = self.kFINISHED msg = 'jobid: %s ... status: ' % jobid if statusCode == self.kRUNNING: msg += 'RUNNING' elif statusCode == self.kFINISHED: msg += 'FINISHED' elif statusCode == self.kSUBMITTED: msg += 'SUBMITTED' statusCode += istage msg += ' (%d)' % statusCode self.info(msg) return statusCode
def check( self, statusCode, istage, run, subrun ): self._data = str( self._data ) nSubmit = None # Get the number of job submissions. if self._data != None and len(self._data) > 0: holder = self._data.split(':') nSubmit = len(holder) # Check the finished jobs stage = self._digit_to_name[istage] # Get project and stage object. try: probj, stobj = project.get_pubs_stage(self._xml_file, '', stage, run, subrun, self._version) except: self.error('Exception raied by project.get_pubs_stage:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return statusCode + istage # Do check. try: real_stdout = sys.stdout real_stderr = sys.stderr sys.stdout = StringIO.StringIO() sys.stderr = StringIO.StringIO() project.doshorten(stobj) check_status = project.docheck(probj, stobj, ana=False) strout = sys.stdout.getvalue() strerr = sys.stderr.getvalue() sys.stdout = real_stdout sys.stderr = real_stderr if strout: self.info(strout) if strerr: self.warning(strerr) except: sys.stdout = real_stdout sys.stderr = real_stderr self.error('Exception raied by project.docheck:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return statusCode + istage # Update pubs status. if check_status == 0: statusCode = self.kREADYFORSAM self._data = '' elif nSubmit > self._nresubmission: # If the sample has been submitted more than a certain number # of times, email the expert, and move on to the next stage subject = "MCC jobs fails after %d resubmissions" % nSubmit text = """ Sample : %s Stage : %s Job IDs : %s """ % ( self._project, self._digit_to_name[istage], self._data.split(':')[2:] ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) #statusCode = self.kDONE #istage += 10 statusCode += 1000 else: statusCode = self.kTOBERECOVERED statusCode += istage self.info("Checked job, status: %d" % statusCode) # Pretend I'm doing something time.sleep(5) # Here we may need some checks return statusCode
def check_db(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 2): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Calculating the file checksum: run=%d, subrun=%d ...' % (run, subrun)) statusCode = 2 in_file_name = self._infile_format % (run, subrun) in_file = '%s/%s' % (self._in_dir, in_file_name) # Get status object status = self._api.get_status( ds_status(self._project, x[0], x[1], x[2])) self._data = status._data self._data = str(self._data) if self._data: statusCode = 0 else: subject = 'Checksum of the file %s not in database' % in_file text = """File: %s Checksum is not in database """ % (in_file) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=statusCode, data=self._data) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def store( self, statusCode, istage, run, subrun ): # Only store the final stage. # If this is not the final stage, advance to the next stage. if istage != self._stage_digits[-1]: statusCode = self.kDONE istage += 10 return statusCode + istage # Get stage name. stage = self._digit_to_name[istage] # Get project and stage object. try: probj, stobj = project.get_pubs_stage(self._xml_file, '', stage, run, subrun, self._version) except: self.error('Exception raied by project.get_pubs_stage:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return statusCode + istage # Do store. try: real_stdout = sys.stdout real_stderr = sys.stderr sys.stdout = StringIO.StringIO() sys.stderr = StringIO.StringIO() # Store files. dim = project_utilities.dimensions(probj, stobj, ana=False) store_status = project.docheck_locations(dim, stobj.outdir, add=False, clean=False, remove=False, upload=True) if store_status == 0: dim = project_utilities.dimensions(probj, stobj, ana=True) store_status = project.docheck_locations(dim, stobj.outdir, add=False, clean=False, remove=False, upload=True) strout = sys.stdout.getvalue() strerr = sys.stderr.getvalue() sys.stdout = real_stdout sys.stderr = real_stderr if strout: self.info(strout) if strerr: self.warning(strerr) except: sys.stdout = real_stdout sys.stderr = real_stderr self.error('Exception raied by project.docheck_locations:') e = sys.exc_info() for item in e: self.error(item) for line in traceback.format_tb(e[2]): self.error(line) return statusCode + istage # Update pubs status. if store_status == 0: statusCode = self.kDONE istage += 10 # Pretend I'm doing something time.sleep(5) # If all the stages complete, send an email to experts if not istage in self._stage_digits: subject = "Completed: MCC sample %s" % self._project text = """ Sample : %s Stage : %s """ % ( self._project, self._digit_to_name[istage-10] ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode += istage self.info("SAM store, status: %d" % statusCode) # Pretend I'm doing something time.sleep(5) # Here we may need some checks return statusCode
def compare(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs( [self._project, self._ref_project, self._parent_project], [1, 0, 0]): # Counter decreases by 1 ctr -= 1 # Currently hard-coded seq = 0 (run, subrun, seq) = (int(x[0]), int(x[1]), 0) # Report starting now_str = time.strftime('%Y-%m-%d %H:%M:%S') self.info('Comparing checksums: run=%d, subrun=%d @ %s' % (run, subrun, now_str)) statusCode = 1 # Get status objects RefStatus = self._api.get_status( ds_status(self._ref_project, run, subrun, seq)) ParentStatus = self._api.get_status( ds_status(self._parent_project, run, subrun, seq)) if RefStatus._data == ParentStatus._data: statusCode = 0 else: subject = 'Checksum different in run %d, subrun %d between %s and %s' % ( run, subrun, self._ref_project, self._parent_project) text = '%s\n' % subject text += 'Run %d, subrun %d\n' % (run, subrun) text += '%s checksum: %s\n' % (self._ref_project, RefStatus._data) text += '%s checksum: %s\n' % (self._parent_project, ParentStatus._data) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) statusCode = 1000 self._data = '%s:%s;%s:%s' % ( self._ref_project, RefStatus._data, self._parent_project, ParentStatus._data) # Pretend I'm doing something time.sleep(0.5) # Report finishing now_str = time.strftime('%Y-%m-%d %H:%M:%S') self.info('Finished comparing checksums: run=%d, subrun=%d @ %s' % (run, subrun, now_str)) # Create a status object to be logged to DB (if necessary) # Let's say we set the status to be 10 status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=seq, status=statusCode, data=self._data) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def compare( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs( [self._project, self._ref_project, self._parent_project], [1, 0, 0] ): # Counter decreases by 1 ctr -=1 # Currently hard-coded seq = 0 (run, subrun, seq) = (int(x[0]), int(x[1]), 0) # Report starting now_str = time.strftime('%Y-%m-%d %H:%M:%S') self.info('Comparing checksums: run=%d, subrun=%d @ %s' % ( run, subrun, now_str )) statusCode = 1 # Get status objects RefStatus = self._api.get_status( ds_status( self._ref_project, run, subrun, seq )) ParentStatus = self._api.get_status( ds_status( self._parent_project, run, subrun, seq )) if RefStatus._data == ParentStatus._data: statusCode = 0 else: subject = 'Checksum different in run %d, subrun %d between %s and %s' % ( run, subrun, self._ref_project, self._parent_project ) text = '%s\n' % subject text += 'Run %d, subrun %d\n' % ( run, subrun ) text += '%s checksum: %s\n' % ( self._ref_project, RefStatus._data ) text += '%s checksum: %s\n' % ( self._parent_project, ParentStatus._data ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 1000 self._data = '%s:%s;%s:%s' % ( self._ref_project, RefStatus._data, self._parent_project, ParentStatus._data ) # Pretend I'm doing something time.sleep(0.5) # Report finishing now_str = time.strftime('%Y-%m-%d %H:%M:%S') self.info('Finished comparing checksums: run=%d, subrun=%d @ %s' % ( run, subrun, now_str )) # Create a status object to be logged to DB (if necessary) # Let's say we set the status to be 10 status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = seq, status = statusCode, data = self._data ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def compare_dropbox_checksum( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs( [self._project, self._parent_project], [1, 0] ): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Calculating the file checksum: run=%d, subrun=%d ...' % (run,subrun)) statusCode = 1 in_file_holder = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) filelist = glob.glob( in_file_holder ) if (len(filelist)<1): self.error('ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run,subrun)) status_code=100 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) continue if (len(filelist)>1): self.error('ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run,subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] in_file_name = os.path.basename(in_file) out_file = '%s/%s' % ( self._out_dir, in_file_name ) #Note that this has the sequence number hard coded as number 0 RefStatus = self._api.get_status( ds_status(self._ref_project, run, subrun, 0)) near1_checksum = RefStatus._data try: pnfs_adler32_1, pnfs_size = get_pnfs_1_adler32_and_size( out_file ) near1_adler32_1 = convert_0_adler32_to_1_adler32(near1_checksum, pnfs_size) if near1_adler32_1 == pnfs_adler32_1: statusCode = 0 else: subject = 'Checksum different in run %d, subrun %d between %s and PNFS' % ( run, subrun, self._ref_project ) text = '%s\n' % subject text += 'Run %d, subrun %d\n' % ( run, subrun ) text += 'Converted %s checksum: %s\n' % ( self._ref_project, near1_adler32_1 ) text += 'Converted PNFS checksum: %s\n' % ( pnfs_adler32_1 ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 1000 self._data = '%s:%s;PNFS:%s' % ( self._ref_project, near1_adler32_1, pnfs_adler32_1 ) except LookupError: self.warning("Could not find file in the dropbox %s" % out_file) self.warning("Gonna go looking on tape %s" % in_file_name) samweb = samweb_cli.SAMWebClient(experiment="uboone") meta = {} try: meta = samweb.getMetadata(filenameorid=in_file_name) checksum_info = meta['checksum'][0].split(':') if checksum_info[0] == 'enstore': self._data = checksum_info[1] statusCode = 0 else: statusCode = 10 except samweb_cli.exceptions.FileNotFound: subject = 'Failed to locate file %s at SAM' % in_file text = 'File %s is not found at SAM!' % in_file pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = statusCode, data = self._data ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def declare_to_sam( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # self.info('Here, self._nruns=%d ... ' % (self._nruns)) self._project_requirement[0] = 1 # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs(self._project_list, self._project_requirement): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Declaring a file to SAM: run=%d, subrun=%d ...' % (run,subrun) ) status = 1 # Check input file exists. Otherwise report error in_file_holder = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) filelist = glob.glob( in_file_holder ) if (len(filelist)<1): self.error('ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run,subrun)) status_code=100 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) continue if (len(filelist)>1): self.error('ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run,subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] in_json = '%s.json' % in_file self.info('Declaring ' + in_file + ' to SAM: using ' + in_json ) if os.path.isfile( in_file ) and os.path.isfile( in_json ): self.info('Found %s' % (in_file) ) self.info('Found %s' % (in_json) ) json_dict = json.load( open( in_json ) ) # native SAM python call, instead of a system call # make sure you've done get-cert # Perhaps we want a try block for samweb? samweb = samweb_cli.SAMWebClient(experiment="uboone") # Check if the file already exists at SAM try: in_file_base=os.path.basename(in_file) samweb.getMetadata(filenameorid=in_file_base) status = 101 # Email the experts subject = 'File %s Existing at SAM' % in_file_base text = """ File %s has already exists at SAM! """ % in_file_base pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) except samweb_cli.exceptions.FileNotFound: # metadata already validated in get_assembler_metadata_file.py try: samweb.declareFile(md=json_dict) status = 2 except Exception as e: # print "Unexpected error: samweb declareFile problem: " self.error( "Unexpected error: samweb declareFile problem: ") self.error( "%s" % e) subject = "samweb declareFile problem: %s" % in_file_base text = """ File %s failed to be declared to SAM! %s """ % ( in_file_base, traceback.print_exc() ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) # print "Give some null properties to this meta data" self.error( "Give this file a status 102") status = 102 else: status = 100 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def calculate_checksum( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs( self._project, 1 ): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Calculating the file checksum: run=%d, subrun=%d ...' % (run,subrun)) statusCode = 1 in_file_name = self._infile_format % ( run, subrun ) in_file_holder = '%s/%s' % ( self._in_dir, in_file_name ) filelist = glob.glob( in_file_holder ) if (len(filelist)>1): self.error('ERROR: There is more than one file matching that pattern: %s' % in_file_name) if (len(filelist)<1): errorMessage = "Failed to find file%s"%in_file_holder subject = "get_checksum_temp Failed to find file%s"%in_file_holder text = """File: %s Error message: %s """ % ( in_file, errorMessage ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 200 else: in_file = filelist[0] metadata = {} try: metadata['crc'] = samweb_client.utility.fileEnstoreChecksum( in_file ) self._data = metadata['crc']['crc_value'] statusCode = 0 except Exception: errorMessage = traceback.print_exc() subject = 'Failed to obtain the checksum of the file %s' % in_file text = """File: %s Error message: %s """ % ( in_file, errorMessage ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = statusCode, data = self._data ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def compare_dropbox_checksum(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project, self._parent_project], [1, 0]): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Calculating the file checksum: run=%d, subrun=%d ...' % (run, subrun)) statusCode = 1 in_file_holder = '%s/%s' % (self._in_dir, self._infile_format % (run, subrun)) filelist = glob.glob(in_file_holder) if (len(filelist) < 1): self.error( 'ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run, subrun)) status_code = 100 status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status_code) self.log_status(status) continue if (len(filelist) > 1): self.error( 'ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run, subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] in_file_name = os.path.basename(in_file) out_file = '%s/%s' % (self._out_dir, in_file_name) #Note that this has the sequence number hard coded as number 0 RefStatus = self._api.get_status( ds_status(self._ref_project, run, subrun, 0)) near1_checksum = RefStatus._data try: pnfs_adler32_1, pnfs_size = get_pnfs_1_adler32_and_size( out_file) near1_adler32_1 = convert_0_adler32_to_1_adler32( near1_checksum, pnfs_size) if near1_adler32_1 == pnfs_adler32_1: statusCode = 0 else: subject = 'Checksum different in run %d, subrun %d between %s and PNFS' % ( run, subrun, self._ref_project) text = '%s\n' % subject text += 'Run %d, subrun %d\n' % (run, subrun) text += 'Converted %s checksum: %s\n' % (self._ref_project, near1_adler32_1) text += 'Converted PNFS checksum: %s\n' % (pnfs_adler32_1) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) statusCode = 1000 self._data = '%s:%s;PNFS:%s' % ( self._ref_project, near1_adler32_1, pnfs_adler32_1) except LookupError: self.warning("Could not find file in the dropbox %s" % out_file) self.warning("Gonna go looking on tape %s" % in_file_name) samweb = samweb_cli.SAMWebClient(experiment="uboone") meta = {} try: meta = samweb.getMetadata(filenameorid=in_file_name) checksum_info = meta['checksum'][0].split(':') if checksum_info[0] == 'enstore': self._data = checksum_info[1] statusCode = 0 else: statusCode = 10 except samweb_cli.exceptions.FileNotFound: subject = 'Failed to locate file %s at SAM' % in_file text = 'File %s is not found at SAM!' % in_file pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=statusCode, data=self._data) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break