def check_db( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs( self._project, 2 ): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Calculating the file checksum: run=%d, subrun=%d ...' % (run,subrun)) statusCode = 2 in_file_name = self._infile_format % ( run, subrun ) in_file = '%s/%s' % ( self._in_dir, in_file_name ) # Get status object status = self._api.get_status(ds_status(self._project, x[0],x[1],x[2])) self._data = status._data self._data = str( self._data ) if self._data: statusCode = 0 else: subject = 'Checksum of the file %s not in database' % in_file text = """File: %s Checksum is not in database """ % ( in_file ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = statusCode, data = self._data ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def process(self): ctr = self._nruns #return # Kazu's version of submit jobs stage_v = list(self._stage_digits) stage_v.reverse() status_v = list(self.PROD_STATUS) status_v.reverse() # temporary fix: record the processed run and only process once per process function call processed_run = [] for istage in stage_v: # self.warning('Inspecting stage %s @ %s' % (istage,self.now_str())) for istatus in status_v: fstatus = istage + istatus self.debug('Inspecting status %s @ %s' % (fstatus, self.now_str())) for x in self.get_runs(self._project, fstatus): run = int(x[0]) subrun = int(x[1]) runid = (run, subrun) if self._max_runid and runid > self._max_runid: continue if runid in processed_run: continue processed_run.append(runid) self.info('Found run/subrun: %s/%s ... inspecting @ %s' % (run, subrun, self.now_str())) statusCode = self.__decode_status__(fstatus) action = self.PROD_ACTION[statusCode] # Get status object status = self._api.get_status( ds_status(self._project, x[0], x[1], x[2])) self._data = status._data statusCode = action(statusCode, istage, run, subrun) self.info('Finished executing an action: %s @ %s' % (action.__name__, self.now_str())) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=statusCode, data=self._data) # Log status self.log_status(status) # Counter decreases by 1 ctr -= 1 # Break from loop if counter became 0 if ctr < 0: return return
def process( self ): ctr = self._nruns #return # Kazu's version of submit jobs stage_v = list(self._stage_digits) stage_v.reverse() status_v = list(self.PROD_STATUS) status_v.reverse() # temporary fix: record the processed run and only process once per process function call processed_run=[] for istage in stage_v: # self.warning('Inspecting stage %s @ %s' % (istage,self.now_str())) for istatus in status_v: fstatus = istage + istatus self.debug('Inspecting status %s @ %s' % (fstatus,self.now_str())) for x in self.get_runs( self._project, fstatus ): run = int(x[0]) subrun = int(x[1]) runid = (run,subrun) if self._max_runid and runid > self._max_runid: continue if runid in processed_run: continue processed_run.append(runid) self.info('Found run/subrun: %s/%s ... inspecting @ %s' % (run,subrun,self.now_str())) statusCode = self.__decode_status__( fstatus ) action = self.PROD_ACTION[statusCode] # Get status object status = self._api.get_status(ds_status(self._project, x[0],x[1],x[2])) self._data = status._data statusCode = action( statusCode, istage, run, subrun ) self.info('Finished executing an action: %s @ %s' % (action.__name__,self.now_str())) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = statusCode, data = self._data ) # Log status self.log_status( status ) # Counter decreases by 1 ctr -=1 # Break from loop if counter became 0 if ctr < 0: return return
def find_checksum( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs( [self._project, self._parent_project], [2, 0] ): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Checking tape: run=%d, subrun=%d ...' % (run,subrun)) statusCode = 2 in_file = self._infile_format % ( run, subrun ) samweb = samweb_cli.SAMWebClient(experiment="uboone") meta = {} try: meta = samweb.getMetadata(filenameorid=in_file) checksum_info = meta['checksum'][0].split(':') if checksum_info[0] == 'enstore': self._data = checksum_info[1] statusCode = 0 else: statusCode = 1 except samweb_cli.exceptions.FileNotFound: subject = 'Failed to locate file %s at SAM' % in_file text = 'File %s is not found at SAM!' % in_file pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = statusCode, data = self._data ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def validate(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project,2): # Counter decreases by 1 ctr -=1 (run, subrun) = (int(x[0]), int(x[1])) status = 1 in_file = '%s/%s' % (self._in_dir,self._name_pattern % (run,subrun)) out_file = '%s/%s' % (self._out_dir,self._name_pattern % (run,subrun)) # Report starting self.info('Checking %s transfer' % (out_file)) fmd5=[] for fin in in_file, out_file: if ":" in fin: if not os.system('ssh -x %s "test -f %s"'%(tuple(fin.split(":")))): fmd5.append(os.popen('ssh -x %s "md5sum -b %s"'%(tuple(fin.split(":")))).read()) else: self.error('File %s does not exist'%fin) status = 100 else: if os.path.isfile(fin): fmd5.append(os.popen('md5sum -b %s'%(fin)).read()) else: self.error('File %s does not exist'%fin) status = 100 if status == 1: if (fmd5[0].split()[0]==fmd5[1].split()[0]): self.info('Cheksum ok! %s %s'%tuple(fmd5[1].split())) status = 0 else: self.error('Failed md5sum!') status = 100 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = int(x[2]), status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def validate(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: resource = self._api.get_resource(self._project) self._nruns = int(resource['NRUNS']) self._beamdir = resource['BEAMDIR'] self._beamfile = resource['BEAMFILE'] self._infodir = resource['INFODIR'] self._infofile = resource['INFOFILE'] ctr = self._nruns for x in self.get_runs(self._project,2): # Counter decreases by 1 ctr -=1 run = int(x[0]) subrun = int(x[1]) status = 0 self.info('Parse into log file and check number events') logfname='%s/%s'%(self._logdir,self._logfile%(run,subrun)) if not os.path.isfile(logfname): # change status self.error('% not created'%logfname) continue if os.stat(logfname).st_size == 0: # change status self.error('%s is empty'%logfname) continue log_file = open(logfname) last_line = log_file.readlines()[-1].split() log_num_evts = int(last_line[0]) if not log_num_evts > 0: self.error('No events written') continue # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: resource = self._api.get_resource(self._project) self._nruns = int(resource['NRUNS']) self._infodir = resource['INFODIR'] self._infofile = resource['INFOFILE'] self._jsondir = resource['JSONDIR'] self._jsonfile = resource['JSONFILE'] ctr = self._nruns for x in self.get_runs(self._project,1): # Counter decreases by 1 ctr -=1 run = int(x[0]) subrun = int(x[1]) jsonfname='%s/%s'%(self._jsondir,self._jsonfile%(run,subrun)) if (not os.path.isfile(jsonfname)): self.info('Waiting for json file %s'%jsonfname) continue json_file=open(jsonfname) json_data=json.load(json_file) tbegin=datetime.datetime.strptime(json_data["stime"], "%a %b %d %H:%M:%S %Z %Y") tend=datetime.datetime.strptime(json_data["etime"], "%a %b %d %H:%M:%S %Z %Y") json_file.close() # Report starting self.info('Getting beam data: run=%d, subrun=%d' % (run,subrun)) self.info(' t0=%s, t1=%s' % (tbegin,tend)) cmd='bdaq_get --run-number %i --subrun-number --%i --begin-time %i %i --end-time %i %i'%(run,subrun,int(tbegin.strftime("%s")),0,int(tend.strftime("%s"))+1,0) self.info('Run cmd: %s'%cmd) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = 2 ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def validate(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: resource = self._api.get_resource(self._project) self._nruns = int(resource['NRUNS']) self._out_dir = '%s/%s' % (os.environ['PUB_TOP_DIR'], resource['OUTDIR']) self._outfile_format = resource['OUTFILE_FORMAT'] ctr = self._nruns for x in self.get_runs(self._project, 2): # Counter decreases by 1 ctr -= 1 run = int(x[0]) subrun = int(x[1]) status = 0 if os.path.isfile('%s/%s' % (self._out_dir, self._outfile_format % (run, subrun))): self.info('validated run: run=%d, subrun=%d ...' % (run, subrun)) else: self.error('error on run: run=%d, subrun=%d ...' % (run, subrun)) status = 1 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project, self._parent_project], [1,0]): # for x in self.get_runs(self._project,1): (run, subrun) = (int(x[0]), int(x[1])) status = 1 # Counter decreases by 1 ctr -=1 # Generate input, output file names in_file = '%s/%s' % (self._in_dir,self._name_pattern % (run,subrun)) out_file = '%s/%s' % (self._out_dir,self._name_pattern % (run,subrun)) dt=time.strftime("%Y%m%d-%H%M%S") cmd=["rsync","-e \"ssh -x\"","-bptgo","--suffix=_%s"%dt,in_file,out_file] if self._bwlimit>0: cmd+=["--bwlimit=%i"%self._bwlimit] p=subprocess.Popen(' '.join(cmd), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.debug('Executing %s'%' '.join(cmd)) (out,err)=p.communicate(None) if not p.returncode: status=2 self.info('Transfered %s to %s'%(in_file,out_file)) else: status=100 self.error("Failed to transfer %s to %s"%(in_file,out_file)) for line in err.split("\n"): self.error(line) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project, self._parent_project], [1, 0]): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('processing new run: run=%d, subrun=%d ...' % (run, subrun)) status = 1 # Check input file exists. Otherwise report error in_file = '%s/%s' % (self._in_dir, self._infile_format % (run, subrun)) out_file = '%s/%s' % (self._out_dir, self._outfile_format % (run, subrun)) if os.path.isfile(in_file): shutil.copyfile(in_file, out_file) status = 2 else: status = 100 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=0, status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def validate_sam( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project,self._parent_project], [12,0]): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Checking the SAM declaration: run=%d, subrun=%d ...' % (run,subrun)) status = 12 in_file_base = self._infile_format % ( run, subrun ) samweb = samweb_cli.SAMWebClient(experiment="uboone") # Check if the file already exists at SAM try: samweb.getMetadata(filenameorid=in_file_base) status = 10 except samweb_cli.exceptions.FileNotFound: status = 1 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: resource = self._api.get_resource(self._project) self._nruns = int(resource['NRUNS']) self._out_dir = '%s/%s' % (os.environ['PUB_TOP_DIR'], resource['OUTDIR']) self._outfile_format = resource['OUTFILE_FORMAT'] ctr = self._nruns for x in self.get_runs(self._project, 1): # Counter decreases by 1 ctr -= 1 run = int(x[0]) subrun = int(x[1]) # Report starting self.info('processing new run: run=%d, subrun=%d ...' % (run, subrun)) f = open( '%s/%s' % (self._out_dir, self._outfile_format % (run, subrun)), 'w') f.write('Dummy data for run %d, subrun %d' % (run, subrun)) f.close() # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=2) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def validate(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 2): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('validating run: run=%d, subrun=%d ...' % (run, subrun)) status = 1 in_file = '%s/%s' % (self._in_dir, self._infile_format % (run, subrun)) out_file = '%s/%s.json' % (self._out_dir, self._infile_format % (run, subrun)) if os.path.isfile(out_file): # os.system('rm %s' % in_file) status = 0 else: status = 100 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=int(x[2]), status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project,self._parent_project], [1,0]): # Counter decreases by 1 ctr -=1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('processing new run: run=%d, subrun=%d ...' % (run,subrun)) status = 1 # Check input file exists. Otherwise report error in_file = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) out_file = '%s/%s' % (self._out_dir,self._outfile_format % (run,subrun)) if os.path.isfile(in_file): shutil.copyfile(in_file,out_file) status = 2 else: status = 100 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def validate_dropbox( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 32): # Counter decreases by 1 ctr -=1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Validating a file in dropbox: run=%d, subrun=%d ...' % (run,subrun)) status = 32 in_file = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) out_file = '%s/%s' % (self._out_dir,self._outfile_format % (run,subrun)) res = subprocess.call(['ssh', 'uboonegpvm06', '-x', 'ls', out_file]) if res: # didn't find the file status = 10 else: status = 0 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = int(x[2]), status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def error_handle(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 100): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting status = 1 out_file = '%s/%s' % (self._out_dir, self._name_pattern % (run, subrun)) self.info('Removing failed transfer %s' % (out_file)) if ":" in out_file: #check that out_file is a file before trying to remove #(hopefully should avoid unintentional rm with bad out_dir/name_pattern combo) if not os.system('ssh -x %s "test -f %s"' % (tuple(out_file.split(":")))): os.system('ssh -x %s "rm %s"' % tuple(out_file.split(":"))) else: if os.path.isfile(out_file): os.system('rm %s' % out_file) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=0, status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def validate(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 2, False): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) in_file = '%s/%s' % (self._in_dir, self._name_pattern % (run, subrun)) self.info('Check if file %s was deleted.' % in_file) tmp_status = 0 if ":" in in_file: if not os.system('ssh -x %s "ls %s"' % (tuple(in_file.split(":")))): tmp_status = 100 else: if (len(glob.glob(in_file)) > 0): tmp_status = 100 if (tmp_status == 100): self.error('Failed to remove %s' % in_file) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=int(x[2]), status=tmp_status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def validate(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project,2): # Counter decreases by 1 ctr -=1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('validating run: run=%d, subrun=%d ...' % (run,subrun)) status = 1 in_file = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) out_file = '%s/%s.json' % (self._out_dir,self._infile_format % (run,subrun)) if os.path.isfile(out_file): # os.system('rm %s' % in_file) status = 0 else: status = 100 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = int(x[2]), status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def validate(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project,2,False): # Counter decreases by 1 ctr -=1 (run, subrun) = (int(x[0]), int(x[1])) in_file = '%s/%s' % (self._in_dir,self._name_pattern % (run,subrun)) self.info('Check if file %s was deleted.' % in_file) tmp_status=0 if ":" in in_file: if not os.system('ssh -x %s "ls %s"'%(tuple(in_file.split(":")))): tmp_status=100 else: if (len(glob.glob(in_file))>0): tmp_status=100 if (tmp_status==100): self.error('Failed to remove %s'%in_file) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = int(x[2]), status = tmp_status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def error_handle(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project,100): # Counter decreases by 1 ctr -=1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting status = 1 out_file = '%s/%s' % (self._out_dir,self._name_pattern % (run,subrun)) self.info('Removing failed transfer %s' %(out_file)) if ":" in out_file: #check that out_file is a file before trying to remove #(hopefully should avoid unintentional rm with bad out_dir/name_pattern combo) if not os.system('ssh -x %s "test -f %s"'%(tuple(out_file.split(":")))): os.system('ssh -x %s "rm %s"' % tuple(out_file.split(":"))) else: if os.path.isfile(out_file): os.system('rm %s' % out_file) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def validate(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: resource = self._api.get_resource(self._project) self._nruns = int(resource['NRUNS']) self._infodir = resource['INFODIR'] self._infofile = resource['INFOFILE'] ctr = self._nruns for x in self.get_runs(self._project,2): # Counter decreases by 1 ctr -=1 run = int(x[0]) subrun = int(x[1]) status = 0 fname='%s/%s'%(self._infodir,self._infofile%(run,subrun)) self.info('Parse info file %s and check created files'%fname) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def validate(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: resource = self._api.get_resource(self._project) self._nruns = int(resource['NRUNS']) self._infodir = resource['INFODIR'] self._infofile = resource['INFOFILE'] ctr = self._nruns for x in self.get_runs(self._project, 2): # Counter decreases by 1 ctr -= 1 run = int(x[0]) subrun = int(x[1]) status = 0 fname = '%s/%s' % (self._infodir, self._infofile % (run, subrun)) self.info('Parse info file %s and check created files' % fname) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def error_handle(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 100): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('cleaning failed run: run=%d, subrun=%d ...' % (run, subrun)) status = 1 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=0, status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def error_handle(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project,100): # Counter decreases by 1 ctr -=1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('cleaning failed run: run=%d, subrun=%d ...' % (run,subrun)) status = 1 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def error_handle(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 100, False): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting in_file = '%s/%s' % (self._in_dir, self._name_pattern % (run, subrun)) self.info('Will try removing %s again later.' % (in_file)) tmp_status = 100 # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=0, status=tmp_status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def error_handle(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project,100,False): # Counter decreases by 1 ctr -=1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting in_file = '%s/%s' % (self._in_dir,self._name_pattern % (run,subrun)) self.info('Will try removing %s again later.' % (in_file)) tmp_status = 100 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = tmp_status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def check_db(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 2): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Calculating the file checksum: run=%d, subrun=%d ...' % (run, subrun)) statusCode = 2 in_file_name = self._infile_format % (run, subrun) in_file = '%s/%s' % (self._in_dir, in_file_name) # Get status object status = self._api.get_status( ds_status(self._project, x[0], x[1], x[2])) self._data = status._data self._data = str(self._data) if self._data: statusCode = 0 else: subject = 'Checksum of the file %s not in database' % in_file text = """File: %s Checksum is not in database """ % (in_file) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=statusCode, data=self._data) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Check available space if ":" in self._in_dir: disk_frac_used=int(os.popen('ssh -x %s "df %s" | tail -n1'%tuple(self._in_dir.split(":"))).read().split()[4].strip("%")) else: disk_frac_used=int(os.popen('df %s | tail -n1'%(self._in_dir)).read().split()[4].strip("%")) self.info("%i%% of disk used. Removing files to get down to %i%%."%(disk_frac_used, self._disk_frac_limit)) if (disk_frac_used < self._disk_frac_limit): self.info('Only %i%% of disk space used (%s), skip cleaning until %i%% is reached.'%(disk_frac_used, self._in_dir, self._disk_frac_limit)) return # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns #we want the last argument of this list get_xtable_runs call to be False #that way the list is old files first to newew files last and clean up that way #target_runs = self.get_xtable_runs([self._project, self._parent_project], # [ 1, 0],False) target_runs = self.get_xtable_runs(self._project_list, self._project_requirement, False) self.info('Found %d runs to be processed (from project %s)...' % (len(target_runs),self._parent_project)) for x in target_runs: # Counter decreases by 1 ctr -=1 (run, subrun) = (int(x[0]), int(x[1])) tmp_status = 1 rm_status = 1 multiple_file_status=0 # Check input file exists. Otherwise report error in_file_holder = '%s/%s' % (self._in_dir,self._name_pattern % (run,subrun)) filelist = glob.glob( in_file_holder ) if (len(filelist)>1): self.error('ERROR: There is more than one file matching that pattern: %s' % filelist) multiple_file_status=200 if (len(filelist)<1): self.info('ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run,subrun)) status_code=100 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) errorMessage = "Failed to find file%s"%in_file_holder subject = "get_checksum_temp Failed to find file%s"%in_file_holder text = """File: %s Error message: %s """ % ( in_file_holder, errorMessage ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) else: in_file = filelist[0] self.info('Removing %s'%in_file) if ":" in in_file: #check that out_file is a file before trying to remove #(hopefully should avoid unintentional rm with bad out_dir/name_pattern combo) if not os.system('ssh -x %s "test -f %s"'%(tuple(in_file.split(":")))): rm_status=os.system('ssh -x %s "rm -f %s"' % tuple(in_file.split(":"))) tmp_status=2 else: self.info('Looks like the file is local on this node') if not os.path.isfile(in_file): self.info("ERROR: os.path.isfile('%s') returned false?!"%in_file) self.info('Going to remove the file with rm...') rm_status=os.system('rm -f %s' % in_file) tmp_status=2 if rm_status==0: tmp_status=tmp_status + multiple_file_status else: self.info('Failed to remove the file %s' % in_file) tmp_status=4 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = tmp_status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: resource = self._api.get_resource(self._project) self._nruns = int(resource['NRUNS']) self._istest = int(resource['ISTEST']) self._fcldir = resource['FCLDIR'] self._fclfile = resource['FCLFILE'] self._infodir = resource['INFODIR'] self._infofile = resource['INFOFILE'] if self._istest == 1: self._timedir = resource['TIMEDIR'] self._timefile = resource['TIMEFILE'] ctr = self._nruns for x in self.get_runs(self._project, 1): # Counter decreases by 1 ctr -= 1 run = int(x[0]) subrun = int(x[1]) if self._istest == 1: # Read timestamp from text file for testing tsfname = '%s/%s' % (self._timedir, self._timefile % (run, subrun)) if not os.path.isfile(tsfname): self.info('Waiting for time stamp file %s' % tsfname) continue if os.stat(tsfname).st_size == 0: self.info('Waiting for time stamp info') continue ts_file = open(tsfname) ts_lines = ts_file.readlines() bgn_line = ts_lines[0].split('"') end_line = ts_lines[-1].split('"') tbegin = datetime.datetime.strptime(bgn_line[1], "%a %b %d %H:%M:%S %Z %Y") tend = datetime.datetime.strptime(end_line[3], "%a %b %d %H:%M:%S %Z %Y") else: # Read timestamp from DB timestamp = self._api.run_timestamp('MainRun', run, subrun) tbegin = datetime.datetime.strptime(timestamp[0], "%a %b %d %H:%M:%S %Z %Y") tend = datetime.datetime.strptime(timestamp[1], "%a %b %d %H:%M:%S %Z %Y") # Report starting self.info('Getting beam data: run=%d, subrun=%d' % (run, subrun)) self.info(' t0=%s, t1=%s' % (tbegin, tend)) cmd = 'bdaq_get --run-number %i --subrun-number %i --begin-time %i %i --end-time %i %i -f %s/%s' % ( run, subrun, int(tbegin.strftime("%s")), 0, int(tend.strftime("%s")) + 1, 0, self._fcldir, self._fclfile) self.info('Run cmd: %s' % cmd) subprocess.call(cmd, shell=True) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=2) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
logger = pub_logger.get_logger('table') reader = ds_reader(pubdb_conn_info.reader_info(), logger) writer = ds_writer(pubdb_conn_info.writer_info(), logger) if not reader.project_exist(table) : print 'The table you gave me does not exist: %s' % table for x in reader.get_runs( table, old_status ): if run==x[0]: if subrun==-1: logger.info('In table %s, changing status of run %d, subrun %d from old_status=%d to new_status=%d' % (table, int(x[0]),int(x[1]), old_status, new_status) ) updated_status = ds_status( project = table, run = int(x[0]), subrun = int(x[1]), seq = 0, status = new_status ) writer.log_status(updated_status) else: if subrun==x[1]: logger.info('In table %s, changing status of run %d, subrun %d from old_status=%d to new_status=%d' % (table, int(x[0]),int(x[1]), old_status, new_status) ) updated_status = ds_status( project = table, run = int(x[0]), subrun = int(x[1]), seq = 0, status = new_status ) writer.log_status(updated_status) print 'Finished updating table %s' % table
def declare_to_sam(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project,self._parent_project], [1,0]): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Declaring a file to SAM: run=%d, subrun=%d ...' % (run,subrun)) status = 1 # Check input file exists. Otherwise report error in_file_base = self._infile_format % ( run, subrun ) in_file = '%s/%s' % ( self._in_dir, in_file_base ) in_json = '%s/%s.json' %( self._meta_dir, in_file_base ) if os.path.isfile(in_file) and os.path.isfile(in_json): self.info('Found %s' % (in_file)) self.info('Found %s' % (in_json)) json_dict = json.load( open( in_json ) ) # native SAM python call, instead of a system call # make sure you've done get-cert # Perhaps we want a try block for samweb? samweb = samweb_cli.SAMWebClient(experiment="uboone") # Check if the file already exists at SAM try: samweb.getMetadata(filenameorid=in_file_base) status = 101 # Want to email the experts except samweb_cli.exceptions.FileNotFound: # metadata already validated in get_assembler_metadata_file.py try: samweb.declareFile(md=json_dict) status = 12 except: print "Unexpected error: samweb declareFile problem: " print traceback.print_exc() # print "Give some null properties to this meta data" print "Give this file a status 11" status = 11 else: status = 100 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def error_handle(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 100): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('cleaning failed run: run=%d, subrun=%d ...' % (run, subrun)) status = 1 in_file_holder = '%s/%s' % (self._in_dir, self._infile_format % (run, subrun)) filelist = glob.glob(in_file_holder) if (len(filelist) < 1): self.error( 'ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run, subrun)) status_code = 100 status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status_code) self.log_status(status) continue if (len(filelist) > 1): self.error( 'ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run, subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] in_file_base_no_ext = os.path.splitext( os.path.basename(in_file))[0] out_file_base = '%s.root' % in_file_base_no_ext out_file = '%s/%s' % (self._out_dir, out_file_base) if os.path.isfile(out_file): os.system('rm %s' % out_file) # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=0, status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break try: os.remove(self._fcl_file_new) except OSError: pass
def transfer_file( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # self.info('Here, self._nruns=%d ... ' % (self._nruns) ) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project, self._parent_project], [1, 0],self._sort_new_to_old): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Transferring a file: run=%d, subrun=%d ...' % (run,subrun) ) status = 1 # Check input file exists. Otherwise report error in_file = '%s/%s' % ( self._in_dir, self._infile_format % ( run, subrun ) ) in_json = '%s/%s.json' %( self._meta_dir, self._infile_format % ( run, subrun ) ) out_file = '%s/%s' % ( self._out_dir, self._outfile_format % (run,subrun) ) out_json = '%s/%s.json' %( self._out_dir, self._outfile_format % (run,subrun) ) if "pnnl" in self._project: self.info('Will look for %s' % os.path.basename(in_file) ) try: if "pnnl" in self._project: (resi, resj) = self.pnnl_transfer(in_file) if resi == 0 and resj == 0: status = 0 else: status = 1 except: status = 1 else: status = 100 self.error("Big problem: This project not doing a transfer to PNNL.") # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def validate_outfile( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs( self._project, 2 ): # Counter decreases by 1 ctr -=1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Validating a file in the output directory: run=%d, subrun=%d ...' % (run,subrun)) status = 2 in_file_holder = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) filelist = glob.glob( in_file_holder ) if (len(filelist)<1): self.error('ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run,subrun)) status_code=100 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) continue if (len(filelist)>1): self.error('ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run,subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] if_file_basename = os.path.basename(in_file) out_file = '%s/%s' % ( self._out_dir, in_file_base) out_json = '%s/%s.json' %( self._out_dir, in_file_base) # construct ifdh object ih = ifdh.ifdh() try: ih.locateFile( out_file ) ih.locateFile( out_json ) status = 0 except: status = 1 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = int(x[2]), status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def transfer_file( self ): proc_list=[] done_list=[] run_id=[] # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project, self._parent_project], [1, 0]): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Transferring a file: run=%d, subrun=%d ...' % (run,subrun) ) status = 1 # Check input file exists. Otherwise report error in_file_holder = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) filelist = glob.glob( in_file_holder ) if (len(filelist)<1): self.error('ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run,subrun)) status_code=100 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) continue if (len(filelist)>1): self.error('ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run,subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] in_json = '%s.json' % in_file in_file_base = os.path.basename(in_file) out_file = '%s/%s' % ( self._out_dir, in_file_base) out_json = '%s/%s.json' % ( self._out_dir, in_file_base) # construct ifdh object #ih = ifdh.ifdh() #we're gonna use subprocess to parallelize these transfers and construct an ifdh command by hand if (os.path.isfile( in_file ) and (os.path.isfile( in_json ))): self.info('Found %s' % (in_file) ) self.info('Found %s' % (in_json) ) try: cmd = ['ifdh', 'cp','-D', in_file, in_json, self._out_dir] proc_list.append(subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE)) done_list.append(False) run_id.append((run,subrun)) self.info('Started transfer for (run,subrun)=%s @ %s' % (run_id[-1], time.strftime('%Y-%m-%d %H:%M:%s'))) status_code=3 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) # if not parallelized, wait till proc is done if not self._parallelize: time_spent = 0 while ((len(proc_list)>0) and (proc_list[-1].poll() is None)): time.sleep(1) time_spent +=1 if time_spent > self._max_wait: self.error('Exceeding the max wait time (%d sec). Terminating the process...' % self._max_wait) proc_list[-1].kill() status = ds_status( project = self._project, run = run_id[-1][0], subrun = run_id[-1][1], seq = 0, status = 555 ) self.log_status( status ) time.sleep(5) if proc_list[-1].poll() is None: self.error('Process termination failed. Hard-killing it (kill -9 %d)' % proc_list[-1].pid) subprocess.call(['kill','-9',str(proc_list[-1].pid)]) status = ds_status( project = self._project, run = run_id[-1][0], subrun = run_id[-1][1], seq = 0, status = 666 ) self.log_status( status ) break self.info('Finished copy [%s] @ %s' % (run_id[-1],time.strftime('%Y-%m-%d %H:%M:%S'))) status = ds_status( project = self._project, run = run_id[-1][0], subrun = run_id[-1][1], seq = 0, status = 0 ) self.log_status( status ) else: time.sleep(1) except: self.error('Caught the exception and setting the status back to 1 for (run,subrun) = (%s, %s)' % (x[0],x[1])) status = 1 status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) self.log_status( status ) else: self.error('Did not find the files that you told me to look for (run,subrun) = (%s, %s)' % (x[0],x[1])) self.error('Not found: %s' % (in_file) ) self.error('Or not found: %s' % (in_json) ) status = 100 status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) self.log_status( status ) # Pretend I'm doing something time.sleep(1) # Break from loop if counter became 0 if not ctr: break if not self._parallelize: return finished = False time_spent = 0 while not finished: finished = True time.sleep(1) time_spent += 1 active_counter = 0 for x in xrange(len(proc_list)): if done_list[x]: continue if not proc_list[x].poll() is None: self.info('Finished copy [%s] @ %s' % (run_id[x],time.strftime('%Y-%m-%d %H:%M:%S'))) status = ds_status( project = self._project, run = run_id[x][0], subrun = run_id[x][1], seq = 0, status = 0 ) self.log_status( status ) done_list[x] = True else: active_counter += 1 finished = False if time_spent%10: self.info('Waiting for copy to be done... (%d/%d processes) ... %d [sec]' % (active_counter,len(proc_list),time_spent)) if time_spent > self._max_wait: self.error('Exceeding the max wait time (%d sec). Terminating the processes...' % self._max_wait) for x in xrange(len(proc_list)): proc_list[x].kill() status_code = 101 status = ds_status( project = self._project, run = run_id[x][0], subrun = run_id[x][1], seq = 0, status = status_code ) self.log_status( status ) # hard kill if still alive time.sleep(5) if proc_list[x].poll() is None: self.error('Process termination failed. Hard-killing it (kill -9 %d)' % proc_list[x].pid) subprocess.call(['kill','-9',str(proc_list[x].pid)]) break self.info('All finished @ %s' % time.strftime('%Y-%m-%d %H:%M:%S'))
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project, self._parent_project], [1, 0]): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('processing new run: run=%d, subrun=%d ...' % (run, subrun)) status = 1 in_file_holder = '%s/%s' % (self._in_dir, self._infile_format % (run, subrun)) filelist = glob.glob(in_file_holder) if (len(filelist) < 1): self.error( 'ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run, subrun)) status_code = 100 status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status_code) self.log_status(status) continue if (len(filelist) > 1): self.error( 'ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run, subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] out_file = '%s.json' % in_file self.info('Found %s' % (in_file)) if in_file.strip().split('.')[-1] == "ubdaq": status, jsonData = self.get_ubdaq_metadata( in_file, run, subrun) else: try: jsonData = extractor_dict.getmetadata(in_file) status = 3 self.info( 'Successfully extract metadata from the swizzled file.' ) except: status = 100 self.error( 'Failed extracting metadata from the swizzled file.') if not status == 100: with open(out_file, 'w') as ofile: json.dump(jsonData, ofile, sort_keys=True, indent=4, ensure_ascii=False) # To Eric: what are you doing here? try: samweb = samweb_cli.SAMWebClient(experiment="uboone") # samweb.validateFileMetadata(json_file) # this throws/raises exception status = 2 except: self.error("Problem with samweb metadata: ", jsonData) self.error(sys.exc_info()[0]) status = 100 else: status = 1000 self.error('Did not find the input file %s' % in_file) # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=0, status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def validate(self): ctr = self._nruns # see if the status=2 files we've processed are indeed where they should be. for x in self.get_runs(self._project,2): # Counter decreases by 1 ctr -=1 run = int(x[0]) subrun = int(x[1]) status_code = 2 in_file_holder = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) filelist = glob.glob( in_file_holder ) if (len(filelist)<1): self.error('ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run,subrun)) status_code=100 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) if (len(filelist)>1): self.error('ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run,subrun)) self.error('ERROR: List of files found %s' % filelist) if (len(filelist)>0): in_file = filelist[0] in_file_segments = os.path.basename(in_file).split('-') if (len(in_file_segments)<2): self.error('ERROR: The file %s does not contain the - character' % in_file) self.error('ERROR: So have no idea what to do.') break out_file_prefix = in_file_segments[0] out_file = '%s/%s' % ( self._out_dir, self._outfile_format % (out_file_prefix,run,subrun) ) #res = subprocess.call(['ssh', 'ubdaq-prod-near1', '-x', 'ls', out_file]) res = subprocess.call(['ls', out_file]) if res: self.error('error on run: run=%d, subrun=%d ...' % (run,subrun)) status_code = 102 else: self.info('validated run: run=%d, subrun=%d ...' % (run,subrun)) status_code = 0 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): ctr = self._nruns proc_list=[] done_list=[] run_id=[] for x in self.get_runs(self._project,1): # Counter decreases by 1 ctr -=1 run = int(x[0]) subrun = int(x[1]) # Report starting self.info('processing new run: run=%d, subrun=%d ...' % (run,subrun)) status_code=1 in_file_holder = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) filelist = glob.glob( in_file_holder ) if (len(filelist)<1): self.error('ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run,subrun)) status_code=100 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) continue if (len(filelist)>1): self.error('ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run,subrun)) self.error('ERROR: List of files found %s' % filelist) if (len(filelist)>0): in_file = filelist[0] in_file_segments = os.path.basename(in_file).split('-') if (len(in_file_segments)<2): self.error('ERROR: The file %s does not contain the - character' % in_file) self.error('ERROR: So have no idea what to do.') break out_file_prefix = in_file_segments[0] out_file = '%s/%s' % ( self._out_dir, self._outfile_format % (out_file_prefix,run,subrun) ) #cmd = ['rsync', '-v', in_file, 'ubdaq-prod-near1:%s' % out_file] #cmd = ['rsync', '-v', in_file, out_file] cmd = ['cp', '-v', in_file, out_file] proc_list.append(subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE)) done_list.append(False) run_id.append((run,subrun)) self.info('Started copy (run,subrun)=%s @ %s' % (run_id[-1],time.strftime('%Y-%m-%d %H:%M:%S'))) # Create a status object to be logged to DB (if necessary) status_code=3 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) # if not parallelized, wait till proc is done if not self._parallelize: time_spent = 0 while ((len(proc_list)>0) and (proc_list[-1].poll() is None)): time.sleep(1) time_spent +=1 if time_spent > self._max_wait: self.error('Exceeding the max wait time (%d sec). Terminating the process...' % self._max_wait) proc_list[-1].kill() time.sleep(5) if proc_list[-1].poll() is None: self.error('Process termination failed. Hard-killing it (kill -9 %d)' % proc_list[-1].pid) subprocess.call(['kill','-9',str(proc_list[-1].pid)]) break self.info('Finished copy [%s] @ %s' % (run_id[-1],time.strftime('%Y-%m-%d %H:%M:%S'))) status = ds_status( project = self._project, run = run_id[-1][0], subrun = run_id[-1][1], seq = 0, status = 2 ) self.log_status( status ) # if parallelized, just sleep 5 sec and go next run else: time.sleep(5) if not ctr: break # if not parallelized, done if not self._parallelize: return finished = False time_spent = 0 while not finished: finished = True time.sleep(1) time_spent += 1 active_counter = 0 for x in xrange(len(proc_list)): if done_list[x]: continue if not proc_list[x].poll() is None: self.info('Finished copy [%s] @ %s' % (run_id[x],time.strftime('%Y-%m-%d %H:%M:%S'))) status_code = 2 status = ds_status( project = self._project, run = run_id[x][0], subrun = run_id[x][1], seq = 0, status = status_code ) self.log_status( status ) done_list[x] = True else: active_counter += 1 finished = False if time_spent%10: self.info('Waiting for copy to be done... (%d/%d processes) ... %d [sec]' % (active_counter,len(proc_list),time_spent)) if time_spent > self._max_wait: self.error('Exceeding the max wait time (%d sec). Terminating the processes...' % self._max_wait) for x in xrange(len(proc_list)): proc_list[x].kill() status_code = 101 status = ds_status( project = self._project, run = run_id[x][0], subrun = run_id[x][1], seq = 0, status = status_code ) self.log_status( status ) # hard kill if still alive time.sleep(5) if proc_list[x].poll() is None: self.error('Process termination failed. Hard-killing it (kill -9 %d)' % proc_list[x].pid) subprocess.call(['kill','-9',str(proc_list[x].pid)]) break self.info('All finished @ %s' % time.strftime('%Y-%m-%d %H:%M:%S'))
def declare_to_sam(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # self.info('Here, self._nruns=%d ... ' % (self._nruns)) self._project_requirement[0] = 1 # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs(self._project_list, self._project_requirement): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Declaring a file to SAM: run=%d, subrun=%d ...' % (run, subrun)) status = 1 # Check input file exists. Otherwise report error in_file_holder = '%s/%s' % (self._in_dir, self._infile_format % (run, subrun)) filelist = glob.glob(in_file_holder) if (len(filelist) < 1): self.error( 'ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run, subrun)) status_code = 100 status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status_code) self.log_status(status) continue if (len(filelist) > 1): self.error( 'ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run, subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] in_json = '%s.json' % in_file self.info('Declaring ' + in_file + ' to SAM: using ' + in_json) if os.path.isfile(in_file) and os.path.isfile(in_json): self.info('Found %s' % (in_file)) self.info('Found %s' % (in_json)) json_dict = json.load(open(in_json)) # native SAM python call, instead of a system call # make sure you've done get-cert # Perhaps we want a try block for samweb? samweb = samweb_cli.SAMWebClient(experiment="uboone") # Check if the file already exists at SAM try: in_file_base = os.path.basename(in_file) samweb.getMetadata(filenameorid=in_file_base) status = 101 # Email the experts subject = 'File %s Existing at SAM' % in_file_base text = """ File %s has already exists at SAM! """ % in_file_base pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) except samweb_cli.exceptions.FileNotFound: # metadata already validated in get_assembler_metadata_file.py try: samweb.declareFile(md=json_dict) status = 2 except Exception as e: # print "Unexpected error: samweb declareFile problem: " self.error( "Unexpected error: samweb declareFile problem: ") self.error("%s" % e) subject = "samweb declareFile problem: %s" % in_file_base text = """ File %s failed to be declared to SAM! %s """ % (in_file_base, traceback.print_exc()) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) # print "Give some null properties to this meta data" self.error("Give this file a status 102") status = 102 else: status = 100 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=0, status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def validate_sam(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # self.info('Here, self._nruns=%d ... ' % (self._nruns) ) self._project_requirement[0] = 2 # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns #for x in [(391,10,0,0)]: for x in self.get_xtable_runs(self._project_list, self._project_requirement): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Checking the SAM declaration: run=%d, subrun=%d ...' % (run, subrun)) status = 12 in_file_holder = '%s/%s' % (self._in_dir, self._infile_format % (run, subrun)) filelist = glob.glob(in_file_holder) if (len(filelist) < 1): self.error( 'ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run, subrun)) status_code = 100 status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status_code) self.log_status(status) continue if (len(filelist) > 1): self.error( 'ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run, subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] in_file_base = os.path.basename(in_file) samweb = samweb_cli.SAMWebClient(experiment="uboone") # Check if the file already exists at SAM try: samweb.getMetadata(filenameorid=in_file_base) status = 0 except samweb_cli.exceptions.FileNotFound: status = 1 # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=0, status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def calculate_checksum(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 1): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Calculating the file checksum: run=%d, subrun=%d ...' % (run, subrun)) statusCode = 1 in_file_name = self._infile_format % (run, subrun) in_file_holder = '%s/%s' % (self._in_dir, in_file_name) filelist = glob.glob(in_file_holder) if (len(filelist) > 1): self.error( 'ERROR: There is more than one file matching that pattern: %s' % in_file_name) if (len(filelist) < 1): errorMessage = "Failed to find file%s" % in_file_holder subject = "get_checksum_temp Failed to find file%s" % in_file_holder text = """File: %s Error message: %s """ % (in_file, errorMessage) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) statusCode = 200 else: in_file = filelist[0] metadata = {} try: metadata[ 'crc'] = samweb_client.utility.fileEnstoreChecksum( in_file) self._data = metadata['crc']['crc_value'] statusCode = 0 except Exception: errorMessage = traceback.print_exc() subject = 'Failed to obtain the checksum of the file %s' % in_file text = """File: %s Error message: %s """ % (in_file, errorMessage) pub_smtp(os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=statusCode, data=self._data) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project,self._parent_project], [1,0]): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('processing new run: run=%d, subrun=%d ...' % (run,subrun)) status = 1 # Check input file exists. Otherwise report error in_file = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) out_file = '%s/%s' % (self._out_dir,self._outfile_format % (run,subrun)) # # Looks fine now, but if there are new troubles: run this project with NRUNS=1 # if os.path.isfile(in_file): self.info('Found %s' % (in_file)) # shutil.copyfile(in_file,out_file) if in_file.strip().split('.')[-1] == "ubdaq": status, jsonData = self.get_ubdaq_metadata( in_file, run, subrun ) else: try: jsonData = extractor_dict.getmetadata( in_file ) status = 3 self.info('Successfully extract metadata from the swizzled file.') except: status = 100 self.error('Failed extracting metadata from the swizzled file.') if not status == 100: with open(out_file, 'w') as ofile: json.dump(jsonData, ofile, sort_keys = True, indent = 4, ensure_ascii=False) # To Eric: what are you doing here? try: samweb = samweb_cli.SAMWebClient(experiment="uboone") # samweb.validateFileMetadata(json_file) # this throws/raises exception status = 2 except: print "Problem with samweb metadata: ", jsonData print sys.exc_info()[0] status=100 else: status = 1000 self.error('Did not find the input file %s' % in_file ) # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def validate(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project, 2): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) self._log_file_local = self._log_file + str(run) + "_" + str( subrun) + ".txt" # Report starting self.info('validating run: run=%d, subrun=%d ...' % (run, subrun)) status = 1 in_file_holder = '%s/%s' % (self._in_dir, self._infile_format % (run, subrun)) filelist = glob.glob(in_file_holder) if (len(filelist) < 1): self.error( 'ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run, subrun)) status_code = 100 status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status_code) self.log_status(status) continue if (len(filelist) > 1): self.error( 'ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run, subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] in_file_base_no_ext = os.path.splitext( os.path.basename(in_file))[0] out_file_base = '%s.root' % in_file_base_no_ext out_file = '%s/%s' % (self._out_dir, out_file_base) # Get status object proj_status = self._api.get_status( ds_status(self._project, x[0], x[1], x[2])) # get data string for this project for this (run,subrun) datastr = proj_status._data # variable to hold the number of attempts to run this project on this (run,subrun) trial = 0 if (datastr != ''): try: trial = int(datastr) self.info('Trial number is %i' % trial) except: self.info( 'data field in status was neither string nor integer...' ) if os.path.exists(self._log_file_local): contents = open(self._log_file_local, 'r').read() if contents.find( 'Art has completed and will exit with status 0') > 0: self.info( 'Swizzling successfully completed for: run=%d, subrun=%d ...' % (run, subrun)) status = 0 else: self.info( 'Swizzling has no corresponding logfile for: run=%d, subrun=%d ...' % (run, subrun)) trial += 1 # if we tried this (run,subrun) too many times # change status to bad status = if (trial > 3): self.info('more than 3 trials...changing status to 101') status = 101 # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=int(x[2]), status=status, data=str(trial)) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): ctr = self._nruns proc_list = [] done_list = [] run_id = [] for x in self.get_runs(self._project, 1): # Counter decreases by 1 ctr -= 1 run = int(x[0]) subrun = int(x[1]) # Report starting self.info('processing new run: run=%d, subrun=%d ...' % (run, subrun)) status_code = 1 in_file_holder = '%s/%s' % (self._in_dir, self._infile_format % (run, subrun)) filelist = glob.glob(in_file_holder) if (len(filelist) < 1): self.error( 'ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run, subrun)) status_code = 100 status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status_code) self.log_status(status) continue if (len(filelist) > 1): self.error( 'ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run, subrun)) self.error('ERROR: List of files found %s' % filelist) if (len(filelist) > 0): in_file = filelist[0] in_file_segments = os.path.basename(in_file).split('-') if (len(in_file_segments) < 2): self.error( 'ERROR: The file %s does not contain the - character' % in_file) self.error('ERROR: So have no idea what to do.') break out_file_prefix = in_file_segments[0] out_file = '%s/%s' % (self._out_dir, self._outfile_format % (out_file_prefix, run, subrun)) #cmd = ['rsync', '-v', in_file, 'ubdaq-prod-near1:%s' % out_file] #cmd = ['rsync', '-v', in_file, out_file] cmd = ['cp', '-v', in_file, out_file] proc_list.append( subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)) done_list.append(False) run_id.append((run, subrun)) self.info('Started copy (run,subrun)=%s @ %s' % (run_id[-1], time.strftime('%Y-%m-%d %H:%M:%S'))) # Create a status object to be logged to DB (if necessary) status_code = 3 status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status_code) self.log_status(status) # if not parallelized, wait till proc is done if not self._parallelize: time_spent = 0 while ((len(proc_list) > 0) and (proc_list[-1].poll() is None)): time.sleep(1) time_spent += 1 if time_spent > self._max_wait: self.error( 'Exceeding the max wait time (%d sec). Terminating the process...' % self._max_wait) proc_list[-1].kill() time.sleep(5) if proc_list[-1].poll() is None: self.error( 'Process termination failed. Hard-killing it (kill -9 %d)' % proc_list[-1].pid) subprocess.call( ['kill', '-9', str(proc_list[-1].pid)]) break self.info('Finished copy [%s] @ %s' % (run_id[-1], time.strftime('%Y-%m-%d %H:%M:%S'))) status = ds_status(project=self._project, run=run_id[-1][0], subrun=run_id[-1][1], seq=0, status=2) self.log_status(status) # if parallelized, just sleep 5 sec and go next run else: time.sleep(5) if not ctr: break # if not parallelized, done if not self._parallelize: return finished = False time_spent = 0 while not finished: finished = True time.sleep(1) time_spent += 1 active_counter = 0 for x in xrange(len(proc_list)): if done_list[x]: continue if not proc_list[x].poll() is None: self.info('Finished copy [%s] @ %s' % (run_id[x], time.strftime('%Y-%m-%d %H:%M:%S'))) status_code = 2 status = ds_status(project=self._project, run=run_id[x][0], subrun=run_id[x][1], seq=0, status=status_code) self.log_status(status) done_list[x] = True else: active_counter += 1 finished = False if time_spent % 10: self.info( 'Waiting for copy to be done... (%d/%d processes) ... %d [sec]' % (active_counter, len(proc_list), time_spent)) if time_spent > self._max_wait: self.error( 'Exceeding the max wait time (%d sec). Terminating the processes...' % self._max_wait) for x in xrange(len(proc_list)): proc_list[x].kill() status_code = 101 status = ds_status(project=self._project, run=run_id[x][0], subrun=run_id[x][1], seq=0, status=status_code) self.log_status(status) # hard kill if still alive time.sleep(5) if proc_list[x].poll() is None: self.error( 'Process termination failed. Hard-killing it (kill -9 %d)' % proc_list[x].pid) subprocess.call(['kill', '-9', str(proc_list[x].pid)]) break self.info('All finished @ %s' % time.strftime('%Y-%m-%d %H:%M:%S'))
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project, self._parent_project], [1, 0]): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) self._log_file_local = self._log_file + str(run) + "_" + str( subrun) + ".txt" # Report starting self.info('processing new run: run=%d, subrun=%d ...' % (run, subrun)) status = 1 # Check input file exists. Otherwise report error in_file_holder = '%s/%s' % (self._in_dir, self._infile_format % (run, subrun)) filelist = glob.glob(in_file_holder) if (len(filelist) < 1): self.error( 'ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run, subrun)) status_code = 100 status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status_code) self.log_status(status) continue if (len(filelist) > 1): self.error( 'ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run, subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] in_file_base_no_ext = os.path.splitext( os.path.basename(in_file))[0] out_file_base = '%s.root' % in_file_base_no_ext out_file = '%s/%s' % (self._out_dir, out_file_base) # # # print "Looking for ", in_file # This is a hackity, hacky hack. But for now, run 1409 is the first run taken with the cable swap performed on Thurs Aug 13, 2015. # I'm putting this hack into place so that we don't swizzle the data before that with uboonecode v04_19_00 since there is not # yet an interval of validity for the database and channel mapping. Basically, the swizzler_data.py will ignore anything before #run 1409 and just not process it. if not run > 1408: continue if not os.path.isfile(in_file): self.error('Could not find file %s. Assigning status 404' % in_file) status = 404 else: self.info('Found %s' % (in_file)) try: # setup the LArSoft envt # print "Putting together cmd " # print "\t fcl_file ", self._fcl_file_new # print "\t in_file", in_file # print "\t out_file", out_file # print "\t basename out_file", os.path.basename(out_file) # print "\t _log_file", self._log_file_local cmd = "lar -c " + self._fcl_file_new + " -s " + in_file + " -o " + out_file + " -T " + self._out_dir + "/" + os.path.basename( out_file).strip(".root") + "_hist.root " # print "cmd is ", cmd self.info('Launch cmd is ' + cmd) except: self.error(sys.exc_info()[0]) # print "Give some null properties to this meta data" self.error("Give this file a status 100") status = 100 if not status == 100: # form the lar command # Check available space if ":" in self._in_dir: disk_frac_used = int( os.popen( 'ssh -x %s "df %s" | tail -n1' % tuple(self._in_dir.split(":"))).read().split() [4].strip("%")) else: disk_frac_used = int( os.popen( 'df %s | tail -n1' % (self._in_dir)).read().split()[4].strip("%")) if (disk_frac_used > self._disk_frac_limit): self.info( '%i%% of disk space used (%s), will not swizzle until %i%% is reached.' % (disk_frac_used, self._in_dir, self._disk_frac_limit)) status = 1 raise Exception( " raising Exception: not enough disk space.") # Check available cpu cpu_used = float( os.popen( "top -bn 2 -d 0.01 | grep '^Cpu.s.' | tail -n 1 | gawk '{print $2+$4+$6}'" ).read().strip("\n")) if (cpu_used > self._cpu_frac_limit): self.info( '%i of cpu used; will not swizzle until %i is reached.' % (cpu_used, self._cpu_frac_limit)) status = 1 raise Exception(" raising Exception: not enough cpu.") # Check available memory mem_avail = float( os.popen( "free -m | grep buffers | tail -n 1| gawk '{print $4}'" ).read().strip("\n")) if (mem_avail < int(self._available_memory)): self.info( '%d Memory available, will not swizzle until %d is reached.' % (mem_avail, int(self._available_memory))) status = 1 raise Exception( " raising Exception: not enough memory available.") self._proc_list.append( sub.Popen(cmd, shell=True, stderr=sub.PIPE, stdout=sub.PIPE)) self._log_file_list.append(self._log_file_local) self._run_list.append(x[0]) self._subrun_list.append(x[1]) self.info( ' Swizzling (run,subrun,processID) = (%d,%d,%d)...' % (run, subrun, self._proc_list[-1].pid)) self._proc_active.append(True) status = 3 time.sleep(1) # Create a status object to be logged to DB (if necessary) self.info('logging (run,subrun) = (%i,%i) with status %i' % (int(x[0]), int(x[1]), status)) status = ds_status(project=self._project, run=int(x[0]), subrun=int(x[1]), seq=0, status=status) # Log status self.log_status(status) # Break from run/subrun loop if counter became 0 if not ctr: break ############################################################################################# # NOTE that the below "poll" solution deadlocks with piping. Yet, I can't read to break the deadlock till # done, so let's for now just block each process till done. I need 'em all anyway before I can proceed. # The real time cost here seems to be in the reading (out,err) into local files. And then grep'ing # for the success mesage -- "Art has completed ...". ############################################################################################# # Now continually loop over all the running processes and ask for them each to be finished before we break out # while (1): # proc_alive=False time_spent = 0 while 1: active_counter = 0 time.sleep(5) time_spent += 5 for x in xrange(len(self._proc_list)): proc = self._proc_list[x] if not self._proc_active[x]: continue if not proc.poll() is None: self._proc_active[x] = False self.info('The return code was %d ' % proc.returncode) self.info('Finished swizzler process %s' % proc.pid) (out, err) = proc.communicate() fout = open(str(self._log_file_list[x]), 'w') fout.write(out) fout.close() if proc.returncode != 0: status = proc.returncode else: status = 2 status = ds_status(project=self._project, run=int(self._run_list[x]), subrun=int(self._subrun_list[x]), seq=0, status=status) # Log status self.log_status(status) else: active_counter += 1 if not active_counter: break if time_spent % 20 == 0: self.info('Swizzling process %d/%d active... @ %d [sec]' % (active_counter, len(self._proc_list), time_spent)) else: self.debug('Swizzling process %d/%d active... @ %d [sec]' % (active_counter, len(self._proc_list), time_spent)) if time_spent > self._proc_lifetime: self.error( 'Exceeding the allowed time span (%d [sec])! Killing lar jobs...', self._proc_lifetime) # Loop over & kill for x in xrange(len(self._proc_list)): proc = self._proc_list[x] # ignore already finished ones if not self._proc_active[x]: continue # kill proc.kill() # Log "finished" status status = 2 status = ds_status(project=self._project, run=int(self._run_list[x]), subrun=int(self._subrun_list[x]), seq=0, status=status) self.log_status(status) # Wait 30 sec and make sure they are dead time.sleep(30) for x in xrange(len(self._proc_list)): proc = self._proc_list[x] if not self._proc_active[x]: continue if proc.poll() is None: self.warning( 'Process %d not ending 30 sec after SIGINT... kill -9 now...' % proc.pid) sub.call(['kill', '-9', str(proc.pid)])
def error_handle(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs(self._project,100): # Counter decreases by 1 ctr -=1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('cleaning failed run: run=%d, subrun=%d ...' % (run,subrun)) status = 1 in_file_holder = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) filelist = glob.glob( in_file_holder ) if (len(filelist)<1): self.error('ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run,subrun)) status_code=100 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) continue if (len(filelist)>1): self.error('ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run,subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] out_file = '%s.json' % in_file if os.path.isfile(out_file): os.system('rm %s' % out_file) # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def compare_dropbox_checksum( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs( [self._project, self._parent_project], [1, 0] ): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Calculating the file checksum: run=%d, subrun=%d ...' % (run,subrun)) statusCode = 1 in_file_holder = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) filelist = glob.glob( in_file_holder ) if (len(filelist)<1): self.error('ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run,subrun)) status_code=100 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) continue if (len(filelist)>1): self.error('ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run,subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] in_file_name = os.path.basename(in_file) out_file = '%s/%s' % ( self._out_dir, in_file_name ) #Note that this has the sequence number hard coded as number 0 RefStatus = self._api.get_status( ds_status(self._ref_project, run, subrun, 0)) near1_checksum = RefStatus._data try: pnfs_adler32_1, pnfs_size = get_pnfs_1_adler32_and_size( out_file ) near1_adler32_1 = convert_0_adler32_to_1_adler32(near1_checksum, pnfs_size) if near1_adler32_1 == pnfs_adler32_1: statusCode = 0 else: subject = 'Checksum different in run %d, subrun %d between %s and PNFS' % ( run, subrun, self._ref_project ) text = '%s\n' % subject text += 'Run %d, subrun %d\n' % ( run, subrun ) text += 'Converted %s checksum: %s\n' % ( self._ref_project, near1_adler32_1 ) text += 'Converted PNFS checksum: %s\n' % ( pnfs_adler32_1 ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 1000 self._data = '%s:%s;PNFS:%s' % ( self._ref_project, near1_adler32_1, pnfs_adler32_1 ) except LookupError: self.warning("Could not find file in the dropbox %s" % out_file) self.warning("Gonna go looking on tape %s" % in_file_name) samweb = samweb_cli.SAMWebClient(experiment="uboone") meta = {} try: meta = samweb.getMetadata(filenameorid=in_file_name) checksum_info = meta['checksum'][0].split(':') if checksum_info[0] == 'enstore': self._data = checksum_info[1] statusCode = 0 else: statusCode = 10 except samweb_cli.exceptions.FileNotFound: subject = 'Failed to locate file %s at SAM' % in_file text = 'File %s is not found at SAM!' % in_file pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = statusCode, data = self._data ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() # self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_xtable_runs([self._project,self._parent_project], [1,0]): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('processing new run: run=%d, subrun=%d ...' % (run,subrun)) status = 1 in_file_holder = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) filelist = glob.glob( in_file_holder ) if (len(filelist)<1): self.error('ERROR: Failed to find the file for (run,subrun) = %s @ %s !!!' % (run,subrun)) status_code=100 status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = status_code ) self.log_status( status ) continue if (len(filelist)>1): self.error('ERROR: Found too many files for (run,subrun) = %s @ %s !!!' % (run,subrun)) self.error('ERROR: List of files found %s' % filelist) in_file = filelist[0] out_file = '%s.json' % in_file self.info('Found %s' % (in_file)) if in_file.strip().split('.')[-1] == "ubdaq": status, jsonData = self.get_ubdaq_metadata( in_file, run, subrun ) else: try: jsonData = extractor_dict.getmetadata( in_file ) status = 3 self.info('Successfully extract metadata from the swizzled file.') except: status = 100 self.error('Failed extracting metadata from the swizzled file.') if not status == 100: with open(out_file, 'w') as ofile: json.dump(jsonData, ofile, sort_keys = True, indent = 4, ensure_ascii=False) # To Eric: what are you doing here? try: samweb = samweb_cli.SAMWebClient(experiment="uboone") # samweb.validateFileMetadata(json_file) # this throws/raises exception status = 2 except: self.error( "Problem with samweb metadata: ", jsonData) self.error( sys.exc_info()[0]) status=100 else: status = 1000 self.error('Did not find the input file %s' % in_file ) # Pretend I'm doing something time.sleep(1) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def calculate_checksum( self ): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns for x in self.get_runs( self._project, 1 ): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('Calculating the file checksum: run=%d, subrun=%d ...' % (run,subrun)) statusCode = 1 in_file_name = self._infile_format % ( run, subrun ) in_file_holder = '%s/%s' % ( self._in_dir, in_file_name ) filelist = glob.glob( in_file_holder ) if (len(filelist)>1): self.error('ERROR: There is more than one file matching that pattern: %s' % in_file_name) if (len(filelist)<1): errorMessage = "Failed to find file%s"%in_file_holder subject = "get_checksum_temp Failed to find file%s"%in_file_holder text = """File: %s Error message: %s """ % ( in_file, errorMessage ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 200 else: in_file = filelist[0] metadata = {} try: metadata['crc'] = samweb_client.utility.fileEnstoreChecksum( in_file ) self._data = metadata['crc']['crc_value'] statusCode = 0 except Exception: errorMessage = traceback.print_exc() subject = 'Failed to obtain the checksum of the file %s' % in_file text = """File: %s Error message: %s """ % ( in_file, errorMessage ) pub_smtp( os.environ['PUB_SMTP_ACCT'], os.environ['PUB_SMTP_SRVR'], os.environ['PUB_SMTP_PASS'], self._experts, subject, text ) statusCode = 100 # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = run, subrun = subrun, seq = 0, status = statusCode, data = self._data ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
def validate(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: resource = self._api.get_resource(self._project) self._nruns = int(resource['NRUNS']) self._infodir = resource['INFODIR'] self._infofile = resource['INFOFILE'] ctr = self._nruns for x in self.get_runs(self._project, 2): # Counter decreases by 1 ctr -= 1 run = int(x[0]) subrun = int(x[1]) status = 0 fname = '%s/%s' % (self._infodir, self._infofile % (run, subrun)) # check that info was created and look for beam events # if beam events, check for beam file self.info('Parse info file %s and check created files' % fname) if not os.path.isfile(fname): # change status? self.error('%s not created' % fname) info_file = open(fname) for line in info_file: if "events" in line: wds = line.split() if int(wds[2]) > 0: beamfname = '%s/beam_%s_%07i_%05i.dat' % ( self._infodir, wds[0], run, subrun) if not os.path.isfile(beamfname): # change to appropriate status self.error('%s not created' % beamfname) return if os.stat(beamfname).st_size == 0: # change to appropriate status self.error('%s is empty' % beamfname) return # Create a status object to be logged to DB (if necessary) status = ds_status(project=self._project, run=run, subrun=subrun, seq=0, status=status) # Log status self.log_status(status) # Break from loop if counter became 0 if not ctr: break
def process_newruns(self): # Attempt to connect DB. If failure, abort if not self.connect(): self.error('Cannot connect to DB! Aborting...') return # If resource info is not yet read-in, read in. if self._nruns is None: self.get_resource() self.info('Here, self._nruns=%d ... ' % (self._nruns)) # Fetch runs from DB and process for # runs specified for this instance. ctr = self._nruns # Below picks up successfully swizzled files for x in self.get_xtable_runs([self._project,self._parent_project], [1,0]): # Counter decreases by 1 ctr -= 1 (run, subrun) = (int(x[0]), int(x[1])) # Report starting self.info('processing new run: run=%d, subrun=%d ...' % (run,subrun)) status = 100 # Check input file exists. Otherwise report error in_file = '%s/%s' % (self._in_dir,self._infile_format % (run,subrun)) print 'Looking for %s' % (in_file) if os.path.isfile(in_file): self.info('Found %s' % (in_file)) # Check metadata has_metadata = False try: samweb = samweb_cli.SAMWebClient(experiment="uboone") md = samweb.getMetadata(filenameorid=in_file) print 'Here 0' % md self.info('Weirdly, metadata already registered in SAM for %s. ... ' % (in_file)) has_metadata = True except: pass # Should be that metadata is in the artroot file. (But not yet declared it to SAM.) # Thus, retrieve metadata from file; use it to declare file with SAM. if not has_metadata: try: self.info( ' I feel a couple woos comin on, cus ') md = extractor_dict.getmetadata(in_file) self.info( ' ... there it was. ... (Just extracted the meta data from root file.) ') status = 3 try: samweb = samweb_cli.SAMWebClient(experiment="uboone") samweb.declareFile(md=md) status = 2 self.info('Successful extraction of artroot metadata and declaring it to SAM for %s. ... ' % (in_file)) except: self.info('Failed declaring metadata to SAM for %s. ... ' % (in_file)) except: self.info('Failed extracting artroot metadata for %s. ... ' % (in_file)) # Create a status object to be logged to DB (if necessary) status = ds_status( project = self._project, run = int(x[0]), subrun = int(x[1]), seq = 0, status = status ) # Log status self.log_status( status ) # Break from loop if counter became 0 if not ctr: break
writer = ds_writer(pubdb_conn_info.writer_info(), logger) if not reader.project_exist(table): print 'The table you gave me does not exist: %s' % table for x in reader.get_runs(table, old_status): if run == x[0]: if subrun == -1: logger.info( 'In table %s, changing status of run %d, subrun %d from old_status=%d to new_status=%d' % (table, int(x[0]), int(x[1]), old_status, new_status)) updated_status = ds_status(project=table, run=int(x[0]), subrun=int(x[1]), seq=0, status=new_status) writer.log_status(updated_status) else: if subrun == x[1]: logger.info( 'In table %s, changing status of run %d, subrun %d from old_status=%d to new_status=%d' % (table, int(x[0]), int(x[1]), old_status, new_status)) updated_status = ds_status(project=table, run=int(x[0]), subrun=int(x[1]), seq=0, status=new_status) writer.log_status(updated_status)