def findLocalFiles(book, dataset): print " INFO - loading local physical files (Tier-3)." cmd = "hdfs dfs -ls /cms/store/user/paus/%s/%s " % (book, dataset) + "|grep root" myRx = rex.Rex() (rc, out, err) = myRx.executeLocalAction(cmd) lines = out.split("\n") # in case hdfs is not installed if len(lines) < 2: cmd = "ls -1 /mnt/hadoop/cms/store/user/paus/%s/%s " % ( book, dataset) + "|grep root" myRx = rex.Rex() (rc, out, err) = myRx.executeLocalAction(cmd) # find list lFiles = set() for row in out.split("\n"): # empty lines if len(row) < 2: continue # make sure to adapt to format f = row.split("/") if len(f) > 2: filename = "/".join(f[-2:]) else: filename = "%s/%s" % (dataset, row) lFiles.add(filename) return lFiles
def findAllT3Files(dir): print " INFO - loading all physical files on T3." cmd = "export T2TOOLS_SERVER=t3serv015.mit.edu; export T2TOOLS_USER=cmsprod;" cmd += "list %s/* " % (dir) + "|grep root" myRx = rex.Rex() #print " CMD: " + cmd (rc, out, err) = myRx.executeLocalAction(cmd) with open(".sizes-t3", "w") as fH: # find list files = {} for row in out.split("\n"): if len(row) < 2: continue fH.write(row + '\n') size = int((row.split(" ")[0]).split(":")[1]) filename = "/".join((row.split(" ")[-1]).split('/')[-2:]) files[filename] = size if size < 10: print " ERROR - zero size file found: %s/%s" % (dir, filename) return files
def findAllFiles(dir): print " INFO - loading all physical files on T2." cmd = "list %s/* " % (dir) + "|grep root" myRx = rex.Rex() (rc, out, err) = myRx.executeLocalAction(cmd) with open(".sizes", "w") as fH: # find list files = {} for row in out.split("\n"): if len(row) < 2: continue fH.write(row + '\n') size = int((row.split(" ")[0]).split(":")[1]) filename = "/".join((row.split(" ")[-1]).split('/')[-2:]) files[filename] = size if size < 10: print " ERROR - zero size file found: " + filename return files
def executeCondorCmd(self, cmd='condor_q', output=False): if output: print(' execute condor command: %s' % (cmd)) myRx = rex.Rex(self.host, self.user) irc = 0 if not self.isLocal(): (irc, rc, out, err) = myRx.executeAction(cmd) if (irc != 0 or rc != 0): print(' ERROR -- IRC: %d' % (irc)) else: (rc, out, err) = myRx.executeLocalAction(cmd) if (irc != 0 or rc != 0): print(' ERROR -- RC: %d' % (rc)) print(' ERROR -- ERR:\n%s' % (err)) if output: print(' OUT:\n%s' % (out)) if err != '': print('\n ERR:\n%s' % (err)) return (rc, out, err)
def findFiles(book, dataset): # very important to only look at files that are newer than the ones we counted already print(" INFO - analyzing book:%s dataset:%s." % (book, dataset)) cmd = "ls -1t %s/%s/%s/*.err " % (TRUNC, book, dataset) myRx = rex.Rex() (rc, out, err) = myRx.executeLocalAction(cmd) ##print(" =DEBUG= START - OUT ==\n%s\n =DEBUG= END - OUT.\n"%(out)) # find list files = set() for row in out.split("\n"): if len(row) < 2: continue fileName = (row.split('/')[-1]).split('.')[0] if 'ncounts' in fileName: # only consider files that were not yet analyzed print(" Found the counts file: %s (%s) --> BREAK" % (row, fileName)) break print(" Adding file: %s" % (fileName)) files.add(fileName) return files
def findX509Proxy(self): cmd = "voms-proxy-info -path" #print " CMD: " + cmd myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(cmd) x509Proxy = out[:-1] print " X509Proxy: " + x509Proxy return x509Proxy.split("/")[-1]
def clearLocalCache(datasetId): cmd = 'rm -f %s/????/%s.????' % (WORK_DIR, datasetId) #print(' Clearing cache: %s'%(cmd)) myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(cmd) return rc
def numberOfFiles(config, version, dataset): nFiles = -1 myRx = rex.Rex() (rc, out, err) = myRx.executeLocalAction("list %s/%s/%s/%s/*.root 2> /dev/null" % (BASE, config, version, dataset)) nFiles = len(out.split("\n")) return nFiles
def __init__(self, task): self.task = task self.localUser = os.getenv('USER') self.logRemoveScript = '' # '#!/bin/bash\n' self.webRemoveScript = '' # '#!/bin/bash\n' self.logSaveScript = '' # '#!/bin/bash\n' self.rex = rex.Rex(self.task.scheduler.host, self.task.scheduler.user)
def findCmsswVersion(self): cmd = "ls -1rt %s/%s/"%(os.getenv('KRAKEN_CMSSW'),self.request.version) print " CMD: " + cmd myRex = rex.Rex() (rc,out,err) = myRex.executeLocalAction(cmd) cmsswVersion = "" for line in out.split("\n"): if 'CMSSW_' in line: cmsswVersion = line print " CMSSW: " + cmsswVersion return (cmsswVersion.replace('CMSSW_',''))
def __init__(self, task): self.task = task self.localUser = os.getenv('USER') self.activity = os.getenv('KRAKEN_ACTIVITY') self.logRemoveScript = '' self.webRemoveScript = '' self.logSaveScript = '' self.rex = rex.Rex(self.task.scheduler.host, self.task.scheduler.user)
def find_files(dir): cmd = "list %s/* " % (dir) + "|grep root" myRx = rex.Rex() (rc, out, err) = myRx.executeLocalAction(cmd) files = [] for row in out.split("\n"): if len(row) < 2: continue filename = "/".join((row.split(" ")[-1]).split('/')[-2:]) files.append(filename) return files
def findOsVersion(self): cmd = "ls -1 %s/%s/SW_%s/lib|cut -d_ -f1|tail -1"%\ (os.getenv('KRAKEN_SW'),self.request.version,self.swVersion) #print(" CMD: " + cmd) myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(cmd) osVersion = "" osVersion = out.decode()[:-1] if osVersion == "": osVersion = 'slc7' print(" OS: " + osVersion) return osVersion
def testTier2Disk(debug=0): # make sure we can see the Tier-2 disks: returns -1 on failure cmd = "list /cms/store/user/paus 2> /dev/null" if debug > 0: print " CMD: %s"%(cmd) myRx = rex.Rex() (rc,out,err) = myRx.executeLocalAction("list /cms/store/user/paus 2> /dev/null") if debug > 0: print " RC: %d\n OUT:\n%s\n ERR:\n%s\n"%(rc,out,err) return rc
def makeDatasetList(config, version): myRx = rex.Rex() (rc, out, err) = myRx.executeLocalAction( "list %s/%s/%s 2> /dev/null | grep ^D:" % (BASE, config, version)) datasetList = [] for line in out.split("\n"): dataset = line.split("/")[-1] if len(dataset) > 4: datasetList.append(dataset) if debug > 0: print " RC: %d\n OUT:\n%s\n ERR:\n%s\n" % (rc, out, err) return datasetList
def findSwVersion(self): cmd = "ls -1rt %s/%s/ |grep ^.*SW_" % (os.getenv('KRAKEN_SW'), self.request.version) print(" CMD: " + cmd) myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(cmd) swVersion = "" for line in out.decode().split("\n"): if 'SW_' in line: swVersion = line swVersion = re.sub(r'^.*SW_', '', swVersion) print(" SW: " + swVersion) return swVersion
def findAllFiles(book,dataset): print " INFO - loading all physical files." cmd = "gfal-ls gsiftp://se01.cmsaf.mit.edu:2811/cms/store/user/paus/%s/%s "%(book,dataset) + "|grep root" myRx = rex.Rex() (rc,out,err) = myRx.executeLocalAction(cmd) # find list files = set() for row in out.split("\n"): if len(row) < 2: continue filename = "%s/%s"%(dataset,row) files.add(filename) return files
def findDatasetProperties(dataset, dbsInst, debug=0): # test whether this is a legitimate dataset by asking DAS and determine size and number of files if "=" in dataset: # this is a dataset produced with Kraken # find config, version and original dataset name f = dataset.split("=") conf = (f[0])[1:] vers = f[1] dset = f[2].replace("/", "+") sizeGb = 10 # does not matter nFiles = 0 cmd = 'cat %s/%s/%s/%s/Filesets' % (CATALOG_INPUT, conf, vers, dset) myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(cmd) for line in out.split("\n"): line = ' '.join(line.split()) f = line.split(" ") if len(f) > 1: nFiles += 1 id = f[0] path = re.sub(r'root://.*/(/store/.*)', r'\1', f[1]) lfn = "%s/%s.root" % (path, id) nEvents = int(f[2]) fId = fileIds.fileId(id + ".root", nEvents) lfn = fileIds.lfn(fId, id, path) lfns[fId.getName()] = lfn if debug > -1: print(" Adding: %s, %s, %s" % (id, lfn.fId.getName())) return (sizeGb, nFiles, lfns) # dealing with a standard dataset first test if dbsInst == 'private': print(" Private dataset detected.") sizeGb = 10 # does not matter nFiles = 0 f = dataset.split("/") trunc = f[1] conf = f[2] vers = f[3] dset = f[4] cmd = 'cat %s/%s/%s/%s/%s/RawFiles.00' % (CATALOG_INPUT, trunc, conf, vers, dset) print(" CMD: %s" % cmd) myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(cmd) for line in out.split("\n"): #print(" LINE - >%s<"%(line)) line = ' '.join(line.split()) f = line.split(" ") if len(f) > 1: nFiles += 1 id = (f[0].split('/')[-1]).replace('.root', '') block = id[0:20] path = "/".join(f[0].split('/')[0:-1]) path = re.sub(r'root://.*/(/store/.*)', r'\1', path) lfn = "%s/%s.root" % (path, id) #print(" ID: %s\nPATH %s\nLFN: %s"%(id,path,lfn)) nEvents = int(f[2]) # #print '%s: %d %d %f'%(fileName,nFiles,nEvents,totalSize/1000./1000./1000.) # fId = fileIds.fileId(fileName,nEvents) # lfn = fileIds.lfn(fId,block,path) fId = fileIds.fileId(id + ".root", nEvents) lfn = fileIds.lfn(fId, block, path) #lfn.show() lfns[fId.getName()] = lfn if debug > -1: print " Adding: %s, %s" % (id, path) else: pass #print(" LINE invalid") return (sizeGb, nFiles, lfns) # dealing with a standard dataset first test if not isDatasetValid(dataset, dbsInst, debug): print(' WARNING - dataset was not found to be valid.') print(' - continue and see whether it is in production.') print(' - to get all data this call has to be repeated') print(' - once the dataset is completed.') #return (-1,-1,-1) else: print(' INFO - dataset is valid.') proxy = getProxy() url = 'curl -s --cert %s -k -H "Accept: application/json"'%proxy \ + ' "https://cmsweb.cern.ch/dbs/prod/global/DBSReader/' \ + 'files?dataset=%s&detail=true"'%(dataset) if debug > 1: print(' CURL: ' + url) myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(url) if rc != 0: print(' ERROR ocurred in %s' % (url)) sys.exit(1) data = json.loads(out) units = 'GB' nFiles = 0 totalSize = 0 blocks = [] for entry in data: valid = int(entry["is_file_valid"]) fileName = entry["logical_file_name"] path = "/".join(fileName.split("/")[:-1]) size = int(entry["file_size"]) block = entry["block_name"].split("#")[1] nEvents = int(entry["event_count"]) if valid == 1: nFiles += 1 totalSize += size #print('%s: %d %d %f'%(fileName,nFiles,nEvents,totalSize/1000./1000./1000.)) fId = fileIds.fileId(fileName, nEvents) lfn = fileIds.lfn(fId, block, path) lfns[fId.getName()] = lfn try: sizeGb = convertSizeToGb(str(totalSize)) except: print('\n Error - could not convert size and number of files (%s %s / %s).'\ %(totalSize,units,nFiles)) sys.exit(1) if debug > 1: for lfn in lfns: lfns[lfn].show() print('\n DBS - %s --> %.1f %s (nFiles: %d)\n' % (dataset, sizeGb, units, nFiles)) return (sizeGb, nFiles, lfns)
# -------------------------------------------------------------------------------------------------- # Read new values from the command line for opt, arg in opts: if opt == "--help": print usage sys.exit(0) # Deal with obvious problems if not os.getenv('KRAKEN_AGENTS_WWW'): print "\n Kraken agent environment is not initialized (KRAKEN_AGENTS_WWW).\n" sys.exit(1) # -------------------------------------------------------------------------------------------------- # Here is where the real action starts ------------------------------------------------------------- # -------------------------------------------------------------------------------------------------- myRx = rex.Rex() # make sure to touch the heartbeat file first cmd = "date >& " + os.getenv('KRAKEN_AGENTS_LOG') + '/heartbeat' (rc, out, err) = myRx.executeLocalAction(cmd) if rc != 0: print '\n ==== ERROR -- DATE (%s) ====\n\n%s' % (cmd, err) print '\n ==== OUTPUT -- DATE (%s) ====\n\n%s' % (cmd, out) else: print ' ==== DATE (%s) ====' % (cmd) # issue full rsync on the log directory cmd = "rsync -Cavz --delete " + os.getenv('KRAKEN_AGENTS_LOG') + ' ' \ + os.getenv('KRAKEN_AGENTS_WWW') + '/../' (rc, out, err) = myRx.executeLocalAction(cmd)
#=================================================================================================== # M A I N #=================================================================================================== # make sure command line is complete if len(sys.argv) < 2: print "\n ERROR -- " + usage sys.exit(1) # command line variables directory = sys.argv[1] print "\n INFO - checkDirectory.py %s" % (directory) cmd = "t2tools.py --action ls --source " + directory + " | grep root" # make sure we can work remotely/locally remoteX = rex.Rex('none', 'none') (rc, out, err) = remoteX.executeLocalAction(cmd) content = out.split("\n") # get Ids in the database (requestId, datasetId) = getRequestId(directory) nEvents = findAllFilesInDb(requestId) # get disk resident Ids path = directory fileIds = [] for line in content: f = line.split(" ") if len(f) > 1: file = f[1]
#!/usr/bin/env python import os import sys import rex import requests import json import time import pprint myRex = rex.Rex() base = os.environ.get('KRAKEN_SE_BASE', '/cms/store/user/paus') def findFileSizes(config, version, dataset): fileSizes = {} cmd = "t2tools.py --action ls --source %s/%s/%s/%s |grep root" % ( base, config, version, dataset) (rc, out, err) = myRex.executeLocalAction(cmd) for line in out.split('\n'): if len(line.split(' ')) > 1: size = (line.split(' ')[0]).split(':')[1] fileName = (line.split('/')[-1]).split('.')[0] fileSizes[fileName] = int(size) return fileSizes def getRequestId(cursor, config, version, dataset): # extract the unique request id this file is part of
def findDatasetProperties(dataset, dbsInst, debug=0): # test whether this is a legitimate dataset by asking DAS and determine size and number of files if "=" in dataset: # find config, version and original dataset name f = dataset.split("=") conf = (f[0])[1:] vers = f[1] dset = f[2].replace("/", "+") sizeGb = 10 # does not matter nFiles = 0 lfns = {} cmd = 'cat %s/%s/%s/%s/Filesets' % (CATALOG_INPUT, conf, vers, dset) myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(cmd) for line in out.split("\n"): line = ' '.join(line.split()) f = line.split(" ") if len(f) > 1: nFiles += 1 id = f[0] path = re.sub(r'root://.*/(/store/.*)', r'\1', f[1]) lfn = "%s/%s.root" % (path, id) nEvents = int(f[2]) fId = fileIds.fileId(id + ".root", nEvents) lfn = fileIds.lfn(fId, id, path) lfns[fId.getName()] = lfn if debug > 1: print " Adding: %s, %s" % (id, lfn) return (sizeGb, nFiles, lfns) ## dealing with a standard dataset first test #if not isDatasetValid(dataset,dbsInst,debug): # return (-1,-1,-1) proxy = getProxy() url = 'curl -s --cert %s -k -H "Accept: application/json"'%proxy \ + ' "https://cmsweb.cern.ch/dbs/prod/global/DBSReader/' \ + 'files?dataset=%s&detail=true"'%(dataset) if debug > 1: print ' CURL: ' + url myRex = rex.Rex() (rc, out, err) = myRex.executeLocalAction(url) if rc != 0: print ' ERROR ocurred in %s' % (url) sys.exit(1) data = json.loads(out) units = 'GB' nFiles = 0 totalSize = 0 lfns = {} blocks = [] for entry in data: valid = int(entry["is_file_valid"]) fileName = entry["logical_file_name"] path = "/".join(fileName.split("/")[:-1]) size = int(entry["file_size"]) block = entry["block_name"].split("#")[1] nEvents = int(entry["event_count"]) if valid == 1: nFiles += 1 totalSize += size #print '%s: %d %d %f'%(fileName,nFiles,nEvents,totalSize/1000./1000./1000.) fId = fileIds.fileId(fileName, nEvents) lfn = fileIds.lfn(fId, block, path) lfns[fId.getName()] = lfn try: sizeGb = convertSizeToGb(str(totalSize)) except: print '\n Error - could not convert size and number of files (%s %s / %s).'\ %(totalSize,units,nFiles) sys.exit(1) if debug > 1: for lfn in lfns: lfns[lfn].show() print '\n DBS - %s --> %.1f %s (nFiles: %d)\n' % (dataset, sizeGb, units, nFiles) return (sizeGb, nFiles, lfns)