def findSE(self, se): return S_OK(se)
def setTransFlavour(self, flavour): self.flavour = flavour return S_OK()
def setExtraname(self, extraname): self.extraname = extraname return S_OK()
def setMetaValues(self, values): if isinstance(values, list): self.metaValues = values else: self.metaValues = [val for val in values.split(",")] return S_OK()
def setMetadata(self, metadata): for pair in metadata.split(','): splitPair = pair.strip().split(':') if len(splitPair) == 2: self.extraData[splitPair[0]] = splitPair[1].strip() return S_OK()
def release( self, lockName ): try: self.__locks[ lockName ].release() except ValueError: return S_ERROR( "No lock named %s" % lockName ) return S_OK()
def export_getJobPageSummaryWeb( self, selectDict, sortList, startItem, maxItems, selectJobs = True ): """ Get the summary of the job information for a given page in the job monitor in a generic format """ resultDict = {} startDate = selectDict.get( 'FromDate', None ) if startDate: del selectDict['FromDate'] # For backward compatibility if startDate is None: startDate = selectDict.get( 'LastUpdate', None ) if startDate: del selectDict['LastUpdate'] endDate = selectDict.get( 'ToDate', None ) if endDate: del selectDict['ToDate'] result = self.jobPolicy.getControlledUsers( RIGHT_GET_INFO ) if not result['OK']: return S_ERROR( 'Failed to evaluate user rights' ) if result['Value'] != 'ALL': selectDict[ ( 'Owner','OwnerGroup' ) ] = result['Value'] # Sorting instructions. Only one for the moment. if sortList: orderAttribute = sortList[0][0] + ":" + sortList[0][1] else: orderAttribute = None statusDict = {} result = gJobDB.getCounters( 'Jobs', ['Status'], selectDict, newer = startDate, older = endDate, timeStamp = 'LastUpdateTime' ) nJobs = 0 if result['OK']: for stDict, count in result['Value']: nJobs += count statusDict[stDict['Status']] = count resultDict['TotalRecords'] = nJobs if nJobs == 0: return S_OK( resultDict ) resultDict['Extras'] = statusDict if selectJobs: iniJob = startItem if iniJob >= nJobs: return S_ERROR( 'Item number out of range' ) result = gJobDB.selectJobs( selectDict, orderAttribute = orderAttribute, newer = startDate, older = endDate, limit = ( maxItems, iniJob ) ) if not result['OK']: return S_ERROR( 'Failed to select jobs: ' + result['Message'] ) summaryJobList = result['Value'] if not self.globalJobsInfo: validJobs, invalidJobs, nonauthJobs, ownJobs = self.jobPolicy.evaluateJobRights( summaryJobList, RIGHT_GET_INFO ) summaryJobList = validJobs result = gJobDB.getAttributesForJobList( summaryJobList, SUMMARY ) if not result['OK']: return S_ERROR( 'Failed to get job summary: ' + result['Message'] ) summaryDict = result['Value'] # Evaluate last sign of life time for jobID, jobDict in summaryDict.items(): if jobDict['HeartBeatTime'] == 'None': jobDict['LastSignOfLife'] = jobDict['LastUpdateTime'] else: lastTime = Time.fromString( jobDict['LastUpdateTime'] ) hbTime = Time.fromString( jobDict['HeartBeatTime'] ) if ( hbTime - lastTime ) > ( lastTime - lastTime ) or jobDict['Status'] == "Stalled": jobDict['LastSignOfLife'] = jobDict['HeartBeatTime'] else: jobDict['LastSignOfLife'] = jobDict['LastUpdateTime'] tqDict = {} result = gTaskQueueDB.getTaskQueueForJobs( summaryJobList ) if result['OK']: tqDict = result['Value'] # prepare the standard structure now key = summaryDict.keys()[0] paramNames = summaryDict[key].keys() records = [] for jobID, jobDict in summaryDict.items(): jParList = [] for pname in paramNames: jParList.append( jobDict[pname] ) jParList.append( tqDict.get( jobID, 0 ) ) records.append( jParList ) resultDict['ParameterNames'] = paramNames + ['TaskQueueID'] resultDict['Records'] = records return S_OK( resultDict )
def __resolveInputData( self ): """This method controls the execution of the DIRAC input data modules according to the VO policy defined in the configuration service. """ if self.arguments['Configuration'].has_key( 'SiteName' ): site = self.arguments['Configuration']['SiteName'] else: site = DIRAC.siteName() policy = [] if not self.arguments.has_key( 'Job' ): self.arguments['Job'] = {} if self.arguments['Job'].has_key( 'InputDataPolicy' ): policy = self.arguments['Job']['InputDataPolicy'] #In principle this can be a list of modules with the first taking precedence if type( policy ) in types.StringTypes: policy = [policy] self.log.info( 'Job has a specific policy setting: %s' % ( ', '.join( policy ) ) ) else: self.log.verbose( 'Attempting to resolve input data policy for site %s' % site ) inputDataPolicy = gConfig.getOptionsDict( '/Operations/InputDataPolicy' ) if not inputDataPolicy['OK']: return S_ERROR( 'Could not resolve InputDataPolicy from /Operations/InputDataPolicy' ) options = inputDataPolicy['Value'] if options.has_key( site ): policy = options[site] policy = [x.strip() for x in policy.split( ',' )] self.log.info( 'Found specific input data policy for site %s:\n%s' % ( site, '\n'.join( policy ) ) ) elif options.has_key( 'Default' ): policy = options['Default'] policy = [x.strip() for x in policy.split( ',' )] self.log.info( 'Applying default input data policy for site %s:\n%s' % ( site, '\n'.join( policy ) ) ) dataToResolve = None #if none, all supplied input data is resolved allDataResolved = False successful = {} failedReplicas = [] for modulePath in policy: if not allDataResolved: result = self.__runModule( modulePath, dataToResolve ) if not result['OK']: self.log.warn( 'Problem during %s execution' % modulePath ) return result if result.has_key( 'Failed' ): failedReplicas = result['Failed'] if failedReplicas: self.log.info( '%s failed for the following files:\n%s' % ( modulePath, '\n'.join( failedReplicas ) ) ) dataToResolve = failedReplicas else: self.log.info( 'All replicas resolved after %s execution' % ( modulePath ) ) allDataResolved = True successful.update( result['Successful'] ) self.log.verbose( successful ) result = S_OK() result['Successful'] = successful result['Failed'] = failedReplicas return result
def downloadSandbox( self, sbLocation, destinationDir = "", inMemory = False, unpack = True ): """ Download a sandbox file and keep it in bundled form """ if sbLocation.find( "SB:" ) != 0: return S_ERROR( "Invalid sandbox URL" ) sbLocation = sbLocation[ 3: ] sbSplit = sbLocation.split( "|" ) if len( sbSplit ) < 2: return S_ERROR( "Invalid sandbox URL" ) SEName = sbSplit[0] SEPFN = "|".join( sbSplit[1:] ) # If destination dir is not specified use current working dir # If its defined ensure the dir structure is there if not destinationDir: destinationDir = os.getcwd() else: mkDir(destinationDir) try: tmpSBDir = tempfile.mkdtemp( prefix = "TMSB." ) except Exception as e: return S_ERROR( "Cannot create temporal file: %s" % str( e ) ) se = StorageElement( SEName, vo = self.__vo ) result = returnSingleResult( se.getFile( SEPFN, localPath = tmpSBDir ) ) if not result[ 'OK' ]: return result sbFileName = os.path.basename( SEPFN ) result = S_OK() tarFileName = os.path.join( tmpSBDir, sbFileName ) if inMemory: try: tfile = open( tarFileName, 'r' ) data = tfile.read() tfile.close() os.unlink( tarFileName ) os.rmdir( tmpSBDir ) except Exception as e: os.unlink( tarFileName ) os.rmdir( tmpSBDir ) return S_ERROR( 'Failed to read the sandbox archive: %s' % str( e ) ) return S_OK( data ) if not unpack: result[ 'Value' ] = tarFileName return result try: sandboxSize = 0 tf = tarfile.open( name = tarFileName, mode = "r" ) for tarinfo in tf: tf.extract( tarinfo, path = destinationDir ) sandboxSize += tarinfo.size tf.close() result[ 'Value' ] = sandboxSize except Exception as e: result = S_ERROR( "Could not open bundle: %s" % str( e ) ) try: os.unlink( tarFileName ) os.rmdir( tmpSBDir ) except Exception as e: gLogger.warn( "Could not remove temporary dir %s: %s" % ( tmpSBDir, str( e ) ) ) return result
'AlternativeBDIIs', 'VO']}), ('DIRAC.ConfigurationSystem.Agent.GOCDB2CSAgent', {'IgnoreOptions': ['Cycles', 'DryRun', 'UpdatePerfSONARS']}), ('DIRAC.ConfigurationSystem.Agent.VOMS2CSAgent', {'IgnoreOptions': ['VO']}), ('DIRAC.DataManagementSystem.Agent.FTS3Agent', {}), ('DIRAC.FrameworkSystem.Agent.CAUpdateAgent', {}), ('DIRAC.FrameworkSystem.Agent.MyProxyRenewalAgent', {'IgnoreOptions': ['MinValidity', 'ValidityPeriod', 'MinimumLifeTime', 'RenewedLifeTime']}), ('DIRAC.FrameworkSystem.Agent.ErrorMessageMonitor', {}), ('DIRAC.FrameworkSystem.Agent.SystemLoggingDBCleaner', {'IgnoreOptions': ['RemoveDate']}), ('DIRAC.FrameworkSystem.Agent.TopErrorMessagesReporter', {}), ('DIRAC.RequestManagementSystem.Agent.CleanReqDBAgent', {}), ('DIRAC.RequestManagementSystem.Agent.RequestExecutingAgent', {'IgnoreOptions': ['FTSMode', 'OperationHandlers'], 'SpecialMocks': {'gConfig': S_OK([])}}), ('DIRAC.ResourceStatusSystem.Agent.CacheFeederAgent', {}), ('DIRAC.ResourceStatusSystem.Agent.ElementInspectorAgent', {}), ('DIRAC.ResourceStatusSystem.Agent.EmailAgent', {}), ('DIRAC.ResourceStatusSystem.Agent.SiteInspectorAgent', {}), ('DIRAC.ResourceStatusSystem.Agent.SummarizeLogsAgent', {}), ('DIRAC.ResourceStatusSystem.Agent.TokenAgent', {}), ('DIRAC.StorageManagementSystem.Agent.RequestFinalizationAgent', {}), ('DIRAC.StorageManagementSystem.Agent.RequestPreparationAgent', {}), ('DIRAC.StorageManagementSystem.Agent.StageMonitorAgent', {}), ('DIRAC.StorageManagementSystem.Agent.StageRequestAgent', {'IgnoreOptions': ['PinLifetime']}), ('DIRAC.TransformationSystem.Agent.DataRecoveryAgent', {}), ('DIRAC.TransformationSystem.Agent.InputDataAgent', {'IgnoreOptions': ['DateKey', 'TransformationTypes']}), ('DIRAC.TransformationSystem.Agent.MCExtensionAgent', {'IgnoreOptions': ['TransformationTypes', 'TasksPerIteration', 'MaxFailureRate',
def getClockDeviation(serverList=None): result = getNTPUTCTime(serverList) if not result['OK']: return result td = datetime.datetime.utcnow() - result['Value'] return S_OK(abs(td.days * 86400 + td.seconds))
def removeJobsByStatus(self, condDict, delay=False): """ Remove deleted jobs """ if delay: gLogger.verbose("Removing jobs with %s and older than %s day(s)" % (condDict, delay)) result = self.jobDB.selectJobs(condDict, older=delay, limit=self.maxJobsAtOnce) else: gLogger.verbose("Removing jobs with %s " % condDict) result = self.jobDB.selectJobs(condDict, limit=self.maxJobsAtOnce) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict)) count = 0 error_count = 0 result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result['OK']: gLogger.error("Cannot unassign jobs to sandboxes", result['Message']) return result result = self.deleteJobOversizedSandbox(jobList) if not result['OK']: gLogger.error("Cannot schedule removal of oversized sandboxes", result['Message']) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop(jobList.index(job)) # TODO: we should not remove a job if it still has requests in the RequestManager. # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB(jobID) resultTQ = self.taskQueueDB.deleteJob(jobID) resultLogDB = self.jobLoggingDB.deleteJob(jobID) errorFlag = False if not resultJobDB['OK']: gLogger.warn('Failed to remove job %d from JobDB' % jobID, result['Message']) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message']) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message']) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB(jobList) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList)) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList)) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message']) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob(jobList) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList)) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList)) if count > 0 or error_count > 0: gLogger.info('Deleted %d jobs from JobDB, %d errors' % (count, error_count)) return S_OK()
def submitJob(self, executableFile, proxy=None, numberOfJobs=1): """ Method to submit job """ self.createClient() # Check if the client is ready if not self.BOINCClient: return S_ERROR('Soap client is not ready') self.log.verbose("Executable file path: %s" % executableFile) # if no proxy is supplied, the executable can be submitted directly # otherwise a wrapper script is needed to get the proxy to the execution node # The wrapper script makes debugging more complicated and thus it is # recommended to transfer a proxy inside the executable if possible. wrapperContent = '' if proxy: self.log.verbose('Setting up proxy for payload') compressedAndEncodedProxy = base64.encodestring( bz2.compress(proxy.dumpAllToString()['Value'])).replace( '\n', '') compressedAndEncodedExecutable = base64.encodestring( bz2.compress(open(executableFile, "rb").read(), 9)).replace('\n', '') wrapperContent = """#!/bin/bash /usr/bin/env python << EOF # Wrapper script for executable and proxy import os import tempfile import sys import base64 import bz2 import shutil import stat try: workingDirectory = tempfile.mkdtemp( suffix = '_wrapper', prefix= 'TORQUE_' ) os.chdir( workingDirectory ) open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedProxy)s" ) ) ) open( '%(executable)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedExecutable)s" ) ) ) os.chmod('proxy',stat.S_IRUSR | stat.S_IWUSR) os.chmod('%(executable)s',stat.S_IRWXU) os.environ["X509_USER_PROXY"]=os.path.join(workingDirectory, 'proxy') except Exception as x: print >> sys.stderr, x sys.exit(-1) cmd = "./%(executable)s" print 'Executing: ', cmd sys.stdout.flush() os.system( cmd ) shutil.rmtree( workingDirectory ) EOF """ % { 'compressedAndEncodedProxy': compressedAndEncodedProxy, \ 'compressedAndEncodedExecutable': compressedAndEncodedExecutable, \ 'executable': os.path.basename( executableFile ) } fd, name = tempfile.mkstemp(suffix='_pilotwrapper.py', prefix='DIRAC_', dir=os.getcwd()) os.close(fd) submitFile = name else: # no proxy submitFile = executableFile wrapperContent = self._fromFileToStr(submitFile) if not wrapperContent: self.log.error('Executable file is empty.') return S_ERROR('Executable file is empty.') #Some special symbol can not be transported by xml, #such as less, greater, amp. So, base64 is used here. wrapperContent = base64.encodestring(wrapperContent).replace("\n", '') prefix = os.path.splitext(os.path.basename(submitFile))[0].replace( '_pilotwrapper', '').replace('DIRAC_', '') batchIDList = [] stampDict = {} for i in range(0, numberOfJobs): jobID = "%s_%d@%s" % (prefix, i, self.suffix) try: # print jobID + "\n" + wrapperContent # print self.BOINCClient result = self.BOINCClient.service.submitJob( jobID, wrapperContent, self.ceParameters['Platform'][0], self.ceParameters['MarketPlaceID']) except: self.log.error('Could not submit the pilot to the BOINC CE', 'Pilot %s, BOINC CE %s' % (jobID, self.wsdl)) break if not result['ok']: self.log.warn( 'Didn\'t submit the pilot %s to the BOINC CE %s, the value returned is false!' % (jobID, self.wsdl)) break self.log.verbose('Submit the pilot %s to the BOINC CE %s' % (jobID, self.wsdl)) diracStamp = "%s_%d" % (prefix, i) batchIDList.append(jobID) stampDict[jobID] = diracStamp if batchIDList: resultRe = S_OK(batchIDList) resultRe['PilotStampDict'] = stampDict else: resultRe = S_ERROR('Submit no pilot to BOINC CE %s' % self.wsdl) return resultRe
def addSE(self, se): return S_OK(se)
def setVerbose( optVal ): global verbose verbose = True return S_OK()
def getTimeLeft(self, cpuConsumed=0.0, processors=1): """ Returns the CPU Time Left for supported batch systems. The CPUConsumed is the current raw total CPU. """ # Quit if no scale factor available if not self.scaleFactor: return S_ERROR( '/LocalSite/CPUScalingFactor not defined for site %s' % DIRAC.siteName()) if not self.batchPlugin: return S_ERROR(self.batchError) resourceDict = self.batchPlugin.getResourceUsage() if not resourceDict['OK']: self.log.warn( 'Could not determine timeleft for batch system at site %s' % DIRAC.siteName()) return resourceDict resources = resourceDict['Value'] self.log.debug("self.batchPlugin.getResourceUsage(): %s" % str(resources)) if not resources.get('CPULimit') and not resources.get( 'WallClockLimit'): # This should never happen return S_ERROR('No CPU or WallClock limit obtained') # if one of CPULimit or WallClockLimit is missing, compute a reasonable value if not resources.get('CPULimit'): resources['CPULimit'] = resources['WallClockLimit'] * processors elif not resources.get('WallClockLimit'): resources['WallClockLimit'] = resources['CPULimit'] / processors # if one of CPU or WallClock is missing, compute a reasonable value if not resources.get('CPU'): resources['CPU'] = resources['WallClock'] * processors elif not resources.get('WallClock'): resources['WallClock'] = resources['CPU'] / processors timeLeft = 0. cpu = float(resources['CPU']) cpuLimit = float(resources['CPULimit']) wallClock = float(resources['WallClock']) wallClockLimit = float(resources['WallClockLimit']) batchSystemTimeUnit = resources.get('Unit', 'Both') # Some batch systems rely on wall clock time and/or cpu time to make allocations if batchSystemTimeUnit == 'WallClock': time = wallClock timeLimit = wallClockLimit else: time = cpu timeLimit = cpuLimit if time and cpuConsumed > 3600. and self.normFactor: # If there has been more than 1 hour of consumed CPU and # there is a Normalization set for the current CPU # use that value to renormalize the values returned by the batch system # NOTE: cpuConsumed is non-zero for call by the JobAgent and 0 for call by the watchdog # cpuLimit and cpu may be in the units of the batch system, not real seconds... # (in this case the other case won't work) # therefore renormalise it using cpuConsumed (which is in real seconds) cpuWorkLeft = (timeLimit - time) * self.normFactor * cpuConsumed / time elif self.normFactor: # FIXME: this is always used by the watchdog... Also used by the JobAgent # if consumed less than 1 hour of CPU # It was using self.scaleFactor but this is inconsistent: use the same as above # In case the returned cpu and cpuLimit are not in real seconds, this is however rubbish cpuWorkLeft = (timeLimit - time) * self.normFactor else: # Last resort recovery... cpuWorkLeft = (timeLimit - time) * self.scaleFactor self.log.verbose('Remaining CPU in normalized units is: %.02f' % timeLeft) return S_OK(cpuWorkLeft)
def setTaskQueueID( optVal ): global taskQueueID taskQueueID = long( optVal ) return S_OK()
instance.update() while instance.state != u'running': if instance.state == u'terminated': self.log.error( "New instance terminated while starting", "AMI: %s" % self.__vmAMI) continue self.log.info( "Sleeping for 10 secs for instance %s (current state %s)" % (instance, instance.state)) time.sleep(10) instance.update() if instance.state != u'terminated': self.log.info("Instance %s started" % instance.id) idList.append(instance.id) return S_OK(idList) def __startSpotInstances(self, numImages, instanceType, waitForConfirmation): self.log.info("Starting %d new spot instances for AMI %s (type %s)" % (numImages, self.__vmAMI, instanceType)) try: spotInstanceRequests = self.__conn.request_spot_instances( price="%f" % self.__vmMaxAllowedPrice, image_id=self.__vmAMI, count=numImages, instance_type=instanceType) self.log.verbose("Got %d spot instance requests" % len(spotInstanceRequests)) except Exception, e: return S_ERROR("Could not start spot instances: %s" % str(e))
def export_getJobsStatus ( jobIDs ): if not jobIDs: return S_OK( {} ) return gJobDB.getAttributesForJobList( jobIDs, ['Status'] )
def export_updateSoftware(self, version, rootPath="", gridVersion=""): """ Update the local DIRAC software installation to version """ # Check that we have a sane local configuration result = gConfig.getOptionsDict('/LocalInstallation') if not result['OK']: return S_ERROR('Invalid installation - missing /LocalInstallation section in the configuration') elif not result['Value']: return S_ERROR('Invalid installation - empty /LocalInstallation section in the configuration') if rootPath and not os.path.exists(rootPath): return S_ERROR('Path "%s" does not exists' % rootPath) cmdList = ['dirac-install', '-r', version, '-t', 'server'] if rootPath: cmdList.extend(['-P', rootPath]) # Check if there are extensions extensionList = getCSExtensions() if extensionList: # by default we do not install WebApp if "WebApp" in extensionList: extensionList.remove("WebApp") webPortal = gConfig.getValue('/LocalInstallation/WebApp', False) # this is the new portal if webPortal: if "WebAppDIRAC" not in extensionList: extensionList.append("WebAppDIRAC") cmdList += ['-e', ','.join(extensionList)] project = gConfig.getValue('/LocalInstallation/Project') if project: cmdList += ['-l', project] # Are grid middleware bindings required ? if gridVersion: cmdList.extend(['-g', gridVersion]) targetPath = gConfig.getValue('/LocalInstallation/TargetPath', gConfig.getValue('/LocalInstallation/RootPath', '')) if targetPath and os.path.exists(targetPath + '/etc/dirac.cfg'): cmdList.append(targetPath + '/etc/dirac.cfg') else: return S_ERROR('Local configuration not found') result = systemCall(240, cmdList) if not result['OK']: return result status = result['Value'][0] if status != 0: # Get error messages error = [] output = result['Value'][1].split('\n') for line in output: line = line.strip() if 'error' in line.lower(): error.append(line) if error: message = '\n'.join(error) else: message = "Failed to update software to %s" % version return S_ERROR(message) return S_OK()
def prepareNewReplicas(self): """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas """ res = self.__getNewReplicas() if not res['OK']: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res['Message']) return res if not res['Value']: gLogger.info("There were no New replicas found") return res replicas = res['Value']['Replicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len(replicaIDs)) # Check that the files exist in the FileCatalog res = self.__getExistingFiles(replicas.keys()) if not res['OK']: return res exist = res['Value']['Exist'] terminal = res['Value']['Missing'] failed = res['Value']['Failed'] if not exist: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine existance of any files' ) return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len(exist)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len(terminal)) # Obtain the file sizes from the FileCatalog res = self.__getFileSize(exist) if not res['OK']: return res failed.update(res['Value']['Failed']) terminal = res['Value']['ZeroSize'] fileSizes = res['Value']['FileSizes'] if not fileSizes: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files' ) return S_OK() for lfn, reason in terminal.items(): for se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len(fileSizes)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len(terminal)) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas(fileSizes.keys()) if not res['OK']: return res failed.update(res['Value']['Failed']) terminal = res['Value']['ZeroReplicas'] fileReplicas = res['Value']['Replicas'] if not fileReplicas: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files' ) return S_OK() for lfn, reason in terminal.items(): for se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len(fileReplicas)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len(terminal)) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas[lfn] for requestedSE, replicaID in requestedSEs.items(): if not requestedSE in lfnReplicas.keys(): terminalReplicaIDs[ replicaID] = "LFN not registered at requested SE" replicas[lfn].pop(requestedSE) else: replicaMetadata.append( (replicaID, lfnReplicas[requestedSE], fileSizes[lfn])) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len(terminalReplicaIDs)) #res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.storageDB.updateReplicaFailure(terminalReplicaIDs) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res['Message']) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len(replicaMetadata)) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.storageDB.updateReplicaInformation(replicaMetadata) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res['Message']) return S_OK()
def export_getProject(self): result = loadDIRACCFG() if not result['OK']: return result _cfgPath, diracCFG = result['Value'] return S_OK(diracCFG.getOption("/LocalInstallation/Project", "DIRAC"))
def setMetaKey(self, key): self.metaKey = key return S_OK()
def export_checkComponentLog(self, component): """ Check component log for errors """ componentList = [] if '*' in component: if component == '*': result = gComponentInstaller.getSetupComponents() if result['OK']: for ctype in ['Services', 'Agents', 'Executors']: if ctype in result['Value']: for sname in result['Value'][ctype]: for cname in result['Value'][ctype][sname]: componentList.append('/'.join([sname, cname])) elif isinstance(component, basestring): componentList = [component] else: componentList = component resultDict = {} for comp in componentList: if '/' not in comp: continue system, cname = comp.split('/') startDir = gComponentInstaller.startDir currentLog = startDir + '/' + system + '_' + cname + '/log/current' try: logFile = file(currentLog, 'r') except IOError as err: gLogger.error("File does not exists:", currentLog) resultDict[comp] = {'ErrorsHour': -1, 'ErrorsDay': -1, 'LastError': currentLog + '::' + repr(err)} continue logLines = logFile.readlines() logFile.close() errors_1 = 0 errors_24 = 0 now = dateTime() lastError = '' for line in logLines: if "ERROR:" in line: fields = line.split() recent = False if len(fields) < 2: # if the line contains only one word lastError = line.split('ERROR:')[-1].strip() continue timeStamp = fromString(fields[0] + ' ' + fields[1]) if not timeStamp: # if the timestamp is missing in the log lastError = line.split('ERROR:')[-1].strip() continue if (now - timeStamp) < hour: errors_1 += 1 recent = True if (now - timeStamp) < day: errors_24 += 1 recent = True if recent: lastError = line.split('ERROR:')[-1].strip() resultDict[comp] = {'ErrorsHour': errors_1, 'ErrorsDay': errors_24, 'LastError': lastError} return S_OK(resultDict)
def setSourceSE(self, sourceSE): self.sourceSE = [sSE.strip() for sSE in sourceSE.split(",")] return S_OK()
def __readHostInfo(): """ Get host current loads, memory, etc """ result = dict() # Memory info re_parser = re.compile(r'^(?P<key>\S*):\s*(?P<value>\d*)\s*kB') for line in open('/proc/meminfo'): match = re_parser.match(line) if not match: continue key, value = match.groups(['key', 'value']) result[key] = int(value) for mtype in ['Mem', 'Swap']: memory = int(result.get(mtype + 'Total')) mfree = int(result.get(mtype + 'Free')) if memory > 0: percentage = float(memory - mfree) / float(memory) * 100. else: percentage = 0 name = 'Memory' if mtype == "Swap": name = 'Swap' result[name] = '%.1f%%/%.1fMB' % (percentage, memory / 1024.) # Loads l1, l5, l15 = (str(lx) for lx in os.getloadavg()) result['Load1'] = l1 result['Load5'] = l5 result['Load15'] = l15 result['Load'] = '/'.join([l1, l5, l15]) # CPU info with open('/proc/cpuinfo', 'r') as fd: lines = fd.readlines() processors = 0 physCores = {} for line in lines: if line.strip(): parameter, value = line.split(':') parameter = parameter.strip() value = value.strip() if parameter.startswith('processor'): processors += 1 if parameter.startswith('physical id'): physCores[value] = parameter if parameter.startswith('model name'): result['CPUModel'] = value if parameter.startswith('cpu MHz'): result['CPUClock'] = value result['Cores'] = processors result['PhysicalCores'] = len(physCores) # Disk occupancy summary = '' _status, output = commands.getstatusoutput('df') lines = output.split('\n') for i in xrange(len(lines)): if lines[i].startswith('/dev'): fields = lines[i].split() if len(fields) == 1: fields += lines[i + 1].split() _disk = fields[0].replace('/dev/sd', '') partition = fields[5] occupancy = fields[4] summary += ",%s:%s" % (partition, occupancy) result['DiskOccupancy'] = summary[1:] result['RootDiskSpace'] = Os.getDiskSpace(rootPath) # Open files puser = getpass.getuser() _status, output = commands.getstatusoutput('lsof') pipes = 0 files = 0 sockets = 0 lines = output.split('\n') for line in lines: fType = line.split()[4] user = line.split()[2] if user == puser: if fType in ['REG']: files += 1 elif fType in ['unix', 'IPv4']: sockets += 1 elif fType in ['FIFO']: pipes += 1 result['OpenSockets'] = sockets result['OpenFiles'] = files result['OpenPipes'] = pipes infoResult = gComponentInstaller.getInfo() if infoResult['OK']: result.update(infoResult['Value']) # the infoResult value is {"Extensions":{'a1':'v1',a2:'v2'}; we convert to a string result.update({"Extensions": ";".join(["%s:%s" % (key, value) for (key, value) in infoResult["Value"].get('Extensions').iteritems()])}) # Host certificate properties certFile, _keyFile = getHostCertificateAndKeyLocation() chain = X509Chain() chain.loadChainFromFile(certFile) resultCert = chain.getCredentials() if resultCert['OK']: result['SecondsLeft'] = resultCert['Value']['secondsLeft'] result['CertificateValidity'] = str(timedelta(seconds=resultCert['Value']['secondsLeft'])) result['CertificateDN'] = resultCert['Value']['subject'] result['HostProperties'] = resultCert['Value']['groupProperties'] result['CertificateIssuer'] = resultCert['Value']['issuer'] # Host uptime result['Uptime'] = str(timedelta(seconds=(time.time() - psutil.boot_time()))) return S_OK(result)
def setTargetSE(self, targetSE): self.targetSE = [tSE.strip() for tSE in targetSE.split(",")] return S_OK()
def initializeHandler(cls, serviceInfoDict): """Initialization of DB object""" cls.dataIntegrityDB = DataIntegrityDB(parentLogger=cls.log) return S_OK()
def setGroupSize(self, size): try: self.groupSize = int(size) except ValueError: return S_ERROR("Expected integer for groupsize") return S_OK()
def getSEDefinition(self, seID): """ Get the Storage Element definition """ if isinstance(seID, str): result = self.getSEID(seID) if not result['OK']: return result seID = result['Value'] if seID in self.db.seDefinitions: if (time.time() - self.db.seDefinitions[seID]['LastUpdate'] ) < self.seUpdatePeriod: if self.db.seDefinitions[seID]['SEDict']: return S_OK(self.db.seDefinitions[seID]) se = self.db.seDefinitions[seID]['SEName'] else: result = self.getSEName(seID) if not result['OK']: return result se = result['Value'] self.db.seDefinitions[seID] = {} self.db.seDefinitions[seID]['SEName'] = se self.db.seDefinitions[seID]['SEDict'] = {} self.db.seDefinitions[seID]['LastUpdate'] = 0. # We have to refresh the SE definition from the CS result = gConfig.getSections('/Resources/StorageElements/%s' % se) if not result['OK']: return result pluginSection = result['Value'][0] result = gConfig.getOptionsDict('/Resources/StorageElements/%s/%s' % (se, pluginSection)) if not result['OK']: return result seDict = result['Value'] self.db.seDefinitions[seID]['SEDict'] = seDict # Get VO paths if any voPathDict = None result = gConfig.getOptionsDict( '/Resources/StorageElements/%s/%s/VOPath' % (se, pluginSection)) if result['OK']: voPathDict = result['Value'] if seDict: # A.T. Ports can be multiple, this can be better done using the Storage plugin # to provide the replica prefix to keep implementations in one place if 'Port' in seDict: ports = seDict['Port'] if ',' in ports: portList = [x.strip() for x in ports.split(',')] random.shuffle(portList) seDict['Port'] = portList[0] tmpDict = dict(seDict) tmpDict['FileName'] = '' result = pfnunparse(tmpDict) if result['OK']: self.db.seDefinitions[seID]['SEDict']['PFNPrefix'] = result[ 'Value'] if voPathDict is not None: for vo in voPathDict: tmpDict['Path'] = voPathDict[vo] result = pfnunparse(tmpDict) if result['OK']: self.db.seDefinitions[seID]['SEDict'].setdefault( "VOPrefix", {})[vo] = result['Value'] self.db.seDefinitions[seID]['LastUpdate'] = time.time() return S_OK(self.db.seDefinitions[seID])