def getDNFromUserName(username, log, ckey=None, cert=None): """ Parse site string to know the fts server to use """ dn = '' with newX509env(X509_USER_CERT=cert, X509_USER_KEY=ckey): resourceCatalog = CRIC(logger=log) try: dn = resourceCatalog.userNameDn(username) except: log.error("CRIC URL cannot be accessed") if not dn: log.error("user does not exist") return dn
def getDNFromUserName(username, log, ckey = None, cert = None): """ Parse site string to know the fts server to use """ dn = '' with newX509env(X509_USER_CERT=cert,X509_USER_KEY=ckey): configDict = {"cacheduration": 1, "pycurl": True} # cache duration is in hours resourceCatalog = CRIC(logger=self.logger, configDict=configDict) try: dn = resourceCatalog.userNameDn(username) except : log.error("CRIC URL cannot be accessed") if not dn: log.error("user does not exist") return dn
def executeInternal(self, *args): """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise""" self.stage = args[0] self.completion = int(args[1]) self.prefix = args[2] self.setupLog() self.statusCacheInfo = { } #Will be filled with the status from the status cache self.readJobStatus() completed = set(self.completedJobs(stage=self.stage)) if len(completed) < self.completion: return 4 self.readProcessedJobs() unprocessed = completed - self.processedJobs estimates = copy.copy(unprocessed) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) if self.stage == 'tail' and len(estimates - set(self.failedJobs)) == 0: estimates = set( self.completedJobs(stage='processing', processFailed=False)) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) # The TaskWorker saves some files that now we are gonna read with open('datadiscovery.pkl', 'rb') as fd: dataset = pickle.load(fd) #Output from the discovery process with open('taskinformation.pkl', 'rb') as fd: task = pickle.load( fd ) #A dictionary containing information about the task as in the Oracle DB with open('taskworkerconfig.pkl', 'rb') as fd: config = pickle.load(fd) #Task worker configuration # need to use user proxy as credential for talking with cmsweb config.TaskWorker.cmscert = os.environ.get('X509_USER_PROXY') config.TaskWorker.cmskey = os.environ.get('X509_USER_PROXY') config.TaskWorker.envForCMSWEB = newX509env( X509_USER_CERT=config.TaskWorker.cmscert, X509_USER_KEY=config.TaskWorker.cmskey) # need to get username from classAd to setup for Rucio access task_ad = classad.parseOne(open(os.environ['_CONDOR_JOB_AD'])) username = task_ad['CRAB_UserHN'] config.Services.Rucio_account = username # need the global black list config.TaskWorker.scratchDir = './scratchdir' if not os.path.exists(config.TaskWorker.scratchDir): os.makedirs(config.TaskWorker.scratchDir) from TaskWorker.Actions.Recurring.BanDestinationSites import CRAB3BanDestinationSites banSites = CRAB3BanDestinationSites(config, self.logger) with config.TaskWorker.envForCMSWEB: banSites.execute() # Read the automatic_splitting/throughputs/0-N files where the PJ # saved the EventThroughput # (report['steps']['cmsRun']['performance']['cpu']['EventThroughput']) # and the average size of the output per event sumEventsThr = 0 sumEventsSize = 0 count = 0 for jid in estimates: if jid in self.failedJobs: continue fn = "automatic_splitting/throughputs/{0}".format(jid) with open(fn) as fd: throughput, eventsize = json.load(fd) sumEventsThr += throughput sumEventsSize += eventsize count += 1 eventsThr = sumEventsThr / count eventsSize = sumEventsSize / count self.logger.info("average throughput for %s jobs: %s evt/s", count, eventsThr) self.logger.info("average eventsize for %s jobs: %s bytes", count, eventsSize) maxSize = getattr(config.TaskWorker, 'automaticOutputSizeMaximum', 5 * 1000**3) maxEvents = (maxSize / eventsSize) if eventsSize > 0 else 0 runtime = task['tm_split_args'].get('minutes_per_job', -1) if self.stage == "processing": # Build in a 33% error margin in the runtime to not create too # many tails. This essentially moves the peak to lower # runtimes and cuts off less of the job distribution tail. target = int(0.75 * runtime) elif self.stage == 'tail': target = int( max( getattr(config.TaskWorker, 'automaticTailRuntimeMinimumMins', 45), getattr(config.TaskWorker, 'automaticTailRuntimeFraction', 0.2) * runtime)) # `target` is in minutes, `eventsThr` is in events/second! events = int(target * eventsThr * 60) if events > maxEvents and maxEvents > 0: self.logger.info( "reduced the target event count from %s to %s to obey output size", events, maxEvents) events = int(maxEvents) splitTask = dict(task) splitTask['tm_split_algo'] = 'EventAwareLumiBased' splitTask['tm_split_args']['events_per_job'] = events if self.stage == 'tail' and not self.adjustLumisForCompletion( splitTask, unprocessed): self.logger.info("nothing to process for completion") self.saveProcessedJobs(unprocessed) return 0 # Disable retries for processing: every lumi is attempted to be # processed once in processing, thrice in the tails -> four times. # That should be enough "retries" # # See note in DagmanCreator about getting this from the Task DB if self.stage == "processing": config.TaskWorker.numAutomJobRetries = 0 try: splitter = Splitter(config, crabserver=None) split_result = splitter.execute(dataset, task=splitTask) self.logger.info("Splitting results:") for g in split_result.result[0]: msg = "Created jobgroup with length {0}".format( len(g.getJobs())) self.logger.info(msg) except TaskWorkerException as e: retmsg = "Splitting failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 try: parent = self.prefix if self.stage == 'tail' else None rucioClient = getNativeRucioClient(config=config, logger=self.logger) creator = DagmanCreator(config, crabserver=None, rucioClient=rucioClient) with config.TaskWorker.envForCMSWEB: creator.createSubdag(split_result.result, task=task, parent=parent, stage=self.stage) self.submitSubdag( 'RunJobs{0}.subdag'.format(self.prefix), getattr(config.TaskWorker, 'maxIdle', MAX_IDLE_JOBS), getattr(config.TaskWorker, 'maxPost', MAX_POST_JOBS), self.stage) except TaskWorkerException as e: retmsg = "DAG creation failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 self.saveProcessedJobs(unprocessed) return 0
from WMCore.Configuration import ConfigurationEx from ServerUtilities import newX509env config = ConfigurationEx() config.section_("Services") config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/%s/DBSReader/' % dbsInstance config.section_("TaskWorker") # will use X509_USER_PROXY var for this test #config.TaskWorker.cmscert = os.environ["X509_USER_PROXY"] #config.TaskWorker.cmskey = os.environ["X509_USER_PROXY"] # will user service cert as defined for TW config.TaskWorker.cmscert = os.environ["X509_USER_CERT"] config.TaskWorker.cmskey = os.environ["X509_USER_KEY"] config.TaskWorker.envForCMSWEB = newX509env( X509_USER_CERT=config.TaskWorker.cmscert, X509_USER_KEY=config.TaskWorker.cmskey) config.TaskWorker.DDMServer = 'dynamo.mit.edu' config.TaskWorker.instance = 'prod' #config.TaskWorker.restHost = 'cmsweb.cern.ch' # The second word identifies the DB instance defined in CRABServerAuth.py on the REST #config.TaskWorker.restURInoAPI = '/crabserver/prod/' config.Services.Rucio_host = 'https://cms-rucio.cern.ch' config.Services.Rucio_account = 'crab_server' config.Services.Rucio_authUrl = 'https://cms-rucio-auth.cern.ch' config.Services.Rucio_caPath = '/etc/grid-security/certificates/' fileset = DBSDataDiscovery(config) fileset.execute(task={
def executeInternal(self, *args): """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise""" self.stage = args[0] self.completion = int(args[1]) self.prefix = args[2] self.setupLog() self.statusCacheInfo = {} #Will be filled with the status from the status cache self.readJobStatus() completed = set(self.completedJobs(stage=self.stage)) if len(completed) < self.completion: return 4 self.readProcessedJobs() unprocessed = completed - self.processedJobs estimates = copy.copy(unprocessed) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) if self.stage == 'tail' and len(estimates-set(self.failedJobs)) == 0: estimates = set(self.completedJobs(stage='processing', processFailed=False)) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) # The TaskWorker saves some files that now we are gonna read with open('datadiscovery.pkl', 'rb') as fd: dataset = pickle.load(fd) #Output from the discovery process with open('taskinformation.pkl', 'rb') as fd: task = pickle.load(fd) #A dictionary containing information about the task as in the Oracle DB with open('taskworkerconfig.pkl', 'rb') as fd: config = pickle.load(fd) #Task worker configuration # need to use user proxy as credential for talking with cmsweb config.TaskWorker.cmscert = os.environ.get('X509_USER_PROXY') config.TaskWorker.cmskey = os.environ.get('X509_USER_PROXY') config.TaskWorker.envForCMSWEB = newX509env(X509_USER_CERT=config.TaskWorker.cmscert, X509_USER_KEY=config.TaskWorker.cmskey) # need the global black list config.TaskWorker.scratchDir = './scratchdir' if not os.path.exists(config.TaskWorker.scratchDir): os.makedirs(config.TaskWorker.scratchDir) from TaskWorker.Actions.Recurring.BanDestinationSites import CRAB3BanDestinationSites banSites = CRAB3BanDestinationSites(config, 'dummy', 'dummy', self.logger) with config.TaskWorker.envForCMSWEB: banSites.execute() # Read the automatic_splitting/throughputs/0-N files where the PJ # saved the EventThroughput # (report['steps']['cmsRun']['performance']['cpu']['EventThroughput']) # and the average size of the output per event sumEventsThr = 0 sumEventsSize = 0 count = 0 for jid in estimates: if jid in self.failedJobs: continue fn = "automatic_splitting/throughputs/{0}".format(jid) with open(fn) as fd: throughput, eventsize = json.load(fd) sumEventsThr += throughput sumEventsSize += eventsize count += 1 eventsThr = sumEventsThr / count eventsSize = sumEventsSize / count self.logger.info("average throughput for %s jobs: %s evt/s", count, eventsThr) self.logger.info("average eventsize for %s jobs: %s bytes", count, eventsSize) maxSize = getattr(config.TaskWorker, 'automaticOutputSizeMaximum', 5 * 1000**3) maxEvents = (maxSize / eventsSize) if eventsSize > 0 else 0 runtime = task['tm_split_args'].get('minutes_per_job', -1) if self.stage == "processing": # Build in a 33% error margin in the runtime to not create too # many tails. This essentially moves the peak to lower # runtimes and cuts off less of the job distribution tail. target = int(0.75 * runtime) elif self.stage == 'tail': target = int(max( getattr(config.TaskWorker, 'automaticTailRuntimeMinimumMins', 45), getattr(config.TaskWorker, 'automaticTailRuntimeFraction', 0.2) * runtime )) # `target` is in minutes, `eventsThr` is in events/second! events = int(target * eventsThr * 60) if events > maxEvents and maxEvents > 0: self.logger.info("reduced the target event count from %s to %s to obey output size", events, maxEvents) events = int(maxEvents) splitTask = dict(task) splitTask['tm_split_algo'] = 'EventAwareLumiBased' splitTask['tm_split_args']['events_per_job'] = events if self.stage == 'tail' and not self.adjustLumisForCompletion(splitTask, unprocessed): self.logger.info("nothing to process for completion") self.saveProcessedJobs(unprocessed) return 0 # Disable retries for processing: every lumi is attempted to be # processed once in processing, thrice in the tails -> four times. # That should be enough "retries" # # See note in DagmanCreator about getting this from the Task DB if self.stage == "processing": config.TaskWorker.numAutomJobRetries = 0 try: splitter = Splitter(config, server=None, resturi='') split_result = splitter.execute(dataset, task=splitTask) self.logger.info("Splitting results:") for g in split_result.result[0]: msg = "Created jobgroup with length {0}".format(len(g.getJobs())) self.logger.info(msg) except TaskWorkerException as e: retmsg = "Splitting failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 try: parent = self.prefix if self.stage == 'tail' else None creator = DagmanCreator(config, server=None, resturi='') with config.TaskWorker.envForCMSWEB: creator.createSubdag(split_result.result, task=task, parent=parent, stage=self.stage) self.submitSubdag('RunJobs{0}.subdag'.format(self.prefix), getattr(config.TaskWorker, 'maxPost', 20), self.stage) except TaskWorkerException as e: retmsg = "DAG creation failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 self.saveProcessedJobs(unprocessed) return 0
def __init__(self, config, logWarning, logDebug, sequential=False, console=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg bool logWarning: it tells if a quiet logger is needed :arg bool logDebug: it tells if needs a verbose logger :arg bool sequential: it tells if to run in sequential (no subprocesses) mode. :arg bool console: it tells if to log to console.""" def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def setRootLogger(logWarning, logDebug, console): """Sets the root logger with the desired verbosity level The root logger logs to logsDir/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool logWarning: it tells if a quiet logger is needed :arg bool logDebug: it tells if needs a verbose logger :arg bool console: it tells if to log to console :return logger: a logger with the appropriate logger level.""" logsDir = config.TaskWorker.logsDir createLogdir(logsDir) createLogdir(logsDir+'/processes') createLogdir(logsDir+'/tasks') if console: logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog(logsDir+'/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if logWarning: loglevel = logging.WARNING if logDebug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master", logsDir) logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger self.STOP = False self.TEST = sequential self.logger = setRootLogger(logWarning, logDebug, console) self.config = config resthost = None self.restURInoAPI = None if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration") elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: resthost = MODEURL[self.config.TaskWorker.mode]['host'] self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] else: resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_- self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] if resthost is None: raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration") #Let's increase the server's retries for recoverable errors in the MasterWorker #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...) self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry=20, logger=self.logger) self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey)) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)] if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry: raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.") # use the config to pass some useful global stuff to all workers # will use TaskWorker.cmscert/key to talk with CMSWEB self.config.TaskWorker.envForCMSWEB = newX509env(X509_USER_CERT=self.config.TaskWorker.cmscert, X509_USER_KEY=self.config.TaskWorker.cmskey) if self.TEST: self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames]
def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :arg bool test: it tells if to run in test (no subprocesses) mode.""" def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def setRootLogger(quiet, debug): """Sets the root logger with the desired verbosity level The root logger logs to logs/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" createLogdir('logs') createLogdir('logs/processes') createLogdir('logs/tasks') if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master") logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger self.STOP = False self.TEST = test self.logger = setRootLogger(quiet, debug) self.config = config resthost = None self.restURInoAPI = None if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration") elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: resthost = MODEURL[self.config.TaskWorker.mode]['host'] self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] else: resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_- self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] if resthost is None: raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration") #Let's increase the server's retries for recoverable errors in the MasterWorker #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...) self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 20, logger = self.logger) self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey)) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)] if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry: raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.") # use the config to pass some useful global stuff to all workers # will use TaskWorker.cmscert/key to talk with CMSWEB self.config.TaskWorker.envForCMSWEB = newX509env(X509_USER_CERT = self.config.TaskWorker.cmscert, X509_USER_KEY = self.config.TaskWorker.cmskey) if self.TEST: self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames]
def __init__(self, config, logWarning, logDebug, sequential=False, console=False, name='master'): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg bool logWarning: it tells if a quiet logger is needed :arg bool logDebug: it tells if needs a verbose logger :arg bool sequential: it tells if to run in sequential (no subprocesses) mode. :arg bool console: it tells if to log to console. :arg string name: defines a name for the log of this master process""" def createLogdir(dirname): """ Create the directory dirname ignoring errors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def createAndCleanLogDirectories(logsDir): # it can be named with the time stamp a TW started createLogdir(logsDir) createLogdir(logsDir + '/tasks') currentProcessesDir = logsDir + '/processes/' createLogdir(currentProcessesDir) # when running inside a container process logs will start with same # process numbers, i.e. same name, at any container restart. # to avoid clashes and confusion, we will put away all previous processes # logs when a TW instance starts. To this goal each TW which runs # creates a directory where new containers will move its logs, so # identify LastLogs_timestamp directory latestLogDir = None # the logs directory could be empty files = os.listdir(currentProcessesDir) files.sort( reverse=True ) # if there are multiple Latest*, will hit the latest first for f in files: if f.startswith('Latest'): latestLogDir = currentProcessesDir + f break if files and latestLogDir: # rename from Latest to Old oldLogsDir = latestLogDir.replace('Latest', 'Old') shutil.move(latestLogDir, oldLogsDir) else: print( "LatestLogDir not found in logs/processes, create a dummy dir to store old files" ) oldLogsDir = currentProcessesDir + 'OldLog-Unknwown' createLogdir(oldLogsDir) # move process logs for latest TW run to old directory for f in files: if f.startswith('proc.c3id'): shutil.move(currentProcessesDir + f, oldLogsDir) # create a new LateastLogs directory where to store logs from this TaskWorker YYMMDD_HHMMSS = time.strftime('%y%m%d_%H%M%S', time.localtime()) myDir = currentProcessesDir + 'LatestLogs-' + YYMMDD_HHMMSS createLogdir(myDir) def setRootLogger(logWarning, logDebug, console, name): """Sets the root logger with the desired verbosity level The root logger logs to logsDir/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool logWarning: it tells if a quiet logger is needed :arg bool logDebug: it tells if needs a verbose logger :arg bool console: it tells if to log to console :arg string name: define a name for the log file of this master process :return logger: a logger with the appropriate logger level.""" # this must only done for real Master, not when it is used by TapeRecallStatus logsDir = config.TaskWorker.logsDir if name == 'master': createAndCleanLogDirectories(logsDir) if console: logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog(logsDir + '/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if logWarning: loglevel = logging.WARNING if logDebug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger(name, logsDir) logger.info("PID %s.", os.getpid()) logger.info("Logging level initialized to %s.", loglevel) return logger def logVersionAndConfig(config=None, logger=None): """ log version number and major config. parameters args: config : a configuration object loaded from file args: logger : the logger instance to use """ twstartDict = {} twstartDict['version'] = __version__ twstartDict['DBSHostName'] = config.Services.DBSHostName twstartDict['name'] = config.TaskWorker.name twstartDict['instance'] = config.TaskWorker.instance if config.TaskWorker.instance == 'other': twstartDict['restHost'] = config.TaskWorker.restHost twstartDict['dbInstance'] = config.TaskWorker.dbInstance twstartDict['nslaves'] = config.TaskWorker.nslaves twstartDict[ 'recurringActions'] = config.TaskWorker.recurringActions # one line for automatic parsing logger.info('TWSTART: %s', json.dumps(twstartDict)) # multiple lines for humans to read for k, v in twstartDict.items(): logger.info('%s: %s', k, v) return self.STOP = False self.TEST = sequential self.logger = setRootLogger(logWarning, logDebug, console, name) self.config = config self.restHost = None dbInstance = None logVersionAndConfig(self.config, self.logger) try: instance = self.config.TaskWorker.instance except: msg = "No instance provided: need to specify config.TaskWorker.instance in the configuration" raise ConfigException(msg) if instance in SERVICE_INSTANCES: self.logger.info('Will connect to CRAB service: %s', instance) self.restHost = SERVICE_INSTANCES[instance]['restHost'] dbInstance = SERVICE_INSTANCES[instance]['dbInstance'] else: msg = "Invalid instance value '%s'" % instance raise ConfigException(msg) if instance == 'other': self.logger.info( 'Will use restHost and dbInstance from config file') try: self.restHost = self.config.TaskWorker.restHost dbInstance = self.config.TaskWorker.dbInstance except: msg = "Need to specify config.TaskWorker.restHost and dbInstance in the configuration" raise ConfigException(msg) self.dbInstance = dbInstance self.logger.info('Will connect via URL: https://%s/%s', self.restHost, self.dbInstance) #Let's increase the server's retries for recoverable errors in the MasterWorker #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...) self.crabserver = CRABRest(self.restHost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry=20, logger=self.logger, userAgent='CRABTaskWorker') self.crabserver.setDbInstance(self.dbInstance) self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey)) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [ retry * 20 * 2 for retry in range(self.config.TaskWorker.max_retry) ] if not len(self.config.TaskWorker.retry_interval ) == self.config.TaskWorker.max_retry: raise ConfigException( "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry." ) # use the config to pass some useful global stuff to all workers # will use TaskWorker.cmscert/key to talk with CMSWEB self.config.TaskWorker.envForCMSWEB = newX509env( X509_USER_CERT=self.config.TaskWorker.cmscert, X509_USER_KEY=self.config.TaskWorker.cmskey) if self.TEST: self.slaves = TestWorker(self.config, self.restHost, self.dbInstance) else: self.slaves = Worker(self.config, self.restHost, self.dbInstance) self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [ self.getRecurringActionInst(name) for name in recurringActionsNames ]